diff --git "a/training_log.txt" "b/training_log.txt"
deleted file mode 100644--- "a/training_log.txt"
+++ /dev/null
@@ -1,62276 +0,0 @@
-[2025-01-25 08:44:40,912] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[2025-01-25 08:44:47,654] [INFO] [comm.py:637:init_distributed] cdb=None
-[2025-01-25 08:44:47,654] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
-01/25/2025 08:44:47 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 1distributed training: True, 16-bits training: False
-01/25/2025 08:44:47 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
-_n_gpu=1,
-accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
-adafactor=False,
-adam_beta1=0.9,
-adam_beta2=0.999,
-adam_epsilon=1e-08,
-auto_find_batch_size=False,
-batch_eval_metrics=False,
-bf16=True,
-bf16_full_eval=False,
-data_seed=None,
-dataloader_drop_last=False,
-dataloader_num_workers=4,
-dataloader_persistent_workers=False,
-dataloader_pin_memory=True,
-dataloader_prefetch_factor=None,
-ddp_backend=None,
-ddp_broadcast_buffers=None,
-ddp_bucket_cap_mb=None,
-ddp_find_unused_parameters=None,
-ddp_timeout=1800,
-debug=[],
-deepspeed=zero_stage1_config.json,
-disable_tqdm=False,
-dispatch_batches=None,
-do_eval=True,
-do_predict=False,
-do_train=True,
-eval_accumulation_steps=None,
-eval_delay=0,
-eval_do_concat_batches=True,
-eval_on_start=False,
-eval_steps=230,
-eval_strategy=steps,
-eval_use_gather_object=False,
-evaluation_strategy=steps,
-fp16=False,
-fp16_backend=auto,
-fp16_full_eval=False,
-fp16_opt_level=O1,
-fsdp=[],
-fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
-fsdp_min_num_params=0,
-fsdp_transformer_layer_cls_to_wrap=None,
-full_determinism=False,
-gradient_accumulation_steps=1,
-gradient_checkpointing=False,
-gradient_checkpointing_kwargs=None,
-greater_is_better=None,
-group_by_length=True,
-half_precision_backend=auto,
-hub_always_push=False,
-hub_model_id=None,
-hub_private_repo=False,
-hub_strategy=every_save,
-hub_token=<HUB_TOKEN>,
-ignore_data_skip=False,
-include_inputs_for_metrics=False,
-include_num_input_tokens_seen=False,
-include_tokens_per_second=False,
-jit_mode_eval=False,
-label_names=None,
-label_smoothing_factor=0.0,
-learning_rate=4e-05,
-length_column_name=length,
-load_best_model_at_end=False,
-local_rank=0,
-log_level=passive,
-log_level_replica=warning,
-log_on_each_node=True,
-logging_dir=/DATA/env/wjr/newtrain/stage2/mos3/runs/Jan25_08-44-47_ps,
-logging_first_step=False,
-logging_nan_inf_filter=True,
-logging_steps=1.0,
-logging_strategy=steps,
-lr_scheduler_kwargs={},
-lr_scheduler_type=cosine,
-max_grad_norm=1.0,
-max_steps=-1,
-metric_for_best_model=None,
-mp_parameters=,
-neftune_noise_alpha=None,
-no_cuda=False,
-num_train_epochs=50.0,
-optim=adamw_torch,
-optim_args=None,
-optim_target_modules=None,
-output_dir=/DATA/env/wjr/newtrain/stage2/mos3,
-overwrite_output_dir=True,
-past_index=-1,
-per_device_eval_batch_size=1,
-per_device_train_batch_size=4,
-prediction_loss_only=False,
-push_to_hub=False,
-push_to_hub_model_id=None,
-push_to_hub_organization=None,
-push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
-ray_scope=last,
-remove_unused_columns=True,
-report_to=['tensorboard'],
-restore_callback_states_from_checkpoint=False,
-resume_from_checkpoint=None,
-run_name=/DATA/env/wjr/newtrain/stage2/mos3,
-save_on_each_node=False,
-save_only_model=False,
-save_safetensors=True,
-save_steps=40000,
-save_strategy=steps,
-save_total_limit=1,
-seed=42,
-skip_memory_metrics=True,
-split_batches=None,
-tf32=None,
-torch_compile=False,
-torch_compile_backend=None,
-torch_compile_mode=None,
-torch_empty_cache_steps=None,
-torchdynamo=None,
-tpu_metrics_debug=False,
-tpu_num_cores=None,
-use_cpu=False,
-use_ipex=False,
-use_legacy_prediction_loop=False,
-use_liger_kernel=False,
-use_mps_device=False,
-warmup_ratio=0.03,
-warmup_steps=0,
-weight_decay=0.01,
-)
-01/25/2025 08:44:47 - INFO - __main__ - Loading Tokenizer: /DATA/env/wjr/newtrain/stage1/mos3
-[INFO|tokenization_utils_base.py:2204] 2025-01-25 08:44:47,858 >> loading file ./tokenizer.model
-[INFO|tokenization_utils_base.py:2204] 2025-01-25 08:44:47,858 >> loading file added_tokens.json
-[INFO|tokenization_utils_base.py:2204] 2025-01-25 08:44:47,858 >> loading file special_tokens_map.json
-[INFO|tokenization_utils_base.py:2204] 2025-01-25 08:44:47,858 >> loading file tokenizer_config.json
-[INFO|tokenization_utils_base.py:2204] 2025-01-25 08:44:47,858 >> loading file tokenizer.json
-[INFO|tokenization_utils_base.py:2470] 2025-01-25 08:44:48,457 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
-01/25/2025 08:44:48 - INFO - __main__ - Loading InternVLChatModel...
-[INFO|configuration_utils.py:673] 2025-01-25 08:44:49,000 >> loading configuration file /DATA/env/wjr/newtrain/stage1/mos3/config.json
-[INFO|configuration_utils.py:742] 2025-01-25 08:44:49,002 >> Model config InternVLChatConfig {
-  "_commit_hash": null,
-  "_name_or_path": "/DATA/DATA1/wjr/intern/InternVL/internvl_chat/InternVL2-8B",
-  "architectures": [
-    "InternVLChatModel"
-  ],
-  "auto_map": {
-    "AutoConfig": "configuration_internvl_chat.InternVLChatConfig",
-    "AutoModel": "modeling_internvl_chat.InternVLChatModel",
-    "AutoModelForCausalLM": "modeling_internvl_chat.InternVLChatModel"
-  },
-  "downsample_ratio": 0.5,
-  "dynamic_image_size": true,
-  "force_image_size": 448,
-  "llm_config": {
-    "_name_or_path": "internlm/internlm2_5-7b-chat",
-    "add_cross_attention": false,
-    "architectures": [
-      "InternLM2ForCausalLM"
-    ],
-    "attn_implementation": "flash_attention_2",
-    "auto_map": {
-      "AutoConfig": "configuration_internlm2.InternLM2Config",
-      "AutoModel": "modeling_internlm2.InternLM2ForCausalLM",
-      "AutoModelForCausalLM": "modeling_internlm2.InternLM2ForCausalLM"
-    },
-    "bad_words_ids": null,
-    "begin_suppress_tokens": null,
-    "bias": false,
-    "bos_token_id": 1,
-    "chunk_size_feed_forward": 0,
-    "cross_attention_hidden_size": null,
-    "decoder_start_token_id": null,
-    "diversity_penalty": 0.0,
-    "do_sample": false,
-    "early_stopping": false,
-    "encoder_no_repeat_ngram_size": 0,
-    "eos_token_id": 2,
-    "exponential_decay_length_penalty": null,
-    "finetuning_task": null,
-    "forced_bos_token_id": null,
-    "forced_eos_token_id": null,
-    "hidden_act": "silu",
-    "hidden_size": 4096,
-    "id2label": {
-      "0": "LABEL_0",
-      "1": "LABEL_1"
-    },
-    "initializer_range": 0.02,
-    "intermediate_size": 14336,
-    "is_decoder": false,
-    "is_encoder_decoder": false,
-    "label2id": {
-      "LABEL_0": 0,
-      "LABEL_1": 1
-    },
-    "length_penalty": 1.0,
-    "max_length": 20,
-    "max_position_embeddings": 32768,
-    "min_length": 0,
-    "model_type": "internlm2",
-    "no_repeat_ngram_size": 0,
-    "num_attention_heads": 32,
-    "num_beam_groups": 1,
-    "num_beams": 1,
-    "num_hidden_layers": 32,
-    "num_key_value_heads": 8,
-    "num_return_sequences": 1,
-    "output_attentions": false,
-    "output_hidden_states": false,
-    "output_scores": false,
-    "pad_token_id": 2,
-    "prefix": null,
-    "pretraining_tp": 1,
-    "problem_type": null,
-    "pruned_heads": {},
-    "remove_invalid_values": false,
-    "repetition_penalty": 1.0,
-    "return_dict": true,
-    "return_dict_in_generate": false,
-    "rms_norm_eps": 1e-05,
-    "rope_scaling": {
-      "factor": 2.0,
-      "type": "dynamic"
-    },
-    "rope_theta": 1000000,
-    "sep_token_id": null,
-    "suppress_tokens": null,
-    "task_specific_params": null,
-    "temperature": 1.0,
-    "tf_legacy_loss": false,
-    "tie_encoder_decoder": false,
-    "tie_word_embeddings": false,
-    "tokenizer_class": null,
-    "top_k": 50,
-    "top_p": 1.0,
-    "torch_dtype": "bfloat16",
-    "torchscript": false,
-    "transformers_version": "4.45.2",
-    "typical_p": 1.0,
-    "use_bfloat16": true,
-    "use_cache": false,
-    "vocab_size": 92553
-  },
-  "max_dynamic_patch": 6,
-  "min_dynamic_patch": 1,
-  "model_type": "internvl_chat",
-  "pad2square": false,
-  "ps_version": "v2",
-  "select_layer": -1,
-  "template": "internlm2-chat",
-  "torch_dtype": "bfloat16",
-  "transformers_version": null,
-  "use_backbone_lora": 0,
-  "use_llm_lora": 0,
-  "use_thumbnail": true,
-  "vision_config": {
-    "_name_or_path": "",
-    "add_cross_attention": false,
-    "architectures": [
-      "InternVisionModel"
-    ],
-    "attention_dropout": 0.0,
-    "bad_words_ids": null,
-    "begin_suppress_tokens": null,
-    "bos_token_id": null,
-    "chunk_size_feed_forward": 0,
-    "cross_attention_hidden_size": null,
-    "decoder_start_token_id": null,
-    "diversity_penalty": 0.0,
-    "do_sample": false,
-    "drop_path_rate": 0.1,
-    "dropout": 0.0,
-    "early_stopping": false,
-    "encoder_no_repeat_ngram_size": 0,
-    "eos_token_id": null,
-    "exponential_decay_length_penalty": null,
-    "finetuning_task": null,
-    "forced_bos_token_id": null,
-    "forced_eos_token_id": null,
-    "hidden_act": "gelu",
-    "hidden_size": 1024,
-    "id2label": {
-      "0": "LABEL_0",
-      "1": "LABEL_1"
-    },
-    "image_size": 448,
-    "initializer_factor": 1.0,
-    "initializer_range": 0.02,
-    "intermediate_size": 4096,
-    "is_decoder": false,
-    "is_encoder_decoder": false,
-    "label2id": {
-      "LABEL_0": 0,
-      "LABEL_1": 1
-    },
-    "layer_norm_eps": 1e-06,
-    "length_penalty": 1.0,
-    "max_length": 20,
-    "min_length": 0,
-    "model_type": "intern_vit_6b",
-    "no_repeat_ngram_size": 0,
-    "norm_type": "layer_norm",
-    "num_attention_heads": 16,
-    "num_beam_groups": 1,
-    "num_beams": 1,
-    "num_channels": 3,
-    "num_hidden_layers": 24,
-    "num_return_sequences": 1,
-    "output_attentions": false,
-    "output_hidden_states": false,
-    "output_scores": false,
-    "pad_token_id": null,
-    "patch_size": 14,
-    "prefix": null,
-    "problem_type": null,
-    "pruned_heads": {},
-    "qk_normalization": false,
-    "qkv_bias": true,
-    "remove_invalid_values": false,
-    "repetition_penalty": 1.0,
-    "return_dict": true,
-    "return_dict_in_generate": false,
-    "sep_token_id": null,
-    "suppress_tokens": null,
-    "task_specific_params": null,
-    "temperature": 1.0,
-    "tf_legacy_loss": false,
-    "tie_encoder_decoder": false,
-    "tie_word_embeddings": true,
-    "tokenizer_class": null,
-    "top_k": 50,
-    "top_p": 1.0,
-    "torch_dtype": "bfloat16",
-    "torchscript": false,
-    "transformers_version": "4.45.2",
-    "typical_p": 1.0,
-    "use_bfloat16": true,
-    "use_flash_attn": true
-  }
-}
-
-01/25/2025 08:44:49 - INFO - __main__ - Using flash_attention_2 for InternLM
-[INFO|modeling_utils.py:3729] 2025-01-25 08:44:49,004 >> loading weights file /DATA/env/wjr/newtrain/stage1/mos3/model.safetensors.index.json
-[INFO|modeling_utils.py:1622] 2025-01-25 08:44:49,005 >> Instantiating InternVLChatModel model under default dtype torch.bfloat16.
-[INFO|configuration_utils.py:1099] 2025-01-25 08:44:49,007 >> Generate config GenerationConfig {}
-
-this model
-[WARNING|logging.py:328] 2025-01-25 08:44:49,080 >> InternLM2ForCausalLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
-  - If you're using `trust_remote_code=True`, you can get rid of this warning by loading the model with an auto class. See https://huggingface.co/docs/transformers/en/model_doc/auto#auto-classes
-  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
-  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
-[INFO|configuration_utils.py:1099] 2025-01-25 08:44:49,081 >> Generate config GenerationConfig {
-  "bos_token_id": 1,
-  "eos_token_id": 2,
-  "pad_token_id": 2,
-  "use_cache": false
-}
-
-motion_mlp.weight1 Parameter containing:
-tensor([[0., 0., 0.,  ..., 0., 0., 0.],
-        [0., 0., 0.,  ..., 0., 0., 0.],
-        [0., 0., 0.,  ..., 0., 0., 0.],
-        ...,
-        [0., 0., 0.,  ..., 0., 0., 0.],
-        [0., 0., 0.,  ..., 0., 0., 0.],
-        [0., 0., 0.,  ..., 0., 0., 0.]], requires_grad=True)
-motion_mlp.weight2 Parameter containing:
-tensor([[-0.0250,  0.0525, -0.0933,  ...,  0.0835, -0.0165,  0.0962],
-        [-0.0337,  0.0618,  0.0094,  ...,  0.0776, -0.0078,  0.0728],
-        [ 0.0063, -0.0234,  0.0991,  ..., -0.0688,  0.0063,  0.0649],
-        ...,
-        [-0.0728, -0.0703, -0.0273,  ...,  0.0179, -0.0728,  0.0757],
-        [ 0.0156, -0.0320,  0.0243,  ...,  0.0830, -0.0947, -0.0586],
-        [ 0.0835, -0.0352, -0.0962,  ..., -0.0211, -0.0195,  0.0086]],
-       requires_grad=True)
-motion_mlp.bias Parameter containing:
-tensor([0., 0., 0.,  ..., 0., 0., 0.], requires_grad=True)
-motion_mlp.weight1 Parameter containing:
-tensor([[0., 0., 0.,  ..., 0., 0., 0.],
-        [0., 0., 0.,  ..., 0., 0., 0.],
-        [0., 0., 0.,  ..., 0., 0., 0.],
-        ...,
-        [0., 0., 0.,  ..., 0., 0., 0.],
-        [0., 0., 0.,  ..., 0., 0., 0.],
-        [0., 0., 0.,  ..., 0., 0., 0.]], requires_grad=True)
-motion_mlp.weight2 Parameter containing:
-tensor([[ 0.0742, -0.0767,  0.0894,  ...,  0.0117,  0.0078, -0.0102],
-        [-0.0398, -0.0352, -0.0806,  ..., -0.0913,  0.0586, -0.0894],
-        [-0.0312,  0.0000,  0.0500,  ..., -0.0938,  0.0102,  0.0376],
-        ...,
-        [-0.0593, -0.0437,  0.0649,  ..., -0.0454, -0.0359,  0.0297],
-        [-0.0273, -0.0117,  0.0447,  ..., -0.0282,  0.0586, -0.0742],
-        [ 0.0698,  0.0454,  0.0728,  ...,  0.0258, -0.1001, -0.0625]],
-       requires_grad=True)
-motion_mlp.bias Parameter containing:
-tensor([0., 0., 0.,  ..., 0., 0., 0.], requires_grad=True)
-m.weight1 Parameter containing:
-tensor([[ 7.4506e-09,  2.1750e+01,  0.0000e+00,  ...,  4.7684e-05,
-          7.6702e+15,  3.8385e-05],
-        [ 0.0000e+00,  3.2000e+01,  2.2675e+07,  ...,  9.6798e-05,
-         -4.6586e+31,  7.7724e-05],
-        [ 0.0000e+00,  6.4000e+01,  2.2675e+07,  ...,  1.4591e-04,
-          2.8229e-03,  1.1778e-04],
-        ...,
-        [ 2.0000e+00,  3.2640e+04, -1.7753e-09,  ...,  5.0293e-02,
-          3.5095e-04,  4.0527e-02],
-        [-0.0000e+00,  3.2640e+04, -1.9922e+22,  ...,  5.0293e-02,
-          4.0398e+21,  4.0527e-02],
-        [-2.0000e+00,  3.2640e+04,  1.9904e-24,  ...,  5.0293e-02,
-         -3.9289e-31,  4.0527e-02]], requires_grad=True)
-m.weight2 Parameter containing:
-tensor([[ 0.0703,  0.0713,  0.0564,  ..., -0.0776,  0.0508,  0.0830],
-        [ 0.0806, -0.0282,  0.0476,  ...,  0.0547,  0.0718,  0.0845],
-        [ 0.0500,  0.0854, -0.0859,  ..., -0.0070,  0.0938, -0.0603],
-        ...,
-        [ 0.0986, -0.0757, -0.0796,  ...,  0.0564, -0.0991,  0.0234],
-        [ 0.0047,  0.0469,  0.0664,  ..., -0.0141, -0.0476, -0.0070],
-        [ 0.0039, -0.0148,  0.0008,  ...,  0.0070, -0.0688,  0.0148]],
-       requires_grad=True)
-m.weight1 Parameter containing:
-tensor([[-2.4414e-04,  9.7711e+10,  0.0000e+00,  ...,  0.0000e+00,
-          0.0000e+00,  0.0000e+00],
-        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
-          0.0000e+00,  0.0000e+00],
-        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
-          0.0000e+00,  0.0000e+00],
-        ...,
-        [ 1.0646e+30, -1.2598e-01, -1.1946e+29,  ...,  9.9609e-01,
-         -2.8937e+28,  9.9609e-01],
-        [ 1.4505e-33, -6.6797e-01,  4.0035e-29,  ...,  9.9609e-01,
-         -2.8627e+28,  9.9609e-01],
-        [-3.3751e+31,  9.9609e-01,  7.0059e-36,  ...,  9.9609e-01,
-         -2.8318e+28,  9.9609e-01]], requires_grad=True)
-m.weight2 Parameter containing:
-tensor([[-0.0461, -0.0469,  0.0023,  ...,  0.0532, -0.0078,  0.0195],
-        [-0.0532,  0.0469, -0.0547,  ..., -0.0688, -0.0791, -0.0737],
-        [ 0.0078, -0.0070,  0.0070,  ..., -0.0493, -0.0078,  0.0454],
-        ...,
-        [ 0.0125, -0.0408, -0.0952,  ...,  0.0282, -0.0376, -0.0133],
-        [ 0.0820, -0.0148, -0.0008,  ..., -0.0649,  0.0894, -0.0102],
-        [-0.0109, -0.0776, -0.0713,  ..., -0.0469,  0.0273,  0.0688]],
-       requires_grad=True)
-m.weight1 Parameter containing:
-tensor([[ 3.3327e+17,  5.8984e-01, -6.3334e+08,  ..., -9.4922e-01,
-          7.6022e+06, -8.1250e-01],
-        [-6.1500e+01,  5.6250e-01,  8.7311e-10,  ..., -9.6484e-01,
-          8.0609e+06, -7.8906e-01],
-        [ 6.0244e-37,  5.3906e-01,  6.5282e+26,  ..., -9.7656e-01,
-         -1.0125e+36, -7.6172e-01],
-        ...,
-        [ 2.0189e+26, -9.6094e-01, -2.4298e-24,  ...,  8.8672e-01,
-         -1.1799e+27,  9.8828e-01],
-        [ 1.2628e+37, -9.6875e-01, -7.0134e-19,  ...,  9.0625e-01,
-          3.4346e-38,  9.9609e-01],
-        [ 4.1104e+09, -9.7656e-01, -8.1852e-34,  ...,  9.2578e-01,
-         -3.6234e-09,  9.9609e-01]], requires_grad=True)
-m.weight2 Parameter containing:
-tensor([[ 0.0282, -0.0938,  0.0859,  ..., -0.0854,  0.0728,  0.0718],
-        [-0.0972, -0.0781, -0.0430,  ...,  0.0415, -0.0947, -0.0737],
-        [-0.0312,  0.0767,  0.0532,  ...,  0.0156, -0.0055,  0.0047],
-        ...,
-        [ 0.0586, -0.0898, -0.0962,  ..., -0.0610, -0.0718, -0.0796],
-        [-0.0610, -0.0039,  0.0947,  ..., -0.0540, -0.0454,  0.0547],
-        [-0.0086,  0.0211,  0.0461,  ...,  0.0781,  0.0086,  0.0055]],
-       requires_grad=True)
-m.weight1 Parameter containing:
-tensor([[-4.8318e+09,  9.7711e+10,  0.0000e+00,  ...,  0.0000e+00,
-          0.0000e+00,  0.0000e+00],
-        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  9.7711e+10,
-          0.0000e+00,  0.0000e+00],
-        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
-          0.0000e+00,  0.0000e+00],
-        ...,
-        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
-          0.0000e+00,  0.0000e+00],
-        [-4.3333e-34,  9.7711e+10,  0.0000e+00,  ...,  0.0000e+00,
-          0.0000e+00,  0.0000e+00],
-        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
-          0.0000e+00,  0.0000e+00]], requires_grad=True)
-m.weight2 Parameter containing:
-tensor([[-0.0078,  0.0039, -0.0266,  ...,  0.0461, -0.0156, -0.0908],
-        [ 0.0094,  0.0320,  0.0500,  ...,  0.0337, -0.0109,  0.0898],
-        [-0.0757, -0.0376,  0.0195,  ...,  0.0698,  0.0962,  0.0796],
-        ...,
-        [-0.0579,  0.0698,  0.0742,  ..., -0.0070,  0.0820, -0.0282],
-        [-0.0913,  0.0376,  0.0564,  ..., -0.0454,  0.0806,  0.0117],
-        [ 0.0297, -0.0258,  0.0640,  ...,  0.0398,  0.0133,  0.0564]],
-       requires_grad=True)
-m.weight1 Parameter containing:
-tensor([[2.5244e-29, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
-         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
-         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00]], requires_grad=True)
-m.weight2 Parameter containing:
-tensor([[ 0.0649, -0.0718,  0.0830,  0.0698, -0.0195, -0.0625, -0.0874,  0.0125,
-          0.0469, -0.0070,  0.0767,  0.0289, -0.0179, -0.0933,  0.0016,  0.0869]],
-       requires_grad=True)
-Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]Loading checkpoint shards:  25%|██▌       | 1/4 [00:01<00:03,  1.09s/it]Loading checkpoint shards:  50%|█████     | 2/4 [00:01<00:01,  1.17it/s]Loading checkpoint shards:  75%|███████▌  | 3/4 [00:02<00:00,  1.31it/s]Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00,  1.27it/s]Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00,  1.23it/s]
-[INFO|modeling_utils.py:4574] 2025-01-25 08:45:13,379 >> All model checkpoint weights were used when initializing InternVLChatModel.
-
-[WARNING|modeling_utils.py:4576] 2025-01-25 08:45:13,379 >> Some weights of InternVLChatModel were not initialized from the model checkpoint at /DATA/env/wjr/newtrain/stage1/mos3 and are newly initialized: ['mlpscore.fc1.bias', 'mlpscore.fc1.weight', 'mlpscore.fc2.bias', 'mlpscore.fc2.weight', 'mlpscore.fc3.bias', 'mlpscore.fc3.weight', 'mlpscore.fc4.bias', 'mlpscore.fc4.weight', 'mlpscore.fc5.bias', 'mlpscore.fc5.weight', 'mlpscore.ln1.bias', 'mlpscore.ln1.weight']
-You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
-[INFO|configuration_utils.py:1052] 2025-01-25 08:45:13,416 >> loading configuration file /DATA/env/wjr/newtrain/stage1/mos3/generation_config.json
-[INFO|configuration_utils.py:1099] 2025-01-25 08:45:13,417 >> Generate config GenerationConfig {
-  "eos_token_id": [
-    92542,
-    92543
-  ]
-}
-
-01/25/2025 08:45:13 - INFO - __main__ - Finished
-01/25/2025 08:45:13 - INFO - __main__ - model.config.force_image_size: 448
-01/25/2025 08:45:13 - INFO - __main__ - data_args.force_image_size: 448
-01/25/2025 08:45:13 - INFO - __main__ - model.config.vision_config.image_size: 448
-01/25/2025 08:45:13 - INFO - __main__ - [Dataset] num_image_token: 256
-01/25/2025 08:45:13 - INFO - __main__ - [Dataset] dynamic_image_size: True
-01/25/2025 08:45:13 - INFO - __main__ - [Dataset] use_thumbnail: True
-01/25/2025 08:45:13 - INFO - __main__ - [Dataset] min_dynamic_patch: 1, max_dynamic_patch: 6
-01/25/2025 08:45:13 - INFO - __main__ - Formatting inputs...Skip in lazy mode
-in
-in2
-01/25/2025 08:45:14 - INFO - __main__ - Add dataset: sharegpt4v_instruct_gpt4-vision_cap100k with length: 461
-01/25/2025 08:45:14 - INFO - __main__ - [Dataset] num_image_token: 256
-01/25/2025 08:45:14 - INFO - __main__ - [Dataset] dynamic_image_size: True
-01/25/2025 08:45:14 - INFO - __main__ - [Dataset] use_thumbnail: True
-01/25/2025 08:45:14 - INFO - __main__ - [Dataset] min_dynamic_patch: 1, max_dynamic_patch: 6
-01/25/2025 08:45:14 - INFO - __main__ - Formatting inputs...Skip in lazy mode
-01/25/2025 08:45:15 - INFO - __main__ - Add dataset: sharegpt4v_instruct_gpt4-vision_cap100k with length: 115
-eval_dataset <torch.utils.data.dataset.ConcatDataset object at 0x7f7db808b3d0>
-trainable params: 3,145,728 || all params: 307,158,016 || trainable%: 1.0241
-trainable params: 18,874,368 || all params: 7,756,656,640 || trainable%: 0.2433
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.0.attn.qkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.0.attn.qkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.0.attn.proj.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.0.attn.proj.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.0.mlp.fc1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.0.mlp.fc1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.0.mlp.fc2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.0.mlp.fc2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.1.attn.qkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.1.attn.qkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.1.attn.proj.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.1.attn.proj.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.1.mlp.fc1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.1.mlp.fc1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.1.mlp.fc2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.1.mlp.fc2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.2.attn.qkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.2.attn.qkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.2.attn.proj.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.2.attn.proj.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.2.mlp.fc1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.2.mlp.fc1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.2.mlp.fc2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.2.mlp.fc2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.3.attn.qkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.3.attn.qkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.3.attn.proj.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.3.attn.proj.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.3.mlp.fc1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.3.mlp.fc1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.3.mlp.fc2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.3.mlp.fc2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.4.attn.qkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.4.attn.qkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.4.attn.proj.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.4.attn.proj.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.4.mlp.fc1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.4.mlp.fc1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.4.mlp.fc2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.4.mlp.fc2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.5.attn.qkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.5.attn.qkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.5.attn.proj.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.5.attn.proj.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.5.mlp.fc1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.5.mlp.fc1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.5.mlp.fc2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.5.mlp.fc2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.6.attn.qkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.6.attn.qkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.6.attn.proj.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.6.attn.proj.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.6.mlp.fc1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.6.mlp.fc1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.6.mlp.fc2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.6.mlp.fc2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.7.attn.qkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.7.attn.qkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.7.attn.proj.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.7.attn.proj.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.7.mlp.fc1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.7.mlp.fc1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.7.mlp.fc2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.7.mlp.fc2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.8.attn.qkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.8.attn.qkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.8.attn.proj.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.8.attn.proj.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.8.mlp.fc1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.8.mlp.fc1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.8.mlp.fc2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.8.mlp.fc2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.9.attn.qkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.9.attn.qkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.9.attn.proj.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.9.attn.proj.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.9.mlp.fc1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.9.mlp.fc1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.9.mlp.fc2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.9.mlp.fc2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.10.attn.qkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.10.attn.qkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.10.attn.proj.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.10.attn.proj.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.10.mlp.fc1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.10.mlp.fc1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.10.mlp.fc2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.10.mlp.fc2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.11.attn.qkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.11.attn.qkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.11.attn.proj.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.11.attn.proj.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.11.mlp.fc1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.11.mlp.fc1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.11.mlp.fc2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.11.mlp.fc2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.12.attn.qkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.12.attn.qkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.12.attn.proj.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.12.attn.proj.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.12.mlp.fc1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.12.mlp.fc1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.12.mlp.fc2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.12.mlp.fc2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.13.attn.qkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.13.attn.qkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.13.attn.proj.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.13.attn.proj.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.13.mlp.fc1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.13.mlp.fc1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.13.mlp.fc2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.13.mlp.fc2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.14.attn.qkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.14.attn.qkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.14.attn.proj.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.14.attn.proj.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.14.mlp.fc1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.14.mlp.fc1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.14.mlp.fc2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.14.mlp.fc2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.15.attn.qkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.15.attn.qkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.15.attn.proj.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.15.attn.proj.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.15.mlp.fc1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.15.mlp.fc1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.15.mlp.fc2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.15.mlp.fc2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.16.attn.qkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.16.attn.qkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.16.attn.proj.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.16.attn.proj.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.16.mlp.fc1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.16.mlp.fc1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.16.mlp.fc2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.16.mlp.fc2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.17.attn.qkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.17.attn.qkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.17.attn.proj.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.17.attn.proj.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.17.mlp.fc1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.17.mlp.fc1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.17.mlp.fc2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.17.mlp.fc2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.18.attn.qkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.18.attn.qkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.18.attn.proj.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.18.attn.proj.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.18.mlp.fc1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.18.mlp.fc1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.18.mlp.fc2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.18.mlp.fc2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.19.attn.qkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.19.attn.qkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.19.attn.proj.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.19.attn.proj.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.19.mlp.fc1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.19.mlp.fc1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.19.mlp.fc2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.19.mlp.fc2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.20.attn.qkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.20.attn.qkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.20.attn.proj.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.20.attn.proj.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.20.mlp.fc1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.20.mlp.fc1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.20.mlp.fc2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.20.mlp.fc2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.21.attn.qkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.21.attn.qkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.21.attn.proj.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.21.attn.proj.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.21.mlp.fc1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.21.mlp.fc1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.21.mlp.fc2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.21.mlp.fc2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.22.attn.qkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.22.attn.qkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.22.attn.proj.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.22.attn.proj.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.22.mlp.fc1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.22.mlp.fc1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.22.mlp.fc2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.22.mlp.fc2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.23.attn.qkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.23.attn.qkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.23.attn.proj.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.23.attn.proj.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.23.mlp.fc1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.23.mlp.fc1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.23.mlp.fc2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - vision_model.base_model.model.encoder.layers.23.mlp.fc2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.0.attention.wqkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.0.attention.wqkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.0.attention.wo.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.0.attention.wo.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.0.feed_forward.w1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.0.feed_forward.w1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.0.feed_forward.w3.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.0.feed_forward.w3.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.0.feed_forward.w2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.0.feed_forward.w2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.1.attention.wqkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.1.attention.wqkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.1.attention.wo.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.1.attention.wo.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.1.feed_forward.w1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.1.feed_forward.w1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.1.feed_forward.w3.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.1.feed_forward.w3.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.1.feed_forward.w2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.1.feed_forward.w2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.2.attention.wqkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.2.attention.wqkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.2.attention.wo.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.2.attention.wo.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.2.feed_forward.w1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.2.feed_forward.w1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.2.feed_forward.w3.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.2.feed_forward.w3.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.2.feed_forward.w2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.2.feed_forward.w2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.3.attention.wqkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.3.attention.wqkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.3.attention.wo.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.3.attention.wo.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.3.feed_forward.w1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.3.feed_forward.w1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.3.feed_forward.w3.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.3.feed_forward.w3.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.3.feed_forward.w2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.3.feed_forward.w2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.4.attention.wqkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.4.attention.wqkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.4.attention.wo.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.4.attention.wo.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.4.feed_forward.w1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.4.feed_forward.w1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.4.feed_forward.w3.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.4.feed_forward.w3.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.4.feed_forward.w2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.4.feed_forward.w2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.5.attention.wqkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.5.attention.wqkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.5.attention.wo.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.5.attention.wo.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.5.feed_forward.w1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.5.feed_forward.w1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.5.feed_forward.w3.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.5.feed_forward.w3.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.5.feed_forward.w2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.5.feed_forward.w2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.6.attention.wqkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.6.attention.wqkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.6.attention.wo.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.6.attention.wo.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.6.feed_forward.w1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.6.feed_forward.w1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.6.feed_forward.w3.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.6.feed_forward.w3.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.6.feed_forward.w2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.6.feed_forward.w2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.7.attention.wqkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.7.attention.wqkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.7.attention.wo.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.7.attention.wo.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.7.feed_forward.w1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.7.feed_forward.w1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.7.feed_forward.w3.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.7.feed_forward.w3.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.7.feed_forward.w2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.7.feed_forward.w2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.8.attention.wqkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.8.attention.wqkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.8.attention.wo.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.8.attention.wo.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.8.feed_forward.w1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.8.feed_forward.w1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.8.feed_forward.w3.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.8.feed_forward.w3.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.8.feed_forward.w2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.8.feed_forward.w2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.9.attention.wqkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.9.attention.wqkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.9.attention.wo.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.9.attention.wo.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.9.feed_forward.w1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.9.feed_forward.w1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.9.feed_forward.w3.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.9.feed_forward.w3.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.9.feed_forward.w2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.9.feed_forward.w2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.10.attention.wqkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.10.attention.wqkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.10.attention.wo.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.10.attention.wo.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.10.feed_forward.w1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.10.feed_forward.w1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.10.feed_forward.w3.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.10.feed_forward.w3.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.10.feed_forward.w2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.10.feed_forward.w2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.11.attention.wqkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.11.attention.wqkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.11.attention.wo.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.11.attention.wo.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.11.feed_forward.w1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.11.feed_forward.w1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.11.feed_forward.w3.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.11.feed_forward.w3.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.11.feed_forward.w2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.11.feed_forward.w2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.12.attention.wqkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.12.attention.wqkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.12.attention.wo.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.12.attention.wo.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.12.feed_forward.w1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.12.feed_forward.w1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.12.feed_forward.w3.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.12.feed_forward.w3.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.12.feed_forward.w2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.12.feed_forward.w2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.13.attention.wqkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.13.attention.wqkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.13.attention.wo.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.13.attention.wo.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.13.feed_forward.w1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.13.feed_forward.w1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.13.feed_forward.w3.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.13.feed_forward.w3.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.13.feed_forward.w2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.13.feed_forward.w2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.14.attention.wqkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.14.attention.wqkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.14.attention.wo.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.14.attention.wo.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.14.feed_forward.w1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.14.feed_forward.w1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.14.feed_forward.w3.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.14.feed_forward.w3.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.14.feed_forward.w2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.14.feed_forward.w2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.15.attention.wqkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.15.attention.wqkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.15.attention.wo.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.15.attention.wo.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.15.feed_forward.w1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.15.feed_forward.w1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.15.feed_forward.w3.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.15.feed_forward.w3.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.15.feed_forward.w2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.15.feed_forward.w2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.16.attention.wqkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.16.attention.wqkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.16.attention.wo.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.16.attention.wo.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.16.feed_forward.w1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.16.feed_forward.w1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.16.feed_forward.w3.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.16.feed_forward.w3.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.16.feed_forward.w2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.16.feed_forward.w2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.17.attention.wqkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.17.attention.wqkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.17.attention.wo.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.17.attention.wo.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.17.feed_forward.w1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.17.feed_forward.w1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.17.feed_forward.w3.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.17.feed_forward.w3.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.17.feed_forward.w2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.17.feed_forward.w2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.18.attention.wqkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.18.attention.wqkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.18.attention.wo.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.18.attention.wo.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.18.feed_forward.w1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.18.feed_forward.w1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.18.feed_forward.w3.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.18.feed_forward.w3.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.18.feed_forward.w2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.18.feed_forward.w2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.19.attention.wqkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.19.attention.wqkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.19.attention.wo.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.19.attention.wo.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.19.feed_forward.w1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.19.feed_forward.w1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.19.feed_forward.w3.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.19.feed_forward.w3.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.19.feed_forward.w2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.19.feed_forward.w2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.20.attention.wqkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.20.attention.wqkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.20.attention.wo.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.20.attention.wo.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.20.feed_forward.w1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.20.feed_forward.w1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.20.feed_forward.w3.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.20.feed_forward.w3.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.20.feed_forward.w2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.20.feed_forward.w2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.21.attention.wqkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.21.attention.wqkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.21.attention.wo.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.21.attention.wo.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.21.feed_forward.w1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.21.feed_forward.w1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.21.feed_forward.w3.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.21.feed_forward.w3.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.21.feed_forward.w2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.21.feed_forward.w2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.22.attention.wqkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.22.attention.wqkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.22.attention.wo.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.22.attention.wo.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.22.feed_forward.w1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.22.feed_forward.w1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.22.feed_forward.w3.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.22.feed_forward.w3.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.22.feed_forward.w2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.22.feed_forward.w2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.23.attention.wqkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.23.attention.wqkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.23.attention.wo.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.23.attention.wo.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.23.feed_forward.w1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.23.feed_forward.w1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.23.feed_forward.w3.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.23.feed_forward.w3.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.23.feed_forward.w2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.23.feed_forward.w2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.24.attention.wqkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.24.attention.wqkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.24.attention.wo.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.24.attention.wo.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.24.feed_forward.w1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.24.feed_forward.w1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.24.feed_forward.w3.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.24.feed_forward.w3.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.24.feed_forward.w2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.24.feed_forward.w2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.25.attention.wqkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.25.attention.wqkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.25.attention.wo.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.25.attention.wo.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.25.feed_forward.w1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.25.feed_forward.w1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.25.feed_forward.w3.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.25.feed_forward.w3.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.25.feed_forward.w2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.25.feed_forward.w2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.26.attention.wqkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.26.attention.wqkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.26.attention.wo.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.26.attention.wo.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.26.feed_forward.w1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.26.feed_forward.w1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.26.feed_forward.w3.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.26.feed_forward.w3.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.26.feed_forward.w2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.26.feed_forward.w2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.27.attention.wqkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.27.attention.wqkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.27.attention.wo.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.27.attention.wo.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.27.feed_forward.w1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.27.feed_forward.w1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.27.feed_forward.w3.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.27.feed_forward.w3.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.27.feed_forward.w2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.27.feed_forward.w2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.28.attention.wqkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.28.attention.wqkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.28.attention.wo.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.28.attention.wo.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.28.feed_forward.w1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.28.feed_forward.w1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.28.feed_forward.w3.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.28.feed_forward.w3.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.28.feed_forward.w2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.28.feed_forward.w2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.29.attention.wqkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.29.attention.wqkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.29.attention.wo.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.29.attention.wo.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.29.feed_forward.w1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.29.feed_forward.w1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.29.feed_forward.w3.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.29.feed_forward.w3.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.29.feed_forward.w2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.29.feed_forward.w2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.30.attention.wqkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.30.attention.wqkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.30.attention.wo.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.30.attention.wo.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.30.feed_forward.w1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.30.feed_forward.w1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.30.feed_forward.w3.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.30.feed_forward.w3.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.30.feed_forward.w2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.30.feed_forward.w2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.31.attention.wqkv.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.31.attention.wqkv.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.31.attention.wo.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.31.attention.wo.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.31.feed_forward.w1.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.31.feed_forward.w1.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.31.feed_forward.w3.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.31.feed_forward.w3.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.31.feed_forward.w2.lora_A.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - language_model.base_model.model.model.layers.31.feed_forward.w2.lora_B.default.weight
-01/25/2025 08:45:40 - INFO - __main__ - mlp1.0.weight
-01/25/2025 08:45:40 - INFO - __main__ - mlp1.0.bias
-01/25/2025 08:45:40 - INFO - __main__ - mlp1.1.weight
-01/25/2025 08:45:40 - INFO - __main__ - mlp1.1.bias
-01/25/2025 08:45:40 - INFO - __main__ - mlp1.3.weight
-01/25/2025 08:45:40 - INFO - __main__ - mlp1.3.bias
-01/25/2025 08:45:40 - INFO - __main__ - motion_mlp.0.weight
-01/25/2025 08:45:40 - INFO - __main__ - motion_mlp.0.bias
-01/25/2025 08:45:40 - INFO - __main__ - motion_mlp.1.weight
-01/25/2025 08:45:40 - INFO - __main__ - motion_mlp.1.bias
-01/25/2025 08:45:40 - INFO - __main__ - motion_mlp.3.weight
-01/25/2025 08:45:40 - INFO - __main__ - motion_mlp.3.bias
-01/25/2025 08:45:40 - INFO - __main__ - mlpscore.fc1.weight
-01/25/2025 08:45:40 - INFO - __main__ - mlpscore.fc1.bias
-01/25/2025 08:45:40 - INFO - __main__ - mlpscore.fc2.weight
-01/25/2025 08:45:40 - INFO - __main__ - mlpscore.fc2.bias
-01/25/2025 08:45:40 - INFO - __main__ - mlpscore.fc3.weight
-01/25/2025 08:45:40 - INFO - __main__ - mlpscore.fc3.bias
-01/25/2025 08:45:40 - INFO - __main__ - mlpscore.fc4.weight
-01/25/2025 08:45:40 - INFO - __main__ - mlpscore.fc4.bias
-01/25/2025 08:45:40 - INFO - __main__ - mlpscore.fc5.weight
-01/25/2025 08:45:40 - INFO - __main__ - mlpscore.fc5.bias
-01/25/2025 08:45:40 - INFO - __main__ - mlpscore.ln1.weight
-01/25/2025 08:45:40 - INFO - __main__ - mlpscore.ln1.bias
-training_args TrainingArguments(
-_n_gpu=1,
-accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
-adafactor=False,
-adam_beta1=0.9,
-adam_beta2=0.999,
-adam_epsilon=1e-08,
-auto_find_batch_size=False,
-batch_eval_metrics=False,
-bf16=True,
-bf16_full_eval=False,
-data_seed=None,
-dataloader_drop_last=False,
-dataloader_num_workers=4,
-dataloader_persistent_workers=False,
-dataloader_pin_memory=True,
-dataloader_prefetch_factor=None,
-ddp_backend=None,
-ddp_broadcast_buffers=None,
-ddp_bucket_cap_mb=None,
-ddp_find_unused_parameters=None,
-ddp_timeout=1800,
-debug=[],
-deepspeed=zero_stage1_config.json,
-disable_tqdm=False,
-dispatch_batches=None,
-do_eval=True,
-do_predict=False,
-do_train=True,
-eval_accumulation_steps=None,
-eval_delay=0,
-eval_do_concat_batches=True,
-eval_on_start=False,
-eval_steps=230,
-eval_strategy=steps,
-eval_use_gather_object=False,
-evaluation_strategy=steps,
-fp16=False,
-fp16_backend=auto,
-fp16_full_eval=False,
-fp16_opt_level=O1,
-fsdp=[],
-fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
-fsdp_min_num_params=0,
-fsdp_transformer_layer_cls_to_wrap=None,
-full_determinism=False,
-gradient_accumulation_steps=1,
-gradient_checkpointing=False,
-gradient_checkpointing_kwargs=None,
-greater_is_better=None,
-group_by_length=True,
-half_precision_backend=auto,
-hub_always_push=False,
-hub_model_id=None,
-hub_private_repo=False,
-hub_strategy=every_save,
-hub_token=<HUB_TOKEN>,
-ignore_data_skip=False,
-include_inputs_for_metrics=False,
-include_num_input_tokens_seen=False,
-include_tokens_per_second=False,
-jit_mode_eval=False,
-label_names=None,
-label_smoothing_factor=0.0,
-learning_rate=4e-05,
-length_column_name=length,
-load_best_model_at_end=False,
-local_rank=0,
-log_level=passive,
-log_level_replica=warning,
-log_on_each_node=True,
-logging_dir=/DATA/env/wjr/newtrain/stage2/mos3/runs/Jan25_08-44-47_ps,
-logging_first_step=False,
-logging_nan_inf_filter=True,
-logging_steps=1.0,
-logging_strategy=steps,
-lr_scheduler_kwargs={},
-lr_scheduler_type=cosine,
-max_grad_norm=1.0,
-max_steps=-1,
-metric_for_best_model=None,
-mp_parameters=,
-neftune_noise_alpha=None,
-no_cuda=False,
-num_train_epochs=50.0,
-optim=adamw_torch,
-optim_args=None,
-optim_target_modules=None,
-output_dir=/DATA/env/wjr/newtrain/stage2/mos3,
-overwrite_output_dir=True,
-past_index=-1,
-per_device_eval_batch_size=1,
-per_device_train_batch_size=4,
-prediction_loss_only=False,
-push_to_hub=False,
-push_to_hub_model_id=None,
-push_to_hub_organization=None,
-push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
-ray_scope=last,
-remove_unused_columns=True,
-report_to=['tensorboard'],
-restore_callback_states_from_checkpoint=False,
-resume_from_checkpoint=None,
-run_name=/DATA/env/wjr/newtrain/stage2/mos3,
-save_on_each_node=False,
-save_only_model=False,
-save_safetensors=True,
-save_steps=40000,
-save_strategy=steps,
-save_total_limit=1,
-seed=42,
-skip_memory_metrics=True,
-split_batches=None,
-tf32=None,
-torch_compile=False,
-torch_compile_backend=None,
-torch_compile_mode=None,
-torch_empty_cache_steps=None,
-torchdynamo=None,
-tpu_metrics_debug=False,
-tpu_num_cores=None,
-use_cpu=False,
-use_ipex=False,
-use_legacy_prediction_loop=False,
-use_liger_kernel=False,
-use_mps_device=False,
-warmup_ratio=0.03,
-warmup_steps=0,
-weight_decay=0.01,
-)
-[INFO|trainer.py:667] 2025-01-25 08:45:40,413 >> Using auto half precision backend
-[2025-01-25 08:45:41,569] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed info: version=0.14.4, git-hash=unknown, git-branch=unknown
-[2025-01-25 08:46:15,846] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False
-Using /home/wangjiarui/.cache/torch_extensions/py39_cu121 as PyTorch extensions root...
-Detected CUDA files, patching ldflags
-Emitting ninja build file /home/wangjiarui/.cache/torch_extensions/py39_cu121/fused_adam/build.ninja...
-Building extension module fused_adam...
-Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
-ninja: no work to do.
-Loading extension module fused_adam...
-Time to load fused_adam op: 1.2471981048583984 seconds
-[2025-01-25 08:46:17,099] [INFO] [logging.py:96:log_dist] [Rank 0] Using DeepSpeed Optimizer param name adamw as basic optimizer
-[2025-01-25 08:46:17,099] [INFO] [logging.py:96:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer
-[2025-01-25 08:46:17,235] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Basic Optimizer = FusedAdam
-[2025-01-25 08:46:17,235] [INFO] [utils.py:56:is_zero_supported_optimizer] Checking ZeRO support for optimizer=FusedAdam type=<class 'deepspeed.ops.adam.fused_adam.FusedAdam'>
-[2025-01-25 08:46:17,236] [INFO] [logging.py:96:log_dist] [Rank 0] Creating torch.bfloat16 ZeRO stage 1 optimizer
-[2025-01-25 08:46:17,236] [INFO] [stage_1_and_2.py:148:__init__] Reduce bucket size 1000000000
-[2025-01-25 08:46:17,236] [INFO] [stage_1_and_2.py:149:__init__] Allgather bucket size 1000000000
-[2025-01-25 08:46:17,236] [INFO] [stage_1_and_2.py:150:__init__] CPU Offload: False
-[2025-01-25 08:46:17,236] [INFO] [stage_1_and_2.py:151:__init__] Round robin gradient partitioning: False
-[2025-01-25 08:46:27,877] [INFO] [utils.py:781:see_memory_usage] Before initializing optimizer states
-[2025-01-25 08:46:27,884] [INFO] [utils.py:782:see_memory_usage] MA 16.03 GB         Max_MA 16.19 GB         CA 16.37 GB         Max_CA 16 GB 
-[2025-01-25 08:46:27,884] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory:  used = 40.47 GB, percent = 8.0%
-[2025-01-25 08:46:28,939] [INFO] [utils.py:781:see_memory_usage] After initializing optimizer states
-[2025-01-25 08:46:28,942] [INFO] [utils.py:782:see_memory_usage] MA 16.03 GB         Max_MA 16.35 GB         CA 16.69 GB         Max_CA 17 GB 
-[2025-01-25 08:46:28,943] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory:  used = 40.63 GB, percent = 8.1%
-[2025-01-25 08:46:28,944] [INFO] [stage_1_and_2.py:543:__init__] optimizer state initialized
-[2025-01-25 08:46:29,964] [INFO] [utils.py:781:see_memory_usage] After initializing ZeRO optimizer
-[2025-01-25 08:46:29,966] [INFO] [utils.py:782:see_memory_usage] MA 16.03 GB         Max_MA 16.03 GB         CA 16.69 GB         Max_CA 17 GB 
-[2025-01-25 08:46:29,966] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory:  used = 40.63 GB, percent = 8.1%
-[2025-01-25 08:46:29,990] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Final Optimizer = DeepSpeedZeroOptimizer
-[2025-01-25 08:46:29,991] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed using client callable to create LR scheduler
-[2025-01-25 08:46:29,992] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed LR Scheduler = <torch.optim.lr_scheduler.LambdaLR object at 0x7f7f6a1c8400>
-[2025-01-25 08:46:29,992] [INFO] [logging.py:96:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0], mom=[[0.9, 0.999]]
-[2025-01-25 08:46:30,017] [INFO] [config.py:997:print] DeepSpeedEngine configuration:
-[2025-01-25 08:46:30,019] [INFO] [config.py:1001:print]   activation_checkpointing_config  {
-    "partition_activations": false, 
-    "contiguous_memory_optimization": false, 
-    "cpu_checkpointing": false, 
-    "number_checkpoints": null, 
-    "synchronize_checkpoint_boundary": false, 
-    "profile": false
-}
-[2025-01-25 08:46:30,019] [INFO] [config.py:1001:print]   aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True}
-[2025-01-25 08:46:30,019] [INFO] [config.py:1001:print]   amp_enabled .................. False
-[2025-01-25 08:46:30,019] [INFO] [config.py:1001:print]   amp_params ................... False
-[2025-01-25 08:46:30,019] [INFO] [config.py:1001:print]   autotuning_config ............ {
-    "enabled": false, 
-    "start_step": null, 
-    "end_step": null, 
-    "metric_path": null, 
-    "arg_mappings": null, 
-    "metric": "throughput", 
-    "model_info": null, 
-    "results_dir": "autotuning_results", 
-    "exps_dir": "autotuning_exps", 
-    "overwrite": true, 
-    "fast": true, 
-    "start_profile_step": 3, 
-    "end_profile_step": 5, 
-    "tuner_type": "gridsearch", 
-    "tuner_early_stopping": 5, 
-    "tuner_num_trials": 50, 
-    "model_info_path": null, 
-    "mp_size": 1, 
-    "max_train_batch_size": null, 
-    "min_train_batch_size": 1, 
-    "max_train_micro_batch_size_per_gpu": 1.024000e+03, 
-    "min_train_micro_batch_size_per_gpu": 1, 
-    "num_tuning_micro_batch_sizes": 3
-}
-[2025-01-25 08:46:30,019] [INFO] [config.py:1001:print]   bfloat16_enabled ............. True
-[2025-01-25 08:46:30,019] [INFO] [config.py:1001:print]   bfloat16_immediate_grad_update  False
-[2025-01-25 08:46:30,019] [INFO] [config.py:1001:print]   checkpoint_parallel_write_pipeline  False
-[2025-01-25 08:46:30,019] [INFO] [config.py:1001:print]   checkpoint_tag_validation_enabled  True
-[2025-01-25 08:46:30,019] [INFO] [config.py:1001:print]   checkpoint_tag_validation_fail  False
-[2025-01-25 08:46:30,019] [INFO] [config.py:1001:print]   comms_config ................. <deepspeed.comm.config.DeepSpeedCommsConfig object at 0x7f7f6a503340>
-[2025-01-25 08:46:30,019] [INFO] [config.py:1001:print]   communication_data_type ...... None
-[2025-01-25 08:46:30,020] [INFO] [config.py:1001:print]   compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}}
-[2025-01-25 08:46:30,020] [INFO] [config.py:1001:print]   curriculum_enabled_legacy .... False
-[2025-01-25 08:46:30,020] [INFO] [config.py:1001:print]   curriculum_params_legacy ..... False
-[2025-01-25 08:46:30,020] [INFO] [config.py:1001:print]   data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}}
-[2025-01-25 08:46:30,020] [INFO] [config.py:1001:print]   data_efficiency_enabled ...... False
-[2025-01-25 08:46:30,020] [INFO] [config.py:1001:print]   dataloader_drop_last ......... False
-[2025-01-25 08:46:30,020] [INFO] [config.py:1001:print]   disable_allgather ............ False
-[2025-01-25 08:46:30,020] [INFO] [config.py:1001:print]   dump_state ................... False
-[2025-01-25 08:46:30,020] [INFO] [config.py:1001:print]   dynamic_loss_scale_args ...... None
-[2025-01-25 08:46:30,020] [INFO] [config.py:1001:print]   eigenvalue_enabled ........... False
-[2025-01-25 08:46:30,020] [INFO] [config.py:1001:print]   eigenvalue_gas_boundary_resolution  1
-[2025-01-25 08:46:30,020] [INFO] [config.py:1001:print]   eigenvalue_layer_name ........ bert.encoder.layer
-[2025-01-25 08:46:30,020] [INFO] [config.py:1001:print]   eigenvalue_layer_num ......... 0
-[2025-01-25 08:46:30,020] [INFO] [config.py:1001:print]   eigenvalue_max_iter .......... 100
-[2025-01-25 08:46:30,020] [INFO] [config.py:1001:print]   eigenvalue_stability ......... 1e-06
-[2025-01-25 08:46:30,020] [INFO] [config.py:1001:print]   eigenvalue_tol ............... 0.01
-[2025-01-25 08:46:30,020] [INFO] [config.py:1001:print]   eigenvalue_verbose ........... False
-[2025-01-25 08:46:30,020] [INFO] [config.py:1001:print]   elasticity_enabled ........... False
-[2025-01-25 08:46:30,020] [INFO] [config.py:1001:print]   flops_profiler_config ........ {
-    "enabled": false, 
-    "recompute_fwd_factor": 0.0, 
-    "profile_step": 1, 
-    "module_depth": -1, 
-    "top_modules": 1, 
-    "detailed": true, 
-    "output_file": null
-}
-[2025-01-25 08:46:30,020] [INFO] [config.py:1001:print]   fp16_auto_cast ............... None
-[2025-01-25 08:46:30,020] [INFO] [config.py:1001:print]   fp16_enabled ................. False
-[2025-01-25 08:46:30,020] [INFO] [config.py:1001:print]   fp16_master_weights_and_gradients  False
-[2025-01-25 08:46:30,020] [INFO] [config.py:1001:print]   global_rank .................. 0
-[2025-01-25 08:46:30,020] [INFO] [config.py:1001:print]   grad_accum_dtype ............. None
-[2025-01-25 08:46:30,020] [INFO] [config.py:1001:print]   gradient_accumulation_steps .. 1
-[2025-01-25 08:46:30,020] [INFO] [config.py:1001:print]   gradient_clipping ............ 1.0
-[2025-01-25 08:46:30,020] [INFO] [config.py:1001:print]   gradient_predivide_factor .... 1.0
-[2025-01-25 08:46:30,020] [INFO] [config.py:1001:print]   graph_harvesting ............. False
-[2025-01-25 08:46:30,020] [INFO] [config.py:1001:print]   hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8
-[2025-01-25 08:46:30,020] [INFO] [config.py:1001:print]   initial_dynamic_scale ........ 1
-[2025-01-25 08:46:30,020] [INFO] [config.py:1001:print]   load_universal_checkpoint .... False
-[2025-01-25 08:46:30,021] [INFO] [config.py:1001:print]   loss_scale ................... 1.0
-[2025-01-25 08:46:30,021] [INFO] [config.py:1001:print]   memory_breakdown ............. False
-[2025-01-25 08:46:30,021] [INFO] [config.py:1001:print]   mics_hierarchial_params_gather  False
-[2025-01-25 08:46:30,021] [INFO] [config.py:1001:print]   mics_shard_size .............. -1
-[2025-01-25 08:46:30,021] [INFO] [config.py:1001:print]   monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') comet=CometConfig(enabled=False, samples_log_interval=100, project=None, workspace=None, api_key=None, experiment_name=None, experiment_key=None, online=None, mode=None) wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') enabled=False
-[2025-01-25 08:46:30,021] [INFO] [config.py:1001:print]   nebula_config ................ {
-    "enabled": false, 
-    "persistent_storage_path": null, 
-    "persistent_time_interval": 100, 
-    "num_of_version_in_retention": 2, 
-    "enable_nebula_load": true, 
-    "load_path": null
-}
-[2025-01-25 08:46:30,021] [INFO] [config.py:1001:print]   optimizer_legacy_fusion ...... False
-[2025-01-25 08:46:30,021] [INFO] [config.py:1001:print]   optimizer_name ............... adamw
-[2025-01-25 08:46:30,021] [INFO] [config.py:1001:print]   optimizer_params ............. {'lr': 4e-05, 'betas': [0.9, 0.999], 'eps': 1e-08, 'weight_decay': 0.01}
-[2025-01-25 08:46:30,021] [INFO] [config.py:1001:print]   pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True}
-[2025-01-25 08:46:30,021] [INFO] [config.py:1001:print]   pld_enabled .................. False
-[2025-01-25 08:46:30,021] [INFO] [config.py:1001:print]   pld_params ................... False
-[2025-01-25 08:46:30,021] [INFO] [config.py:1001:print]   prescale_gradients ........... False
-[2025-01-25 08:46:30,021] [INFO] [config.py:1001:print]   scheduler_name ............... None
-[2025-01-25 08:46:30,021] [INFO] [config.py:1001:print]   scheduler_params ............. None
-[2025-01-25 08:46:30,021] [INFO] [config.py:1001:print]   seq_parallel_communication_data_type  torch.float32
-[2025-01-25 08:46:30,021] [INFO] [config.py:1001:print]   sparse_attention ............. None
-[2025-01-25 08:46:30,021] [INFO] [config.py:1001:print]   sparse_gradients_enabled ..... False
-[2025-01-25 08:46:30,021] [INFO] [config.py:1001:print]   steps_per_print .............. inf
-[2025-01-25 08:46:30,021] [INFO] [config.py:1001:print]   timers_config ................ enabled=True synchronized=True
-[2025-01-25 08:46:30,021] [INFO] [config.py:1001:print]   train_batch_size ............. 4
-[2025-01-25 08:46:30,021] [INFO] [config.py:1001:print]   train_micro_batch_size_per_gpu  4
-[2025-01-25 08:46:30,021] [INFO] [config.py:1001:print]   use_data_before_expert_parallel_  False
-[2025-01-25 08:46:30,021] [INFO] [config.py:1001:print]   use_node_local_storage ....... False
-[2025-01-25 08:46:30,021] [INFO] [config.py:1001:print]   wall_clock_breakdown ......... True
-[2025-01-25 08:46:30,021] [INFO] [config.py:1001:print]   weight_quantization_config ... None
-[2025-01-25 08:46:30,021] [INFO] [config.py:1001:print]   world_size ................... 1
-[2025-01-25 08:46:30,021] [INFO] [config.py:1001:print]   zero_allow_untested_optimizer  False
-[2025-01-25 08:46:30,022] [INFO] [config.py:1001:print]   zero_config .................. stage=1 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=1000000000 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=1000000000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1,000,000,000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50,000,000 param_persistence_threshold=100,000 model_persistence_threshold=sys.maxsize max_live_parameters=1,000,000,000 max_reuse_distance=1,000,000,000 gather_16bit_weights_on_model_save=False use_all_reduce_for_fetch_params=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=True pipeline_loading_checkpoint=False override_module_apply=True
-[2025-01-25 08:46:30,022] [INFO] [config.py:1001:print]   zero_enabled ................. True
-[2025-01-25 08:46:30,022] [INFO] [config.py:1001:print]   zero_force_ds_cpu_optimizer .. True
-[2025-01-25 08:46:30,022] [INFO] [config.py:1001:print]   zero_optimization_stage ...... 1
-[2025-01-25 08:46:30,022] [INFO] [config.py:987:print_user_config]   json = {
-    "zero_optimization": {
-        "stage": 1, 
-        "allgather_partitions": true, 
-        "allgather_bucket_size": 1.000000e+09, 
-        "overlap_comm": true, 
-        "reduce_scatter": true, 
-        "reduce_bucket_size": 1.000000e+09, 
-        "contiguous_gradients": true
-    }, 
-    "fp16": {
-        "enabled": false, 
-        "auto_cast": true, 
-        "loss_scale": 0, 
-        "initial_scale_power": 32, 
-        "loss_scale_window": 1000, 
-        "hysteresis": 2, 
-        "min_loss_scale": 1
-    }, 
-    "bf16": {
-        "enabled": true
-    }, 
-    "optimizer": {
-        "type": "AdamW", 
-        "params": {
-            "lr": 4e-05, 
-            "betas": [0.9, 0.999], 
-            "eps": 1e-08, 
-            "weight_decay": 0.01
-        }
-    }, 
-    "gradient_accumulation_steps": 1, 
-    "gradient_clipping": 1.0, 
-    "steps_per_print": inf, 
-    "train_batch_size": 4, 
-    "train_micro_batch_size_per_gpu": 4, 
-    "wall_clock_breakdown": true
-}
-[INFO|trainer.py:2243] 2025-01-25 08:46:30,022 >> ***** Running training *****
-[INFO|trainer.py:2244] 2025-01-25 08:46:30,022 >>   Num examples = 461
-[INFO|trainer.py:2245] 2025-01-25 08:46:30,023 >>   Num Epochs = 50
-[INFO|trainer.py:2246] 2025-01-25 08:46:30,023 >>   Instantaneous batch size per device = 4
-[INFO|trainer.py:2249] 2025-01-25 08:46:30,023 >>   Total train batch size (w. parallel, distributed & accumulation) = 4
-[INFO|trainer.py:2250] 2025-01-25 08:46:30,023 >>   Gradient Accumulation steps = 1
-[INFO|trainer.py:2251] 2025-01-25 08:46:30,023 >>   Total optimization steps = 5,800
-[INFO|trainer.py:2252] 2025-01-25 08:46:30,038 >>   Number of trainable parameters = 86,295,393
-  0%|          | 0/5800 [00:00<?, ?it/s][2025-01-25 08:46:34,455] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 08:46:45,314] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 08:46:56,363] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 08:47:06,820] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-score1 tensor([[0.2617],
-        [0.0354],
-        [0.8320],
-        [0.1924]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.4492, 0.3750, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.3516, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:47:25,591] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.00 | optimizer_step: 14.33
-[2025-01-25 08:47:25,594] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 4176.23 | bwd_microstep: 4674.77 | bwd_inner_microstep: 4669.06 | bwd_allreduce_microstep: 5.61 | step_microstep: 160.28
-[2025-01-25 08:47:25,594] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 4176.19 | bwd: 4674.82 | bwd_inner: 4669.06 | bwd_allreduce: 5.66 | step: 160.30
-  0%|          | 1/5800 [00:55<89:34:36, 55.61s/it]                                                   {'loss': 0.3516, 'grad_norm': 13.790648460388184, 'learning_rate': 2.2988505747126437e-07, 'epoch': 0.01}
-  0%|          | 1/5800 [00:55<89:34:36, 55.61s/it]score1 tensor([[0.3711],
-        [0.4277],
-        [0.3730],
-        [0.4609]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.3809, 0.5820, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1309, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:47:32,615] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.52 | optimizer_step: 4.37
-[2025-01-25 08:47:32,617] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2125.26 | bwd_microstep: 4582.07 | bwd_inner_microstep: 4575.44 | bwd_allreduce_microstep: 6.47 | step_microstep: 58.48
-[2025-01-25 08:47:32,618] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2125.23 | bwd: 4582.11 | bwd_inner: 4575.44 | bwd_allreduce: 6.56 | step: 58.49
-  0%|          | 2/5800 [01:02<43:30:44, 27.02s/it]                                                   {'loss': 0.1309, 'grad_norm': 15.176630020141602, 'learning_rate': 4.5977011494252875e-07, 'epoch': 0.02}
-  0%|          | 2/5800 [01:02<43:30:44, 27.02s/it]score1 tensor([[1.0078],
-        [0.2012],
-        [0.2031],
-        [0.3262]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.6016, 0.6094, 0.5898], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.3672, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:47:39,488] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.21 | optimizer_step: 4.36
-[2025-01-25 08:47:39,489] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2138.14 | bwd_microstep: 4577.17 | bwd_inner_microstep: 4572.42 | bwd_allreduce_microstep: 4.67 | step_microstep: 48.64
-[2025-01-25 08:47:39,490] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2138.11 | bwd: 4577.19 | bwd_inner: 4572.42 | bwd_allreduce: 4.71 | step: 48.65
-  0%|          | 3/5800 [01:09<28:41:12, 17.81s/it]                                                   {'loss': 0.3672, 'grad_norm': 15.578592300415039, 'learning_rate': 6.896551724137931e-07, 'epoch': 0.03}
-  0%|          | 3/5800 [01:09<28:41:12, 17.81s/it]score1 tensor([[0.0454],
-        [0.2451],
-        [0.1377],
-        [0.3066]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.5820, 0.4355, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.3359, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:47:46,339] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.48 | optimizer_step: 4.36
-[2025-01-25 08:47:46,340] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2129.80 | bwd_microstep: 4579.27 | bwd_inner_microstep: 4572.52 | bwd_allreduce_microstep: 6.62 | step_microstep: 48.72
-[2025-01-25 08:47:46,341] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2129.77 | bwd: 4579.31 | bwd_inner: 4572.52 | bwd_allreduce: 6.69 | step: 48.73
-  0%|          | 4/5800 [01:16<21:43:09, 13.49s/it]                                                   {'loss': 0.3359, 'grad_norm': 25.273414611816406, 'learning_rate': 9.195402298850575e-07, 'epoch': 0.03}
-  0%|          | 4/5800 [01:16<21:43:09, 13.49s/it]score1 tensor([[0.2012],
-        [0.1650],
-        [0.1523],
-        [0.0913]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4805, 0.5508, 0.5391, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.3789, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:47:53,205] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 7.92 | optimizer_step: 4.36
-[2025-01-25 08:47:53,206] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2133.51 | bwd_microstep: 4585.93 | bwd_inner_microstep: 4581.13 | bwd_allreduce_microstep: 4.73 | step_microstep: 45.67
-[2025-01-25 08:47:53,207] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2133.48 | bwd: 4585.95 | bwd_inner: 4581.13 | bwd_allreduce: 4.76 | step: 45.68
-  0%|          | 5/5800 [01:23<17:51:36, 11.10s/it]                                                   {'loss': 0.3789, 'grad_norm': 24.029287338256836, 'learning_rate': 1.1494252873563219e-06, 'epoch': 0.04}
-  0%|          | 5/5800 [01:23<17:51:36, 11.10s/it]score1 tensor([[0.4375],
-        [0.1992],
-        [0.0000],
-        [0.3301]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.6484, 0.5508, 0.3867], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2773, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:48:00,021] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 08:48:00,022] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2135.79 | bwd_microstep: 4558.03 | bwd_inner_microstep: 4553.30 | bwd_allreduce_microstep: 4.66 | step_microstep: 42.36
-[2025-01-25 08:48:00,023] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2135.76 | bwd: 4558.05 | bwd_inner: 4553.30 | bwd_allreduce: 4.69 | step: 42.37
-  0%|          | 6/5800 [01:30<15:31:05,  9.64s/it]                                                   {'loss': 0.2773, 'grad_norm': 20.622129440307617, 'learning_rate': 1.3793103448275862e-06, 'epoch': 0.05}
-  0%|          | 6/5800 [01:30<15:31:05,  9.64s/it]score1 tensor([[0.6875],
-        [0.0884],
-        [0.6992],
-        [0.4297]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.5742, 0.5117, 0.4727], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2305, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:48:06,897] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.93 | optimizer_step: 4.37
-[2025-01-25 08:48:06,898] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2140.01 | bwd_microstep: 4603.55 | bwd_inner_microstep: 4598.54 | bwd_allreduce_microstep: 4.93 | step_microstep: 42.30
-[2025-01-25 08:48:06,899] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2139.98 | bwd: 4603.58 | bwd_inner: 4598.54 | bwd_allreduce: 4.97 | step: 42.30
-  0%|          | 7/5800 [01:36<14:03:33,  8.74s/it]                                                   {'loss': 0.2305, 'grad_norm': 12.238910675048828, 'learning_rate': 1.6091954022988506e-06, 'epoch': 0.06}
-  0%|          | 7/5800 [01:36<14:03:33,  8.74s/it]score1 tensor([[0.1865],
-        [0.1973],
-        [0.1021],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.5156, 0.4844, 0.4043], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.3086, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:48:13,769] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.36
-[2025-01-25 08:48:13,771] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2140.80 | bwd_microstep: 4607.35 | bwd_inner_microstep: 4602.65 | bwd_allreduce_microstep: 4.63 | step_microstep: 41.21
-[2025-01-25 08:48:13,771] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2140.77 | bwd: 4607.38 | bwd_inner: 4602.64 | bwd_allreduce: 4.67 | step: 41.22
-  0%|          | 8/5800 [01:43<13:05:58,  8.14s/it]                                                   {'loss': 0.3086, 'grad_norm': 13.800691604614258, 'learning_rate': 1.839080459770115e-06, 'epoch': 0.07}
-  0%|          | 8/5800 [01:43<13:05:58,  8.14s/it]score1 tensor([[0.4023],
-        [0.2695],
-        [0.1689],
-        [0.3828]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4258, 0.5625, 0.5430, 0.4707], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1953, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:48:20,653] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.36
-[2025-01-25 08:48:20,655] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2145.28 | bwd_microstep: 4611.67 | bwd_inner_microstep: 4606.24 | bwd_allreduce_microstep: 5.34 | step_microstep: 43.49
-[2025-01-25 08:48:20,655] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2145.25 | bwd: 4611.69 | bwd_inner: 4606.24 | bwd_allreduce: 5.39 | step: 43.49
-  0%|          | 9/5800 [01:50<12:27:58,  7.75s/it]                                                   {'loss': 0.1953, 'grad_norm': 28.002317428588867, 'learning_rate': 2.0689655172413796e-06, 'epoch': 0.08}
-  0%|          | 9/5800 [01:50<12:27:58,  7.75s/it]score1 tensor([[0.4473],
-        [0.0022],
-        [0.4629],
-        [0.2773]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.4863, 0.5117, 0.4336], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1953, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:48:27,530] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 08:48:27,531] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2148.02 | bwd_microstep: 4592.48 | bwd_inner_microstep: 4587.80 | bwd_allreduce_microstep: 4.61 | step_microstep: 40.94
-[2025-01-25 08:48:27,531] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2147.99 | bwd: 4592.50 | bwd_inner: 4587.80 | bwd_allreduce: 4.64 | step: 40.95
-  0%|          | 10/5800 [01:57<12:01:41,  7.48s/it]                                                    {'loss': 0.1953, 'grad_norm': 24.591567993164062, 'learning_rate': 2.2988505747126437e-06, 'epoch': 0.09}
-  0%|          | 10/5800 [01:57<12:01:41,  7.48s/it]score1 tensor([[0.3379],
-        [0.5117],
-        [0.0562],
-        [0.3105]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.4746, 0.6211, 0.4863], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2383, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:48:34,390] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 08:48:34,391] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2145.04 | bwd_microstep: 4596.49 | bwd_inner_microstep: 4591.98 | bwd_allreduce_microstep: 4.44 | step_microstep: 40.59
-[2025-01-25 08:48:34,392] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2145.01 | bwd: 4596.52 | bwd_inner: 4591.98 | bwd_allreduce: 4.47 | step: 40.60
-  0%|          | 11/5800 [02:04<11:43:20,  7.29s/it]                                                    {'loss': 0.2383, 'grad_norm': 13.944327354431152, 'learning_rate': 2.5287356321839083e-06, 'epoch': 0.09}
-  0%|          | 11/5800 [02:04<11:43:20,  7.29s/it]score1 tensor([[1.0391],
-        [0.5234],
-        [0.3984],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3418, 0.6094, 0.6445, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2812, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:48:41,267] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 08:48:41,268] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2147.08 | bwd_microstep: 4608.29 | bwd_inner_microstep: 4603.44 | bwd_allreduce_microstep: 4.77 | step_microstep: 40.48
-[2025-01-25 08:48:41,268] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2147.05 | bwd: 4608.32 | bwd_inner: 4603.44 | bwd_allreduce: 4.81 | step: 40.49
-  0%|          | 12/5800 [02:11<11:31:07,  7.16s/it]                                                    {'loss': 0.2812, 'grad_norm': 24.62859344482422, 'learning_rate': 2.7586206896551725e-06, 'epoch': 0.1}
-  0%|          | 12/5800 [02:11<11:31:07,  7.16s/it]score1 tensor([[0.5391],
-        [0.9453],
-        [0.4707],
-        [0.1973]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5742, 0.6094, 0.4688, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1650, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:48:48,148] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 08:48:48,149] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.03 | bwd_microstep: 4606.47 | bwd_inner_microstep: 4601.89 | bwd_allreduce_microstep: 4.51 | step_microstep: 41.47
-[2025-01-25 08:48:48,150] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2149.97 | bwd: 4606.49 | bwd_inner: 4601.89 | bwd_allreduce: 4.54 | step: 41.47
-  0%|          | 13/5800 [02:18<11:22:48,  7.08s/it]                                                    {'loss': 0.165, 'grad_norm': 14.205713272094727, 'learning_rate': 2.988505747126437e-06, 'epoch': 0.11}
-  0%|          | 13/5800 [02:18<11:22:48,  7.08s/it]score1 tensor([[0.2793],
-        [0.2080],
-        [0.4648],
-        [1.1094]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.4219, 0.4551, 0.3730], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2988, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:48:55,034] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 08:48:55,035] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.26 | bwd_microstep: 4604.06 | bwd_inner_microstep: 4599.27 | bwd_allreduce_microstep: 4.70 | step_microstep: 41.27
-[2025-01-25 08:48:55,035] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.23 | bwd: 4604.08 | bwd_inner: 4599.27 | bwd_allreduce: 4.75 | step: 41.28
-  0%|          | 14/5800 [02:25<11:16:59,  7.02s/it]                                                    {'loss': 0.2988, 'grad_norm': 13.651576042175293, 'learning_rate': 3.2183908045977012e-06, 'epoch': 0.12}
-  0%|          | 14/5800 [02:25<11:16:59,  7.02s/it]score1 tensor([[0.1865],
-        [0.1475],
-        [0.6719],
-        [0.3887]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.5117, 0.4629, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2754, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:49:01,912] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 08:49:01,913] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.05 | bwd_microstep: 4602.98 | bwd_inner_microstep: 4598.55 | bwd_allreduce_microstep: 4.36 | step_microstep: 41.47
-[2025-01-25 08:49:01,913] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.03 | bwd: 4603.01 | bwd_inner: 4598.55 | bwd_allreduce: 4.40 | step: 41.47
-  0%|          | 15/5800 [02:31<11:12:42,  6.98s/it]                                                    {'loss': 0.2754, 'grad_norm': 16.572179794311523, 'learning_rate': 3.448275862068966e-06, 'epoch': 0.13}
-  0%|          | 15/5800 [02:31<11:12:42,  6.98s/it]score1 tensor([[0.5977],
-        [0.1592],
-        [0.4160],
-        [0.1670]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4297, 0.4883, 0.4961, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2285, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:49:08,804] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 08:49:08,805] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2151.91 | bwd_microstep: 4617.40 | bwd_inner_microstep: 4612.82 | bwd_allreduce_microstep: 4.48 | step_microstep: 42.66
-[2025-01-25 08:49:08,805] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2151.88 | bwd: 4617.42 | bwd_inner: 4612.82 | bwd_allreduce: 4.53 | step: 42.66
-  0%|          | 16/5800 [02:38<11:10:06,  6.95s/it]                                                    {'loss': 0.2285, 'grad_norm': 15.842597007751465, 'learning_rate': 3.67816091954023e-06, 'epoch': 0.14}
-  0%|          | 16/5800 [02:38<11:10:06,  6.95s/it]score1 tensor([[0.6367],
-        [0.6367],
-        [0.1377],
-        [0.3125]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3789, 0.4980, 0.4121, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2246, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:49:15,701] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 08:49:15,702] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.57 | bwd_microstep: 4618.34 | bwd_inner_microstep: 4613.71 | bwd_allreduce_microstep: 4.55 | step_microstep: 39.94
-[2025-01-25 08:49:15,703] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.54 | bwd: 4618.36 | bwd_inner: 4613.71 | bwd_allreduce: 4.58 | step: 39.95
-  0%|          | 17/5800 [02:45<11:08:26,  6.94s/it]                                                    {'loss': 0.2246, 'grad_norm': 11.245152473449707, 'learning_rate': 3.908045977011495e-06, 'epoch': 0.15}
-  0%|          | 17/5800 [02:45<11:08:26,  6.94s/it]score1 tensor([[0.2266],
-        [0.7695],
-        [0.5859],
-        [0.2676]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.6602, 0.4180, 0.5938], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2461, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:49:22,592] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 08:49:22,593] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.94 | bwd_microstep: 4613.00 | bwd_inner_microstep: 4608.28 | bwd_allreduce_microstep: 4.65 | step_microstep: 42.28
-[2025-01-25 08:49:22,594] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.91 | bwd: 4613.02 | bwd_inner: 4608.28 | bwd_allreduce: 4.68 | step: 42.28
-  0%|          | 18/5800 [02:52<11:07:06,  6.92s/it]                                                    {'loss': 0.2461, 'grad_norm': 14.206080436706543, 'learning_rate': 4.137931034482759e-06, 'epoch': 0.16}
-  0%|          | 18/5800 [02:52<11:07:06,  6.92s/it]score1 tensor([[0.3281],
-        [0.7305],
-        [0.2910],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.4648, 0.4453, 0.4707], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1846, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:49:29,497] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 08:49:29,498] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.46 | bwd_microstep: 4614.09 | bwd_inner_microstep: 4609.36 | bwd_allreduce_microstep: 4.66 | step_microstep: 41.50
-[2025-01-25 08:49:29,498] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.43 | bwd: 4614.11 | bwd_inner: 4609.36 | bwd_allreduce: 4.69 | step: 41.51
-  0%|          | 19/5800 [02:59<11:06:27,  6.92s/it]                                                    {'loss': 0.1846, 'grad_norm': 9.326728820800781, 'learning_rate': 4.367816091954023e-06, 'epoch': 0.16}
-  0%|          | 19/5800 [02:59<11:06:27,  6.92s/it]score1 tensor([[0.5547],
-        [0.3691],
-        [0.4707],
-        [0.6289]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4707, 0.4336, 0.6016, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0972, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:49:36,388] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 08:49:36,389] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.19 | bwd_microstep: 4612.31 | bwd_inner_microstep: 4607.54 | bwd_allreduce_microstep: 4.70 | step_microstep: 39.90
-[2025-01-25 08:49:36,389] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.17 | bwd: 4612.33 | bwd_inner: 4607.54 | bwd_allreduce: 4.73 | step: 39.90
-  0%|          | 20/5800 [03:06<11:05:32,  6.91s/it]                                                    {'loss': 0.0972, 'grad_norm': 11.149824142456055, 'learning_rate': 4.5977011494252875e-06, 'epoch': 0.17}
-  0%|          | 20/5800 [03:06<11:05:32,  6.91s/it]score1 tensor([[0.3008],
-        [0.4922],
-        [0.5312],
-        [0.3711]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.3652, 0.3711, 0.5703], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1875, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:49:43,300] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.89 | optimizer_step: 4.36
-[2025-01-25 08:49:43,301] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.12 | bwd_microstep: 4634.61 | bwd_inner_microstep: 4629.92 | bwd_allreduce_microstep: 4.62 | step_microstep: 40.69
-[2025-01-25 08:49:43,302] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.10 | bwd: 4634.63 | bwd_inner: 4629.92 | bwd_allreduce: 4.65 | step: 40.69
-  0%|          | 21/5800 [03:13<11:05:32,  6.91s/it]                                                    {'loss': 0.1875, 'grad_norm': 8.706533432006836, 'learning_rate': 4.8275862068965525e-06, 'epoch': 0.18}
-  0%|          | 21/5800 [03:13<11:05:32,  6.91s/it]score1 tensor([[0.5977],
-        [0.4297],
-        [0.7930],
-        [0.6719]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.5312, 0.3945, 0.4297], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1904, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:49:50,214] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 08:49:50,216] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.11 | bwd_microstep: 4632.80 | bwd_inner_microstep: 4628.01 | bwd_allreduce_microstep: 4.70 | step_microstep: 41.06
-[2025-01-25 08:49:50,216] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.07 | bwd: 4632.82 | bwd_inner: 4628.01 | bwd_allreduce: 4.74 | step: 41.07
-  0%|          | 22/5800 [03:20<11:05:38,  6.91s/it]                                                    {'loss': 0.1904, 'grad_norm': 18.130876541137695, 'learning_rate': 5.057471264367817e-06, 'epoch': 0.19}
-  0%|          | 22/5800 [03:20<11:05:38,  6.91s/it]score1 tensor([[0.4902],
-        [0.6641],
-        [0.9414],
-        [0.7656]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4531, 0.4922, 0.3672, 0.4863], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2656, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:49:57,138] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.37
-[2025-01-25 08:49:57,139] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.48 | bwd_microstep: 4639.20 | bwd_inner_microstep: 4634.50 | bwd_allreduce_microstep: 4.63 | step_microstep: 40.82
-[2025-01-25 08:49:57,140] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.45 | bwd: 4639.22 | bwd_inner: 4634.50 | bwd_allreduce: 4.66 | step: 40.83
-  0%|          | 23/5800 [03:27<11:05:47,  6.91s/it]                                                    {'loss': 0.2656, 'grad_norm': 26.7901554107666, 'learning_rate': 5.287356321839081e-06, 'epoch': 0.2}
-  0%|          | 23/5800 [03:27<11:05:47,  6.91s/it]score1 tensor([[0.8125],
-        [0.5820],
-        [0.3320],
-        [0.7891]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3457, 0.5430, 0.5430, 0.4668], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2598, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:50:04,059] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.37
-[2025-01-25 08:50:04,060] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.54 | bwd_microstep: 4639.51 | bwd_inner_microstep: 4634.94 | bwd_allreduce_microstep: 4.49 | step_microstep: 39.67
-[2025-01-25 08:50:04,060] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.52 | bwd: 4639.53 | bwd_inner: 4634.94 | bwd_allreduce: 4.53 | step: 39.68
-  0%|          | 24/5800 [03:34<11:05:48,  6.92s/it]                                                    {'loss': 0.2598, 'grad_norm': 16.128541946411133, 'learning_rate': 5.517241379310345e-06, 'epoch': 0.21}
-  0%|          | 24/5800 [03:34<11:05:48,  6.92s/it]score1 tensor([[0.7188],
-        [0.5000],
-        [0.6055],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3398, 0.6094, 0.5625, 0.4805], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1416, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:50:10,976] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 08:50:10,977] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.01 | bwd_microstep: 4637.12 | bwd_inner_microstep: 4632.35 | bwd_allreduce_microstep: 4.69 | step_microstep: 40.39
-[2025-01-25 08:50:10,977] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.97 | bwd: 4637.14 | bwd_inner: 4632.35 | bwd_allreduce: 4.73 | step: 40.40
-  0%|          | 25/5800 [03:40<11:05:43,  6.92s/it]                                                    {'loss': 0.1416, 'grad_norm': 15.644857406616211, 'learning_rate': 5.747126436781609e-06, 'epoch': 0.22}
-  0%|          | 25/5800 [03:40<11:05:43,  6.92s/it]score1 tensor([[1.1484],
-        [0.5273],
-        [1.0625],
-        [0.7539]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6562, 0.5039, 0.6445, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.3086, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:50:17,898] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 08:50:17,900] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.20 | bwd_microstep: 4641.51 | bwd_inner_microstep: 4636.79 | bwd_allreduce_microstep: 4.64 | step_microstep: 40.29
-[2025-01-25 08:50:17,900] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.17 | bwd: 4641.53 | bwd_inner: 4636.79 | bwd_allreduce: 4.68 | step: 40.29
-  0%|          | 26/5800 [03:47<11:05:49,  6.92s/it]                                                    {'loss': 0.3086, 'grad_norm': 31.461055755615234, 'learning_rate': 5.977011494252874e-06, 'epoch': 0.22}
-  0%|          | 26/5800 [03:47<11:05:49,  6.92s/it]score1 tensor([[0.4805],
-        [0.6875],
-        [1.0078],
-        [0.4941]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6250, 0.4590, 0.4668, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2490, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:50:24,821] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 08:50:24,823] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.79 | bwd_microstep: 4637.46 | bwd_inner_microstep: 4632.58 | bwd_allreduce_microstep: 4.79 | step_microstep: 40.80
-[2025-01-25 08:50:24,823] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.76 | bwd: 4637.48 | bwd_inner: 4632.58 | bwd_allreduce: 4.84 | step: 40.82
-  0%|          | 27/5800 [03:54<11:06:00,  6.92s/it]                                                    {'loss': 0.249, 'grad_norm': 11.493097305297852, 'learning_rate': 6.206896551724138e-06, 'epoch': 0.23}
-  0%|          | 27/5800 [03:54<11:06:00,  6.92s/it]score1 tensor([[0.8164],
-        [0.6289],
-        [0.5352],
-        [0.6016]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3105, 0.6328, 0.3789, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1689, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:50:31,756] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.37
-[2025-01-25 08:50:31,757] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.87 | bwd_microstep: 4645.70 | bwd_inner_microstep: 4640.98 | bwd_allreduce_microstep: 4.65 | step_microstep: 41.01
-[2025-01-25 08:50:31,758] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.84 | bwd: 4645.73 | bwd_inner: 4640.98 | bwd_allreduce: 4.69 | step: 41.00
-  0%|          | 28/5800 [04:01<11:06:08,  6.92s/it]                                                    {'loss': 0.1689, 'grad_norm': 12.872565269470215, 'learning_rate': 6.4367816091954025e-06, 'epoch': 0.24}
-  0%|          | 28/5800 [04:01<11:06:08,  6.92s/it]score1 tensor([[0.6953],
-        [0.4902],
-        [0.5938],
-        [0.3555]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.4785, 0.5625, 0.4766], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0791, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:50:38,683] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 08:50:38,684] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.72 | bwd_microstep: 4638.36 | bwd_inner_microstep: 4633.73 | bwd_allreduce_microstep: 4.55 | step_microstep: 40.52
-[2025-01-25 08:50:38,684] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.70 | bwd: 4638.39 | bwd_inner: 4633.73 | bwd_allreduce: 4.59 | step: 40.53
-  0%|          | 29/5800 [04:08<11:06:00,  6.92s/it]                                                    {'loss': 0.0791, 'grad_norm': 17.032943725585938, 'learning_rate': 6.666666666666667e-06, 'epoch': 0.25}
-  0%|          | 29/5800 [04:08<11:06:00,  6.92s/it]score1 tensor([[0.2930],
-        [0.4766],
-        [0.5195],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5977, 0.4922, 0.5156, 0.4336], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1016, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:50:45,607] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 08:50:45,608] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.64 | bwd_microstep: 4640.32 | bwd_inner_microstep: 4635.49 | bwd_allreduce_microstep: 4.76 | step_microstep: 39.83
-[2025-01-25 08:50:45,608] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.60 | bwd: 4640.34 | bwd_inner: 4635.49 | bwd_allreduce: 4.79 | step: 39.83
-  1%|          | 30/5800 [04:15<11:05:54,  6.92s/it]                                                    {'loss': 0.1016, 'grad_norm': 9.359320640563965, 'learning_rate': 6.896551724137932e-06, 'epoch': 0.26}
-  1%|          | 30/5800 [04:15<11:05:54,  6.92s/it]score1 tensor([[0.6094],
-        [0.5078],
-        [0.5859],
-        [0.7188]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.5508, 0.4473, 0.3555], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1650, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:50:52,534] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.36
-[2025-01-25 08:50:52,535] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.14 | bwd_microstep: 4641.58 | bwd_inner_microstep: 4636.97 | bwd_allreduce_microstep: 4.51 | step_microstep: 41.39
-[2025-01-25 08:50:52,535] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.12 | bwd: 4641.60 | bwd_inner: 4636.97 | bwd_allreduce: 4.56 | step: 41.40
-  1%|          | 31/5800 [04:22<11:05:47,  6.92s/it]                                                    {'loss': 0.165, 'grad_norm': 15.908100128173828, 'learning_rate': 7.126436781609196e-06, 'epoch': 0.27}
-  1%|          | 31/5800 [04:22<11:05:47,  6.92s/it]score1 tensor([[0.7852],
-        [0.8477],
-        [0.5000],
-        [0.8281]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.6406, 0.5508, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1777, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:50:59,454] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.36
-[2025-01-25 08:50:59,455] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.10 | bwd_microstep: 4639.05 | bwd_inner_microstep: 4634.35 | bwd_allreduce_microstep: 4.62 | step_microstep: 40.17
-[2025-01-25 08:50:59,455] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.08 | bwd: 4639.07 | bwd_inner: 4634.35 | bwd_allreduce: 4.66 | step: 40.17
-  1%|          | 32/5800 [04:29<11:05:32,  6.92s/it]                                                    {'loss': 0.1777, 'grad_norm': 20.995744705200195, 'learning_rate': 7.35632183908046e-06, 'epoch': 0.28}
-  1%|          | 32/5800 [04:29<11:05:32,  6.92s/it]score1 tensor([[0.4316],
-        [0.6680],
-        [0.3066],
-        [0.6562]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4824, 0.4980, 0.4609, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1455, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:51:06,379] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 08:51:06,380] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.22 | bwd_microstep: 4646.59 | bwd_inner_microstep: 4641.83 | bwd_allreduce_microstep: 4.68 | step_microstep: 39.72
-[2025-01-25 08:51:06,380] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.19 | bwd: 4646.61 | bwd_inner: 4641.83 | bwd_allreduce: 4.72 | step: 39.73
-  1%|          | 33/5800 [04:36<11:05:30,  6.92s/it]                                                    {'loss': 0.1455, 'grad_norm': 7.782193183898926, 'learning_rate': 7.586206896551724e-06, 'epoch': 0.28}
-  1%|          | 33/5800 [04:36<11:05:30,  6.92s/it]score1 tensor([[0.4336],
-        [0.6172],
-        [0.3984],
-        [0.7734]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.5508, 0.4258, 0.5703], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0786, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:51:13,313] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 7.27 | optimizer_step: 4.37
-[2025-01-25 08:51:13,314] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.78 | bwd_microstep: 4643.37 | bwd_inner_microstep: 4638.85 | bwd_allreduce_microstep: 4.42 | step_microstep: 45.40
-[2025-01-25 08:51:13,315] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.75 | bwd: 4643.40 | bwd_inner: 4638.86 | bwd_allreduce: 4.47 | step: 45.41
-  1%|          | 34/5800 [04:43<11:05:41,  6.93s/it]                                                    {'loss': 0.0786, 'grad_norm': 12.477624893188477, 'learning_rate': 7.81609195402299e-06, 'epoch': 0.29}
-  1%|          | 34/5800 [04:43<11:05:41,  6.93s/it]score1 tensor([[0.4688],
-        [0.4609],
-        [0.4590],
-        [0.4395]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4121, 0.4180, 0.5430, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0596, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:51:20,255] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.36
-[2025-01-25 08:51:20,256] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.90 | bwd_microstep: 4651.17 | bwd_inner_microstep: 4646.52 | bwd_allreduce_microstep: 4.58 | step_microstep: 40.85
-[2025-01-25 08:51:20,257] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2168.87 | bwd: 4651.19 | bwd_inner: 4646.52 | bwd_allreduce: 4.61 | step: 40.86
-  1%|          | 35/5800 [04:50<11:06:16,  6.93s/it]                                                    {'loss': 0.0596, 'grad_norm': 8.094722747802734, 'learning_rate': 8.045977011494253e-06, 'epoch': 0.3}
-  1%|          | 35/5800 [04:50<11:06:16,  6.93s/it]score1 tensor([[0.5547],
-        [0.5039],
-        [0.4160],
-        [0.3945]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.5664, 0.5117, 0.5000], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0811, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:51:27,199] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 08:51:27,200] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.74 | bwd_microstep: 4649.93 | bwd_inner_microstep: 4644.99 | bwd_allreduce_microstep: 4.86 | step_microstep: 40.54
-[2025-01-25 08:51:27,201] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.72 | bwd: 4649.95 | bwd_inner: 4645.00 | bwd_allreduce: 4.89 | step: 40.54
-  1%|          | 36/5800 [04:57<11:06:08,  6.93s/it]                                                    {'loss': 0.0811, 'grad_norm': 14.570757865905762, 'learning_rate': 8.275862068965518e-06, 'epoch': 0.31}
-  1%|          | 36/5800 [04:57<11:06:08,  6.93s/it]score1 tensor([[0.4629],
-        [0.4219],
-        [0.3691],
-        [0.2832]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.4492, 0.4512, 0.4570], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0820, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:51:34,124] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 08:51:34,126] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.00 | bwd_microstep: 4639.38 | bwd_inner_microstep: 4634.62 | bwd_allreduce_microstep: 4.69 | step_microstep: 41.23
-[2025-01-25 08:51:34,126] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.97 | bwd: 4639.40 | bwd_inner: 4634.62 | bwd_allreduce: 4.72 | step: 41.24
-  1%|          | 37/5800 [05:04<11:05:49,  6.93s/it]                                                    {'loss': 0.082, 'grad_norm': 13.694799423217773, 'learning_rate': 8.505747126436782e-06, 'epoch': 0.32}
-  1%|          | 37/5800 [05:04<11:05:49,  6.93s/it]score1 tensor([[0.5742],
-        [0.8438],
-        [0.2910],
-        [1.1016]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5742, 0.5391, 0.5977, 0.6133], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2754, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:51:41,002] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 08:51:41,003] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.16 | bwd_microstep: 4589.69 | bwd_inner_microstep: 4584.98 | bwd_allreduce_microstep: 4.63 | step_microstep: 40.49
-[2025-01-25 08:51:41,004] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.13 | bwd: 4589.71 | bwd_inner: 4584.98 | bwd_allreduce: 4.67 | step: 40.50
-  1%|          | 38/5800 [05:10<11:04:05,  6.92s/it]                                                    {'loss': 0.2754, 'grad_norm': 17.01567840576172, 'learning_rate': 8.735632183908047e-06, 'epoch': 0.33}
-  1%|          | 38/5800 [05:10<11:04:05,  6.92s/it]score1 tensor([[0.4102],
-        [0.6680],
-        [0.2295],
-        [0.4805]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.4941, 0.4180, 0.4258], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1377, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:51:47,929] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.93 | optimizer_step: 4.36
-[2025-01-25 08:51:47,931] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.92 | bwd_microstep: 4640.95 | bwd_inner_microstep: 4636.34 | bwd_allreduce_microstep: 4.53 | step_microstep: 45.52
-[2025-01-25 08:51:47,931] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.89 | bwd: 4640.97 | bwd_inner: 4636.34 | bwd_allreduce: 4.57 | step: 45.51
-  1%|          | 39/5800 [05:17<11:04:19,  6.92s/it]                                                    {'loss': 0.1377, 'grad_norm': 9.991281509399414, 'learning_rate': 8.965517241379312e-06, 'epoch': 0.34}
-  1%|          | 39/5800 [05:17<11:04:19,  6.92s/it]score1 tensor([[0.8711],
-        [0.4629],
-        [0.2441],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6562, 0.4023, 0.4980, 0.4727], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1426, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:51:54,849] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 08:51:54,850] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.69 | bwd_microstep: 4639.24 | bwd_inner_microstep: 4634.69 | bwd_allreduce_microstep: 4.48 | step_microstep: 40.22
-[2025-01-25 08:51:54,850] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.67 | bwd: 4639.26 | bwd_inner: 4634.69 | bwd_allreduce: 4.52 | step: 40.22
-  1%|          | 40/5800 [05:24<11:04:14,  6.92s/it]                                                    {'loss': 0.1426, 'grad_norm': 17.41720199584961, 'learning_rate': 9.195402298850575e-06, 'epoch': 0.34}
-  1%|          | 40/5800 [05:24<11:04:14,  6.92s/it]score1 tensor([[0.5977],
-        [0.4844],
-        [0.4570],
-        [0.4531]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.5547, 0.5273, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0693, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:52:01,775] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.95 | optimizer_step: 4.37
-[2025-01-25 08:52:01,776] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.17 | bwd_microstep: 4641.97 | bwd_inner_microstep: 4637.33 | bwd_allreduce_microstep: 4.56 | step_microstep: 40.40
-[2025-01-25 08:52:01,776] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.15 | bwd: 4641.99 | bwd_inner: 4637.33 | bwd_allreduce: 4.60 | step: 40.40
-  1%|          | 41/5800 [05:31<11:04:20,  6.92s/it]                                                    {'loss': 0.0693, 'grad_norm': 13.245406150817871, 'learning_rate': 9.42528735632184e-06, 'epoch': 0.35}
-  1%|          | 41/5800 [05:31<11:04:20,  6.92s/it]score1 tensor([[0.5898],
-        [0.4199],
-        [0.4062],
-        [0.3672]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6719, 0.3984, 0.4824, 0.4688], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:52:08,712] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.36
-[2025-01-25 08:52:08,713] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.02 | bwd_microstep: 4649.46 | bwd_inner_microstep: 4644.46 | bwd_allreduce_microstep: 4.93 | step_microstep: 40.47
-[2025-01-25 08:52:08,713] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.99 | bwd: 4649.48 | bwd_inner: 4644.46 | bwd_allreduce: 4.97 | step: 40.48
-  1%|          | 42/5800 [05:38<11:04:37,  6.93s/it]                                                    {'loss': 0.0703, 'grad_norm': 16.2587890625, 'learning_rate': 9.655172413793105e-06, 'epoch': 0.36}
-  1%|          | 42/5800 [05:38<11:04:37,  6.93s/it]score1 tensor([[0.5742],
-        [0.5938],
-        [0.4219],
-        [0.4902]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6406, 0.6797, 0.5352, 0.3984], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0894, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:52:15,636] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.36
-[2025-01-25 08:52:15,638] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.42 | bwd_microstep: 4644.29 | bwd_inner_microstep: 4639.35 | bwd_allreduce_microstep: 4.85 | step_microstep: 41.70
-[2025-01-25 08:52:15,641] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.39 | bwd: 4644.31 | bwd_inner: 4639.35 | bwd_allreduce: 4.89 | step: 41.70
-  1%|          | 43/5800 [05:45<11:04:47,  6.93s/it]                                                    {'loss': 0.0894, 'grad_norm': 16.027124404907227, 'learning_rate': 9.885057471264368e-06, 'epoch': 0.37}
-  1%|          | 43/5800 [05:45<11:04:47,  6.93s/it]score1 tensor([[0.3613],
-        [0.4844],
-        [0.4668],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.5039, 0.4980, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0459, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:52:22,573] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.37
-[2025-01-25 08:52:22,574] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.95 | bwd_microstep: 4641.60 | bwd_inner_microstep: 4636.98 | bwd_allreduce_microstep: 4.55 | step_microstep: 40.55
-[2025-01-25 08:52:22,575] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.89 | bwd: 4641.62 | bwd_inner: 4636.98 | bwd_allreduce: 4.58 | step: 40.56
-  1%|          | 44/5800 [05:52<11:04:35,  6.93s/it]                                                    {'loss': 0.0459, 'grad_norm': 13.273720741271973, 'learning_rate': 1.0114942528735633e-05, 'epoch': 0.38}
-  1%|          | 44/5800 [05:52<11:04:35,  6.93s/it]score1 tensor([[0.6367],
-        [0.4512],
-        [0.3848],
-        [0.3887]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.5781, 0.6016, 0.5000], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1484, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:52:29,504] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.37
-[2025-01-25 08:52:29,505] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.66 | bwd_microstep: 4646.42 | bwd_inner_microstep: 4641.70 | bwd_allreduce_microstep: 4.64 | step_microstep: 45.14
-[2025-01-25 08:52:29,505] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.63 | bwd: 4646.44 | bwd_inner: 4641.70 | bwd_allreduce: 4.68 | step: 45.15
-  1%|          | 45/5800 [05:59<11:04:35,  6.93s/it]                                                    {'loss': 0.1484, 'grad_norm': 13.89497184753418, 'learning_rate': 1.0344827586206898e-05, 'epoch': 0.39}
-  1%|          | 45/5800 [05:59<11:04:35,  6.93s/it]score1 tensor([[0.4590],
-        [0.4805],
-        [0.4160],
-        [0.4062]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3730, 0.4766, 0.4844, 0.4629], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0537, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:52:36,431] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 08:52:36,432] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.82 | bwd_microstep: 4644.24 | bwd_inner_microstep: 4639.60 | bwd_allreduce_microstep: 4.56 | step_microstep: 40.85
-[2025-01-25 08:52:36,433] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.79 | bwd: 4644.26 | bwd_inner: 4639.60 | bwd_allreduce: 4.60 | step: 40.86
-  1%|          | 46/5800 [06:06<11:04:27,  6.93s/it]                                                    {'loss': 0.0537, 'grad_norm': 7.5656890869140625, 'learning_rate': 1.0574712643678162e-05, 'epoch': 0.4}
-  1%|          | 46/5800 [06:06<11:04:27,  6.93s/it]score1 tensor([[0.3789],
-        [0.5820],
-        [0.5156],
-        [0.6445]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4883, 0.5547, 0.4023, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1113, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:52:43,351] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.11 | optimizer_step: 4.36
-[2025-01-25 08:52:43,352] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.15 | bwd_microstep: 4639.72 | bwd_inner_microstep: 4635.11 | bwd_allreduce_microstep: 4.54 | step_microstep: 41.29
-[2025-01-25 08:52:43,353] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.12 | bwd: 4639.75 | bwd_inner: 4635.11 | bwd_allreduce: 4.57 | step: 41.29
-  1%|          | 47/5800 [06:13<11:04:11,  6.93s/it]                                                    {'loss': 0.1113, 'grad_norm': 14.343555450439453, 'learning_rate': 1.0804597701149427e-05, 'epoch': 0.41}
-  1%|          | 47/5800 [06:13<11:04:11,  6.93s/it]score1 tensor([[0.5820],
-        [0.5469],
-        [0.6836],
-        [0.6523]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6289, 0.4160, 0.5195, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1445, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:52:50,282] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 08:52:50,283] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.35 | bwd_microstep: 4641.78 | bwd_inner_microstep: 4636.86 | bwd_allreduce_microstep: 4.84 | step_microstep: 40.21
-[2025-01-25 08:52:50,283] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.32 | bwd: 4641.80 | bwd_inner: 4636.86 | bwd_allreduce: 4.88 | step: 40.22
-  1%|          | 48/5800 [06:20<11:04:41,  6.93s/it]                                                    {'loss': 0.1445, 'grad_norm': 16.47435188293457, 'learning_rate': 1.103448275862069e-05, 'epoch': 0.41}
-  1%|          | 48/5800 [06:20<11:04:41,  6.93s/it]score1 tensor([[0.7109],
-        [0.8359],
-        [0.6367],
-        [0.4648]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4355, 0.5078, 0.4609, 0.4395], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2012, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:52:57,231] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.36
-[2025-01-25 08:52:57,233] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.74 | bwd_microstep: 4639.46 | bwd_inner_microstep: 4634.65 | bwd_allreduce_microstep: 4.74 | step_microstep: 40.33
-[2025-01-25 08:52:57,233] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.71 | bwd: 4639.48 | bwd_inner: 4634.65 | bwd_allreduce: 4.77 | step: 40.34
-  1%|          | 49/5800 [06:27<11:04:29,  6.93s/it]                                                    {'loss': 0.2012, 'grad_norm': 27.35670280456543, 'learning_rate': 1.1264367816091955e-05, 'epoch': 0.42}
-  1%|          | 49/5800 [06:27<11:04:29,  6.93s/it]score1 tensor([[0.5156],
-        [0.6641],
-        [0.3887],
-        [0.6719]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4902, 0.6172, 0.4043, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0562, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:53:04,160] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.37
-[2025-01-25 08:53:04,162] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.71 | bwd_microstep: 4641.60 | bwd_inner_microstep: 4637.00 | bwd_allreduce_microstep: 4.53 | step_microstep: 39.44
-[2025-01-25 08:53:04,162] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.69 | bwd: 4641.62 | bwd_inner: 4637.00 | bwd_allreduce: 4.56 | step: 39.44
-  1%|          | 50/5800 [06:34<11:04:11,  6.93s/it]                                                    {'loss': 0.0562, 'grad_norm': 18.8441162109375, 'learning_rate': 1.1494252873563218e-05, 'epoch': 0.43}
-  1%|          | 50/5800 [06:34<11:04:11,  6.93s/it]score1 tensor([[0.6250],
-        [0.7969],
-        [0.4629],
-        [0.5898]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4570, 0.3926, 0.1787, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2617, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:53:11,088] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.36
-[2025-01-25 08:53:11,090] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.53 | bwd_microstep: 4648.25 | bwd_inner_microstep: 4643.48 | bwd_allreduce_microstep: 4.67 | step_microstep: 40.96
-[2025-01-25 08:53:11,090] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.51 | bwd: 4648.28 | bwd_inner: 4643.48 | bwd_allreduce: 4.74 | step: 40.97
-  1%|          | 51/5800 [06:41<11:04:02,  6.93s/it]                                                    {'loss': 0.2617, 'grad_norm': 26.630544662475586, 'learning_rate': 1.1724137931034483e-05, 'epoch': 0.44}
-  1%|          | 51/5800 [06:41<11:04:02,  6.93s/it]score1 tensor([[0.5430],
-        [0.6328],
-        [0.6680],
-        [1.0391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4609, 0.5625, 0.5664, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1680, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:53:18,008] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.37
-[2025-01-25 08:53:18,009] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.51 | bwd_microstep: 4634.82 | bwd_inner_microstep: 4629.98 | bwd_allreduce_microstep: 4.72 | step_microstep: 42.13
-[2025-01-25 08:53:18,010] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.48 | bwd: 4634.86 | bwd_inner: 4629.98 | bwd_allreduce: 4.76 | step: 42.14
-  1%|          | 52/5800 [06:47<11:03:44,  6.93s/it]                                                    {'loss': 0.168, 'grad_norm': 28.11731719970703, 'learning_rate': 1.1954022988505748e-05, 'epoch': 0.45}
-  1%|          | 52/5800 [06:47<11:03:44,  6.93s/it]score1 tensor([[0.4473],
-        [0.7578],
-        [0.5078],
-        [0.6992]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4629, 0.5469, 0.4023, 0.5703], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1152, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:53:24,942] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 08:53:24,943] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.60 | bwd_microstep: 4645.76 | bwd_inner_microstep: 4641.02 | bwd_allreduce_microstep: 4.66 | step_microstep: 40.62
-[2025-01-25 08:53:24,944] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.58 | bwd: 4645.78 | bwd_inner: 4641.02 | bwd_allreduce: 4.70 | step: 40.63
-  1%|          | 53/5800 [06:54<11:03:41,  6.93s/it]                                                    {'loss': 0.1152, 'grad_norm': 15.232158660888672, 'learning_rate': 1.2183908045977013e-05, 'epoch': 0.46}
-  1%|          | 53/5800 [06:54<11:03:41,  6.93s/it]score1 tensor([[0.6367],
-        [0.4805],
-        [0.5156],
-        [0.5977]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.5469, 0.4766, 0.6875], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:53:31,865] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 08:53:31,867] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.33 | bwd_microstep: 4635.75 | bwd_inner_microstep: 4630.99 | bwd_allreduce_microstep: 4.68 | step_microstep: 41.85
-[2025-01-25 08:53:31,867] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.31 | bwd: 4635.78 | bwd_inner: 4630.99 | bwd_allreduce: 4.71 | step: 41.85
-  1%|          | 54/5800 [07:01<11:03:22,  6.93s/it]                                                    {'loss': 0.0664, 'grad_norm': 10.480086326599121, 'learning_rate': 1.2413793103448277e-05, 'epoch': 0.47}
-  1%|          | 54/5800 [07:01<11:03:22,  6.93s/it]score1 tensor([[0.2969],
-        [0.6719],
-        [0.5547],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3789, 0.6875, 0.3809, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0708, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:53:38,802] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 08:53:38,803] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.91 | bwd_microstep: 4650.54 | bwd_inner_microstep: 4645.75 | bwd_allreduce_microstep: 4.71 | step_microstep: 41.12
-[2025-01-25 08:53:38,803] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.88 | bwd: 4650.56 | bwd_inner: 4645.75 | bwd_allreduce: 4.75 | step: 41.13
-  1%|          | 55/5800 [07:08<11:03:31,  6.93s/it]                                                    {'loss': 0.0708, 'grad_norm': 8.248854637145996, 'learning_rate': 1.2643678160919542e-05, 'epoch': 0.47}
-  1%|          | 55/5800 [07:08<11:03:31,  6.93s/it]score1 tensor([[0.5625],
-        [0.3320],
-        [0.4902],
-        [0.1885]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4121, 0.4004, 0.5625, 0.4609], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1406, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:53:45,723] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 08:53:45,724] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.98 | bwd_microstep: 4636.44 | bwd_inner_microstep: 4631.79 | bwd_allreduce_microstep: 4.57 | step_microstep: 39.36
-[2025-01-25 08:53:45,724] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.96 | bwd: 4636.46 | bwd_inner: 4631.79 | bwd_allreduce: 4.60 | step: 39.38
-  1%|          | 56/5800 [07:15<11:03:07,  6.93s/it]                                                    {'loss': 0.1406, 'grad_norm': 11.878772735595703, 'learning_rate': 1.2873563218390805e-05, 'epoch': 0.48}
-  1%|          | 56/5800 [07:15<11:03:07,  6.93s/it]score1 tensor([[0.7070],
-        [0.4902],
-        [0.4258],
-        [0.9180]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6250, 0.5078, 0.4004, 0.7070], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0840, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:53:52,640] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 08:53:52,641] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.10 | bwd_microstep: 4635.53 | bwd_inner_microstep: 4631.01 | bwd_allreduce_microstep: 4.45 | step_microstep: 39.72
-[2025-01-25 08:53:52,641] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.07 | bwd: 4635.55 | bwd_inner: 4631.01 | bwd_allreduce: 4.48 | step: 39.73
-  1%|          | 57/5800 [07:22<11:02:43,  6.92s/it]                                                    {'loss': 0.084, 'grad_norm': 16.92253303527832, 'learning_rate': 1.310344827586207e-05, 'epoch': 0.49}
-  1%|          | 57/5800 [07:22<11:02:43,  6.92s/it]score1 tensor([[0.4531],
-        [0.4238],
-        [0.3008],
-        [0.2324]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3672, 0.5508, 0.5234, 0.6484], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2129, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:53:59,569] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.89 | optimizer_step: 4.36
-[2025-01-25 08:53:59,570] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.73 | bwd_microstep: 4645.77 | bwd_inner_microstep: 4641.27 | bwd_allreduce_microstep: 4.40 | step_microstep: 40.20
-[2025-01-25 08:53:59,571] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.68 | bwd: 4645.79 | bwd_inner: 4641.27 | bwd_allreduce: 4.44 | step: 40.21
-  1%|          | 58/5800 [07:29<11:02:47,  6.93s/it]                                                    {'loss': 0.2129, 'grad_norm': 13.793370246887207, 'learning_rate': 1.3333333333333333e-05, 'epoch': 0.5}
-  1%|          | 58/5800 [07:29<11:02:47,  6.93s/it]score1 tensor([[0.2109],
-        [0.4766],
-        [0.4199],
-        [0.4648]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.6797, 0.5664, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1592, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:54:06,495] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.96 | optimizer_step: 4.36
-[2025-01-25 08:54:06,496] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.26 | bwd_microstep: 4640.72 | bwd_inner_microstep: 4635.76 | bwd_allreduce_microstep: 4.88 | step_microstep: 40.87
-[2025-01-25 08:54:06,496] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.23 | bwd: 4640.74 | bwd_inner: 4635.76 | bwd_allreduce: 4.92 | step: 40.88
-  1%|          | 59/5800 [07:36<11:02:49,  6.93s/it]                                                    {'loss': 0.1592, 'grad_norm': 24.097936630249023, 'learning_rate': 1.3563218390804598e-05, 'epoch': 0.51}
-  1%|          | 59/5800 [07:36<11:02:49,  6.93s/it]score1 tensor([[0.3906],
-        [0.7344],
-        [0.2930],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4531, 0.6641, 0.4863, 0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0923, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:54:13,424] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.37
-[2025-01-25 08:54:13,425] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.59 | bwd_microstep: 4641.11 | bwd_inner_microstep: 4636.15 | bwd_allreduce_microstep: 4.89 | step_microstep: 41.47
-[2025-01-25 08:54:13,425] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.56 | bwd: 4641.13 | bwd_inner: 4636.15 | bwd_allreduce: 4.92 | step: 41.47
-  1%|          | 60/5800 [07:43<11:02:51,  6.93s/it]                                                    {'loss': 0.0923, 'grad_norm': 13.67313289642334, 'learning_rate': 1.3793103448275863e-05, 'epoch': 0.52}
-  1%|          | 60/5800 [07:43<11:02:51,  6.93s/it]score1 tensor([[0.5625],
-        [0.4609],
-        [0.5156],
-        [0.3945]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.5352, 0.5352, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0752, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:54:20,377] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.96 | optimizer_step: 4.36
-[2025-01-25 08:54:20,379] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.40 | bwd_microstep: 4646.57 | bwd_inner_microstep: 4641.53 | bwd_allreduce_microstep: 4.95 | step_microstep: 47.63
-[2025-01-25 08:54:20,379] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.38 | bwd: 4646.59 | bwd_inner: 4641.53 | bwd_allreduce: 5.00 | step: 47.63
-  1%|          | 61/5800 [07:50<11:03:20,  6.94s/it]                                                    {'loss': 0.0752, 'grad_norm': 13.204339981079102, 'learning_rate': 1.4022988505747128e-05, 'epoch': 0.53}
-  1%|          | 61/5800 [07:50<11:03:20,  6.94s/it]score1 tensor([[0.3965],
-        [0.6406],
-        [0.3008],
-        [0.3770]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4453, 0.6445, 0.5156, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0864, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:54:27,310] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 08:54:27,312] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.47 | bwd_microstep: 4645.60 | bwd_inner_microstep: 4640.80 | bwd_allreduce_microstep: 4.73 | step_microstep: 40.81
-[2025-01-25 08:54:27,312] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.44 | bwd: 4645.62 | bwd_inner: 4640.79 | bwd_allreduce: 4.76 | step: 40.82
-  1%|          | 62/5800 [07:57<11:03:01,  6.93s/it]                                                    {'loss': 0.0864, 'grad_norm': 24.703266143798828, 'learning_rate': 1.4252873563218392e-05, 'epoch': 0.53}
-  1%|          | 62/5800 [07:57<11:03:01,  6.93s/it]score1 tensor([[0.5430],
-        [0.5117],
-        [0.5430],
-        [0.3711]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.4512, 0.4785, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0591, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:54:34,236] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 08:54:34,237] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.56 | bwd_microstep: 4641.79 | bwd_inner_microstep: 4636.89 | bwd_allreduce_microstep: 4.83 | step_microstep: 41.32
-[2025-01-25 08:54:34,237] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.53 | bwd: 4641.82 | bwd_inner: 4636.89 | bwd_allreduce: 4.86 | step: 41.33
-  1%|          | 63/5800 [08:04<11:02:40,  6.93s/it]                                                    {'loss': 0.0591, 'grad_norm': 13.655978202819824, 'learning_rate': 1.4482758620689657e-05, 'epoch': 0.54}
-  1%|          | 63/5800 [08:04<11:02:40,  6.93s/it]score1 tensor([[0.5078],
-        [0.3242],
-        [0.5859],
-        [0.4102]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4629, 0.4375, 0.5508, 0.4082], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0488, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:54:41,155] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 08:54:41,156] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.08 | bwd_microstep: 4639.16 | bwd_inner_microstep: 4634.42 | bwd_allreduce_microstep: 4.67 | step_microstep: 40.67
-[2025-01-25 08:54:41,157] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.05 | bwd: 4639.18 | bwd_inner: 4634.42 | bwd_allreduce: 4.70 | step: 40.68
-  1%|          | 64/5800 [08:11<11:02:12,  6.93s/it]                                                    {'loss': 0.0488, 'grad_norm': 13.402694702148438, 'learning_rate': 1.471264367816092e-05, 'epoch': 0.55}
-  1%|          | 64/5800 [08:11<11:02:12,  6.93s/it]score1 tensor([[0.4824],
-        [0.3652],
-        [0.5156],
-        [0.6562]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4414, 0.4473, 0.3906, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0894, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:54:48,077] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.37
-[2025-01-25 08:54:48,078] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.01 | bwd_microstep: 4642.69 | bwd_inner_microstep: 4637.93 | bwd_allreduce_microstep: 4.69 | step_microstep: 41.49
-[2025-01-25 08:54:48,079] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.93 | bwd: 4642.71 | bwd_inner: 4637.93 | bwd_allreduce: 4.72 | step: 41.50
-  1%|          | 65/5800 [08:18<11:02:02,  6.93s/it]                                                    {'loss': 0.0894, 'grad_norm': 13.803966522216797, 'learning_rate': 1.4942528735632185e-05, 'epoch': 0.56}
-  1%|          | 65/5800 [08:18<11:02:02,  6.93s/it]score1 tensor([[0.5078],
-        [0.4375],
-        [0.8906],
-        [0.4727]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.5547, 0.6484, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1279, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:54:55,003] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.37
-[2025-01-25 08:54:55,004] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.86 | bwd_microstep: 4640.72 | bwd_inner_microstep: 4635.96 | bwd_allreduce_microstep: 4.68 | step_microstep: 41.33
-[2025-01-25 08:54:55,005] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.83 | bwd: 4640.74 | bwd_inner: 4635.96 | bwd_allreduce: 4.72 | step: 41.33
-  1%|          | 66/5800 [08:24<11:01:54,  6.93s/it]                                                    {'loss': 0.1279, 'grad_norm': 13.628310203552246, 'learning_rate': 1.5172413793103448e-05, 'epoch': 0.57}
-  1%|          | 66/5800 [08:24<11:01:54,  6.93s/it]score1 tensor([[0.5234],
-        [0.7656],
-        [0.5781],
-        [0.4746]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4883, 0.6055, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0923, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:55:01,932] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.69 | optimizer_step: 4.36
-[2025-01-25 08:55:01,934] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.39 | bwd_microstep: 4641.91 | bwd_inner_microstep: 4636.40 | bwd_allreduce_microstep: 5.41 | step_microstep: 47.09
-[2025-01-25 08:55:01,934] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.36 | bwd: 4641.94 | bwd_inner: 4636.40 | bwd_allreduce: 5.46 | step: 47.10
-  1%|          | 67/5800 [08:31<11:02:04,  6.93s/it]                                                    {'loss': 0.0923, 'grad_norm': 13.13815689086914, 'learning_rate': 1.540229885057471e-05, 'epoch': 0.58}
-  1%|          | 67/5800 [08:31<11:02:04,  6.93s/it]score1 tensor([[0.4766],
-        [0.6133],
-        [0.6133],
-        [0.4512]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4922, 0.4844, 0.4805, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0693, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:55:08,839] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.37
-[2025-01-25 08:55:08,840] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.74 | bwd_microstep: 4601.10 | bwd_inner_microstep: 4596.24 | bwd_allreduce_microstep: 4.79 | step_microstep: 43.46
-[2025-01-25 08:55:08,841] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.71 | bwd: 4601.13 | bwd_inner: 4596.24 | bwd_allreduce: 4.82 | step: 43.47
-  1%|          | 68/5800 [08:38<11:01:08,  6.92s/it]                                                    {'loss': 0.0693, 'grad_norm': 7.673307418823242, 'learning_rate': 1.563218390804598e-05, 'epoch': 0.59}
-  1%|          | 68/5800 [08:38<11:01:08,  6.92s/it]score1 tensor([[0.7383],
-        [0.6484],
-        [0.4629],
-        [0.4062]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5000, 0.6094, 0.3945, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1060, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:55:15,781] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.78 | optimizer_step: 4.36
-[2025-01-25 08:55:15,783] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.87 | bwd_microstep: 4650.50 | bwd_inner_microstep: 4645.29 | bwd_allreduce_microstep: 5.13 | step_microstep: 46.23
-[2025-01-25 08:55:15,783] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.84 | bwd: 4650.54 | bwd_inner: 4645.29 | bwd_allreduce: 5.16 | step: 46.23
-  1%|          | 69/5800 [08:45<11:01:49,  6.93s/it]                                                    {'loss': 0.106, 'grad_norm': 15.921873092651367, 'learning_rate': 1.586206896551724e-05, 'epoch': 0.59}
-  1%|          | 69/5800 [08:45<11:01:49,  6.93s/it]score1 tensor([[0.5430],
-        [0.5117],
-        [0.6172],
-        [0.4551]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.5391, 0.5586, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0654, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:55:22,732] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 08:55:22,733] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.22 | bwd_microstep: 4645.02 | bwd_inner_microstep: 4640.15 | bwd_allreduce_microstep: 4.80 | step_microstep: 42.02
-[2025-01-25 08:55:22,734] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.18 | bwd: 4645.04 | bwd_inner: 4640.15 | bwd_allreduce: 4.84 | step: 42.03
-  1%|          | 70/5800 [08:52<11:02:13,  6.93s/it]                                                    {'loss': 0.0654, 'grad_norm': 12.804322242736816, 'learning_rate': 1.6091954022988507e-05, 'epoch': 0.6}
-  1%|          | 70/5800 [08:52<11:02:13,  6.93s/it]score1 tensor([[0.7969],
-        [0.4453],
-        [0.7031],
-        [0.3828]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.3262, 0.6875, 0.4805], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0962, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:55:29,667] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.39 | optimizer_step: 4.36
-[2025-01-25 08:55:29,669] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.32 | bwd_microstep: 4645.81 | bwd_inner_microstep: 4640.73 | bwd_allreduce_microstep: 5.01 | step_microstep: 44.45
-[2025-01-25 08:55:29,670] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.30 | bwd: 4645.83 | bwd_inner: 4640.73 | bwd_allreduce: 5.04 | step: 44.46
-  1%|          | 71/5800 [08:59<11:02:09,  6.93s/it]                                                    {'loss': 0.0962, 'grad_norm': 14.661547660827637, 'learning_rate': 1.632183908045977e-05, 'epoch': 0.61}
-  1%|          | 71/5800 [08:59<11:02:09,  6.93s/it]score1 tensor([[0.3945],
-        [0.7539],
-        [0.5469],
-        [0.4473]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.6523, 0.4238, 0.4395], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0962, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:55:36,612] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.10 | optimizer_step: 4.37
-[2025-01-25 08:55:36,614] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.54 | bwd_microstep: 4642.12 | bwd_inner_microstep: 4636.16 | bwd_allreduce_microstep: 5.86 | step_microstep: 55.13
-[2025-01-25 08:55:36,615] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.51 | bwd: 4642.14 | bwd_inner: 4636.16 | bwd_allreduce: 5.91 | step: 55.16
-  1%|          | 72/5800 [09:06<11:02:56,  6.94s/it]                                                    {'loss': 0.0962, 'grad_norm': 14.687331199645996, 'learning_rate': 1.6551724137931037e-05, 'epoch': 0.62}
-  1%|          | 72/5800 [09:06<11:02:56,  6.94s/it]score1 tensor([[0.5547],
-        [0.2451],
-        [0.5430],
-        [0.5977]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3438, 0.3613, 0.6172, 0.5586], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1104, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:55:43,593] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.04 | optimizer_step: 4.37
-[2025-01-25 08:55:43,595] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.28 | bwd_microstep: 4648.63 | bwd_inner_microstep: 4640.08 | bwd_allreduce_microstep: 8.43 | step_microstep: 50.16
-[2025-01-25 08:55:43,595] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.24 | bwd: 4648.67 | bwd_inner: 4640.08 | bwd_allreduce: 8.50 | step: 50.16
-  1%|▏         | 73/5800 [09:13<11:03:45,  6.95s/it]                                                    {'loss': 0.1104, 'grad_norm': 5.922950744628906, 'learning_rate': 1.6781609195402298e-05, 'epoch': 0.63}
-  1%|▏         | 73/5800 [09:13<11:03:45,  6.95s/it]score1 tensor([[0.3535],
-        [0.5078],
-        [0.4844],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.4160, 0.5508, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1035, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:55:50,567] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.39 | optimizer_step: 4.36
-[2025-01-25 08:55:50,569] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.68 | bwd_microstep: 4644.90 | bwd_inner_microstep: 4638.64 | bwd_allreduce_microstep: 6.15 | step_microstep: 47.40
-[2025-01-25 08:55:50,570] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.64 | bwd: 4644.93 | bwd_inner: 4638.65 | bwd_allreduce: 6.21 | step: 47.41
-  1%|▏         | 74/5800 [09:20<11:03:58,  6.96s/it]                                                    {'loss': 0.1035, 'grad_norm': 14.777176856994629, 'learning_rate': 1.7011494252873563e-05, 'epoch': 0.64}
-  1%|▏         | 74/5800 [09:20<11:03:58,  6.96s/it]score1 tensor([[0.4180],
-        [0.3789],
-        [0.2002],
-        [0.3613]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4141, 0.4473, 0.4473, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1279, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:55:57,542] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.36 | optimizer_step: 4.36
-[2025-01-25 08:55:57,544] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2169.26 | bwd_microstep: 4650.67 | bwd_inner_microstep: 4643.74 | bwd_allreduce_microstep: 6.78 | step_microstep: 51.17
-[2025-01-25 08:55:57,544] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2169.23 | bwd: 4650.71 | bwd_inner: 4643.74 | bwd_allreduce: 6.86 | step: 51.18
-  1%|▏         | 75/5800 [09:27<11:04:28,  6.96s/it]                                                    {'loss': 0.1279, 'grad_norm': 11.57616138458252, 'learning_rate': 1.7241379310344828e-05, 'epoch': 0.65}
-  1%|▏         | 75/5800 [09:27<11:04:28,  6.96s/it]score1 tensor([[0.5234],
-        [0.4023],
-        [0.2402],
-        [0.2188]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133, 0.4961, 0.4648, 0.4199], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1523, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:56:04,512] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.35 | optimizer_step: 4.36
-[2025-01-25 08:56:04,514] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.09 | bwd_microstep: 4646.68 | bwd_inner_microstep: 4640.70 | bwd_allreduce_microstep: 5.89 | step_microstep: 51.75
-[2025-01-25 08:56:04,514] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.06 | bwd: 4646.71 | bwd_inner: 4640.70 | bwd_allreduce: 5.94 | step: 51.76
-  1%|▏         | 76/5800 [09:34<11:04:37,  6.97s/it]                                                    {'loss': 0.1523, 'grad_norm': 23.51370620727539, 'learning_rate': 1.7471264367816093e-05, 'epoch': 0.66}
-  1%|▏         | 76/5800 [09:34<11:04:37,  6.97s/it]score1 tensor([[0.3457],
-        [0.2773],
-        [0.4590],
-        [0.3320]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.4434, 0.5742, 0.4277], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1436, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:56:11,506] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.87 | optimizer_step: 4.37
-[2025-01-25 08:56:11,508] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.95 | bwd_microstep: 4643.93 | bwd_inner_microstep: 4636.70 | bwd_allreduce_microstep: 7.12 | step_microstep: 61.80
-[2025-01-25 08:56:11,508] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2168.92 | bwd: 4643.96 | bwd_inner: 4636.70 | bwd_allreduce: 7.19 | step: 61.82
-  1%|▏         | 77/5800 [09:41<11:05:36,  6.98s/it]                                                    {'loss': 0.1436, 'grad_norm': 23.080257415771484, 'learning_rate': 1.770114942528736e-05, 'epoch': 0.66}
-  1%|▏         | 77/5800 [09:41<11:05:36,  6.98s/it]score1 tensor([[0.2461],
-        [0.4648],
-        [0.4414],
-        [0.3516]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.5664, 0.4941, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0942, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:56:18,499] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.34 | optimizer_step: 4.37
-[2025-01-25 08:56:18,500] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.77 | bwd_microstep: 4646.68 | bwd_inner_microstep: 4639.92 | bwd_allreduce_microstep: 6.62 | step_microstep: 51.69
-[2025-01-25 08:56:18,501] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.43 | bwd: 4646.71 | bwd_inner: 4639.92 | bwd_allreduce: 6.69 | step: 51.70
-  1%|▏         | 78/5800 [09:48<11:05:50,  6.98s/it]                                                    {'loss': 0.0942, 'grad_norm': 24.18284797668457, 'learning_rate': 1.7931034482758623e-05, 'epoch': 0.67}
-  1%|▏         | 78/5800 [09:48<11:05:50,  6.98s/it]score1 tensor([[0.5039],
-        [0.3652],
-        [0.5859],
-        [0.4941]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4043, 0.4922, 0.5352, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0728, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:56:25,473] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.55 | optimizer_step: 4.36
-[2025-01-25 08:56:25,475] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.86 | bwd_microstep: 4636.45 | bwd_inner_microstep: 4630.14 | bwd_allreduce_microstep: 6.23 | step_microstep: 52.48
-[2025-01-25 08:56:25,476] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.83 | bwd: 4636.47 | bwd_inner: 4630.14 | bwd_allreduce: 6.27 | step: 52.48
-  1%|▏         | 79/5800 [09:55<11:05:15,  6.98s/it]                                                    {'loss': 0.0728, 'grad_norm': 4.174198150634766, 'learning_rate': 1.8160919540229885e-05, 'epoch': 0.68}
-  1%|▏         | 79/5800 [09:55<11:05:15,  6.98s/it]score1 tensor([[0.4316],
-        [0.7500],
-        [0.3652],
-        [0.6445]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3652, 0.4688, 0.3691, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1230, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:56:32,455] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.34 | optimizer_step: 4.36
-[2025-01-25 08:56:32,456] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.30 | bwd_microstep: 4651.84 | bwd_inner_microstep: 4640.75 | bwd_allreduce_microstep: 10.94 | step_microstep: 52.64
-[2025-01-25 08:56:32,456] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.26 | bwd: 4651.87 | bwd_inner: 4640.75 | bwd_allreduce: 11.04 | step: 52.64
-  1%|▏         | 80/5800 [10:02<11:05:13,  6.98s/it]                                                    {'loss': 0.123, 'grad_norm': 15.513971328735352, 'learning_rate': 1.839080459770115e-05, 'epoch': 0.69}
-  1%|▏         | 80/5800 [10:02<11:05:13,  6.98s/it]score1 tensor([[0.4668],
-        [0.6289],
-        [0.6758],
-        [0.7148]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.4277, 0.5273, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1465, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:56:39,409] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.08 | optimizer_step: 4.36
-[2025-01-25 08:56:39,411] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.27 | bwd_microstep: 4637.42 | bwd_inner_microstep: 4632.49 | bwd_allreduce_microstep: 4.83 | step_microstep: 44.83
-[2025-01-25 08:56:39,411] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.22 | bwd: 4637.46 | bwd_inner: 4632.49 | bwd_allreduce: 4.87 | step: 44.84
-  1%|▏         | 81/5800 [10:09<11:04:08,  6.97s/it]                                                    {'loss': 0.1465, 'grad_norm': 24.538204193115234, 'learning_rate': 1.8620689655172415e-05, 'epoch': 0.7}
-  1%|▏         | 81/5800 [10:09<11:04:08,  6.97s/it]score1 tensor([[0.6211],
-        [0.5781],
-        [0.5508],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.5898, 0.5391, 0.6484], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0264, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:56:46,343] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 08:56:46,345] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.27 | bwd_microstep: 4639.37 | bwd_inner_microstep: 4634.35 | bwd_allreduce_microstep: 4.94 | step_microstep: 43.35
-[2025-01-25 08:56:46,345] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.24 | bwd: 4639.39 | bwd_inner: 4634.35 | bwd_allreduce: 4.98 | step: 43.35
-  1%|▏         | 82/5800 [10:16<11:02:49,  6.96s/it]                                                    {'loss': 0.0264, 'grad_norm': 6.307685852050781, 'learning_rate': 1.885057471264368e-05, 'epoch': 0.71}
-  1%|▏         | 82/5800 [10:16<11:02:49,  6.96s/it]score1 tensor([[0.6133],
-        [0.7891],
-        [0.5391],
-        [0.7148]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.5312, 0.4141, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1982, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:56:53,265] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 08:56:53,266] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.20 | bwd_microstep: 4636.23 | bwd_inner_microstep: 4631.34 | bwd_allreduce_microstep: 4.82 | step_microstep: 41.91
-[2025-01-25 08:56:53,266] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.17 | bwd: 4636.25 | bwd_inner: 4631.34 | bwd_allreduce: 4.85 | step: 41.92
-  1%|▏         | 83/5800 [10:23<11:01:44,  6.94s/it]                                                    {'loss': 0.1982, 'grad_norm': 23.707155227661133, 'learning_rate': 1.908045977011494e-05, 'epoch': 0.72}
-  1%|▏         | 83/5800 [10:23<11:01:44,  6.94s/it]score1 tensor([[0.6641],
-        [0.5547],
-        [0.6055],
-        [0.8984]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.5195, 0.5781, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1348, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:57:00,182] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.37
-[2025-01-25 08:57:00,183] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.01 | bwd_microstep: 4633.60 | bwd_inner_microstep: 4628.80 | bwd_allreduce_microstep: 4.72 | step_microstep: 42.05
-[2025-01-25 08:57:00,184] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.98 | bwd: 4633.62 | bwd_inner: 4628.80 | bwd_allreduce: 4.76 | step: 42.03
-  1%|▏         | 84/5800 [10:30<11:00:50,  6.94s/it]                                                    {'loss': 0.1348, 'grad_norm': 24.25497055053711, 'learning_rate': 1.931034482758621e-05, 'epoch': 0.72}
-  1%|▏         | 84/5800 [10:30<11:00:50,  6.94s/it]score1 tensor([[0.5312],
-        [0.4863],
-        [0.4648],
-        [0.4395]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133, 0.3887, 0.5352, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0757, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:57:07,107] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.37
-[2025-01-25 08:57:07,108] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.13 | bwd_microstep: 4640.29 | bwd_inner_microstep: 4635.79 | bwd_allreduce_microstep: 4.42 | step_microstep: 42.12
-[2025-01-25 08:57:07,108] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.10 | bwd: 4640.31 | bwd_inner: 4635.79 | bwd_allreduce: 4.46 | step: 42.13
-  1%|▏         | 85/5800 [10:37<11:00:19,  6.93s/it]                                                    {'loss': 0.0757, 'grad_norm': 13.278458595275879, 'learning_rate': 1.9540229885057475e-05, 'epoch': 0.73}
-  1%|▏         | 85/5800 [10:37<11:00:19,  6.93s/it]score1 tensor([[0.5742],
-        [0.5508],
-        [0.5352],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4570, 0.4551, 0.4141, 0.4375], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1069, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:57:14,027] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 08:57:14,028] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.72 | bwd_microstep: 4641.08 | bwd_inner_microstep: 4636.43 | bwd_allreduce_microstep: 4.58 | step_microstep: 41.00
-[2025-01-25 08:57:14,028] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.66 | bwd: 4641.10 | bwd_inner: 4636.43 | bwd_allreduce: 4.61 | step: 41.01
-  1%|▏         | 86/5800 [10:44<10:59:58,  6.93s/it]                                                    {'loss': 0.1069, 'grad_norm': 23.030202865600586, 'learning_rate': 1.9770114942528737e-05, 'epoch': 0.74}
-  1%|▏         | 86/5800 [10:44<10:59:58,  6.93s/it]score1 tensor([[0.4961],
-        [0.5312],
-        [0.4941],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4219, 0.4844, 0.5625, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0605, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:57:20,949] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 08:57:20,951] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.26 | bwd_microstep: 4641.04 | bwd_inner_microstep: 4636.14 | bwd_allreduce_microstep: 4.83 | step_microstep: 40.46
-[2025-01-25 08:57:20,951] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.22 | bwd: 4641.06 | bwd_inner: 4636.14 | bwd_allreduce: 4.86 | step: 40.47
-  2%|▏         | 87/5800 [10:50<10:59:43,  6.93s/it]                                                    {'loss': 0.0605, 'grad_norm': 11.882909774780273, 'learning_rate': 2e-05, 'epoch': 0.75}
-  2%|▏         | 87/5800 [10:50<10:59:43,  6.93s/it]score1 tensor([[0.4043],
-        [0.5664],
-        [0.3574],
-        [0.4199]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4277, 0.5039, 0.5234, 0.3086], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0908, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:57:27,878] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.94 | optimizer_step: 4.36
-[2025-01-25 08:57:27,879] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.07 | bwd_microstep: 4634.42 | bwd_inner_microstep: 4629.71 | bwd_allreduce_microstep: 4.64 | step_microstep: 45.49
-[2025-01-25 08:57:27,879] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.04 | bwd: 4634.45 | bwd_inner: 4629.71 | bwd_allreduce: 4.68 | step: 45.49
-  2%|▏         | 88/5800 [10:57<10:59:37,  6.93s/it]                                                    {'loss': 0.0908, 'grad_norm': 4.933067321777344, 'learning_rate': 2.0229885057471267e-05, 'epoch': 0.76}
-  2%|▏         | 88/5800 [10:57<10:59:37,  6.93s/it]score1 tensor([[0.3770],
-        [0.4727],
-        [0.4688],
-        [0.3066]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4590, 0.4375, 0.4863, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0688, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:57:34,814] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.02 | optimizer_step: 4.36
-[2025-01-25 08:57:34,816] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.10 | bwd_microstep: 4645.55 | bwd_inner_microstep: 4639.62 | bwd_allreduce_microstep: 5.85 | step_microstep: 42.33
-[2025-01-25 08:57:34,816] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.07 | bwd: 4645.57 | bwd_inner: 4639.62 | bwd_allreduce: 5.89 | step: 42.34
-  2%|▏         | 89/5800 [11:04<10:59:38,  6.93s/it]                                                    {'loss': 0.0688, 'grad_norm': 11.400203704833984, 'learning_rate': 2.0459770114942528e-05, 'epoch': 0.77}
-  2%|▏         | 89/5800 [11:04<10:59:38,  6.93s/it]score1 tensor([[0.3574],
-        [0.3047],
-        [0.3926],
-        [0.3906]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.4570, 0.5625, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1660, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:57:41,738] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.41 | optimizer_step: 4.36
-[2025-01-25 08:57:41,739] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.42 | bwd_microstep: 4639.44 | bwd_inner_microstep: 4635.49 | bwd_allreduce_microstep: 3.89 | step_microstep: 38.78
-[2025-01-25 08:57:41,739] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.40 | bwd: 4639.46 | bwd_inner: 4635.49 | bwd_allreduce: 3.92 | step: 38.79
-  2%|▏         | 90/5800 [11:11<10:59:09,  6.93s/it]                                                    {'loss': 0.166, 'grad_norm': 21.40899658203125, 'learning_rate': 2.0689655172413797e-05, 'epoch': 0.78}
-  2%|▏         | 90/5800 [11:11<10:59:09,  6.93s/it]score1 tensor([[0.4082],
-        [0.3008],
-        [0.5117],
-        [0.3320]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5820, 0.4551, 0.4785, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1270, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:57:48,663] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 08:57:48,664] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.56 | bwd_microstep: 4644.41 | bwd_inner_microstep: 4639.63 | bwd_allreduce_microstep: 4.71 | step_microstep: 41.29
-[2025-01-25 08:57:48,665] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.53 | bwd: 4644.44 | bwd_inner: 4639.63 | bwd_allreduce: 4.74 | step: 41.30
-  2%|▏         | 91/5800 [11:18<10:59:13,  6.93s/it]                                                    {'loss': 0.127, 'grad_norm': 12.48677921295166, 'learning_rate': 2.0919540229885058e-05, 'epoch': 0.78}
-  2%|▏         | 91/5800 [11:18<10:59:13,  6.93s/it]score1 tensor([[0.3359],
-        [0.3320],
-        [0.4531],
-        [0.2285]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4961, 0.5195, 0.5586, 0.3516], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1445, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:57:55,581] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.11 | optimizer_step: 4.36
-[2025-01-25 08:57:55,582] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.56 | bwd_microstep: 4633.72 | bwd_inner_microstep: 4628.70 | bwd_allreduce_microstep: 4.92 | step_microstep: 42.50
-[2025-01-25 08:57:55,583] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.53 | bwd: 4633.75 | bwd_inner: 4628.70 | bwd_allreduce: 4.97 | step: 42.51
-  2%|▏         | 92/5800 [11:25<10:58:46,  6.92s/it]                                                    {'loss': 0.1445, 'grad_norm': 21.652164459228516, 'learning_rate': 2.1149425287356323e-05, 'epoch': 0.79}
-  2%|▏         | 92/5800 [11:25<10:58:46,  6.92s/it]score1 tensor([[0.3379],
-        [0.2910],
-        [0.5898],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133, 0.5078, 0.4453, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1611, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:58:02,511] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.37
-[2025-01-25 08:58:02,512] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.52 | bwd_microstep: 4643.22 | bwd_inner_microstep: 4638.64 | bwd_allreduce_microstep: 4.51 | step_microstep: 40.03
-[2025-01-25 08:58:02,513] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.50 | bwd: 4643.25 | bwd_inner: 4638.64 | bwd_allreduce: 4.55 | step: 40.03
-  2%|▏         | 93/5800 [11:32<10:58:41,  6.93s/it]                                                    {'loss': 0.1611, 'grad_norm': 5.345050811767578, 'learning_rate': 2.1379310344827585e-05, 'epoch': 0.8}
-  2%|▏         | 93/5800 [11:32<10:58:41,  6.93s/it]score1 tensor([[0.4688],
-        [0.6562],
-        [0.4512],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4961, 0.6172, 0.4980, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0459, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:58:09,426] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 08:58:09,427] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.88 | bwd_microstep: 4636.08 | bwd_inner_microstep: 4631.38 | bwd_allreduce_microstep: 4.62 | step_microstep: 40.61
-[2025-01-25 08:58:09,427] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.85 | bwd: 4636.10 | bwd_inner: 4631.38 | bwd_allreduce: 4.65 | step: 40.62
-  2%|▏         | 94/5800 [11:39<10:58:17,  6.92s/it]                                                    {'loss': 0.0459, 'grad_norm': 5.033718585968018, 'learning_rate': 2.1609195402298853e-05, 'epoch': 0.81}
-  2%|▏         | 94/5800 [11:39<10:58:17,  6.92s/it]score1 tensor([[0.4785],
-        [0.4688],
-        [0.6406],
-        [0.3965]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.4434, 0.5273, 0.4199], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0483, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:58:16,345] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.98 | optimizer_step: 4.36
-[2025-01-25 08:58:16,346] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.03 | bwd_microstep: 4636.88 | bwd_inner_microstep: 4631.81 | bwd_allreduce_microstep: 4.97 | step_microstep: 41.95
-[2025-01-25 08:58:16,347] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.01 | bwd: 4636.90 | bwd_inner: 4631.82 | bwd_allreduce: 5.02 | step: 41.96
-  2%|▏         | 95/5800 [11:46<10:58:14,  6.92s/it]                                                    {'loss': 0.0483, 'grad_norm': 11.89331340789795, 'learning_rate': 2.183908045977012e-05, 'epoch': 0.82}
-  2%|▏         | 95/5800 [11:46<10:58:14,  6.92s/it]score1 tensor([[0.4219],
-        [0.3828],
-        [0.6523],
-        [0.4570]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3223, 0.3457, 0.4648, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0947, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:58:23,271] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.37
-[2025-01-25 08:58:23,273] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.46 | bwd_microstep: 4636.40 | bwd_inner_microstep: 4631.39 | bwd_allreduce_microstep: 4.93 | step_microstep: 40.50
-[2025-01-25 08:58:23,273] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.43 | bwd: 4636.43 | bwd_inner: 4631.38 | bwd_allreduce: 4.96 | step: 40.52
-  2%|▏         | 96/5800 [11:53<10:58:19,  6.92s/it]                                                    {'loss': 0.0947, 'grad_norm': 13.194902420043945, 'learning_rate': 2.206896551724138e-05, 'epoch': 0.83}
-  2%|▏         | 96/5800 [11:53<10:58:19,  6.92s/it]score1 tensor([[0.4590],
-        [0.5078],
-        [0.6328],
-        [0.4922]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4062, 0.4512, 0.4082, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1128, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:58:30,199] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 08:58:30,200] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.90 | bwd_microstep: 4638.24 | bwd_inner_microstep: 4633.53 | bwd_allreduce_microstep: 4.64 | step_microstep: 40.91
-[2025-01-25 08:58:30,201] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.87 | bwd: 4638.27 | bwd_inner: 4633.53 | bwd_allreduce: 4.68 | step: 40.92
-  2%|▏         | 97/5800 [12:00<10:57:59,  6.92s/it]                                                    {'loss': 0.1128, 'grad_norm': 23.060649871826172, 'learning_rate': 2.229885057471265e-05, 'epoch': 0.84}
-  2%|▏         | 97/5800 [12:00<10:57:59,  6.92s/it]score1 tensor([[0.5156],
-        [0.5898],
-        [0.9922],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.6953, 0.6367, 0.5586], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1533, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:58:37,126] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 08:58:37,127] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.40 | bwd_microstep: 4640.65 | bwd_inner_microstep: 4635.94 | bwd_allreduce_microstep: 4.63 | step_microstep: 40.96
-[2025-01-25 08:58:37,128] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.36 | bwd: 4640.67 | bwd_inner: 4635.94 | bwd_allreduce: 4.66 | step: 40.97
-  2%|▏         | 98/5800 [12:07<10:58:04,  6.92s/it]                                                    {'loss': 0.1533, 'grad_norm': 12.464479446411133, 'learning_rate': 2.252873563218391e-05, 'epoch': 0.84}
-  2%|▏         | 98/5800 [12:07<10:58:04,  6.92s/it]score1 tensor([[0.6641],
-        [0.3398],
-        [0.2559],
-        [0.4824]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6289, 0.4043, 0.7031, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1543, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:58:44,049] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.36
-[2025-01-25 08:58:44,050] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.69 | bwd_microstep: 4637.12 | bwd_inner_microstep: 4632.26 | bwd_allreduce_microstep: 4.78 | step_microstep: 43.54
-[2025-01-25 08:58:44,051] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.67 | bwd: 4637.14 | bwd_inner: 4632.26 | bwd_allreduce: 4.82 | step: 43.55
-  2%|▏         | 99/5800 [12:14<10:57:53,  6.92s/it]                                                    {'loss': 0.1543, 'grad_norm': 12.383063316345215, 'learning_rate': 2.2758620689655175e-05, 'epoch': 0.85}
-  2%|▏         | 99/5800 [12:14<10:57:53,  6.92s/it]score1 tensor([[0.6211],
-        [0.5391],
-        [0.4648],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6562, 0.5664, 0.4414, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0303, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:58:50,971] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 08:58:50,972] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.50 | bwd_microstep: 4637.63 | bwd_inner_microstep: 4633.07 | bwd_allreduce_microstep: 4.49 | step_microstep: 40.65
-[2025-01-25 08:58:50,973] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.47 | bwd: 4637.66 | bwd_inner: 4633.07 | bwd_allreduce: 4.53 | step: 40.66
-  2%|▏         | 100/5800 [12:20<10:57:51,  6.92s/it]                                                     {'loss': 0.0303, 'grad_norm': 14.865701675415039, 'learning_rate': 2.2988505747126437e-05, 'epoch': 0.86}
-  2%|▏         | 100/5800 [12:20<10:57:51,  6.92s/it]score1 tensor([[0.4785],
-        [0.2441],
-        [0.4238],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.3340, 0.5664, 0.6016], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0771, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:58:57,904] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.37
-[2025-01-25 08:58:57,905] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.57 | bwd_microstep: 4645.31 | bwd_inner_microstep: 4640.45 | bwd_allreduce_microstep: 4.79 | step_microstep: 40.91
-[2025-01-25 08:58:57,906] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.55 | bwd: 4645.33 | bwd_inner: 4640.45 | bwd_allreduce: 4.82 | step: 40.92
-  2%|▏         | 101/5800 [12:27<10:57:53,  6.93s/it]                                                     {'loss': 0.0771, 'grad_norm': 22.319509506225586, 'learning_rate': 2.3218390804597705e-05, 'epoch': 0.87}
-  2%|▏         | 101/5800 [12:27<10:57:53,  6.93s/it]score1 tensor([[0.3965],
-        [0.5469],
-        [0.5117],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.5664, 0.6367, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0884, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:59:04,835] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 08:59:04,836] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.87 | bwd_microstep: 4646.87 | bwd_inner_microstep: 4642.29 | bwd_allreduce_microstep: 4.51 | step_microstep: 40.00
-[2025-01-25 08:59:04,837] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.84 | bwd: 4646.90 | bwd_inner: 4642.29 | bwd_allreduce: 4.54 | step: 40.01
-  2%|▏         | 102/5800 [12:34<10:58:00,  6.93s/it]                                                     {'loss': 0.0884, 'grad_norm': 12.487683296203613, 'learning_rate': 2.3448275862068967e-05, 'epoch': 0.88}
-  2%|▏         | 102/5800 [12:34<10:58:00,  6.93s/it]score1 tensor([[0.5000],
-        [0.3789],
-        [0.5742],
-        [0.8281]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4062, 0.4238, 0.5312, 0.6133], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0991, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:59:11,780] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.84 | optimizer_step: 4.36
-[2025-01-25 08:59:11,782] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.63 | bwd_microstep: 4648.34 | bwd_inner_microstep: 4643.50 | bwd_allreduce_microstep: 4.78 | step_microstep: 46.12
-[2025-01-25 08:59:11,782] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.60 | bwd: 4648.37 | bwd_inner: 4643.50 | bwd_allreduce: 4.81 | step: 46.13
-  2%|▏         | 103/5800 [12:41<10:58:22,  6.93s/it]                                                     {'loss': 0.0991, 'grad_norm': 12.622773170471191, 'learning_rate': 2.367816091954023e-05, 'epoch': 0.89}
-  2%|▏         | 103/5800 [12:41<10:58:22,  6.93s/it]score1 tensor([[0.6250],
-        [0.4570],
-        [0.7422],
-        [0.6211]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6836, 0.5156, 0.6172, 0.4316], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1079, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:59:18,717] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.33 | optimizer_step: 4.37
-[2025-01-25 08:59:18,718] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.75 | bwd_microstep: 4645.40 | bwd_inner_microstep: 4640.46 | bwd_allreduce_microstep: 4.84 | step_microstep: 42.96
-[2025-01-25 08:59:18,719] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.73 | bwd: 4645.42 | bwd_inner: 4640.46 | bwd_allreduce: 4.90 | step: 42.96
-  2%|▏         | 104/5800 [12:48<10:58:19,  6.93s/it]                                                     {'loss': 0.1079, 'grad_norm': 4.374044418334961, 'learning_rate': 2.3908045977011497e-05, 'epoch': 0.9}
-  2%|▏         | 104/5800 [12:48<10:58:19,  6.93s/it]score1 tensor([[0.1602],
-        [0.5156],
-        [0.5273],
-        [0.1826]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3477, 0.4277, 0.5469, 0.3652], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1191, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:59:25,652] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.37
-[2025-01-25 08:59:25,653] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.50 | bwd_microstep: 4638.91 | bwd_inner_microstep: 4634.02 | bwd_allreduce_microstep: 4.78 | step_microstep: 41.43
-[2025-01-25 08:59:25,654] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.47 | bwd: 4638.93 | bwd_inner: 4634.02 | bwd_allreduce: 4.83 | step: 41.43
-  2%|▏         | 105/5800 [12:55<10:58:18,  6.94s/it]                                                     {'loss': 0.1191, 'grad_norm': 11.597214698791504, 'learning_rate': 2.413793103448276e-05, 'epoch': 0.91}
-  2%|▏         | 105/5800 [12:55<10:58:18,  6.94s/it]score1 tensor([[0.6172],
-        [0.6836],
-        [0.5273],
-        [0.6992]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5938, 0.6289, 0.4004, 0.5000], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1011, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:59:32,591] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.37
-[2025-01-25 08:59:32,592] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.42 | bwd_microstep: 4639.43 | bwd_inner_microstep: 4634.66 | bwd_allreduce_microstep: 4.68 | step_microstep: 41.48
-[2025-01-25 08:59:32,593] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.39 | bwd: 4639.45 | bwd_inner: 4634.66 | bwd_allreduce: 4.72 | step: 41.48
-  2%|▏         | 106/5800 [13:02<10:58:02,  6.93s/it]                                                     {'loss': 0.1011, 'grad_norm': 22.702281951904297, 'learning_rate': 2.4367816091954027e-05, 'epoch': 0.91}
-  2%|▏         | 106/5800 [13:02<10:58:02,  6.93s/it]score1 tensor([[0.4219],
-        [0.5352],
-        [0.6641],
-        [0.4922]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.4844, 0.5391, 0.4160], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0737, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:59:39,512] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.37
-[2025-01-25 08:59:39,514] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.27 | bwd_microstep: 4640.52 | bwd_inner_microstep: 4635.86 | bwd_allreduce_microstep: 4.59 | step_microstep: 41.74
-[2025-01-25 08:59:39,514] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.25 | bwd: 4640.54 | bwd_inner: 4635.86 | bwd_allreduce: 4.62 | step: 41.75
-  2%|▏         | 107/5800 [13:09<10:57:35,  6.93s/it]                                                     {'loss': 0.0737, 'grad_norm': 11.36104965209961, 'learning_rate': 2.4597701149425288e-05, 'epoch': 0.92}
-  2%|▏         | 107/5800 [13:09<10:57:35,  6.93s/it]score1 tensor([[0.5586],
-        [0.5273],
-        [0.5195],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.4941, 0.4180, 0.4727], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0562, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:59:46,434] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 08:59:46,435] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.31 | bwd_microstep: 4641.15 | bwd_inner_microstep: 4636.37 | bwd_allreduce_microstep: 4.71 | step_microstep: 40.12
-[2025-01-25 08:59:46,436] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.28 | bwd: 4641.17 | bwd_inner: 4636.37 | bwd_allreduce: 4.75 | step: 40.12
-  2%|▏         | 108/5800 [13:16<10:57:13,  6.93s/it]                                                     {'loss': 0.0562, 'grad_norm': 22.105634689331055, 'learning_rate': 2.4827586206896553e-05, 'epoch': 0.93}
-  2%|▏         | 108/5800 [13:16<10:57:13,  6.93s/it]score1 tensor([[0.6445],
-        [0.5391],
-        [0.4219],
-        [0.4766]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6367, 0.5195, 0.4902, 0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0376, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 08:59:53,364] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.37
-[2025-01-25 08:59:53,365] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.27 | bwd_microstep: 4644.57 | bwd_inner_microstep: 4639.49 | bwd_allreduce_microstep: 5.00 | step_microstep: 41.99
-[2025-01-25 08:59:53,366] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.24 | bwd: 4644.59 | bwd_inner: 4639.49 | bwd_allreduce: 5.03 | step: 42.00
-  2%|▏         | 109/5800 [13:23<10:57:15,  6.93s/it]                                                     {'loss': 0.0376, 'grad_norm': 3.704374313354492, 'learning_rate': 2.5057471264367815e-05, 'epoch': 0.94}
-  2%|▏         | 109/5800 [13:23<10:57:15,  6.93s/it]score1 tensor([[0.3555],
-        [0.3965],
-        [0.4590],
-        [0.4062]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4590, 0.5625, 0.4941, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1113, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:00:00,286] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 09:00:00,288] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.81 | bwd_microstep: 4637.57 | bwd_inner_microstep: 4632.82 | bwd_allreduce_microstep: 4.67 | step_microstep: 39.00
-[2025-01-25 09:00:00,288] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.78 | bwd: 4637.59 | bwd_inner: 4632.82 | bwd_allreduce: 4.71 | step: 39.01
-  2%|▏         | 110/5800 [13:30<10:56:44,  6.93s/it]                                                     {'loss': 0.1113, 'grad_norm': 21.243316650390625, 'learning_rate': 2.5287356321839083e-05, 'epoch': 0.95}
-  2%|▏         | 110/5800 [13:30<10:56:44,  6.93s/it]score1 tensor([[0.4785],
-        [0.4258],
-        [0.4180],
-        [0.3809]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.5039, 0.5391, 0.4316], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0796, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:00:07,210] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.92 | optimizer_step: 4.36
-[2025-01-25 09:00:07,212] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.95 | bwd_microstep: 4646.45 | bwd_inner_microstep: 4641.84 | bwd_allreduce_microstep: 4.53 | step_microstep: 41.82
-[2025-01-25 09:00:07,212] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.93 | bwd: 4646.47 | bwd_inner: 4641.84 | bwd_allreduce: 4.56 | step: 41.83
-  2%|▏         | 111/5800 [13:37<10:56:43,  6.93s/it]                                                     {'loss': 0.0796, 'grad_norm': 21.431537628173828, 'learning_rate': 2.551724137931035e-05, 'epoch': 0.96}
-  2%|▏         | 111/5800 [13:37<10:56:43,  6.93s/it]score1 tensor([[0.4336],
-        [0.5000],
-        [0.5469],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4297, 0.6094, 0.6211, 0.6641], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0713, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:00:14,143] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.36
-[2025-01-25 09:00:14,145] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.79 | bwd_microstep: 4639.89 | bwd_inner_microstep: 4635.16 | bwd_allreduce_microstep: 4.65 | step_microstep: 44.51
-[2025-01-25 09:00:14,145] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.76 | bwd: 4639.91 | bwd_inner: 4635.16 | bwd_allreduce: 4.68 | step: 44.52
-  2%|▏         | 112/5800 [13:44<10:56:54,  6.93s/it]                                                     {'loss': 0.0713, 'grad_norm': 12.542961120605469, 'learning_rate': 2.574712643678161e-05, 'epoch': 0.97}
-  2%|▏         | 112/5800 [13:44<10:56:54,  6.93s/it]score1 tensor([[0.4590],
-        [0.4199],
-        [0.3066],
-        [0.2031]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5898, 0.4395, 0.4648, 0.2812], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0967, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:00:21,082] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.37
-[2025-01-25 09:00:21,084] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.47 | bwd_microstep: 4647.90 | bwd_inner_microstep: 4643.00 | bwd_allreduce_microstep: 4.80 | step_microstep: 40.47
-[2025-01-25 09:00:21,084] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.45 | bwd: 4647.93 | bwd_inner: 4643.00 | bwd_allreduce: 4.86 | step: 40.48
-  2%|▏         | 113/5800 [13:51<10:56:54,  6.93s/it]                                                     {'loss': 0.0967, 'grad_norm': 20.88558006286621, 'learning_rate': 2.597701149425288e-05, 'epoch': 0.97}
-  2%|▏         | 113/5800 [13:51<10:56:54,  6.93s/it]score1 tensor([[0.5195],
-        [0.4902],
-        [0.6211],
-        [0.4355]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.5469, 0.5000, 0.4668], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0532, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:00:28,016] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.36
-[2025-01-25 09:00:28,017] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.55 | bwd_microstep: 4639.53 | bwd_inner_microstep: 4634.66 | bwd_allreduce_microstep: 4.79 | step_microstep: 51.51
-[2025-01-25 09:00:28,017] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.52 | bwd: 4639.55 | bwd_inner: 4634.66 | bwd_allreduce: 4.83 | step: 51.52
-  2%|▏         | 114/5800 [13:58<10:57:06,  6.93s/it]                                                     {'loss': 0.0532, 'grad_norm': 3.99465274810791, 'learning_rate': 2.620689655172414e-05, 'epoch': 0.98}
-  2%|▏         | 114/5800 [13:58<10:57:06,  6.93s/it]score1 tensor([[0.6797],
-        [0.5391],
-        [0.5586],
-        [0.5938]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.4980, 0.5781, 0.4746], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0801, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:00:34,956] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.36
-[2025-01-25 09:00:34,957] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.01 | bwd_microstep: 4650.34 | bwd_inner_microstep: 4645.17 | bwd_allreduce_microstep: 5.07 | step_microstep: 41.74
-[2025-01-25 09:00:34,958] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.98 | bwd: 4650.36 | bwd_inner: 4645.17 | bwd_allreduce: 5.11 | step: 41.75
-  2%|▏         | 115/5800 [14:04<10:57:03,  6.93s/it]                                                     {'loss': 0.0801, 'grad_norm': 11.123169898986816, 'learning_rate': 2.6436781609195405e-05, 'epoch': 0.99}
-  2%|▏         | 115/5800 [14:04<10:57:03,  6.93s/it][93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-score1 tensor([[0.6211]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1270, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:00:40,239] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.37
-[2025-01-25 09:00:40,241] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 595.03 | bwd_microstep: 1252.79 | bwd_inner_microstep: 1248.26 | bwd_allreduce_microstep: 4.44 | step_microstep: 42.09
-[2025-01-25 09:00:40,241] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 595.00 | bwd: 1252.81 | bwd_inner: 1248.26 | bwd_allreduce: 4.49 | step: 42.10
-  2%|▏         | 116/5800 [14:10<10:09:59,  6.44s/it]                                                     {'loss': 0.127, 'grad_norm': 21.769697189331055, 'learning_rate': 2.6666666666666667e-05, 'epoch': 1.0}
-  2%|▏         | 116/5800 [14:10<10:09:59,  6.44s/it][2025-01-25 09:00:44,943] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 09:00:56,010] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 09:01:06,586] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 09:01:17,870] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-score1 tensor([[0.6680],
-        [0.6797],
-        [0.6680],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.4355, 0.5977, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1050, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:01:34,943] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.90 | optimizer_step: 4.36
-[2025-01-25 09:01:34,944] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2136.75 | bwd_microstep: 4601.23 | bwd_inner_microstep: 4596.05 | bwd_allreduce_microstep: 5.09 | step_microstep: 43.49
-[2025-01-25 09:01:34,945] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2136.72 | bwd: 4601.25 | bwd_inner: 4596.05 | bwd_allreduce: 5.14 | step: 43.49
-  2%|▏         | 117/5800 [15:04<33:01:30, 20.92s/it]                                                     {'loss': 0.105, 'grad_norm': 21.642946243286133, 'learning_rate': 2.6896551724137935e-05, 'epoch': 1.01}
-  2%|▏         | 117/5800 [15:04<33:01:30, 20.92s/it]score1 tensor([[0.5039],
-        [0.5195],
-        [0.4883],
-        [0.5898]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.4512, 0.3750, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1187, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:01:41,784] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.36
-[2025-01-25 09:01:41,785] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2126.90 | bwd_microstep: 4582.83 | bwd_inner_microstep: 4577.93 | bwd_allreduce_microstep: 4.83 | step_microstep: 44.17
-[2025-01-25 09:01:41,785] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2126.86 | bwd: 4582.85 | bwd_inner: 4577.92 | bwd_allreduce: 4.87 | step: 44.17
-  2%|▏         | 118/5800 [15:11<26:20:54, 16.69s/it]                                                     {'loss': 0.1187, 'grad_norm': 20.653247833251953, 'learning_rate': 2.7126436781609197e-05, 'epoch': 1.02}
-  2%|▏         | 118/5800 [15:11<26:20:54, 16.69s/it]score1 tensor([[0.4121],
-        [0.5117],
-        [0.4707],
-        [0.4238]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4297, 0.4473, 0.4023, 0.4160], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0396, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:01:48,630] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.98 | optimizer_step: 4.36
-[2025-01-25 09:01:48,631] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2131.63 | bwd_microstep: 4593.46 | bwd_inner_microstep: 4588.42 | bwd_allreduce_microstep: 4.96 | step_microstep: 42.76
-[2025-01-25 09:01:48,632] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2131.60 | bwd: 4593.48 | bwd_inner: 4588.42 | bwd_allreduce: 5.00 | step: 42.77
-  2%|▏         | 119/5800 [15:18<21:40:58, 13.74s/it]                                                     {'loss': 0.0396, 'grad_norm': 11.244624137878418, 'learning_rate': 2.735632183908046e-05, 'epoch': 1.03}
-  2%|▏         | 119/5800 [15:18<21:40:58, 13.74s/it]score1 tensor([[0.5391],
-        [0.4590],
-        [0.3457],
-        [0.3164]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.4844, 0.4453, 0.6016], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1094, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:01:55,493] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.37
-[2025-01-25 09:01:55,494] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2140.70 | bwd_microstep: 4598.86 | bwd_inner_microstep: 4593.80 | bwd_allreduce_microstep: 4.98 | step_microstep: 41.40
-[2025-01-25 09:01:55,494] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2140.67 | bwd: 4598.88 | bwd_inner: 4593.80 | bwd_allreduce: 5.02 | step: 41.40
-  2%|▏         | 120/5800 [15:25<18:25:19, 11.68s/it]                                                     {'loss': 0.1094, 'grad_norm': 20.683595657348633, 'learning_rate': 2.7586206896551727e-05, 'epoch': 1.03}
-  2%|▏         | 120/5800 [15:25<18:25:19, 11.68s/it]score1 tensor([[0.3613],
-        [0.3730],
-        [0.3340],
-        [0.3359]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.5117, 0.5039, 0.4316], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1387, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:02:02,372] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.93 | optimizer_step: 4.36
-[2025-01-25 09:02:02,373] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2140.12 | bwd_microstep: 4610.46 | bwd_inner_microstep: 4605.21 | bwd_allreduce_microstep: 5.18 | step_microstep: 46.69
-[2025-01-25 09:02:02,374] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2140.09 | bwd: 4610.49 | bwd_inner: 4605.21 | bwd_allreduce: 5.21 | step: 46.69
-  2%|▏         | 121/5800 [15:32<16:09:04, 10.24s/it]                                                     {'loss': 0.1387, 'grad_norm': 20.235919952392578, 'learning_rate': 2.781609195402299e-05, 'epoch': 1.04}
-  2%|▏         | 121/5800 [15:32<16:09:04, 10.24s/it]score1 tensor([[0.2832],
-        [0.2695],
-        [0.3223],
-        [0.4492]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.4980, 0.4590, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2002, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:02:09,249] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.93 | optimizer_step: 4.36
-[2025-01-25 09:02:09,250] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2141.46 | bwd_microstep: 4605.75 | bwd_inner_microstep: 4600.72 | bwd_allreduce_microstep: 4.96 | step_microstep: 46.65
-[2025-01-25 09:02:09,251] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2141.43 | bwd: 4605.77 | bwd_inner: 4600.72 | bwd_allreduce: 4.99 | step: 46.65
-  2%|▏         | 122/5800 [15:39<14:33:29,  9.23s/it]                                                     {'loss': 0.2002, 'grad_norm': 20.31935691833496, 'learning_rate': 2.8045977011494257e-05, 'epoch': 1.05}
-  2%|▏         | 122/5800 [15:39<14:33:29,  9.23s/it]score1 tensor([[0.3633],
-        [0.3320],
-        [0.4297],
-        [0.3398]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4219, 0.4824, 0.6523, 0.4219], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1289, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:02:16,141] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 09:02:16,142] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2144.40 | bwd_microstep: 4610.33 | bwd_inner_microstep: 4605.61 | bwd_allreduce_microstep: 4.64 | step_microstep: 42.30
-[2025-01-25 09:02:16,142] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2144.37 | bwd: 4610.35 | bwd_inner: 4605.61 | bwd_allreduce: 4.68 | step: 42.31
-  2%|▏         | 123/5800 [15:46<13:26:58,  8.53s/it]                                                     {'loss': 0.1289, 'grad_norm': 20.798580169677734, 'learning_rate': 2.8275862068965518e-05, 'epoch': 1.06}
-  2%|▏         | 123/5800 [15:46<13:26:58,  8.53s/it]score1 tensor([[0.3691],
-        [0.4766],
-        [0.3730],
-        [0.3633]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5000, 0.6094, 0.4941, 0.4062], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1069, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:02:23,039] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.60 | optimizer_step: 4.36
-[2025-01-25 09:02:23,040] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2145.70 | bwd_microstep: 4607.07 | bwd_inner_microstep: 4602.23 | bwd_allreduce_microstep: 4.76 | step_microstep: 48.99
-[2025-01-25 09:02:23,041] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2145.67 | bwd: 4607.09 | bwd_inner: 4602.23 | bwd_allreduce: 4.80 | step: 49.00
-  2%|▏         | 124/5800 [15:53<12:40:22,  8.04s/it]                                                     {'loss': 0.1069, 'grad_norm': 20.73577880859375, 'learning_rate': 2.8505747126436783e-05, 'epoch': 1.07}
-  2%|▏         | 124/5800 [15:53<12:40:22,  8.04s/it]score1 tensor([[0.5195],
-        [0.4766],
-        [0.4863],
-        [0.4395]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.4980, 0.4727, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:02:29,923] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.00 | optimizer_step: 4.36
-[2025-01-25 09:02:29,925] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2151.50 | bwd_microstep: 4609.05 | bwd_inner_microstep: 4604.34 | bwd_allreduce_microstep: 4.60 | step_microstep: 42.47
-[2025-01-25 09:02:29,925] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2151.47 | bwd: 4609.07 | bwd_inner: 4604.33 | bwd_allreduce: 4.67 | step: 42.48
-  2%|▏         | 125/5800 [15:59<12:07:32,  7.69s/it]                                                     {'loss': 0.0215, 'grad_norm': 2.987359046936035, 'learning_rate': 2.8735632183908045e-05, 'epoch': 1.08}
-  2%|▏         | 125/5800 [15:59<12:07:32,  7.69s/it]score1 tensor([[0.6562],
-        [0.4883],
-        [0.5430],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.5703, 0.5039, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0972, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:02:36,799] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 09:02:36,800] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2148.73 | bwd_microstep: 4605.57 | bwd_inner_microstep: 4597.57 | bwd_allreduce_microstep: 7.92 | step_microstep: 41.11
-[2025-01-25 09:02:36,800] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2148.71 | bwd: 4605.59 | bwd_inner: 4597.57 | bwd_allreduce: 7.96 | step: 41.12
-  2%|▏         | 126/5800 [16:06<11:44:11,  7.45s/it]                                                     {'loss': 0.0972, 'grad_norm': 11.116546630859375, 'learning_rate': 2.8965517241379313e-05, 'epoch': 1.09}
-  2%|▏         | 126/5800 [16:06<11:44:11,  7.45s/it]score1 tensor([[0.7656],
-        [0.5820],
-        [0.7188],
-        [0.6758]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.4141, 0.5781, 0.3926], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1836, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:02:43,678] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.37
-[2025-01-25 09:02:43,681] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2148.97 | bwd_microstep: 4610.39 | bwd_inner_microstep: 4605.31 | bwd_allreduce_microstep: 5.00 | step_microstep: 43.35
-[2025-01-25 09:02:43,681] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2148.95 | bwd: 4610.42 | bwd_inner: 4605.31 | bwd_allreduce: 5.04 | step: 43.37
-  2%|▏         | 127/5800 [16:13<11:28:07,  7.28s/it]                                                     {'loss': 0.1836, 'grad_norm': 21.283830642700195, 'learning_rate': 2.919540229885058e-05, 'epoch': 1.09}
-  2%|▏         | 127/5800 [16:13<11:28:07,  7.28s/it]score1 tensor([[0.5820],
-        [0.6055],
-        [0.6562],
-        [0.6641]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3672, 0.4395, 0.4727, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1719, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:02:50,570] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.11 | optimizer_step: 4.36
-[2025-01-25 09:02:50,571] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2151.35 | bwd_microstep: 4605.07 | bwd_inner_microstep: 4600.36 | bwd_allreduce_microstep: 4.65 | step_microstep: 49.85
-[2025-01-25 09:02:50,572] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2151.32 | bwd: 4605.10 | bwd_inner: 4600.36 | bwd_allreduce: 4.68 | step: 49.85
-  2%|▏         | 128/5800 [16:20<11:17:01,  7.16s/it]                                                     {'loss': 0.1719, 'grad_norm': 20.746315002441406, 'learning_rate': 2.942528735632184e-05, 'epoch': 1.1}
-  2%|▏         | 128/5800 [16:20<11:17:01,  7.16s/it]score1 tensor([[0.5508],
-        [0.5664],
-        [0.5117],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.5273, 0.4941, 0.3789], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0903, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:02:57,462] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.95 | optimizer_step: 4.36
-[2025-01-25 09:02:57,463] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.05 | bwd_microstep: 4613.76 | bwd_inner_microstep: 4608.83 | bwd_allreduce_microstep: 4.84 | step_microstep: 42.22
-[2025-01-25 09:02:57,463] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.00 | bwd: 4613.78 | bwd_inner: 4608.83 | bwd_allreduce: 4.88 | step: 42.23
-  2%|▏         | 129/5800 [16:27<11:09:11,  7.08s/it]                                                     {'loss': 0.0903, 'grad_norm': 19.756765365600586, 'learning_rate': 2.965517241379311e-05, 'epoch': 1.11}
-  2%|▏         | 129/5800 [16:27<11:09:11,  7.08s/it]score1 tensor([[0.4219],
-        [0.3418],
-        [0.4043],
-        [0.4727]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.3086, 0.3516, 0.4336], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0615, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:03:04,347] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.01 | optimizer_step: 4.37
-[2025-01-25 09:03:04,348] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.53 | bwd_microstep: 4609.59 | bwd_inner_microstep: 4604.76 | bwd_allreduce_microstep: 4.72 | step_microstep: 42.24
-[2025-01-25 09:03:04,349] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.50 | bwd: 4609.61 | bwd_inner: 4604.76 | bwd_allreduce: 4.78 | step: 42.24
-  2%|▏         | 130/5800 [16:34<11:03:36,  7.02s/it]                                                     {'loss': 0.0615, 'grad_norm': 10.192688941955566, 'learning_rate': 2.988505747126437e-05, 'epoch': 1.12}
-  2%|▏         | 130/5800 [16:34<11:03:36,  7.02s/it]score1 tensor([[0.4297],
-        [0.5312],
-        [0.4297],
-        [0.4980]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.5117, 0.5625, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0537, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:03:11,323] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.37
-[2025-01-25 09:03:11,325] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2200.91 | bwd_microstep: 4645.37 | bwd_inner_microstep: 4640.33 | bwd_allreduce_microstep: 4.96 | step_microstep: 42.13
-[2025-01-25 09:03:11,325] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2200.88 | bwd: 4645.40 | bwd_inner: 4640.33 | bwd_allreduce: 5.00 | step: 42.14
-  2%|▏         | 131/5800 [16:41<11:02:05,  7.01s/it]                                                     {'loss': 0.0537, 'grad_norm': 10.368681907653809, 'learning_rate': 3.0114942528735635e-05, 'epoch': 1.13}
-  2%|▏         | 131/5800 [16:41<11:02:05,  7.01s/it]score1 tensor([[0.4609],
-        [0.4043],
-        [0.3066],
-        [0.3223]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.4863, 0.5625, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1523, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:03:18,227] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.36
-[2025-01-25 09:03:18,228] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.58 | bwd_microstep: 4617.85 | bwd_inner_microstep: 4613.15 | bwd_allreduce_microstep: 4.61 | step_microstep: 47.00
-[2025-01-25 09:03:18,228] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.55 | bwd: 4617.87 | bwd_inner: 4613.15 | bwd_allreduce: 4.65 | step: 47.01
-  2%|▏         | 132/5800 [16:48<10:59:10,  6.98s/it]                                                     {'loss': 0.1523, 'grad_norm': 19.842735290527344, 'learning_rate': 3.0344827586206897e-05, 'epoch': 1.14}
-  2%|▏         | 132/5800 [16:48<10:59:10,  6.98s/it]score1 tensor([[0.4922],
-        [0.3633],
-        [0.3555],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5898, 0.6602, 0.4180, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1445, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:03:25,139] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.37
-[2025-01-25 09:03:25,141] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.67 | bwd_microstep: 4617.83 | bwd_inner_microstep: 4611.55 | bwd_allreduce_microstep: 6.20 | step_microstep: 43.84
-[2025-01-25 09:03:25,141] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.64 | bwd: 4617.85 | bwd_inner: 4611.55 | bwd_allreduce: 6.24 | step: 43.86
-  2%|▏         | 133/5800 [16:55<10:57:06,  6.96s/it]                                                     {'loss': 0.1445, 'grad_norm': 20.099058151245117, 'learning_rate': 3.057471264367816e-05, 'epoch': 1.15}
-  2%|▏         | 133/5800 [16:55<10:57:06,  6.96s/it]score1 tensor([[0.3887],
-        [0.3887],
-        [0.3848],
-        [0.4824]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.5664, 0.5234, 0.6016], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1494, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:03:32,045] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.31 | optimizer_step: 4.37
-[2025-01-25 09:03:32,047] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.11 | bwd_microstep: 4620.21 | bwd_inner_microstep: 4615.51 | bwd_allreduce_microstep: 4.63 | step_microstep: 43.54
-[2025-01-25 09:03:32,047] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.08 | bwd: 4620.23 | bwd_inner: 4615.51 | bwd_allreduce: 4.66 | step: 43.55
-  2%|▏         | 134/5800 [17:02<10:55:31,  6.94s/it]                                                     {'loss': 0.1494, 'grad_norm': 20.004350662231445, 'learning_rate': 3.080459770114942e-05, 'epoch': 1.16}
-  2%|▏         | 134/5800 [17:02<10:55:31,  6.94s/it]score1 tensor([[0.3516],
-        [0.3379],
-        [0.4609],
-        [0.3867]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.4160, 0.4629, 0.4805], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0684, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:03:38,941] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.37
-[2025-01-25 09:03:38,943] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.32 | bwd_microstep: 4615.10 | bwd_inner_microstep: 4610.33 | bwd_allreduce_microstep: 4.68 | step_microstep: 42.16
-[2025-01-25 09:03:38,944] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.29 | bwd: 4615.12 | bwd_inner: 4610.33 | bwd_allreduce: 4.72 | step: 42.18
-  2%|▏         | 135/5800 [17:08<10:54:06,  6.93s/it]                                                     {'loss': 0.0684, 'grad_norm': 19.561540603637695, 'learning_rate': 3.103448275862069e-05, 'epoch': 1.16}
-  2%|▏         | 135/5800 [17:08<10:54:06,  6.93s/it]score1 tensor([[0.5234],
-        [0.4453],
-        [0.4590],
-        [0.6406]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4121, 0.4551, 0.6133, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1001, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:03:45,839] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 09:03:45,841] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.94 | bwd_microstep: 4621.74 | bwd_inner_microstep: 4617.17 | bwd_allreduce_microstep: 4.48 | step_microstep: 40.80
-[2025-01-25 09:03:45,841] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.90 | bwd: 4621.76 | bwd_inner: 4617.17 | bwd_allreduce: 4.52 | step: 40.81
-  2%|▏         | 136/5800 [17:15<10:53:10,  6.92s/it]                                                     {'loss': 0.1001, 'grad_norm': 2.5837457180023193, 'learning_rate': 3.126436781609196e-05, 'epoch': 1.17}
-  2%|▏         | 136/5800 [17:15<10:53:10,  6.92s/it]score1 tensor([[0.6484],
-        [0.5664],
-        [0.7070],
-        [0.6250]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4531, 0.4473, 0.6133, 0.3809], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1631, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:03:52,742] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.98 | optimizer_step: 4.37
-[2025-01-25 09:03:52,746] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.99 | bwd_microstep: 4616.78 | bwd_inner_microstep: 4611.96 | bwd_allreduce_microstep: 4.74 | step_microstep: 45.01
-[2025-01-25 09:03:52,746] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.97 | bwd: 4616.80 | bwd_inner: 4611.96 | bwd_allreduce: 4.78 | step: 45.01
-  2%|▏         | 137/5800 [17:22<10:52:48,  6.92s/it]                                                     {'loss': 0.1631, 'grad_norm': 20.492647171020508, 'learning_rate': 3.149425287356322e-05, 'epoch': 1.18}
-  2%|▏         | 137/5800 [17:22<10:52:48,  6.92s/it]score1 tensor([[0.7422],
-        [0.6953],
-        [0.5234],
-        [0.6641]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4922, 0.4668, 0.4609, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1660, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:03:59,647] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 09:03:59,648] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.82 | bwd_microstep: 4614.83 | bwd_inner_microstep: 4609.78 | bwd_allreduce_microstep: 4.97 | step_microstep: 42.46
-[2025-01-25 09:03:59,649] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.79 | bwd: 4614.85 | bwd_inner: 4609.78 | bwd_allreduce: 5.00 | step: 42.47
-  2%|▏         | 138/5800 [17:29<10:52:14,  6.91s/it]                                                     {'loss': 0.166, 'grad_norm': 20.3112850189209, 'learning_rate': 3.172413793103448e-05, 'epoch': 1.19}
-  2%|▏         | 138/5800 [17:29<10:52:14,  6.91s/it]score1 tensor([[0.7344],
-        [0.7305],
-        [0.6484],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4277, 0.5391, 0.4922, 0.4570], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1934, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:04:06,556] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.93 | optimizer_step: 4.36
-[2025-01-25 09:04:06,557] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.45 | bwd_microstep: 4619.53 | bwd_inner_microstep: 4614.59 | bwd_allreduce_microstep: 4.86 | step_microstep: 45.15
-[2025-01-25 09:04:06,557] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.42 | bwd: 4619.55 | bwd_inner: 4614.59 | bwd_allreduce: 4.90 | step: 45.16
-  2%|▏         | 139/5800 [17:36<10:51:48,  6.91s/it]                                                     {'loss': 0.1934, 'grad_norm': 20.340290069580078, 'learning_rate': 3.195402298850575e-05, 'epoch': 1.2}
-  2%|▏         | 139/5800 [17:36<10:51:48,  6.91s/it]score1 tensor([[0.5469],
-        [0.6562],
-        [0.6289],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.5469, 0.5117, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0923, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:04:13,450] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.36
-[2025-01-25 09:04:13,452] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.35 | bwd_microstep: 4620.51 | bwd_inner_microstep: 4615.67 | bwd_allreduce_microstep: 4.77 | step_microstep: 41.66
-[2025-01-25 09:04:13,452] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.33 | bwd: 4620.54 | bwd_inner: 4615.67 | bwd_allreduce: 4.81 | step: 41.67
-  2%|▏         | 140/5800 [17:43<10:51:34,  6.91s/it]                                                     {'loss': 0.0923, 'grad_norm': 20.164052963256836, 'learning_rate': 3.218390804597701e-05, 'epoch': 1.21}
-  2%|▏         | 140/5800 [17:43<10:51:34,  6.91s/it]score1 tensor([[0.5000],
-        [0.6367],
-        [0.3418],
-        [0.4023]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4395, 0.6172, 0.3105, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0317, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:04:20,352] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 09:04:20,353] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.05 | bwd_microstep: 4615.08 | bwd_inner_microstep: 4610.37 | bwd_allreduce_microstep: 4.63 | step_microstep: 40.21
-[2025-01-25 09:04:20,353] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.02 | bwd: 4615.11 | bwd_inner: 4610.37 | bwd_allreduce: 4.67 | step: 40.21
-  2%|▏         | 141/5800 [17:50<10:51:11,  6.90s/it]                                                     {'loss': 0.0317, 'grad_norm': 9.574358940124512, 'learning_rate': 3.2413793103448275e-05, 'epoch': 1.22}
-  2%|▏         | 141/5800 [17:50<10:51:11,  6.90s/it]score1 tensor([[0.4844],
-        [0.3613],
-        [0.3535],
-        [0.4473]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.4512, 0.5039, 0.4863], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0767, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:04:27,259] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.02 | optimizer_step: 4.36
-[2025-01-25 09:04:27,261] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.50 | bwd_microstep: 4623.51 | bwd_inner_microstep: 4618.85 | bwd_allreduce_microstep: 4.58 | step_microstep: 41.99
-[2025-01-25 09:04:27,261] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.48 | bwd: 4623.54 | bwd_inner: 4618.85 | bwd_allreduce: 4.62 | step: 42.00
-  2%|▏         | 142/5800 [17:57<10:51:08,  6.90s/it]                                                     {'loss': 0.0767, 'grad_norm': 19.028888702392578, 'learning_rate': 3.264367816091954e-05, 'epoch': 1.22}
-  2%|▏         | 142/5800 [17:57<10:51:08,  6.90s/it]score1 tensor([[0.5547],
-        [0.5039],
-        [0.2812],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6484, 0.6211, 0.4414, 0.7031], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1348, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:04:34,161] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 09:04:34,163] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.92 | bwd_microstep: 4619.72 | bwd_inner_microstep: 4615.14 | bwd_allreduce_microstep: 4.51 | step_microstep: 39.96
-[2025-01-25 09:04:34,163] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.89 | bwd: 4619.74 | bwd_inner: 4615.14 | bwd_allreduce: 4.54 | step: 39.97
-  2%|▏         | 143/5800 [18:04<10:50:48,  6.90s/it]                                                     {'loss': 0.1348, 'grad_norm': 19.78900146484375, 'learning_rate': 3.287356321839081e-05, 'epoch': 1.23}
-  2%|▏         | 143/5800 [18:04<10:50:48,  6.90s/it]score1 tensor([[0.3516],
-        [0.4395],
-        [0.5195],
-        [0.4961]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.4844, 0.5586, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0435, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:04:41,063] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.15 | optimizer_step: 4.36
-[2025-01-25 09:04:41,064] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.13 | bwd_microstep: 4624.30 | bwd_inner_microstep: 4619.30 | bwd_allreduce_microstep: 4.92 | step_microstep: 43.05
-[2025-01-25 09:04:41,065] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.10 | bwd: 4624.32 | bwd_inner: 4619.30 | bwd_allreduce: 4.96 | step: 43.06
-  2%|▏         | 144/5800 [18:11<10:50:50,  6.90s/it]                                                     {'loss': 0.0435, 'grad_norm': 19.21440887451172, 'learning_rate': 3.310344827586207e-05, 'epoch': 1.24}
-  2%|▏         | 144/5800 [18:11<10:50:50,  6.90s/it]score1 tensor([[0.2578],
-        [0.7070],
-        [0.8750],
-        [0.4316]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.6445, 0.6641, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1621, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:04:47,962] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.99 | optimizer_step: 4.37
-[2025-01-25 09:04:47,963] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.10 | bwd_microstep: 4617.08 | bwd_inner_microstep: 4611.86 | bwd_allreduce_microstep: 5.13 | step_microstep: 43.83
-[2025-01-25 09:04:47,963] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.07 | bwd: 4617.10 | bwd_inner: 4611.86 | bwd_allreduce: 5.18 | step: 43.83
-  2%|▎         | 145/5800 [18:17<10:50:33,  6.90s/it]                                                     {'loss': 0.1621, 'grad_norm': 5.419125080108643, 'learning_rate': 3.3333333333333335e-05, 'epoch': 1.25}
-  2%|▎         | 145/5800 [18:17<10:50:33,  6.90s/it]score1 tensor([[0.5312],
-        [0.4277],
-        [0.4941],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4570, 0.4883, 0.5781, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0557, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:04:54,869] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.88 | optimizer_step: 4.37
-[2025-01-25 09:04:54,871] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.10 | bwd_microstep: 4621.73 | bwd_inner_microstep: 4616.88 | bwd_allreduce_microstep: 4.77 | step_microstep: 42.22
-[2025-01-25 09:04:54,871] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.07 | bwd: 4621.75 | bwd_inner: 4616.88 | bwd_allreduce: 4.81 | step: 42.23
-  3%|▎         | 146/5800 [18:24<10:50:31,  6.90s/it]                                                     {'loss': 0.0557, 'grad_norm': 9.720562934875488, 'learning_rate': 3.3563218390804597e-05, 'epoch': 1.26}
-  3%|▎         | 146/5800 [18:24<10:50:31,  6.90s/it]score1 tensor([[0.5000],
-        [0.7656],
-        [0.5234],
-        [0.6172]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.6094, 0.6172, 0.6250], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0850, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:05:01,773] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.05 | optimizer_step: 4.36
-[2025-01-25 09:05:01,775] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.15 | bwd_microstep: 4622.21 | bwd_inner_microstep: 4617.42 | bwd_allreduce_microstep: 4.71 | step_microstep: 43.50
-[2025-01-25 09:05:01,775] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.12 | bwd: 4622.23 | bwd_inner: 4617.42 | bwd_allreduce: 4.74 | step: 43.50
-  3%|▎         | 147/5800 [18:31<10:50:35,  6.91s/it]                                                     {'loss': 0.085, 'grad_norm': 2.4199934005737305, 'learning_rate': 3.3793103448275865e-05, 'epoch': 1.27}
-  3%|▎         | 147/5800 [18:31<10:50:35,  6.91s/it]score1 tensor([[0.6562],
-        [0.6016],
-        [0.6875],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6875, 0.5508, 0.6953, 0.4727], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0420, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:05:08,682] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 09:05:08,683] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.85 | bwd_microstep: 4622.01 | bwd_inner_microstep: 4617.24 | bwd_allreduce_microstep: 4.70 | step_microstep: 40.97
-[2025-01-25 09:05:08,683] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.83 | bwd: 4622.04 | bwd_inner: 4617.24 | bwd_allreduce: 4.73 | step: 40.97
-  3%|▎         | 148/5800 [18:38<10:50:23,  6.90s/it]                                                     {'loss': 0.042, 'grad_norm': 2.704993486404419, 'learning_rate': 3.4022988505747127e-05, 'epoch': 1.28}
-  3%|▎         | 148/5800 [18:38<10:50:23,  6.90s/it]score1 tensor([[0.6641],
-        [0.5039],
-        [0.6914],
-        [0.6445]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.4043, 0.6367, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0869, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:05:15,580] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.38 | optimizer_step: 4.36
-[2025-01-25 09:05:15,582] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.10 | bwd_microstep: 4621.52 | bwd_inner_microstep: 4616.50 | bwd_allreduce_microstep: 4.94 | step_microstep: 43.71
-[2025-01-25 09:05:15,582] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.08 | bwd: 4621.55 | bwd_inner: 4616.50 | bwd_allreduce: 4.97 | step: 43.72
-  3%|▎         | 149/5800 [18:45<10:50:16,  6.90s/it]                                                     {'loss': 0.0869, 'grad_norm': 19.524675369262695, 'learning_rate': 3.4252873563218395e-05, 'epoch': 1.28}
-  3%|▎         | 149/5800 [18:45<10:50:16,  6.90s/it]score1 tensor([[0.5703],
-        [0.6562],
-        [0.6875],
-        [0.5977]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.6328, 0.5508, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0615, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:05:22,486] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.07 | optimizer_step: 4.36
-[2025-01-25 09:05:22,488] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.75 | bwd_microstep: 4615.90 | bwd_inner_microstep: 4610.25 | bwd_allreduce_microstep: 5.57 | step_microstep: 42.67
-[2025-01-25 09:05:22,488] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.72 | bwd: 4615.93 | bwd_inner: 4610.25 | bwd_allreduce: 5.61 | step: 42.68
-  3%|▎         | 150/5800 [18:52<10:50:10,  6.90s/it]                                                     {'loss': 0.0615, 'grad_norm': 19.1571044921875, 'learning_rate': 3.4482758620689657e-05, 'epoch': 1.29}
-  3%|▎         | 150/5800 [18:52<10:50:10,  6.90s/it]score1 tensor([[0.7617],
-        [0.5000],
-        [0.6367],
-        [0.6680]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.3652, 0.6484, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1030, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:05:29,404] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.18 | optimizer_step: 4.37
-[2025-01-25 09:05:29,405] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.38 | bwd_microstep: 4621.93 | bwd_inner_microstep: 4617.03 | bwd_allreduce_microstep: 4.83 | step_microstep: 43.60
-[2025-01-25 09:05:29,406] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.35 | bwd: 4621.95 | bwd_inner: 4617.03 | bwd_allreduce: 4.86 | step: 43.61
-  3%|▎         | 151/5800 [18:59<10:50:28,  6.91s/it]                                                     {'loss': 0.103, 'grad_norm': 9.99596881866455, 'learning_rate': 3.4712643678160925e-05, 'epoch': 1.3}
-  3%|▎         | 151/5800 [18:59<10:50:28,  6.91s/it]score1 tensor([[0.4453],
-        [0.4922],
-        [0.5898],
-        [0.4570]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.5391, 0.6797, 0.4805], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0513, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:05:36,316] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.92 | optimizer_step: 4.36
-[2025-01-25 09:05:36,318] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.46 | bwd_microstep: 4626.61 | bwd_inner_microstep: 4621.61 | bwd_allreduce_microstep: 4.92 | step_microstep: 43.45
-[2025-01-25 09:05:36,318] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.43 | bwd: 4626.63 | bwd_inner: 4621.61 | bwd_allreduce: 4.96 | step: 43.46
-  3%|▎         | 152/5800 [19:06<10:50:20,  6.91s/it]                                                     {'loss': 0.0513, 'grad_norm': 9.703055381774902, 'learning_rate': 3.4942528735632187e-05, 'epoch': 1.31}
-  3%|▎         | 152/5800 [19:06<10:50:20,  6.91s/it]score1 tensor([[0.4277],
-        [0.6133],
-        [0.4863],
-        [0.4961]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4043, 0.6133, 0.4336, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0562, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:05:43,179] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.93 | optimizer_step: 4.36
-[2025-01-25 09:05:43,181] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.80 | bwd_microstep: 4578.54 | bwd_inner_microstep: 4573.62 | bwd_allreduce_microstep: 4.84 | step_microstep: 43.13
-[2025-01-25 09:05:43,181] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.76 | bwd: 4578.56 | bwd_inner: 4573.62 | bwd_allreduce: 4.88 | step: 43.14
-  3%|▎         | 153/5800 [19:13<10:48:56,  6.90s/it]                                                     {'loss': 0.0562, 'grad_norm': 5.0958356857299805, 'learning_rate': 3.517241379310345e-05, 'epoch': 1.32}
-  3%|▎         | 153/5800 [19:13<10:48:56,  6.90s/it]score1 tensor([[0.3906],
-        [0.4512],
-        [0.3691],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5234, 0.5352, 0.4141, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0688, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:05:50,101] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.88 | optimizer_step: 4.36
-[2025-01-25 09:05:50,103] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.97 | bwd_microstep: 4635.19 | bwd_inner_microstep: 4630.27 | bwd_allreduce_microstep: 4.85 | step_microstep: 41.73
-[2025-01-25 09:05:50,103] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.94 | bwd: 4635.22 | bwd_inner: 4630.27 | bwd_allreduce: 4.89 | step: 41.74
-  3%|▎         | 154/5800 [19:20<10:49:32,  6.90s/it]                                                     {'loss': 0.0688, 'grad_norm': 9.171738624572754, 'learning_rate': 3.540229885057472e-05, 'epoch': 1.33}
-  3%|▎         | 154/5800 [19:20<10:49:32,  6.90s/it]score1 tensor([[0.5117],
-        [0.4160],
-        [0.4316],
-        [0.4336]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.4688, 0.4746, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0449, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:05:57,023] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 09:05:57,024] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.28 | bwd_microstep: 4635.29 | bwd_inner_microstep: 4630.33 | bwd_allreduce_microstep: 4.89 | step_microstep: 41.09
-[2025-01-25 09:05:57,025] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.25 | bwd: 4635.31 | bwd_inner: 4630.33 | bwd_allreduce: 4.93 | step: 41.09
-  3%|▎         | 155/5800 [19:26<10:49:50,  6.91s/it]                                                     {'loss': 0.0449, 'grad_norm': 9.371855735778809, 'learning_rate': 3.563218390804598e-05, 'epoch': 1.34}
-  3%|▎         | 155/5800 [19:27<10:49:50,  6.91s/it]score1 tensor([[0.4492],
-        [0.4473],
-        [0.5273],
-        [0.4648]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.5391, 0.4668, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0801, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:06:03,938] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 09:06:03,939] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.98 | bwd_microstep: 4638.18 | bwd_inner_microstep: 4633.62 | bwd_allreduce_microstep: 4.48 | step_microstep: 40.30
-[2025-01-25 09:06:03,939] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.96 | bwd: 4638.20 | bwd_inner: 4633.62 | bwd_allreduce: 4.51 | step: 40.31
-  3%|▎         | 156/5800 [19:33<10:49:59,  6.91s/it]                                                     {'loss': 0.0801, 'grad_norm': 9.372556686401367, 'learning_rate': 3.586206896551725e-05, 'epoch': 1.34}
-  3%|▎         | 156/5800 [19:33<10:49:59,  6.91s/it]score1 tensor([[0.4023],
-        [0.6016],
-        [0.3691],
-        [0.3320]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4609, 0.6289, 0.5469, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1040, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:06:10,861] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 7.98 | optimizer_step: 4.36
-[2025-01-25 09:06:10,862] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.87 | bwd_microstep: 4639.79 | bwd_inner_microstep: 4635.16 | bwd_allreduce_microstep: 4.55 | step_microstep: 46.16
-[2025-01-25 09:06:10,863] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.83 | bwd: 4639.81 | bwd_inner: 4635.16 | bwd_allreduce: 4.58 | step: 46.17
-  3%|▎         | 157/5800 [19:40<10:50:19,  6.91s/it]                                                     {'loss': 0.104, 'grad_norm': 18.392696380615234, 'learning_rate': 3.609195402298851e-05, 'epoch': 1.35}
-  3%|▎         | 157/5800 [19:40<10:50:19,  6.91s/it]score1 tensor([[0.6953],
-        [0.5625],
-        [0.5469],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5938, 0.5781, 0.3750, 0.3809], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1069, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:06:17,786] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.13 | optimizer_step: 4.37
-[2025-01-25 09:06:17,787] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.61 | bwd_microstep: 4643.54 | bwd_inner_microstep: 4637.84 | bwd_allreduce_microstep: 5.62 | step_microstep: 43.33
-[2025-01-25 09:06:17,787] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.58 | bwd: 4643.56 | bwd_inner: 4637.84 | bwd_allreduce: 5.66 | step: 43.34
-  3%|▎         | 158/5800 [19:47<10:50:33,  6.92s/it]                                                     {'loss': 0.1069, 'grad_norm': 9.587553977966309, 'learning_rate': 3.632183908045977e-05, 'epoch': 1.36}
-  3%|▎         | 158/5800 [19:47<10:50:33,  6.92s/it]score1 tensor([[0.7422],
-        [0.9219],
-        [0.7695],
-        [0.6211]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.6055, 0.4609, 0.3867], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2656, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:06:24,713] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.89 | optimizer_step: 4.37
-[2025-01-25 09:06:24,715] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.15 | bwd_microstep: 4635.04 | bwd_inner_microstep: 4629.95 | bwd_allreduce_microstep: 5.00 | step_microstep: 43.14
-[2025-01-25 09:06:24,715] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.12 | bwd: 4635.06 | bwd_inner: 4629.95 | bwd_allreduce: 5.05 | step: 43.14
-  3%|▎         | 159/5800 [19:54<10:50:46,  6.92s/it]                                                     {'loss': 0.2656, 'grad_norm': 19.626699447631836, 'learning_rate': 3.655172413793104e-05, 'epoch': 1.37}
-  3%|▎         | 159/5800 [19:54<10:50:46,  6.92s/it]score1 tensor([[0.7539],
-        [0.3281],
-        [0.7031],
-        [0.6602]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4902, 0.4434, 0.5352, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1641, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:06:31,680] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 09:06:31,681] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.82 | bwd_microstep: 4647.69 | bwd_inner_microstep: 4642.71 | bwd_allreduce_microstep: 4.91 | step_microstep: 43.46
-[2025-01-25 09:06:31,681] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.76 | bwd: 4647.71 | bwd_inner: 4642.71 | bwd_allreduce: 4.94 | step: 43.47
-  3%|▎         | 160/5800 [20:01<10:51:45,  6.93s/it]                                                     {'loss': 0.1641, 'grad_norm': 10.431231498718262, 'learning_rate': 3.67816091954023e-05, 'epoch': 1.38}
-  3%|▎         | 160/5800 [20:01<10:51:45,  6.93s/it]score1 tensor([[0.2637],
-        [0.8125],
-        [0.6758],
-        [0.8516]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.1787, 0.5664, 0.4395, 0.5898], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2070, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:06:38,601] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.93 | optimizer_step: 4.36
-[2025-01-25 09:06:38,602] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.99 | bwd_microstep: 4636.15 | bwd_inner_microstep: 4631.05 | bwd_allreduce_microstep: 5.00 | step_microstep: 43.43
-[2025-01-25 09:06:38,602] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.97 | bwd: 4636.18 | bwd_inner: 4631.05 | bwd_allreduce: 5.06 | step: 43.44
-  3%|▎         | 161/5800 [20:08<10:51:25,  6.93s/it]                                                     {'loss': 0.207, 'grad_norm': 19.637754440307617, 'learning_rate': 3.701149425287357e-05, 'epoch': 1.39}
-  3%|▎         | 161/5800 [20:08<10:51:25,  6.93s/it]score1 tensor([[0.8203],
-        [0.4863],
-        [0.6328],
-        [0.6680]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6367, 0.4023, 0.6367, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1089, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:06:45,522] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 09:06:45,524] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.08 | bwd_microstep: 4634.28 | bwd_inner_microstep: 4629.56 | bwd_allreduce_microstep: 4.63 | step_microstep: 41.14
-[2025-01-25 09:06:45,524] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.05 | bwd: 4634.31 | bwd_inner: 4629.56 | bwd_allreduce: 4.67 | step: 41.15
-  3%|▎         | 162/5800 [20:15<10:50:55,  6.93s/it]                                                     {'loss': 0.1089, 'grad_norm': 10.028595924377441, 'learning_rate': 3.724137931034483e-05, 'epoch': 1.4}
-  3%|▎         | 162/5800 [20:15<10:50:55,  6.93s/it]score1 tensor([[0.2871],
-        [0.4141],
-        [0.4805],
-        [0.2158]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4883, 0.5547, 0.4082, 0.3652], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1406, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:06:52,452] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.37
-[2025-01-25 09:06:52,453] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.34 | bwd_microstep: 4644.91 | bwd_inner_microstep: 4638.32 | bwd_allreduce_microstep: 6.51 | step_microstep: 41.74
-[2025-01-25 09:06:52,453] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.31 | bwd: 4644.94 | bwd_inner: 4638.32 | bwd_allreduce: 6.55 | step: 41.75
-  3%|▎         | 163/5800 [20:22<10:50:44,  6.93s/it]                                                     {'loss': 0.1406, 'grad_norm': 9.150333404541016, 'learning_rate': 3.74712643678161e-05, 'epoch': 1.41}
-  3%|▎         | 163/5800 [20:22<10:50:44,  6.93s/it]score1 tensor([[0.2930],
-        [0.5586],
-        [0.4551],
-        [0.3145]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4453, 0.6094, 0.4941, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0942, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:06:59,370] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.40 | optimizer_step: 4.36
-[2025-01-25 09:06:59,371] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.30 | bwd_microstep: 4648.20 | bwd_inner_microstep: 4644.24 | bwd_allreduce_microstep: 3.90 | step_microstep: 32.14
-[2025-01-25 09:06:59,371] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.23 | bwd: 4648.22 | bwd_inner: 4644.24 | bwd_allreduce: 3.93 | step: 32.15
-  3%|▎         | 164/5800 [20:29<10:50:09,  6.92s/it]                                                     {'loss': 0.0942, 'grad_norm': 18.29046058654785, 'learning_rate': 3.770114942528736e-05, 'epoch': 1.41}
-  3%|▎         | 164/5800 [20:29<10:50:09,  6.92s/it]score1 tensor([[0.4336],
-        [0.4336],
-        [0.4062],
-        [0.4043]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4766, 0.4199, 0.4277, 0.5586], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0581, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:07:06,272] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.95 | optimizer_step: 4.37
-[2025-01-25 09:07:06,273] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.19 | bwd_microstep: 4641.40 | bwd_inner_microstep: 4636.75 | bwd_allreduce_microstep: 4.58 | step_microstep: 40.93
-[2025-01-25 09:07:06,274] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.17 | bwd: 4641.43 | bwd_inner: 4636.75 | bwd_allreduce: 4.62 | step: 40.94
-  3%|▎         | 165/5800 [20:36<10:49:55,  6.92s/it]                                                     {'loss': 0.0581, 'grad_norm': 8.984949111938477, 'learning_rate': 3.793103448275862e-05, 'epoch': 1.42}
-  3%|▎         | 165/5800 [20:36<10:49:55,  6.92s/it]score1 tensor([[0.4316],
-        [0.4316],
-        [0.5898],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.5508, 0.5664, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0762, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:07:13,200] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.37
-[2025-01-25 09:07:13,201] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.41 | bwd_microstep: 4642.11 | bwd_inner_microstep: 4637.16 | bwd_allreduce_microstep: 4.85 | step_microstep: 42.57
-[2025-01-25 09:07:13,201] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.38 | bwd: 4642.13 | bwd_inner: 4637.16 | bwd_allreduce: 4.90 | step: 42.58
-  3%|▎         | 166/5800 [20:43<10:49:55,  6.92s/it]                                                     {'loss': 0.0762, 'grad_norm': 9.165533065795898, 'learning_rate': 3.816091954022988e-05, 'epoch': 1.43}
-  3%|▎         | 166/5800 [20:43<10:49:55,  6.92s/it]score1 tensor([[0.6641],
-        [0.6133],
-        [0.6445],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.6016, 0.5156, 0.4414], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0684, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:07:20,132] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.43 | optimizer_step: 4.36
-[2025-01-25 09:07:20,134] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.06 | bwd_microstep: 4643.22 | bwd_inner_microstep: 4638.27 | bwd_allreduce_microstep: 4.86 | step_microstep: 46.25
-[2025-01-25 09:07:20,134] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.03 | bwd: 4643.24 | bwd_inner: 4638.27 | bwd_allreduce: 4.90 | step: 46.25
-  3%|▎         | 167/5800 [20:50<10:50:20,  6.93s/it]                                                     {'loss': 0.0684, 'grad_norm': 18.93910026550293, 'learning_rate': 3.839080459770115e-05, 'epoch': 1.44}
-  3%|▎         | 167/5800 [20:50<10:50:20,  6.93s/it]score1 tensor([[0.5820],
-        [0.6250],
-        [0.4746],
-        [0.6250]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4668, 0.5547, 0.3906, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0977, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:07:27,066] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 09:07:27,067] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.54 | bwd_microstep: 4641.98 | bwd_inner_microstep: 4636.98 | bwd_allreduce_microstep: 4.90 | step_microstep: 41.05
-[2025-01-25 09:07:27,068] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.52 | bwd: 4642.00 | bwd_inner: 4636.98 | bwd_allreduce: 4.95 | step: 41.05
-  3%|▎         | 168/5800 [20:57<10:50:10,  6.93s/it]                                                     {'loss': 0.0977, 'grad_norm': 18.586416244506836, 'learning_rate': 3.862068965517242e-05, 'epoch': 1.45}
-  3%|▎         | 168/5800 [20:57<10:50:10,  6.93s/it]score1 tensor([[0.6094],
-        [0.5977],
-        [0.5078],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6875, 0.6211, 0.6289, 0.4277], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0806, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:07:33,988] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.90 | optimizer_step: 4.37
-[2025-01-25 09:07:33,990] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.34 | bwd_microstep: 4634.78 | bwd_inner_microstep: 4630.01 | bwd_allreduce_microstep: 4.70 | step_microstep: 43.20
-[2025-01-25 09:07:33,990] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.32 | bwd: 4634.81 | bwd_inner: 4630.01 | bwd_allreduce: 4.73 | step: 43.21
-  3%|▎         | 169/5800 [21:03<10:49:56,  6.93s/it]                                                     {'loss': 0.0806, 'grad_norm': 9.511337280273438, 'learning_rate': 3.885057471264368e-05, 'epoch': 1.46}
-  3%|▎         | 169/5800 [21:03<10:49:56,  6.93s/it]score1 tensor([[0.7266],
-        [0.6641],
-        [0.6953],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4961, 0.4863, 0.5547, 0.4688], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1641, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:07:40,907] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 09:07:40,908] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.46 | bwd_microstep: 4639.13 | bwd_inner_microstep: 4634.17 | bwd_allreduce_microstep: 4.88 | step_microstep: 41.59
-[2025-01-25 09:07:40,908] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.43 | bwd: 4639.15 | bwd_inner: 4634.17 | bwd_allreduce: 4.91 | step: 41.61
-  3%|▎         | 170/5800 [21:10<10:49:47,  6.92s/it]                                                     {'loss': 0.1641, 'grad_norm': 18.850662231445312, 'learning_rate': 3.908045977011495e-05, 'epoch': 1.47}
-  3%|▎         | 170/5800 [21:10<10:49:47,  6.92s/it]score1 tensor([[0.6406],
-        [0.4805],
-        [0.5508],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4023, 0.3477, 0.5352, 0.4160], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1328, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:07:47,844] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.36
-[2025-01-25 09:07:47,846] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.58 | bwd_microstep: 4648.91 | bwd_inner_microstep: 4644.00 | bwd_allreduce_microstep: 4.83 | step_microstep: 41.36
-[2025-01-25 09:07:47,846] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.56 | bwd: 4648.93 | bwd_inner: 4644.00 | bwd_allreduce: 4.87 | step: 41.36
-  3%|▎         | 171/5800 [21:17<10:50:03,  6.93s/it]                                                     {'loss': 0.1328, 'grad_norm': 18.535018920898438, 'learning_rate': 3.931034482758621e-05, 'epoch': 1.47}
-  3%|▎         | 171/5800 [21:17<10:50:03,  6.93s/it]score1 tensor([[0.5859],
-        [0.5547],
-        [0.5195],
-        [0.6133]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.4551, 0.3438, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1133, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:07:54,768] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.17 | optimizer_step: 4.36
-[2025-01-25 09:07:54,769] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.99 | bwd_microstep: 4636.05 | bwd_inner_microstep: 4631.06 | bwd_allreduce_microstep: 4.91 | step_microstep: 45.79
-[2025-01-25 09:07:54,770] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.96 | bwd: 4636.07 | bwd_inner: 4631.06 | bwd_allreduce: 4.94 | step: 45.80
-  3%|▎         | 172/5800 [21:24<10:49:45,  6.93s/it]                                                     {'loss': 0.1133, 'grad_norm': 18.31667137145996, 'learning_rate': 3.954022988505747e-05, 'epoch': 1.48}
-  3%|▎         | 172/5800 [21:24<10:49:45,  6.93s/it]score1 tensor([[0.4922],
-        [0.4688],
-        [0.4883],
-        [0.4121]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4863, 0.5742, 0.5664, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0557, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:08:01,707] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.08 | optimizer_step: 4.36
-[2025-01-25 09:08:01,708] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.15 | bwd_microstep: 4643.47 | bwd_inner_microstep: 4638.50 | bwd_allreduce_microstep: 4.88 | step_microstep: 43.29
-[2025-01-25 09:08:01,709] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.12 | bwd: 4643.49 | bwd_inner: 4638.50 | bwd_allreduce: 4.92 | step: 43.29
-  3%|▎         | 173/5800 [21:31<10:50:03,  6.93s/it]                                                     {'loss': 0.0557, 'grad_norm': 9.076985359191895, 'learning_rate': 3.9770114942528735e-05, 'epoch': 1.49}
-  3%|▎         | 173/5800 [21:31<10:50:03,  6.93s/it]score1 tensor([[0.3906],
-        [0.4082],
-        [0.3594],
-        [0.3340]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.5195, 0.4707, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1328, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:08:08,637] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.37
-[2025-01-25 09:08:08,639] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.91 | bwd_microstep: 4639.19 | bwd_inner_microstep: 4634.53 | bwd_allreduce_microstep: 4.58 | step_microstep: 40.72
-[2025-01-25 09:08:08,639] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.89 | bwd: 4639.21 | bwd_inner: 4634.53 | bwd_allreduce: 4.62 | step: 40.72
-  3%|▎         | 174/5800 [21:38<10:49:39,  6.93s/it]                                                     {'loss': 0.1328, 'grad_norm': 18.029296875, 'learning_rate': 4e-05, 'epoch': 1.5}
-  3%|▎         | 174/5800 [21:38<10:49:39,  6.93s/it]score1 tensor([[0.3262],
-        [0.3223],
-        [0.3672],
-        [0.4336]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.3262, 0.4941, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0981, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:08:15,562] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.37
-[2025-01-25 09:08:15,564] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.31 | bwd_microstep: 4645.41 | bwd_inner_microstep: 4639.50 | bwd_allreduce_microstep: 5.83 | step_microstep: 41.16
-[2025-01-25 09:08:15,564] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.28 | bwd: 4645.43 | bwd_inner: 4639.50 | bwd_allreduce: 5.87 | step: 41.17
-  3%|▎         | 175/5800 [21:45<10:49:31,  6.93s/it]                                                     {'loss': 0.0981, 'grad_norm': 17.793296813964844, 'learning_rate': 3.999999688182649e-05, 'epoch': 1.51}
-  3%|▎         | 175/5800 [21:45<10:49:31,  6.93s/it]score1 tensor([[0.4707],
-        [0.4238],
-        [0.5547],
-        [0.4082]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4883, 0.3457, 0.6055, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0581, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:08:22,489] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 09:08:22,491] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.75 | bwd_microstep: 4637.94 | bwd_inner_microstep: 4633.16 | bwd_allreduce_microstep: 4.70 | step_microstep: 46.92
-[2025-01-25 09:08:22,491] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.72 | bwd: 4637.97 | bwd_inner: 4633.16 | bwd_allreduce: 4.74 | step: 46.93
-  3%|▎         | 176/5800 [21:52<10:49:27,  6.93s/it]                                                     {'loss': 0.0581, 'grad_norm': 9.76066780090332, 'learning_rate': 3.999998752730692e-05, 'epoch': 1.52}
-  3%|▎         | 176/5800 [21:52<10:49:27,  6.93s/it]score1 tensor([[0.6797],
-        [0.5508],
-        [0.3496],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6484, 0.5273, 0.3223, 0.4238], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0522, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:08:29,414] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 09:08:29,415] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.74 | bwd_microstep: 4638.86 | bwd_inner_microstep: 4634.22 | bwd_allreduce_microstep: 4.57 | step_microstep: 40.49
-[2025-01-25 09:08:29,416] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.71 | bwd: 4638.88 | bwd_inner: 4634.22 | bwd_allreduce: 4.60 | step: 40.50
-  3%|▎         | 177/5800 [21:59<10:49:11,  6.93s/it]                                                     {'loss': 0.0522, 'grad_norm': 18.416967391967773, 'learning_rate': 3.9999971936444204e-05, 'epoch': 1.53}
-  3%|▎         | 177/5800 [21:59<10:49:11,  6.93s/it]score1 tensor([[0.5703],
-        [0.5234],
-        [0.4648],
-        [0.6172]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4629, 0.6484, 0.4648, 0.6133], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0591, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:08:36,284] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 09:08:36,286] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.95 | bwd_microstep: 4585.80 | bwd_inner_microstep: 4580.97 | bwd_allreduce_microstep: 4.74 | step_microstep: 45.49
-[2025-01-25 09:08:36,286] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.93 | bwd: 4585.82 | bwd_inner: 4580.97 | bwd_allreduce: 4.79 | step: 45.49
-  3%|▎         | 178/5800 [22:06<10:47:20,  6.91s/it]                                                     {'loss': 0.0591, 'grad_norm': 4.967622756958008, 'learning_rate': 3.999995010924321e-05, 'epoch': 1.53}
-  3%|▎         | 178/5800 [22:06<10:47:20,  6.91s/it]score1 tensor([[0.6172],
-        [0.5586],
-        [0.5312],
-        [0.4316]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6289, 0.5312, 0.5586, 0.4043], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:08:43,200] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 09:08:43,201] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.56 | bwd_microstep: 4634.35 | bwd_inner_microstep: 4629.67 | bwd_allreduce_microstep: 4.61 | step_microstep: 41.36
-[2025-01-25 09:08:43,204] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.53 | bwd: 4634.37 | bwd_inner: 4629.67 | bwd_allreduce: 4.64 | step: 41.36
-  3%|▎         | 179/5800 [22:13<10:47:36,  6.91s/it]                                                     {'loss': 0.0234, 'grad_norm': 5.0146684646606445, 'learning_rate': 3.999992204571074e-05, 'epoch': 1.54}
-  3%|▎         | 179/5800 [22:13<10:47:36,  6.91s/it]score1 tensor([[0.5195],
-        [0.6055],
-        [0.5625],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3984, 0.5508, 0.5547, 0.4258], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0771, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:08:50,125] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 09:08:50,126] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.97 | bwd_microstep: 4641.73 | bwd_inner_microstep: 4637.03 | bwd_allreduce_microstep: 4.62 | step_microstep: 39.79
-[2025-01-25 09:08:50,126] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.94 | bwd: 4641.75 | bwd_inner: 4637.03 | bwd_allreduce: 4.66 | step: 39.80
-  3%|▎         | 180/5800 [22:20<10:47:45,  6.92s/it]                                                     {'loss': 0.0771, 'grad_norm': 18.11168670654297, 'learning_rate': 3.9999887745855554e-05, 'epoch': 1.55}
-  3%|▎         | 180/5800 [22:20<10:47:45,  6.92s/it]score1 tensor([[0.4395],
-        [0.4922],
-        [0.4707],
-        [0.4785]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.5664, 0.5469, 0.5977], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0771, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:08:57,058] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 8.01 | optimizer_step: 4.37
-[2025-01-25 09:08:57,059] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.51 | bwd_microstep: 4640.95 | bwd_inner_microstep: 4636.20 | bwd_allreduce_microstep: 4.68 | step_microstep: 45.69
-[2025-01-25 09:08:57,059] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.48 | bwd: 4640.97 | bwd_inner: 4636.20 | bwd_allreduce: 4.72 | step: 45.70
-  3%|▎         | 181/5800 [22:27<10:48:07,  6.92s/it]                                                     {'loss': 0.0771, 'grad_norm': 17.896240234375, 'learning_rate': 3.999984720968834e-05, 'epoch': 1.56}
-  3%|▎         | 181/5800 [22:27<10:48:07,  6.92s/it]score1 tensor([[0.4180],
-        [0.4453],
-        [0.4844],
-        [0.4121]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.4707, 0.5898, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0854, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:09:03,984] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 09:09:03,985] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.12 | bwd_microstep: 4643.33 | bwd_inner_microstep: 4638.61 | bwd_allreduce_microstep: 4.64 | step_microstep: 40.76
-[2025-01-25 09:09:03,985] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.09 | bwd: 4643.35 | bwd_inner: 4638.61 | bwd_allreduce: 4.67 | step: 40.76
-  3%|▎         | 182/5800 [22:33<10:48:04,  6.92s/it]                                                     {'loss': 0.0854, 'grad_norm': 9.008902549743652, 'learning_rate': 3.999980043722173e-05, 'epoch': 1.57}
-  3%|▎         | 182/5800 [22:33<10:48:04,  6.92s/it]score1 tensor([[0.4512],
-        [0.4844],
-        [0.5156],
-        [0.4902]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.4570, 0.5312, 0.4902], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0493, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:09:10,864] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.91 | optimizer_step: 4.37
-[2025-01-25 09:09:10,866] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.97 | bwd_microstep: 4592.57 | bwd_inner_microstep: 4587.81 | bwd_allreduce_microstep: 4.69 | step_microstep: 43.21
-[2025-01-25 09:09:10,866] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.94 | bwd: 4592.58 | bwd_inner: 4587.81 | bwd_allreduce: 4.72 | step: 43.22
-  3%|▎         | 183/5800 [22:40<10:46:51,  6.91s/it]                                                     {'loss': 0.0493, 'grad_norm': 4.738223552703857, 'learning_rate': 3.999974742847032e-05, 'epoch': 1.58}
-  3%|▎         | 183/5800 [22:40<10:46:51,  6.91s/it]score1 tensor([[0.5742],
-        [0.5586],
-        [0.6641],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4336, 0.4727, 0.5508, 0.4629], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0981, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:09:17,790] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.37
-[2025-01-25 09:09:17,791] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.93 | bwd_microstep: 4639.46 | bwd_inner_microstep: 4634.95 | bwd_allreduce_microstep: 4.43 | step_microstep: 42.77
-[2025-01-25 09:09:17,791] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.90 | bwd: 4639.48 | bwd_inner: 4634.95 | bwd_allreduce: 4.47 | step: 42.77
-  3%|▎         | 184/5800 [22:47<10:47:04,  6.91s/it]                                                     {'loss': 0.0981, 'grad_norm': 18.22058868408203, 'learning_rate': 3.999968818345064e-05, 'epoch': 1.59}
-  3%|▎         | 184/5800 [22:47<10:47:04,  6.91s/it]score1 tensor([[0.5547],
-        [0.6211],
-        [0.5625],
-        [0.5977]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3555, 0.5273, 0.4297, 0.4609], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1406, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:09:24,714] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.07 | optimizer_step: 4.37
-[2025-01-25 09:09:24,715] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.76 | bwd_microstep: 4634.52 | bwd_inner_microstep: 4629.75 | bwd_allreduce_microstep: 4.68 | step_microstep: 41.98
-[2025-01-25 09:09:24,715] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.73 | bwd: 4634.55 | bwd_inner: 4629.75 | bwd_allreduce: 4.72 | step: 41.99
-  3%|▎         | 185/5800 [22:54<10:47:30,  6.92s/it]                                                     {'loss': 0.1406, 'grad_norm': 18.108112335205078, 'learning_rate': 3.999962270218116e-05, 'epoch': 1.59}
-  3%|▎         | 185/5800 [22:54<10:47:30,  6.92s/it]score1 tensor([[0.6289],
-        [0.6914],
-        [0.6133],
-        [0.6133]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.6445, 0.4922, 0.6016], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0605, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:09:31,645] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.37
-[2025-01-25 09:09:31,647] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.39 | bwd_microstep: 4636.12 | bwd_inner_microstep: 4631.16 | bwd_allreduce_microstep: 4.88 | step_microstep: 46.88
-[2025-01-25 09:09:31,647] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.36 | bwd: 4636.15 | bwd_inner: 4631.16 | bwd_allreduce: 4.92 | step: 46.88
-  3%|▎         | 186/5800 [23:01<10:47:43,  6.92s/it]                                                     {'loss': 0.0605, 'grad_norm': 18.447551727294922, 'learning_rate': 3.999955098468229e-05, 'epoch': 1.6}
-  3%|▎         | 186/5800 [23:01<10:47:43,  6.92s/it]score1 tensor([[0.5625],
-        [0.3477],
-        [0.5039],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4766, 0.4258, 0.4785, 0.4805], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0522, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:09:38,576] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 09:09:38,577] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.00 | bwd_microstep: 4640.52 | bwd_inner_microstep: 4635.87 | bwd_allreduce_microstep: 4.57 | step_microstep: 41.17
-[2025-01-25 09:09:38,578] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.97 | bwd: 4640.54 | bwd_inner: 4635.87 | bwd_allreduce: 4.61 | step: 41.18
-  3%|▎         | 187/5800 [23:08<10:47:36,  6.92s/it]                                                     {'loss': 0.0522, 'grad_norm': 9.368250846862793, 'learning_rate': 3.999947303097641e-05, 'epoch': 1.61}
-  3%|▎         | 187/5800 [23:08<10:47:36,  6.92s/it]score1 tensor([[0.4434],
-        [0.3574],
-        [0.4219],
-        [0.4453]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.3730, 0.5391, 0.5703], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0698, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:09:45,496] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 09:09:45,498] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.99 | bwd_microstep: 4632.15 | bwd_inner_microstep: 4627.52 | bwd_allreduce_microstep: 4.56 | step_microstep: 41.51
-[2025-01-25 09:09:45,498] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.96 | bwd: 4632.17 | bwd_inner: 4627.52 | bwd_allreduce: 4.59 | step: 41.52
-  3%|▎         | 188/5800 [23:15<10:47:28,  6.92s/it]                                                     {'loss': 0.0698, 'grad_norm': 17.515649795532227, 'learning_rate': 3.9999388841087815e-05, 'epoch': 1.62}
-  3%|▎         | 188/5800 [23:15<10:47:28,  6.92s/it]score1 tensor([[0.2949],
-        [0.2598],
-        [0.3477],
-        [0.3809]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4199, 0.4238, 0.4941, 0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1465, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:09:52,406] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.37
-[2025-01-25 09:09:52,407] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.64 | bwd_microstep: 4634.32 | bwd_inner_microstep: 4629.40 | bwd_allreduce_microstep: 4.85 | step_microstep: 40.31
-[2025-01-25 09:09:52,408] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.61 | bwd: 4634.34 | bwd_inner: 4629.40 | bwd_allreduce: 4.88 | step: 40.32
-  3%|▎         | 189/5800 [23:22<10:47:10,  6.92s/it]                                                     {'loss': 0.1465, 'grad_norm': 17.605676651000977, 'learning_rate': 3.999929841504276e-05, 'epoch': 1.63}
-  3%|▎         | 189/5800 [23:22<10:47:10,  6.92s/it]score1 tensor([[0.3398],
-        [0.3809],
-        [0.3711],
-        [0.2852]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.5156, 0.4512, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1523, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:09:59,331] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 09:09:59,332] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.02 | bwd_microstep: 4640.96 | bwd_inner_microstep: 4636.32 | bwd_allreduce_microstep: 4.57 | step_microstep: 40.44
-[2025-01-25 09:09:59,332] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.99 | bwd: 4640.97 | bwd_inner: 4636.32 | bwd_allreduce: 4.60 | step: 40.45
-  3%|▎         | 190/5800 [23:29<10:46:58,  6.92s/it]                                                     {'loss': 0.1523, 'grad_norm': 17.261281967163086, 'learning_rate': 3.999920175286944e-05, 'epoch': 1.64}
-  3%|▎         | 190/5800 [23:29<10:46:58,  6.92s/it]score1 tensor([[0.3438],
-        [0.3984],
-        [0.4492],
-        [0.3398]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.4863, 0.5469, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1089, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:10:06,238] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 09:10:06,239] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.19 | bwd_microstep: 4630.25 | bwd_inner_microstep: 4625.49 | bwd_allreduce_microstep: 4.69 | step_microstep: 40.42
-[2025-01-25 09:10:06,239] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.16 | bwd: 4630.27 | bwd_inner: 4625.49 | bwd_allreduce: 4.72 | step: 40.43
-  3%|▎         | 191/5800 [23:36<10:46:33,  6.92s/it]                                                     {'loss': 0.1089, 'grad_norm': 17.27553939819336, 'learning_rate': 3.9999098854598e-05, 'epoch': 1.65}
-  3%|▎         | 191/5800 [23:36<10:46:33,  6.92s/it]score1 tensor([[0.4902],
-        [0.5430],
-        [0.5859],
-        [0.4629]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.5742, 0.5078, 0.4824], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0420, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:10:13,160] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.10 | optimizer_step: 4.36
-[2025-01-25 09:10:13,161] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.84 | bwd_microstep: 4640.94 | bwd_inner_microstep: 4636.09 | bwd_allreduce_microstep: 4.77 | step_microstep: 40.93
-[2025-01-25 09:10:13,161] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.81 | bwd: 4640.96 | bwd_inner: 4636.09 | bwd_allreduce: 4.81 | step: 40.94
-  3%|▎         | 192/5800 [23:43<10:46:43,  6.92s/it]                                                     {'loss': 0.042, 'grad_norm': 1.627022624015808, 'learning_rate': 3.999898972026052e-05, 'epoch': 1.66}
-  3%|▎         | 192/5800 [23:43<10:46:43,  6.92s/it]score1 tensor([[0.4941],
-        [0.5000],
-        [0.6211],
-        [0.6133]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.4473, 0.5625, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0459, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:10:20,103] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.85 | optimizer_step: 4.37
-[2025-01-25 09:10:20,105] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.70 | bwd_microstep: 4650.80 | bwd_inner_microstep: 4645.91 | bwd_allreduce_microstep: 4.82 | step_microstep: 45.67
-[2025-01-25 09:10:20,105] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.67 | bwd: 4650.82 | bwd_inner: 4645.91 | bwd_allreduce: 4.85 | step: 45.68
-  3%|▎         | 193/5800 [23:50<10:47:20,  6.93s/it]                                                     {'loss': 0.0459, 'grad_norm': 18.057823181152344, 'learning_rate': 3.999887434989103e-05, 'epoch': 1.66}
-  3%|▎         | 193/5800 [23:50<10:47:20,  6.93s/it]score1 tensor([[0.6523],
-        [0.6914],
-        [0.5938],
-        [0.6172]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.6562, 0.5195, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1001, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:10:27,029] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 09:10:27,030] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.99 | bwd_microstep: 4638.00 | bwd_inner_microstep: 4633.11 | bwd_allreduce_microstep: 4.78 | step_microstep: 41.38
-[2025-01-25 09:10:27,031] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.96 | bwd: 4638.02 | bwd_inner: 4633.11 | bwd_allreduce: 4.84 | step: 41.39
-  3%|▎         | 194/5800 [23:57<10:47:17,  6.93s/it]                                                     {'loss': 0.1001, 'grad_norm': 18.39923667907715, 'learning_rate': 3.999875274352551e-05, 'epoch': 1.67}
-  3%|▎         | 194/5800 [23:57<10:47:17,  6.93s/it]score1 tensor([[0.6133],
-        [0.4844],
-        [0.4844],
-        [0.6641]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4688, 0.4121, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0913, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:10:33,954] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 09:10:33,955] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.86 | bwd_microstep: 4633.43 | bwd_inner_microstep: 4628.75 | bwd_allreduce_microstep: 4.59 | step_microstep: 40.54
-[2025-01-25 09:10:33,955] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.83 | bwd: 4633.45 | bwd_inner: 4628.75 | bwd_allreduce: 4.63 | step: 40.55
-  3%|▎         | 195/5800 [24:03<10:46:51,  6.92s/it]                                                     {'loss': 0.0913, 'grad_norm': 17.97284507751465, 'learning_rate': 3.9998624901201876e-05, 'epoch': 1.68}
-  3%|▎         | 195/5800 [24:03<10:46:51,  6.92s/it]score1 tensor([[0.5000],
-        [0.5469],
-        [0.5781],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.6875, 0.5625, 0.6719], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0830, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:10:40,883] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 09:10:40,885] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.58 | bwd_microstep: 4643.62 | bwd_inner_microstep: 4638.84 | bwd_allreduce_microstep: 4.67 | step_microstep: 41.65
-[2025-01-25 09:10:40,885] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.56 | bwd: 4643.64 | bwd_inner: 4638.84 | bwd_allreduce: 4.74 | step: 41.66
-  3%|▎         | 196/5800 [24:10<10:46:51,  6.93s/it]                                                     {'loss': 0.083, 'grad_norm': 9.181872367858887, 'learning_rate': 3.999849082295999e-05, 'epoch': 1.69}
-  3%|▎         | 196/5800 [24:10<10:46:51,  6.93s/it]score1 tensor([[0.4941],
-        [0.5039],
-        [0.6016],
-        [0.4707]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4297, 0.5508, 0.5469, 0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0566, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:10:47,817] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 09:10:47,818] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.40 | bwd_microstep: 4647.25 | bwd_inner_microstep: 4642.17 | bwd_allreduce_microstep: 4.98 | step_microstep: 41.78
-[2025-01-25 09:10:47,819] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.38 | bwd: 4647.27 | bwd_inner: 4642.18 | bwd_allreduce: 5.02 | step: 41.79
-  3%|▎         | 197/5800 [24:17<10:46:56,  6.93s/it]                                                     {'loss': 0.0566, 'grad_norm': 1.7188639640808105, 'learning_rate': 3.999835050884166e-05, 'epoch': 1.7}
-  3%|▎         | 197/5800 [24:17<10:46:56,  6.93s/it]score1 tensor([[0.5781],
-        [0.5195],
-        [0.4883],
-        [0.4688]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.5625, 0.6562, 0.5938], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0918, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:10:54,743] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 09:10:54,744] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.58 | bwd_microstep: 4645.63 | bwd_inner_microstep: 4640.80 | bwd_allreduce_microstep: 4.74 | step_microstep: 42.19
-[2025-01-25 09:10:54,745] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.55 | bwd: 4645.65 | bwd_inner: 4640.79 | bwd_allreduce: 4.78 | step: 42.19
-  3%|▎         | 198/5800 [24:24<10:46:47,  6.93s/it]                                                     {'loss': 0.0918, 'grad_norm': 18.48040199279785, 'learning_rate': 3.999820395889065e-05, 'epoch': 1.71}
-  3%|▎         | 198/5800 [24:24<10:46:47,  6.93s/it]score1 tensor([[0.4609],
-        [0.4805],
-        [0.4375],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.5625, 0.3691, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0474, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:11:01,668] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.38 | optimizer_step: 4.36
-[2025-01-25 09:11:01,669] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.00 | bwd_microstep: 4641.27 | bwd_inner_microstep: 4636.44 | bwd_allreduce_microstep: 4.76 | step_microstep: 43.47
-[2025-01-25 09:11:01,669] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.97 | bwd: 4641.29 | bwd_inner: 4636.44 | bwd_allreduce: 4.80 | step: 43.48
-  3%|▎         | 199/5800 [24:31<10:46:39,  6.93s/it]                                                     {'loss': 0.0474, 'grad_norm': 9.317967414855957, 'learning_rate': 3.9998051173152635e-05, 'epoch': 1.72}
-  3%|▎         | 199/5800 [24:31<10:46:39,  6.93s/it]score1 tensor([[0.6055],
-        [0.5859],
-        [0.4824],
-        [0.6328]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.4961, 0.2812, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1523, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:11:08,591] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 09:11:08,592] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.22 | bwd_microstep: 4637.00 | bwd_inner_microstep: 4632.56 | bwd_allreduce_microstep: 4.37 | step_microstep: 43.51
-[2025-01-25 09:11:08,593] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.19 | bwd: 4637.02 | bwd_inner: 4632.55 | bwd_allreduce: 4.40 | step: 43.53
-  3%|▎         | 200/5800 [24:38<10:46:29,  6.93s/it]                                                     {'loss': 0.1523, 'grad_norm': 18.26628875732422, 'learning_rate': 3.999789215167527e-05, 'epoch': 1.72}
-  3%|▎         | 200/5800 [24:38<10:46:29,  6.93s/it]score1 tensor([[0.5820],
-        [0.6211],
-        [0.6016],
-        [0.6602]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5820, 0.4551, 0.4844, 0.5703], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0933, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:11:15,463] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 09:11:15,464] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.12 | bwd_microstep: 4583.89 | bwd_inner_microstep: 4579.14 | bwd_allreduce_microstep: 4.67 | step_microstep: 39.98
-[2025-01-25 09:11:15,465] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.10 | bwd: 4583.91 | bwd_inner: 4579.14 | bwd_allreduce: 4.70 | step: 39.99
-  3%|▎         | 201/5800 [24:45<10:44:44,  6.91s/it]                                                     {'loss': 0.0933, 'grad_norm': 13.90648078918457, 'learning_rate': 3.9997726894508144e-05, 'epoch': 1.73}
-  3%|▎         | 201/5800 [24:45<10:44:44,  6.91s/it]score1 tensor([[0.6641],
-        [0.5781],
-        [0.6406],
-        [0.4141]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.5078, 0.5820, 0.3418], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0610, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:11:22,386] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.37
-[2025-01-25 09:11:22,387] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.76 | bwd_microstep: 4636.37 | bwd_inner_microstep: 4631.92 | bwd_allreduce_microstep: 4.40 | step_microstep: 37.03
-[2025-01-25 09:11:22,387] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.73 | bwd: 4636.41 | bwd_inner: 4631.92 | bwd_allreduce: 4.42 | step: 37.04
-  3%|▎         | 202/5800 [24:52<10:44:48,  6.91s/it]                                                     {'loss': 0.061, 'grad_norm': 18.547761917114258, 'learning_rate': 3.9997555401702775e-05, 'epoch': 1.74}
-  3%|▎         | 202/5800 [24:52<10:44:48,  6.91s/it]score1 tensor([[0.4883],
-        [0.4434],
-        [0.5508],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4258, 0.4473, 0.5039, 0.5820], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0342, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:11:29,308] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.37
-[2025-01-25 09:11:29,309] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.93 | bwd_microstep: 4645.15 | bwd_inner_microstep: 4640.48 | bwd_allreduce_microstep: 4.59 | step_microstep: 41.62
-[2025-01-25 09:11:29,309] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.91 | bwd: 4645.17 | bwd_inner: 4640.48 | bwd_allreduce: 4.63 | step: 41.62
-  4%|▎         | 203/5800 [24:59<10:45:12,  6.92s/it]                                                     {'loss': 0.0342, 'grad_norm': 1.9807273149490356, 'learning_rate': 3.9997377673312644e-05, 'epoch': 1.75}
-  4%|▎         | 203/5800 [24:59<10:45:12,  6.92s/it]score1 tensor([[0.5508],
-        [0.5352],
-        [0.4102],
-        [0.4570]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.4746, 0.3750, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0513, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:11:36,227] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.37
-[2025-01-25 09:11:36,229] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.12 | bwd_microstep: 4635.96 | bwd_inner_microstep: 4630.74 | bwd_allreduce_microstep: 5.15 | step_microstep: 42.38
-[2025-01-25 09:11:36,229] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.09 | bwd: 4635.99 | bwd_inner: 4630.73 | bwd_allreduce: 5.19 | step: 42.38
-  4%|▎         | 204/5800 [25:06<10:45:08,  6.92s/it]                                                     {'loss': 0.0513, 'grad_norm': 1.8013193607330322, 'learning_rate': 3.9997193709393175e-05, 'epoch': 1.76}
-  4%|▎         | 204/5800 [25:06<10:45:08,  6.92s/it]score1 tensor([[0.3652],
-        [0.4648],
-        [0.4668],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3730, 0.5156, 0.3789, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0542, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:11:43,151] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 09:11:43,152] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.19 | bwd_microstep: 4644.71 | bwd_inner_microstep: 4639.96 | bwd_allreduce_microstep: 4.68 | step_microstep: 42.58
-[2025-01-25 09:11:43,153] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.16 | bwd: 4644.73 | bwd_inner: 4639.96 | bwd_allreduce: 4.71 | step: 42.59
-  4%|▎         | 205/5800 [25:13<10:45:25,  6.92s/it]                                                     {'loss': 0.0542, 'grad_norm': 9.325339317321777, 'learning_rate': 3.999700351000172e-05, 'epoch': 1.77}
-  4%|▎         | 205/5800 [25:13<10:45:25,  6.92s/it]score1 tensor([[0.3574],
-        [0.2949],
-        [0.3984],
-        [0.4297]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.4766, 0.4277, 0.4805], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0972, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:11:50,097] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 09:11:50,098] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.40 | bwd_microstep: 4635.57 | bwd_inner_microstep: 4630.80 | bwd_allreduce_microstep: 4.70 | step_microstep: 41.26
-[2025-01-25 09:11:50,098] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.37 | bwd: 4635.59 | bwd_inner: 4630.80 | bwd_allreduce: 4.73 | step: 41.28
-  4%|▎         | 206/5800 [25:20<10:46:01,  6.93s/it]                                                     {'loss': 0.0972, 'grad_norm': 17.87169647216797, 'learning_rate': 3.99968070751976e-05, 'epoch': 1.78}
-  4%|▎         | 206/5800 [25:20<10:46:01,  6.93s/it]score1 tensor([[0.5312],
-        [0.4023],
-        [0.5898],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.4961, 0.4785, 0.4062], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0796, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:11:57,026] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.37
-[2025-01-25 09:11:57,027] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.26 | bwd_microstep: 4635.58 | bwd_inner_microstep: 4630.81 | bwd_allreduce_microstep: 4.69 | step_microstep: 45.00
-[2025-01-25 09:11:57,028] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.23 | bwd: 4635.60 | bwd_inner: 4630.81 | bwd_allreduce: 4.72 | step: 45.00
-  4%|▎         | 207/5800 [25:27<10:45:40,  6.93s/it]                                                     {'loss': 0.0796, 'grad_norm': 9.529786109924316, 'learning_rate': 3.9996604405042055e-05, 'epoch': 1.78}
-  4%|▎         | 207/5800 [25:27<10:45:40,  6.93s/it]score1 tensor([[0.4824],
-        [0.4824],
-        [0.3418],
-        [0.4531]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.4492, 0.3789, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0464, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:12:03,941] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.36
-[2025-01-25 09:12:03,942] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.44 | bwd_microstep: 4636.01 | bwd_inner_microstep: 4631.25 | bwd_allreduce_microstep: 4.68 | step_microstep: 40.72
-[2025-01-25 09:12:03,943] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.42 | bwd: 4636.03 | bwd_inner: 4631.25 | bwd_allreduce: 4.72 | step: 40.72
-  4%|▎         | 208/5800 [25:33<10:45:11,  6.92s/it]                                                     {'loss': 0.0464, 'grad_norm': 9.175821304321289, 'learning_rate': 3.999639549959828e-05, 'epoch': 1.79}
-  4%|▎         | 208/5800 [25:33<10:45:11,  6.92s/it]score1 tensor([[0.6211],
-        [0.5586],
-        [0.5859],
-        [0.6445]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6797, 0.5742, 0.6055, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0400, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:12:10,869] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 09:12:10,870] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.55 | bwd_microstep: 4635.21 | bwd_inner_microstep: 4630.69 | bwd_allreduce_microstep: 4.46 | step_microstep: 39.98
-[2025-01-25 09:12:10,871] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.53 | bwd: 4635.24 | bwd_inner: 4630.69 | bwd_allreduce: 4.49 | step: 39.98
-  4%|▎         | 209/5800 [25:40<10:45:14,  6.92s/it]                                                     {'loss': 0.04, 'grad_norm': 9.230587005615234, 'learning_rate': 3.999618035893143e-05, 'epoch': 1.8}
-  4%|▎         | 209/5800 [25:40<10:45:14,  6.92s/it]score1 tensor([[0.5586],
-        [0.3867],
-        [0.7500],
-        [0.7188]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.4180, 0.6094, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1367, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:12:17,783] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 09:12:17,784] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.04 | bwd_microstep: 4636.20 | bwd_inner_microstep: 4631.72 | bwd_allreduce_microstep: 4.41 | step_microstep: 40.00
-[2025-01-25 09:12:17,785] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.01 | bwd: 4636.22 | bwd_inner: 4631.72 | bwd_allreduce: 4.44 | step: 40.01
-  4%|▎         | 210/5800 [25:47<10:44:46,  6.92s/it]                                                     {'loss': 0.1367, 'grad_norm': 10.167933464050293, 'learning_rate': 3.999595898310857e-05, 'epoch': 1.81}
-  4%|▎         | 210/5800 [25:47<10:44:46,  6.92s/it]score1 tensor([[0.5664],
-        [0.7773],
-        [0.5547],
-        [0.7031]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.6406, 0.4082, 0.4375], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1367, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:12:24,631] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.43 | optimizer_step: 4.37
-[2025-01-25 09:12:24,632] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.45 | bwd_microstep: 4583.50 | bwd_inner_microstep: 4580.05 | bwd_allreduce_microstep: 3.40 | step_microstep: 32.00
-[2025-01-25 09:12:24,633] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.42 | bwd: 4583.52 | bwd_inner: 4580.05 | bwd_allreduce: 3.42 | step: 32.00
-  4%|▎         | 211/5800 [25:54<10:42:07,  6.89s/it]                                                     {'loss': 0.1367, 'grad_norm': 14.509153366088867, 'learning_rate': 3.999573137219875e-05, 'epoch': 1.82}
-  4%|▎         | 211/5800 [25:54<10:42:07,  6.89s/it]score1 tensor([[0.6172],
-        [0.4824],
-        [0.4570],
-        [0.6680]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.5156, 0.3887, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0679, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:12:31,524] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.27 | optimizer_step: 4.36
-[2025-01-25 09:12:31,526] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.67 | bwd_microstep: 4641.75 | bwd_inner_microstep: 4636.85 | bwd_allreduce_microstep: 4.81 | step_microstep: 45.00
-[2025-01-25 09:12:31,526] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.65 | bwd: 4641.77 | bwd_inner: 4636.85 | bwd_allreduce: 4.85 | step: 45.00
-  4%|▎         | 212/5800 [26:01<10:42:49,  6.90s/it]                                                     {'loss': 0.0679, 'grad_norm': 9.59367847442627, 'learning_rate': 3.9995497526272926e-05, 'epoch': 1.83}
-  4%|▎         | 212/5800 [26:01<10:42:49,  6.90s/it]score1 tensor([[0.6797],
-        [0.3477],
-        [0.4453],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5078, 0.3340, 0.5430, 0.4629], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0889, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:12:38,460] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.37
-[2025-01-25 09:12:38,462] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.05 | bwd_microstep: 4643.54 | bwd_inner_microstep: 4636.33 | bwd_allreduce_microstep: 7.14 | step_microstep: 45.51
-[2025-01-25 09:12:38,462] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.02 | bwd: 4643.56 | bwd_inner: 4636.33 | bwd_allreduce: 7.18 | step: 45.52
-  4%|▎         | 213/5800 [26:08<10:43:53,  6.91s/it]                                                     {'loss': 0.0889, 'grad_norm': 9.833358764648438, 'learning_rate': 3.9995257445404024e-05, 'epoch': 1.84}
-  4%|▎         | 213/5800 [26:08<10:43:53,  6.91s/it]score1 tensor([[0.6289],
-        [0.4512],
-        [0.3262],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.4648, 0.4180, 0.5742], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:12:45,373] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.92 | optimizer_step: 4.36
-[2025-01-25 09:12:45,374] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.45 | bwd_microstep: 4598.71 | bwd_inner_microstep: 4593.92 | bwd_allreduce_microstep: 4.70 | step_microstep: 54.58
-[2025-01-25 09:12:45,375] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.43 | bwd: 4598.75 | bwd_inner: 4593.92 | bwd_allreduce: 4.74 | step: 54.61
-  4%|▎         | 214/5800 [26:15<10:43:32,  6.91s/it]                                                     {'loss': 0.0391, 'grad_norm': 4.797937393188477, 'learning_rate': 3.99950111296669e-05, 'epoch': 1.84}
-  4%|▎         | 214/5800 [26:15<10:43:32,  6.91s/it]score1 tensor([[0.3184],
-        [0.2354],
-        [0.5625],
-        [0.4238]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5000, 0.4453, 0.4980, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1338, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:12:52,321] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.95 | optimizer_step: 4.36
-[2025-01-25 09:12:52,323] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.30 | bwd_microstep: 4637.44 | bwd_inner_microstep: 4630.97 | bwd_allreduce_microstep: 6.33 | step_microstep: 54.46
-[2025-01-25 09:12:52,323] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.27 | bwd: 4637.49 | bwd_inner: 4630.97 | bwd_allreduce: 6.39 | step: 54.48
-  4%|▎         | 215/5800 [26:22<10:44:14,  6.92s/it]                                                     {'loss': 0.1338, 'grad_norm': 9.070660591125488, 'learning_rate': 3.9994758579138365e-05, 'epoch': 1.85}
-  4%|▎         | 215/5800 [26:22<10:44:14,  6.92s/it]score1 tensor([[0.5430],
-        [0.4805],
-        [0.3359],
-        [0.2676]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5078, 0.4453, 0.4980, 0.4375], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1006, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:12:59,240] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.44 | optimizer_step: 4.36
-[2025-01-25 09:12:59,241] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.40 | bwd_microstep: 4640.40 | bwd_inner_microstep: 4636.48 | bwd_allreduce_microstep: 3.85 | step_microstep: 31.73
-[2025-01-25 09:12:59,241] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.34 | bwd: 4640.42 | bwd_inner: 4636.48 | bwd_allreduce: 3.88 | step: 31.73
-  4%|▎         | 216/5800 [26:29<10:43:38,  6.92s/it]                                                     {'loss': 0.1006, 'grad_norm': 2.7203400135040283, 'learning_rate': 3.999449979389716e-05, 'epoch': 1.86}
-  4%|▎         | 216/5800 [26:29<10:43:38,  6.92s/it]score1 tensor([[0.2520],
-        [0.5039],
-        [0.4805],
-        [0.4004]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4375, 0.4707, 0.5508, 0.3652], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0811, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:13:06,155] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 09:13:06,156] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.27 | bwd_microstep: 4648.51 | bwd_inner_microstep: 4643.78 | bwd_allreduce_microstep: 4.64 | step_microstep: 40.77
-[2025-01-25 09:13:06,156] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.27 | bwd: 4648.53 | bwd_inner: 4643.78 | bwd_allreduce: 4.67 | step: 40.77
-  4%|▎         | 217/5800 [26:36<10:43:42,  6.92s/it]                                                     {'loss': 0.0811, 'grad_norm': 2.3855974674224854, 'learning_rate': 3.999423477402399e-05, 'epoch': 1.87}
-  4%|▎         | 217/5800 [26:36<10:43:42,  6.92s/it]score1 tensor([[0.5586],
-        [0.3125],
-        [0.3867],
-        [0.4160]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5000, 0.3398, 0.4590, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0693, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:13:13,083] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.26 | optimizer_step: 4.36
-[2025-01-25 09:13:13,084] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.00 | bwd_microstep: 4646.46 | bwd_inner_microstep: 4640.03 | bwd_allreduce_microstep: 6.34 | step_microstep: 43.30
-[2025-01-25 09:13:13,085] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.97 | bwd: 4646.49 | bwd_inner: 4640.03 | bwd_allreduce: 6.38 | step: 43.31
-  4%|▍         | 218/5800 [26:43<10:43:58,  6.92s/it]                                                     {'loss': 0.0693, 'grad_norm': 8.64902400970459, 'learning_rate': 3.999396351960148e-05, 'epoch': 1.88}
-  4%|▍         | 218/5800 [26:43<10:43:58,  6.92s/it]score1 tensor([[0.4805],
-        [0.4375],
-        [0.4297],
-        [0.6523]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5000, 0.5195, 0.4570, 0.6133], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0420, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:13:20,005] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.51 | optimizer_step: 4.36
-[2025-01-25 09:13:20,010] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.38 | bwd_microstep: 4646.68 | bwd_inner_microstep: 4642.10 | bwd_allreduce_microstep: 4.51 | step_microstep: 42.62
-[2025-01-25 09:13:20,011] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.35 | bwd: 4646.70 | bwd_inner: 4642.10 | bwd_allreduce: 4.54 | step: 42.62
-  4%|▍         | 219/5800 [26:49<10:43:47,  6.92s/it]                                                     {'loss': 0.042, 'grad_norm': 8.882965087890625, 'learning_rate': 3.999368603071423e-05, 'epoch': 1.89}
-  4%|▍         | 219/5800 [26:49<10:43:47,  6.92s/it]score1 tensor([[0.5234],
-        [0.5391],
-        [0.6250],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.5391, 0.5391, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0435, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:13:26,895] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.18 | optimizer_step: 4.36
-[2025-01-25 09:13:26,897] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.17 | bwd_microstep: 4595.89 | bwd_inner_microstep: 4589.96 | bwd_allreduce_microstep: 5.78 | step_microstep: 61.35
-[2025-01-25 09:13:26,897] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.15 | bwd: 4595.94 | bwd_inner: 4589.96 | bwd_allreduce: 5.86 | step: 61.34
-  4%|▍         | 220/5800 [26:56<10:43:34,  6.92s/it]                                                     {'loss': 0.0435, 'grad_norm': 4.96452522277832, 'learning_rate': 3.999340230744875e-05, 'epoch': 1.9}
-  4%|▍         | 220/5800 [26:56<10:43:34,  6.92s/it]score1 tensor([[0.4492],
-        [0.6445],
-        [0.5508],
-        [0.6992]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3711, 0.6836, 0.4473, 0.6562], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0659, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:13:33,878] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.76 | optimizer_step: 4.36
-[2025-01-25 09:13:33,880] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.77 | bwd_microstep: 4646.03 | bwd_inner_microstep: 4640.74 | bwd_allreduce_microstep: 5.19 | step_microstep: 56.73
-[2025-01-25 09:13:33,880] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.70 | bwd: 4646.07 | bwd_inner: 4640.74 | bwd_allreduce: 5.24 | step: 56.74
-  4%|▍         | 221/5800 [27:03<10:45:29,  6.94s/it]                                                     {'loss': 0.0659, 'grad_norm': 8.870025634765625, 'learning_rate': 3.9993112349893516e-05, 'epoch': 1.91}
-  4%|▍         | 221/5800 [27:03<10:45:29,  6.94s/it]score1 tensor([[0.6055],
-        [0.5195],
-        [0.5781],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3945, 0.3945, 0.4590, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1226, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:13:40,851] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.47 | optimizer_step: 4.36
-[2025-01-25 09:13:40,852] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.15 | bwd_microstep: 4646.71 | bwd_inner_microstep: 4642.76 | bwd_allreduce_microstep: 3.88 | step_microstep: 34.95
-[2025-01-25 09:13:40,852] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.11 | bwd: 4646.73 | bwd_inner: 4642.76 | bwd_allreduce: 3.91 | step: 34.96
-  4%|▍         | 222/5800 [27:10<10:45:06,  6.94s/it]                                                     {'loss': 0.1226, 'grad_norm': 17.566774368286133, 'learning_rate': 3.9992816158138935e-05, 'epoch': 1.91}
-  4%|▍         | 222/5800 [27:10<10:45:06,  6.94s/it]score1 tensor([[0.5000],
-        [0.5312],
-        [0.5664],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.4121, 0.7070, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0815, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:13:47,778] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.29 | optimizer_step: 4.37
-[2025-01-25 09:13:47,779] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.52 | bwd_microstep: 4642.41 | bwd_inner_microstep: 4638.54 | bwd_allreduce_microstep: 3.81 | step_microstep: 52.76
-[2025-01-25 09:13:47,779] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.50 | bwd: 4642.43 | bwd_inner: 4638.54 | bwd_allreduce: 3.84 | step: 52.78
-  4%|▍         | 223/5800 [27:17<10:45:08,  6.94s/it]                                                     {'loss': 0.0815, 'grad_norm': 8.896605491638184, 'learning_rate': 3.999251373227738e-05, 'epoch': 1.92}
-  4%|▍         | 223/5800 [27:17<10:45:08,  6.94s/it]score1 tensor([[0.4863],
-        [0.5586],
-        [0.4668],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.5000, 0.4434, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0620, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:13:54,708] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.23 | optimizer_step: 4.36
-[2025-01-25 09:13:54,709] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.39 | bwd_microstep: 4635.64 | bwd_inner_microstep: 4631.96 | bwd_allreduce_microstep: 3.63 | step_microstep: 30.30
-[2025-01-25 09:13:54,709] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.36 | bwd: 4635.66 | bwd_inner: 4631.96 | bwd_allreduce: 3.65 | step: 30.30
-  4%|▍         | 224/5800 [27:24<10:43:59,  6.93s/it]                                                     {'loss': 0.062, 'grad_norm': 8.886368751525879, 'learning_rate': 3.9992205072403136e-05, 'epoch': 1.93}
-  4%|▍         | 224/5800 [27:24<10:43:59,  6.93s/it]score1 tensor([[0.5234],
-        [0.5156],
-        [0.5430],
-        [0.4336]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6406, 0.5430, 0.5391, 0.3672], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0537, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:14:01,605] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 09:14:01,606] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.47 | bwd_microstep: 4638.20 | bwd_inner_microstep: 4633.99 | bwd_allreduce_microstep: 4.13 | step_microstep: 33.48
-[2025-01-25 09:14:01,607] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.44 | bwd: 4638.22 | bwd_inner: 4633.99 | bwd_allreduce: 4.16 | step: 33.48
-  4%|▍         | 225/5800 [27:31<10:43:08,  6.92s/it]                                                     {'loss': 0.0537, 'grad_norm': 1.301935076713562, 'learning_rate': 3.9991890178612454e-05, 'epoch': 1.94}
-  4%|▍         | 225/5800 [27:31<10:43:08,  6.92s/it]score1 tensor([[0.4355],
-        [0.4570],
-        [0.4395],
-        [0.4141]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.6641, 0.4316, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0962, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:14:08,508] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.46 | optimizer_step: 4.37
-[2025-01-25 09:14:08,509] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.04 | bwd_microstep: 4642.70 | bwd_inner_microstep: 4638.92 | bwd_allreduce_microstep: 3.72 | step_microstep: 32.03
-[2025-01-25 09:14:08,509] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.02 | bwd: 4642.71 | bwd_inner: 4638.92 | bwd_allreduce: 3.74 | step: 32.04
-  4%|▍         | 226/5800 [27:38<10:42:16,  6.91s/it]                                                     {'loss': 0.0962, 'grad_norm': 8.6328125, 'learning_rate': 3.999156905100353e-05, 'epoch': 1.95}
-  4%|▍         | 226/5800 [27:38<10:42:16,  6.91s/it]score1 tensor([[0.3555],
-        [0.4902],
-        [0.4492],
-        [0.4102]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4141, 0.5469, 0.5117, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0659, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:14:15,392] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.23 | optimizer_step: 4.36
-[2025-01-25 09:14:15,392] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.76 | bwd_microstep: 4636.73 | bwd_inner_microstep: 4632.84 | bwd_allreduce_microstep: 3.83 | step_microstep: 31.01
-[2025-01-25 09:14:15,393] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.75 | bwd: 4636.75 | bwd_inner: 4632.84 | bwd_allreduce: 3.85 | step: 31.01
-  4%|▍         | 227/5800 [27:45<10:41:19,  6.90s/it]                                                     {'loss': 0.0659, 'grad_norm': 17.4647274017334, 'learning_rate': 3.999124168967649e-05, 'epoch': 1.96}
-  4%|▍         | 227/5800 [27:45<10:41:19,  6.90s/it]score1 tensor([[0.5391],
-        [0.5234],
-        [0.4551],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4355, 0.4473, 0.3613, 0.6250], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0957, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:14:22,301] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.54 | optimizer_step: 4.37
-[2025-01-25 09:14:22,302] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.04 | bwd_microstep: 4638.44 | bwd_inner_microstep: 4634.09 | bwd_allreduce_microstep: 4.29 | step_microstep: 41.32
-[2025-01-25 09:14:22,302] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.01 | bwd: 4638.46 | bwd_inner: 4634.08 | bwd_allreduce: 4.32 | step: 41.33
-  4%|▍         | 228/5800 [27:52<10:41:53,  6.91s/it]                                                     {'loss': 0.0957, 'grad_norm': 8.870973587036133, 'learning_rate': 3.999090809473341e-05, 'epoch': 1.97}
-  4%|▍         | 228/5800 [27:52<10:41:53,  6.91s/it]score1 tensor([[0.5820],
-        [0.4707],
-        [0.5820],
-        [0.3555]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5586, 0.3457, 0.4922, 0.4043], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0718, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:14:29,212] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 09:14:29,214] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.38 | bwd_microstep: 4636.89 | bwd_inner_microstep: 4632.96 | bwd_allreduce_microstep: 3.87 | step_microstep: 35.98
-[2025-01-25 09:14:29,214] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.35 | bwd: 4636.91 | bwd_inner: 4632.96 | bwd_allreduce: 3.90 | step: 36.00
-  4%|▍         | 229/5800 [27:59<10:41:50,  6.91s/it]                                                     {'loss': 0.0718, 'grad_norm': 10.609457969665527, 'learning_rate': 3.999056826627832e-05, 'epoch': 1.97}
-  4%|▍         | 229/5800 [27:59<10:41:50,  6.91s/it]score1 tensor([[0.5273],
-        [0.3770],
-        [0.4395],
-        [0.3867]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.4492, 0.4531, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:14:36,138] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 09:14:36,140] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.48 | bwd_microstep: 4636.08 | bwd_inner_microstep: 4631.23 | bwd_allreduce_microstep: 4.76 | step_microstep: 47.20
-[2025-01-25 09:14:36,140] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.45 | bwd: 4636.10 | bwd_inner: 4631.24 | bwd_allreduce: 4.80 | step: 47.21
-  4%|▍         | 230/5800 [28:06<10:41:53,  6.91s/it]                                                     {'loss': 0.0703, 'grad_norm': 17.148338317871094, 'learning_rate': 3.9990222204417174e-05, 'epoch': 1.98}
-  4%|▍         | 230/5800 [28:06<10:41:53,  6.91s/it]evaluate!
-score1 tensor([[0.5469]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4805, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4453]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5625, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1172, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4707]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4766, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4746]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5703, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0957, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6055]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5742, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5781]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5547, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5664]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5977, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5547]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4219, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1328, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5039]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4766]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5000, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4902]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3945, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0957, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5664]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5000, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4785]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3730, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1055, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5195]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3926]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4414, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0488, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5508]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6797, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1289, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4961]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5391, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6445]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4648, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1797, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4121]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4258, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4688]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4707, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4219]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3965, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5508]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3750, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1758, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5312]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4492, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0820, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5312]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5586, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5977]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6523, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5195]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5898, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4355, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0918, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5508]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5156, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4707]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3262, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1445, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5000]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4863, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5820]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5039]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6055, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1016, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4531]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4570, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4863]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1621, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5117]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.7031, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1914, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5586]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4336, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1250, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5938]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5391, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4570]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4473, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5312]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5430, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5117]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4590, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0527, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6055]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6641, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0586, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6641]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4453, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2188, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5820]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.7109, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1289, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4629]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5273, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0645, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4023]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3691, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0332, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4824]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4180, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0645, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5742]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3223, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2520, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6406]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6016, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5586]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4648, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0938, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3418, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1973, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4590]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4590, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0., device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4082]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5117, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1035, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4551, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0684, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4375]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5234, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0859, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5312, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5430]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5820, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4941]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4199, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0742, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4863]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3926, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0938, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4922]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1250, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4883]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6016]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1133, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4844]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5352, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5977]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6250, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5586]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5703, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5039]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4727, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4434, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0840, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4941, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0332, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5352, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5312]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3809]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3828, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4297]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4219, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4805]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4844, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5781]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4805, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0977, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0781, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6719]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5898, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0820, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5781]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6641, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0859, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4336]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4688, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3828]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4297, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4531]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4668, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4824]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1465, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4863]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4297, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0566, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3887]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4609, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0723, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0820, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4844]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4492, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5078]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3730, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1348, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4316]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6445, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2129, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4844]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1641, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3652]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5430, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1777, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5820]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5273, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4492]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3867, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0625, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4316]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4668, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6055]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5625, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6250, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0859, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5742]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5664]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0625, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5430]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4785, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0645, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4062, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1094, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6289]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4395, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1895, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3535]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5664, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2129, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4160]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3555, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0605, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4629]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4785, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5742]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5078, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5469]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6016, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4961]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4277, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0684, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5742]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5781, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5625]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3477, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2148, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5430]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6133, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5742]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5156, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0586, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3379, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1855, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4473]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3418, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1055, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6016]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3848, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2168, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4648]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3828, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0820, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4277, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0957, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3594]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4043, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0449, device='cuda:0', dtype=torch.bfloat16)
-Results saved to /DATA/DATA1/wjr/intern/InternVL/internvl_chat/new_train_result/stage2/mos3_test.csv
-Accuracy: 0.4782608695652174
-SRCC_score: 0.34391341496999106
-PLCC_score: 0.3107022155221436
-KRCC_score: 0.24173032384784016
-SRCC_level: 0.34391341496999106
-PLCC_level: 0.3107022155221436
-KRCC_level: 0.24173032384784016
-New best SRCC_score: 0.34391341496999106. Saving model...
-[INFO|trainer.py:3705] 2025-01-25 09:25:09,118 >> Saving model checkpoint to /DATA/env/wjr/newtrain/stage2/mos3
-[INFO|configuration_utils.py:410] 2025-01-25 09:25:09,127 >> Configuration saved in /DATA/env/wjr/newtrain/stage2/mos3/config.json
-[INFO|configuration_utils.py:868] 2025-01-25 09:25:09,128 >> Configuration saved in /DATA/env/wjr/newtrain/stage2/mos3/generation_config.json
-[INFO|modeling_utils.py:2844] 2025-01-25 09:25:56,370 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 4 checkpoint shards. You can find where each parameters has been saved in the index located at /DATA/env/wjr/newtrain/stage2/mos3/model.safetensors.index.json.
-[INFO|tokenization_utils_base.py:2641] 2025-01-25 09:25:56,373 >> tokenizer config file saved in /DATA/env/wjr/newtrain/stage2/mos3/tokenizer_config.json
-[INFO|tokenization_utils_base.py:2650] 2025-01-25 09:25:56,373 >> Special tokens file saved in /DATA/env/wjr/newtrain/stage2/mos3/special_tokens_map.json
-[INFO|tokenization_utils_base.py:2701] 2025-01-25 09:25:56,374 >> added tokens file saved in /DATA/env/wjr/newtrain/stage2/mos3/added_tokens.json
-01/25/2025 09:26:08 - INFO - __main__ - Saved LoRA weights to /DATA/env/wjr/newtrain/stage2/mos3/lora_weights.pth
-score1 tensor([[0.4473],
-        [0.4668],
-        [0.4648],
-        [0.6094]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.3984, 0.5430, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0791, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:26:15,584] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 09:26:15,586] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2126.68 | bwd_microstep: 4564.60 | bwd_inner_microstep: 4558.04 | bwd_allreduce_microstep: 6.42 | step_microstep: 54.84
-[2025-01-25 09:26:15,586] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2126.65 | bwd: 4564.65 | bwd_inner: 4558.04 | bwd_allreduce: 6.49 | step: 54.83
-  4%|▍         | 231/5800 [39:45<332:05:40, 214.68s/it]                                                       {'loss': 0.0791, 'grad_norm': 1.7782686948776245, 'learning_rate': 3.998986990925788e-05, 'epoch': 1.99}
-  4%|▍         | 231/5800 [39:45<332:05:40, 214.68s/it][93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-score1 tensor([[0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0625, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:26:21,564] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 09:26:21,567] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 565.54 | bwd_microstep: 1213.01 | bwd_inner_microstep: 1208.50 | bwd_allreduce_microstep: 4.44 | step_microstep: 41.68
-[2025-01-25 09:26:21,567] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 565.51 | bwd: 1213.03 | bwd_inner: 1208.50 | bwd_allreduce: 4.48 | step: 41.70
-  4%|▍         | 232/5800 [39:51<235:11:38, 152.07s/it]                                                       {'loss': 0.0625, 'grad_norm': 18.045799255371094, 'learning_rate': 3.998951138091031e-05, 'epoch': 2.0}
-  4%|▍         | 232/5800 [39:51<235:11:38, 152.07s/it][2025-01-25 09:26:26,212] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 09:26:37,908] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 09:26:48,435] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 09:26:58,655] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-score1 tensor([[0.4258],
-        [0.5469],
-        [0.4238],
-        [0.4980]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.5625, 0.4785, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0601, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:27:16,046] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.55 | optimizer_step: 4.37
-[2025-01-25 09:27:16,048] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2129.76 | bwd_microstep: 4582.01 | bwd_inner_microstep: 4574.72 | bwd_allreduce_microstep: 7.17 | step_microstep: 55.33
-[2025-01-25 09:27:16,049] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2129.72 | bwd: 4582.04 | bwd_inner: 4574.72 | bwd_allreduce: 7.23 | step: 55.23
-  4%|▍         | 233/5800 [40:46<189:53:34, 122.80s/it]                                                       {'loss': 0.0601, 'grad_norm': 17.611806869506836, 'learning_rate': 3.9989146619486225e-05, 'epoch': 2.01}
-  4%|▍         | 233/5800 [40:46<189:53:34, 122.80s/it]score1 tensor([[0.4941],
-        [0.5195],
-        [0.4434],
-        [0.4961]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4863, 0.5273, 0.5586, 0.4277], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0498, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:27:22,974] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.60 | optimizer_step: 4.36
-[2025-01-25 09:27:22,978] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2140.90 | bwd_microstep: 4594.35 | bwd_inner_microstep: 4587.17 | bwd_allreduce_microstep: 6.99 | step_microstep: 64.08
-[2025-01-25 09:27:22,978] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2140.87 | bwd: 4594.41 | bwd_inner: 4587.17 | bwd_allreduce: 7.09 | step: 64.05
-  4%|▍         | 234/5800 [40:53<136:07:49, 88.05s/it]                                                       {'loss': 0.0498, 'grad_norm': 1.5247385501861572, 'learning_rate': 3.998877562509939e-05, 'epoch': 2.02}
-  4%|▍         | 234/5800 [40:53<136:07:49, 88.05s/it]score1 tensor([[0.4531],
-        [0.3535],
-        [0.4766],
-        [0.3047]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3926, 0.3984, 0.6055, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0952, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:27:29,926] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.10 | optimizer_step: 4.37
-[2025-01-25 09:27:29,928] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.00 | bwd_microstep: 4578.07 | bwd_inner_microstep: 4570.92 | bwd_allreduce_microstep: 6.97 | step_microstep: 51.82
-[2025-01-25 09:27:29,929] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.86 | bwd: 4578.10 | bwd_inner: 4570.92 | bwd_allreduce: 7.10 | step: 51.83
-  4%|▍         | 235/5800 [40:59<98:28:31, 63.70s/it]                                                      {'loss': 0.0952, 'grad_norm': 8.545784950256348, 'learning_rate': 3.9988398397865476e-05, 'epoch': 2.03}
-  4%|▍         | 235/5800 [40:59<98:28:31, 63.70s/it]score1 tensor([[0.5000],
-        [0.4336],
-        [0.4785],
-        [0.4648]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.4180, 0.5508, 0.4629], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:27:36,798] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.34 | optimizer_step: 4.36
-[2025-01-25 09:27:36,800] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2138.31 | bwd_microstep: 4584.44 | bwd_inner_microstep: 4578.40 | bwd_allreduce_microstep: 5.94 | step_microstep: 45.19
-[2025-01-25 09:27:36,800] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2138.26 | bwd: 4584.47 | bwd_inner: 4578.40 | bwd_allreduce: 5.99 | step: 45.19
-  4%|▍         | 236/5800 [41:06<72:06:24, 46.65s/it]                                                     {'loss': 0.0293, 'grad_norm': 2.3790156841278076, 'learning_rate': 3.998801493790211e-05, 'epoch': 2.03}
-  4%|▍         | 236/5800 [41:06<72:06:24, 46.65s/it]score1 tensor([[0.5703],
-        [0.6367],
-        [0.4668],
-        [0.4766]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133, 0.6523, 0.4004, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0479, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:27:43,686] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.52 | optimizer_step: 4.36
-[2025-01-25 09:27:43,688] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2136.64 | bwd_microstep: 4591.94 | bwd_inner_microstep: 4585.61 | bwd_allreduce_microstep: 6.20 | step_microstep: 58.83
-[2025-01-25 09:27:43,689] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2136.60 | bwd: 4591.97 | bwd_inner: 4585.61 | bwd_allreduce: 6.26 | step: 58.84
-  4%|▍         | 237/5800 [41:13<53:39:48, 34.73s/it]                                                     {'loss': 0.0479, 'grad_norm': 9.170100212097168, 'learning_rate': 3.9987625245328875e-05, 'epoch': 2.04}
-  4%|▍         | 237/5800 [41:13<53:39:48, 34.73s/it]score1 tensor([[0.7227],
-        [0.6641],
-        [0.6836],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.4648, 0.6484, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1260, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:27:50,609] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.57 | optimizer_step: 4.36
-[2025-01-25 09:27:50,611] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2146.30 | bwd_microstep: 4609.36 | bwd_inner_microstep: 4603.19 | bwd_allreduce_microstep: 6.09 | step_microstep: 48.23
-[2025-01-25 09:27:50,612] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2146.27 | bwd: 4609.38 | bwd_inner: 4603.19 | bwd_allreduce: 6.13 | step: 48.24
-  4%|▍         | 238/5800 [41:20<40:45:54, 26.39s/it]                                                     {'loss': 0.126, 'grad_norm': 18.005802154541016, 'learning_rate': 3.9987229320267265e-05, 'epoch': 2.05}
-  4%|▍         | 238/5800 [41:20<40:45:54, 26.39s/it]score1 tensor([[0.6719],
-        [0.6250],
-        [0.6719],
-        [0.6367]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5742, 0.4668, 0.4062, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1387, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:27:57,524] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.18 | optimizer_step: 4.37
-[2025-01-25 09:27:57,526] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2146.03 | bwd_microstep: 4606.21 | bwd_inner_microstep: 4600.86 | bwd_allreduce_microstep: 5.25 | step_microstep: 45.49
-[2025-01-25 09:27:57,526] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2145.99 | bwd: 4606.24 | bwd_inner: 4600.86 | bwd_allreduce: 5.30 | step: 45.50
-  4%|▍         | 239/5800 [41:27<31:43:59, 20.54s/it]                                                     {'loss': 0.1387, 'grad_norm': 17.99540901184082, 'learning_rate': 3.9986827162840744e-05, 'epoch': 2.06}
-  4%|▍         | 239/5800 [41:27<31:43:59, 20.54s/it]score1 tensor([[0.5391],
-        [0.5508],
-        [0.6484],
-        [0.6328]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.4004, 0.3809, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1582, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:28:04,446] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.38 | optimizer_step: 4.36
-[2025-01-25 09:28:04,448] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2145.29 | bwd_microstep: 4616.12 | bwd_inner_microstep: 4611.01 | bwd_allreduce_microstep: 5.03 | step_microstep: 46.15
-[2025-01-25 09:28:04,448] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2145.25 | bwd: 4616.14 | bwd_inner: 4611.01 | bwd_allreduce: 5.07 | step: 46.16
-  4%|▍         | 240/5800 [41:34<25:24:53, 16.46s/it]                                                     {'loss': 0.1582, 'grad_norm': 17.859657287597656, 'learning_rate': 3.998641877317471e-05, 'epoch': 2.07}
-  4%|▍         | 240/5800 [41:34<25:24:53, 16.46s/it]score1 tensor([[0.5781],
-        [0.4707],
-        [0.4453],
-        [0.3457]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.5000, 0.4180, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0674, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:28:11,338] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.88 | optimizer_step: 4.36
-[2025-01-25 09:28:11,339] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2148.31 | bwd_microstep: 4598.92 | bwd_inner_microstep: 4593.77 | bwd_allreduce_microstep: 5.06 | step_microstep: 43.47
-[2025-01-25 09:28:11,340] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2148.26 | bwd: 4598.95 | bwd_inner: 4593.77 | bwd_allreduce: 5.11 | step: 43.48
-  4%|▍         | 241/5800 [41:41<20:58:28, 13.58s/it]                                                     {'loss': 0.0674, 'grad_norm': 2.1891660690307617, 'learning_rate': 3.998600415139652e-05, 'epoch': 2.08}
-  4%|▍         | 241/5800 [41:41<20:58:28, 13.58s/it]score1 tensor([[0.4375],
-        [0.3730],
-        [0.4531],
-        [0.4590]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.4336, 0.4961, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:28:18,228] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.95 | optimizer_step: 4.36
-[2025-01-25 09:28:18,229] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2151.93 | bwd_microstep: 4608.91 | bwd_inner_microstep: 4603.51 | bwd_allreduce_microstep: 5.28 | step_microstep: 42.19
-[2025-01-25 09:28:18,230] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2151.90 | bwd: 4608.94 | bwd_inner: 4603.51 | bwd_allreduce: 5.34 | step: 42.20
-  4%|▍         | 242/5800 [41:48<17:52:31, 11.58s/it]                                                     {'loss': 0.0547, 'grad_norm': 17.112550735473633, 'learning_rate': 3.998558329763544e-05, 'epoch': 2.09}
-  4%|▍         | 242/5800 [41:48<17:52:31, 11.58s/it]score1 tensor([[0.4668],
-        [0.5195],
-        [0.4414],
-        [0.4668]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.5078, 0.5195, 0.4707], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0513, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:28:25,134] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.91 | optimizer_step: 4.37
-[2025-01-25 09:28:25,136] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.39 | bwd_microstep: 4613.55 | bwd_inner_microstep: 4608.75 | bwd_allreduce_microstep: 4.67 | step_microstep: 41.41
-[2025-01-25 09:28:25,136] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.35 | bwd: 4613.58 | bwd_inner: 4608.75 | bwd_allreduce: 4.73 | step: 41.42
-  4%|▍         | 243/5800 [41:55<15:42:19, 10.17s/it]                                                     {'loss': 0.0513, 'grad_norm': 8.820696830749512, 'learning_rate': 3.9985156212022705e-05, 'epoch': 2.09}
-  4%|▍         | 243/5800 [41:55<15:42:19, 10.17s/it]score1 tensor([[0.6602],
-        [0.5469],
-        [0.3496],
-        [0.2266]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6289, 0.4824, 0.4434, 0.3887], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0879, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:28:32,028] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.37
-[2025-01-25 09:28:32,029] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.58 | bwd_microstep: 4609.36 | bwd_inner_microstep: 4604.90 | bwd_allreduce_microstep: 4.40 | step_microstep: 40.40
-[2025-01-25 09:28:32,030] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.56 | bwd: 4609.38 | bwd_inner: 4604.90 | bwd_allreduce: 4.43 | step: 40.41
-  4%|▍         | 244/5800 [42:02<14:10:57,  9.19s/it]                                                     {'loss': 0.0879, 'grad_norm': 2.9072623252868652, 'learning_rate': 3.99847228946915e-05, 'epoch': 2.1}
-  4%|▍         | 244/5800 [42:02<14:10:57,  9.19s/it]score1 tensor([[0.4941],
-        [0.4023],
-        [0.4902],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4922, 0.3555, 0.4395, 0.4766], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0435, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:28:38,934] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 09:28:38,935] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.02 | bwd_microstep: 4620.27 | bwd_inner_microstep: 4615.26 | bwd_allreduce_microstep: 4.92 | step_microstep: 41.04
-[2025-01-25 09:28:38,936] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.00 | bwd: 4620.30 | bwd_inner: 4615.26 | bwd_allreduce: 4.97 | step: 41.05
-  4%|▍         | 245/5800 [42:08<13:07:18,  8.50s/it]                                                     {'loss': 0.0435, 'grad_norm': 17.483642578125, 'learning_rate': 3.9984283345776924e-05, 'epoch': 2.11}
-  4%|▍         | 245/5800 [42:08<13:07:18,  8.50s/it]score1 tensor([[0.5508],
-        [0.5508],
-        [0.4629],
-        [0.6133]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6016, 0.5508, 0.5664, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0552, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:28:45,785] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.37
-[2025-01-25 09:28:45,786] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.38 | bwd_microstep: 4569.94 | bwd_inner_microstep: 4564.79 | bwd_allreduce_microstep: 5.05 | step_microstep: 44.54
-[2025-01-25 09:28:45,786] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.35 | bwd: 4569.96 | bwd_inner: 4564.79 | bwd_allreduce: 5.10 | step: 44.55
-  4%|▍         | 246/5800 [42:15<12:21:17,  8.01s/it]                                                     {'loss': 0.0552, 'grad_norm': 4.329246520996094, 'learning_rate': 3.9983837565416044e-05, 'epoch': 2.12}
-  4%|▍         | 246/5800 [42:15<12:21:17,  8.01s/it]score1 tensor([[0.6289],
-        [0.5078],
-        [0.5234],
-        [0.3789]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.4531, 0.3613, 0.3086], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0767, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:28:52,721] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.16 | optimizer_step: 4.36
-[2025-01-25 09:28:52,724] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.67 | bwd_microstep: 4639.73 | bwd_inner_microstep: 4634.00 | bwd_allreduce_microstep: 5.61 | step_microstep: 56.05
-[2025-01-25 09:28:52,724] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.64 | bwd: 4639.76 | bwd_inner: 4634.00 | bwd_allreduce: 5.68 | step: 56.06
-  4%|▍         | 247/5800 [42:22<11:52:28,  7.70s/it]                                                     {'loss': 0.0767, 'grad_norm': 17.18313217163086, 'learning_rate': 3.998338555374786e-05, 'epoch': 2.13}
-  4%|▍         | 247/5800 [42:22<11:52:28,  7.70s/it]score1 tensor([[0.3711],
-        [0.4609],
-        [0.4844],
-        [0.6250]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.1787, 0.3730, 0.5625, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1191, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:28:59,714] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 09:28:59,717] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.54 | bwd_microstep: 4648.81 | bwd_inner_microstep: 4642.34 | bwd_allreduce_microstep: 6.30 | step_microstep: 49.53
-[2025-01-25 09:28:59,717] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.51 | bwd: 4648.86 | bwd_inner: 4642.34 | bwd_allreduce: 6.38 | step: 49.54
-  4%|▍         | 248/5800 [42:29<11:32:14,  7.48s/it]                                                     {'loss': 0.1191, 'grad_norm': 8.862905502319336, 'learning_rate': 3.998292731091332e-05, 'epoch': 2.14}
-  4%|▍         | 248/5800 [42:29<11:32:14,  7.48s/it]score1 tensor([[0.4375],
-        [0.4844],
-        [0.5312],
-        [0.4863]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4336, 0.6250, 0.5820, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0562, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:29:06,682] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.24 | optimizer_step: 4.36
-[2025-01-25 09:29:06,683] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2171.72 | bwd_microstep: 4648.24 | bwd_inner_microstep: 4642.97 | bwd_allreduce_microstep: 5.19 | step_microstep: 43.33
-[2025-01-25 09:29:06,684] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2171.69 | bwd: 4648.26 | bwd_inner: 4642.97 | bwd_allreduce: 5.23 | step: 43.33
-  4%|▍         | 249/5800 [42:36<11:17:32,  7.32s/it]                                                     {'loss': 0.0562, 'grad_norm': 8.741289138793945, 'learning_rate': 3.9982462837055314e-05, 'epoch': 2.15}
-  4%|▍         | 249/5800 [42:36<11:17:32,  7.32s/it]score1 tensor([[0.4785],
-        [0.4824],
-        [0.4785],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.3789, 0.6172, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0786, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:29:13,623] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 09:29:13,625] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.35 | bwd_microstep: 4644.40 | bwd_inner_microstep: 4638.94 | bwd_allreduce_microstep: 5.38 | step_microstep: 42.81
-[2025-01-25 09:29:13,625] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.33 | bwd: 4644.44 | bwd_inner: 4638.94 | bwd_allreduce: 5.42 | step: 42.81
-  4%|▍         | 250/5800 [42:43<11:06:48,  7.21s/it]                                                     {'loss': 0.0786, 'grad_norm': 8.605073928833008, 'learning_rate': 3.9981992132318665e-05, 'epoch': 2.16}
-  4%|▍         | 250/5800 [42:43<11:06:48,  7.21s/it]score1 tensor([[0.4609],
-        [0.5508],
-        [0.5938],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3418, 0.4668, 0.6445, 0.6133], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0713, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:29:20,572] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.24 | optimizer_step: 4.37
-[2025-01-25 09:29:20,573] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.79 | bwd_microstep: 4648.59 | bwd_inner_microstep: 4643.78 | bwd_allreduce_microstep: 4.73 | step_microstep: 45.31
-[2025-01-25 09:29:20,574] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.77 | bwd: 4648.61 | bwd_inner: 4643.78 | bwd_allreduce: 4.76 | step: 45.32
-  4%|▍         | 251/5800 [42:50<10:59:24,  7.13s/it]                                                     {'loss': 0.0713, 'grad_norm': 1.8679332733154297, 'learning_rate': 3.9981515196850156e-05, 'epoch': 2.16}
-  4%|▍         | 251/5800 [42:50<10:59:24,  7.13s/it]score1 tensor([[0.5703],
-        [0.5234],
-        [0.6055],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4121, 0.4844, 0.6055, 0.4805], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0698, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:29:27,464] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.37
-[2025-01-25 09:29:27,466] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.77 | bwd_microstep: 4589.37 | bwd_inner_microstep: 4584.77 | bwd_allreduce_microstep: 4.51 | step_microstep: 42.25
-[2025-01-25 09:29:27,466] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.74 | bwd: 4589.40 | bwd_inner: 4584.77 | bwd_allreduce: 4.56 | step: 42.25
-  4%|▍         | 252/5800 [42:57<10:53:03,  7.06s/it]                                                     {'loss': 0.0698, 'grad_norm': 12.765166282653809, 'learning_rate': 3.9981032030798494e-05, 'epoch': 2.17}
-  4%|▍         | 252/5800 [42:57<10:53:03,  7.06s/it]score1 tensor([[0.6406],
-        [0.5391],
-        [0.5820],
-        [0.4902]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6836, 0.3750, 0.4590, 0.4688], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0879, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:29:34,428] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.05 | optimizer_step: 4.61
-[2025-01-25 09:29:34,429] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.27 | bwd_microstep: 4642.05 | bwd_inner_microstep: 4637.20 | bwd_allreduce_microstep: 4.76 | step_microstep: 56.77
-[2025-01-25 09:29:34,429] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.24 | bwd: 4642.07 | bwd_inner: 4637.20 | bwd_allreduce: 4.81 | step: 56.78
-  4%|▍         | 253/5800 [43:04<10:50:07,  7.03s/it]                                                     {'loss': 0.0879, 'grad_norm': 8.355828285217285, 'learning_rate': 3.998054263431435e-05, 'epoch': 2.18}
-  4%|▍         | 253/5800 [43:04<10:50:07,  7.03s/it]score1 tensor([[0.3164],
-        [0.5312],
-        [0.4316],
-        [0.4785]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.5469, 0.4238, 0.4570], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0259, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:29:41,405] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.90 | optimizer_step: 4.37
-[2025-01-25 09:29:41,407] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2177.76 | bwd_microstep: 4650.67 | bwd_inner_microstep: 4645.66 | bwd_allreduce_microstep: 4.93 | step_microstep: 42.39
-[2025-01-25 09:29:41,407] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2177.70 | bwd: 4650.70 | bwd_inner: 4645.66 | bwd_allreduce: 4.97 | step: 42.39
-  4%|▍         | 254/5800 [43:11<10:48:22,  7.01s/it]                                                     {'loss': 0.0259, 'grad_norm': 1.3934792280197144, 'learning_rate': 3.998004700755031e-05, 'epoch': 2.19}
-  4%|▍         | 254/5800 [43:11<10:48:22,  7.01s/it]score1 tensor([[0.5000],
-        [0.3770],
-        [0.3203],
-        [0.4238]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.4141, 0.4961, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0942, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:29:48,354] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 09:29:48,356] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.49 | bwd_microstep: 4641.24 | bwd_inner_microstep: 4636.77 | bwd_allreduce_microstep: 4.40 | step_microstep: 42.99
-[2025-01-25 09:29:48,356] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.46 | bwd: 4641.26 | bwd_inner: 4636.77 | bwd_allreduce: 4.43 | step: 43.00
-  4%|▍         | 255/5800 [43:18<10:46:17,  6.99s/it]                                                     {'loss': 0.0942, 'grad_norm': 16.485271453857422, 'learning_rate': 3.997954515066094e-05, 'epoch': 2.2}
-  4%|▍         | 255/5800 [43:18<10:46:17,  6.99s/it]score1 tensor([[0.3633],
-        [0.5312],
-        [0.5078],
-        [0.3320]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.6797, 0.5508, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1216, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:29:55,292] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.92 | optimizer_step: 4.36
-[2025-01-25 09:29:55,294] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.70 | bwd_microstep: 4643.17 | bwd_inner_microstep: 4637.92 | bwd_allreduce_microstep: 5.16 | step_microstep: 42.96
-[2025-01-25 09:29:55,294] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.66 | bwd: 4643.22 | bwd_inner: 4637.92 | bwd_allreduce: 5.21 | step: 42.95
-  4%|▍         | 256/5800 [43:25<10:44:36,  6.98s/it]                                                     {'loss': 0.1216, 'grad_norm': 16.32749366760254, 'learning_rate': 3.997903706380271e-05, 'epoch': 2.21}
-  4%|▍         | 256/5800 [43:25<10:44:36,  6.98s/it]score1 tensor([[0.5352],
-        [0.2852],
-        [0.3750],
-        [0.4746]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.3652, 0.5586, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0835, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:30:02,231] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.19 | optimizer_step: 4.37
-[2025-01-25 09:30:02,233] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.62 | bwd_microstep: 4643.31 | bwd_inner_microstep: 4638.07 | bwd_allreduce_microstep: 5.16 | step_microstep: 43.70
-[2025-01-25 09:30:02,233] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.58 | bwd: 4643.33 | bwd_inner: 4638.07 | bwd_allreduce: 5.20 | step: 43.71
-  4%|▍         | 257/5800 [43:32<10:43:29,  6.97s/it]                                                     {'loss': 0.0835, 'grad_norm': 7.9637370109558105, 'learning_rate': 3.997852274713406e-05, 'epoch': 2.22}
-  4%|▍         | 257/5800 [43:32<10:43:29,  6.97s/it]score1 tensor([[0.4512],
-        [0.5547],
-        [0.4980],
-        [0.6055]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.6484, 0.5781, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0879, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:30:09,165] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.37
-[2025-01-25 09:30:09,166] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.83 | bwd_microstep: 4641.43 | bwd_inner_microstep: 4636.86 | bwd_allreduce_microstep: 4.48 | step_microstep: 41.57
-[2025-01-25 09:30:09,167] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.81 | bwd: 4641.46 | bwd_inner: 4636.86 | bwd_allreduce: 4.53 | step: 41.58
-  4%|▍         | 258/5800 [43:39<10:42:15,  6.95s/it]                                                     {'loss': 0.0879, 'grad_norm': 8.161086082458496, 'learning_rate': 3.997800220081535e-05, 'epoch': 2.22}
-  4%|▍         | 258/5800 [43:39<10:42:15,  6.95s/it]score1 tensor([[0.8125],
-        [0.5352],
-        [0.6562],
-        [0.7344]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.3457, 0.4941, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2344, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:30:16,098] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.37
-[2025-01-25 09:30:16,100] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.22 | bwd_microstep: 4646.46 | bwd_inner_microstep: 4641.43 | bwd_allreduce_microstep: 4.95 | step_microstep: 46.89
-[2025-01-25 09:30:16,101] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.19 | bwd: 4646.48 | bwd_inner: 4641.43 | bwd_allreduce: 4.99 | step: 46.90
-  4%|▍         | 259/5800 [43:46<10:41:57,  6.95s/it]                                                     {'loss': 0.2344, 'grad_norm': 16.946165084838867, 'learning_rate': 3.997747542500892e-05, 'epoch': 2.23}
-  4%|▍         | 259/5800 [43:46<10:41:57,  6.95s/it]score1 tensor([[0.6719],
-        [0.8164],
-        [0.7773],
-        [0.6445]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.5781, 0.4863, 0.4883], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2188, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:30:23,052] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.00 | optimizer_step: 4.37
-[2025-01-25 09:30:23,054] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.38 | bwd_microstep: 4651.24 | bwd_inner_microstep: 4646.21 | bwd_allreduce_microstep: 4.92 | step_microstep: 42.83
-[2025-01-25 09:30:23,054] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.36 | bwd: 4651.28 | bwd_inner: 4646.21 | bwd_allreduce: 4.96 | step: 42.84
-  4%|▍         | 260/5800 [43:53<10:41:42,  6.95s/it]                                                     {'loss': 0.2188, 'grad_norm': 16.984386444091797, 'learning_rate': 3.997694241987901e-05, 'epoch': 2.24}
-  4%|▍         | 260/5800 [43:53<10:41:42,  6.95s/it]score1 tensor([[0.7109],
-        [0.8203],
-        [0.8516],
-        [0.7344]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5234, 0.4062, 0.3262, 0.4629], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.3496, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:30:30,006] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.37 | optimizer_step: 4.37
-[2025-01-25 09:30:30,007] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.68 | bwd_microstep: 4653.21 | bwd_inner_microstep: 4646.97 | bwd_allreduce_microstep: 6.05 | step_microstep: 44.01
-[2025-01-25 09:30:30,008] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2168.64 | bwd: 4653.27 | bwd_inner: 4646.97 | bwd_allreduce: 6.15 | step: 43.99
-  4%|▍         | 261/5800 [43:59<10:41:58,  6.95s/it]                                                     {'loss': 0.3496, 'grad_norm': 15.97628116607666, 'learning_rate': 3.997640318559182e-05, 'epoch': 2.25}
-  4%|▍         | 261/5800 [44:00<10:41:58,  6.95s/it]score1 tensor([[0.7227],
-        [0.6797],
-        [0.7578],
-        [0.7344]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.4707, 0.3906, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2578, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:30:36,997] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.92 | optimizer_step: 4.37
-[2025-01-25 09:30:36,999] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.08 | bwd_microstep: 4647.08 | bwd_inner_microstep: 4640.08 | bwd_allreduce_microstep: 6.83 | step_microstep: 79.30
-[2025-01-25 09:30:37,000] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2168.06 | bwd: 4647.12 | bwd_inner: 4640.08 | bwd_allreduce: 6.92 | step: 79.32
-  5%|▍         | 262/5800 [44:07<10:43:20,  6.97s/it]                                                     {'loss': 0.2578, 'grad_norm': 15.5725736618042, 'learning_rate': 3.997585772231549e-05, 'epoch': 2.26}
-  5%|▍         | 262/5800 [44:07<10:43:20,  6.97s/it]score1 tensor([[0.7734],
-        [0.6680],
-        [0.7227],
-        [0.6953]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3789, 0.4375, 0.5039, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2471, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:30:43,977] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.36
-[2025-01-25 09:30:43,978] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2171.22 | bwd_microstep: 4647.59 | bwd_inner_microstep: 4642.53 | bwd_allreduce_microstep: 4.96 | step_microstep: 40.92
-[2025-01-25 09:30:43,979] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2171.14 | bwd: 4647.62 | bwd_inner: 4642.53 | bwd_allreduce: 5.01 | step: 40.92
-  5%|▍         | 263/5800 [44:13<10:42:41,  6.96s/it]                                                     {'loss': 0.2471, 'grad_norm': 15.280916213989258, 'learning_rate': 3.997530603022012e-05, 'epoch': 2.27}
-  5%|▍         | 263/5800 [44:13<10:42:41,  6.96s/it]score1 tensor([[0.5625],
-        [0.5938],
-        [0.5078],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.4785, 0.5117, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0596, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:30:50,926] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 7.58 | optimizer_step: 4.37
-[2025-01-25 09:30:50,928] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.77 | bwd_microstep: 4651.60 | bwd_inner_microstep: 4645.97 | bwd_allreduce_microstep: 5.52 | step_microstep: 50.59
-[2025-01-25 09:30:50,928] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.74 | bwd: 4651.63 | bwd_inner: 4645.97 | bwd_allreduce: 5.57 | step: 50.60
-  5%|▍         | 264/5800 [44:20<10:42:34,  6.96s/it]                                                     {'loss': 0.0596, 'grad_norm': 2.0986363887786865, 'learning_rate': 3.997474810947773e-05, 'epoch': 2.28}
-  5%|▍         | 264/5800 [44:20<10:42:34,  6.96s/it]score1 tensor([[0.5039],
-        [0.5352],
-        [0.3984],
-        [0.4473]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.6328, 0.4570, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0825, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:30:57,886] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.21 | optimizer_step: 4.37
-[2025-01-25 09:30:57,888] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.33 | bwd_microstep: 4644.52 | bwd_inner_microstep: 4638.60 | bwd_allreduce_microstep: 5.84 | step_microstep: 45.16
-[2025-01-25 09:30:57,888] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.29 | bwd: 4644.55 | bwd_inner: 4638.60 | bwd_allreduce: 5.88 | step: 45.16
-  5%|▍         | 265/5800 [44:27<10:42:16,  6.96s/it]                                                     {'loss': 0.0825, 'grad_norm': 6.968156814575195, 'learning_rate': 3.997418396026228e-05, 'epoch': 2.28}
-  5%|▍         | 265/5800 [44:27<10:42:16,  6.96s/it]score1 tensor([[0.5195],
-        [0.4102],
-        [0.5117],
-        [0.4922]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.3750, 0.5391, 0.4219], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0586, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:31:04,845] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.36
-[2025-01-25 09:31:04,847] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.58 | bwd_microstep: 4644.01 | bwd_inner_microstep: 4639.03 | bwd_allreduce_microstep: 4.90 | step_microstep: 41.40
-[2025-01-25 09:31:04,847] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2168.55 | bwd: 4644.03 | bwd_inner: 4639.03 | bwd_allreduce: 4.94 | step: 41.40
-  5%|▍         | 266/5800 [44:34<10:41:43,  6.96s/it]                                                     {'loss': 0.0586, 'grad_norm': 1.7795144319534302, 'learning_rate': 3.997361358274969e-05, 'epoch': 2.29}
-  5%|▍         | 266/5800 [44:34<10:41:43,  6.96s/it]score1 tensor([[0.3926],
-        [0.4180],
-        [0.5117],
-        [0.3926]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5898, 0.5664, 0.6094, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1484, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:31:11,804] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.60 | optimizer_step: 4.36
-[2025-01-25 09:31:11,806] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2175.11 | bwd_microstep: 4646.27 | bwd_inner_microstep: 4639.61 | bwd_allreduce_microstep: 6.53 | step_microstep: 51.59
-[2025-01-25 09:31:11,806] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2175.08 | bwd: 4646.31 | bwd_inner: 4639.61 | bwd_allreduce: 6.60 | step: 51.56
-  5%|▍         | 267/5800 [44:41<10:41:47,  6.96s/it]                                                     {'loss': 0.1484, 'grad_norm': 13.917196273803711, 'learning_rate': 3.997303697711782e-05, 'epoch': 2.3}
-  5%|▍         | 267/5800 [44:41<10:41:47,  6.96s/it]score1 tensor([[0.3555],
-        [0.5156],
-        [0.3613],
-        [0.4473]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.5820, 0.4160, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0903, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:31:18,760] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.37
-[2025-01-25 09:31:18,762] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.47 | bwd_microstep: 4649.60 | bwd_inner_microstep: 4645.13 | bwd_allreduce_microstep: 4.38 | step_microstep: 40.50
-[2025-01-25 09:31:18,762] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.42 | bwd: 4649.62 | bwd_inner: 4645.13 | bwd_allreduce: 4.42 | step: 40.51
-  5%|▍         | 268/5800 [44:48<10:41:22,  6.96s/it]                                                     {'loss': 0.0903, 'grad_norm': 13.82066535949707, 'learning_rate': 3.997245414354645e-05, 'epoch': 2.31}
-  5%|▍         | 268/5800 [44:48<10:41:22,  6.96s/it]score1 tensor([[0.5312],
-        [0.5273],
-        [0.4746],
-        [0.3906]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.5312, 0.5195, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0347, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:31:25,690] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 09:31:25,692] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.40 | bwd_microstep: 4651.21 | bwd_inner_microstep: 4646.77 | bwd_allreduce_microstep: 4.36 | step_microstep: 40.22
-[2025-01-25 09:31:25,692] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.37 | bwd: 4651.24 | bwd_inner: 4646.77 | bwd_allreduce: 4.40 | step: 40.23
-  5%|▍         | 269/5800 [44:55<10:40:56,  6.95s/it]                                                     {'loss': 0.0347, 'grad_norm': 7.41732120513916, 'learning_rate': 3.997186508221733e-05, 'epoch': 2.32}
-  5%|▍         | 269/5800 [44:55<10:40:56,  6.95s/it]score1 tensor([[0.6562],
-        [0.5469],
-        [0.5820],
-        [0.4180]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4648, 0.5000, 0.3223], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0913, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:31:32,642] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.37
-[2025-01-25 09:31:32,643] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.21 | bwd_microstep: 4642.44 | bwd_inner_microstep: 4637.60 | bwd_allreduce_microstep: 4.75 | step_microstep: 41.84
-[2025-01-25 09:31:32,643] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.14 | bwd: 4642.46 | bwd_inner: 4637.60 | bwd_allreduce: 4.79 | step: 41.84
-  5%|▍         | 270/5800 [45:02<10:40:25,  6.95s/it]                                                     {'loss': 0.0913, 'grad_norm': 14.327885627746582, 'learning_rate': 3.997126979331413e-05, 'epoch': 2.33}
-  5%|▍         | 270/5800 [45:02<10:40:25,  6.95s/it]score1 tensor([[0.7188],
-        [0.8672],
-        [0.6094],
-        [0.8359]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.5312, 0.5156, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2119, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:31:39,610] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.09 | optimizer_step: 4.37
-[2025-01-25 09:31:39,612] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.06 | bwd_microstep: 4647.09 | bwd_inner_microstep: 4639.99 | bwd_allreduce_microstep: 6.92 | step_microstep: 76.08
-[2025-01-25 09:31:39,612] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.03 | bwd: 4647.13 | bwd_inner: 4639.99 | bwd_allreduce: 7.02 | step: 76.08
-  5%|▍         | 271/5800 [45:09<10:41:31,  6.96s/it]                                                     {'loss': 0.2119, 'grad_norm': 16.567913055419922, 'learning_rate': 3.997066827702248e-05, 'epoch': 2.34}
-  5%|▍         | 271/5800 [45:09<10:41:31,  6.96s/it]score1 tensor([[0.6484],
-        [0.6094],
-        [0.6523],
-        [0.6758]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.5312, 0.5352, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1035, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:31:46,587] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.36
-[2025-01-25 09:31:46,588] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.50 | bwd_microstep: 4652.29 | bwd_inner_microstep: 4647.34 | bwd_allreduce_microstep: 4.85 | step_microstep: 42.52
-[2025-01-25 09:31:46,589] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.42 | bwd: 4652.32 | bwd_inner: 4647.34 | bwd_allreduce: 4.90 | step: 42.53
-  5%|▍         | 272/5800 [45:16<10:41:19,  6.96s/it]                                                     {'loss': 0.1035, 'grad_norm': 14.903491020202637, 'learning_rate': 3.997006053352994e-05, 'epoch': 2.34}
-  5%|▍         | 272/5800 [45:16<10:41:19,  6.96s/it]score1 tensor([[0.4316],
-        [0.6133],
-        [0.4277],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4609, 0.4180, 0.4727, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1006, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:31:53,528] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 09:31:53,529] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.72 | bwd_microstep: 4648.61 | bwd_inner_microstep: 4644.31 | bwd_allreduce_microstep: 4.22 | step_microstep: 40.60
-[2025-01-25 09:31:53,530] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.68 | bwd: 4648.64 | bwd_inner: 4644.31 | bwd_allreduce: 4.26 | step: 40.61
-  5%|▍         | 273/5800 [45:23<10:40:28,  6.95s/it]                                                     {'loss': 0.1006, 'grad_norm': 1.7395572662353516, 'learning_rate': 3.9969446563026015e-05, 'epoch': 2.35}
-  5%|▍         | 273/5800 [45:23<10:40:28,  6.95s/it]score1 tensor([[0.6211],
-        [0.5469],
-        [0.6328],
-        [0.6055]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.4902, 0.5977, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0562, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:32:00,460] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 09:32:00,462] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.84 | bwd_microstep: 4645.99 | bwd_inner_microstep: 4641.52 | bwd_allreduce_microstep: 4.39 | step_microstep: 41.14
-[2025-01-25 09:32:00,462] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.82 | bwd: 4646.01 | bwd_inner: 4641.52 | bwd_allreduce: 4.42 | step: 41.15
-  5%|▍         | 274/5800 [45:30<10:39:52,  6.95s/it]                                                     {'loss': 0.0562, 'grad_norm': 14.58797550201416, 'learning_rate': 3.996882636570215e-05, 'epoch': 2.36}
-  5%|▍         | 274/5800 [45:30<10:39:52,  6.95s/it]score1 tensor([[0.5117],
-        [0.5234],
-        [0.3945],
-        [0.4570]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5977, 0.5000, 0.4121, 0.4688], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0347, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:32:07,389] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 09:32:07,390] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.26 | bwd_microstep: 4644.33 | bwd_inner_microstep: 4639.71 | bwd_allreduce_microstep: 4.55 | step_microstep: 40.84
-[2025-01-25 09:32:07,391] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.19 | bwd: 4644.36 | bwd_inner: 4639.71 | bwd_allreduce: 4.59 | step: 40.85
-  5%|▍         | 275/5800 [45:37<10:39:07,  6.94s/it]                                                     {'loss': 0.0347, 'grad_norm': 7.035111904144287, 'learning_rate': 3.996819994175174e-05, 'epoch': 2.37}
-  5%|▍         | 275/5800 [45:37<10:39:07,  6.94s/it]score1 tensor([[0.3750],
-        [0.3516],
-        [0.4531],
-        [0.5781]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.4980, 0.5273, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1172, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:32:14,326] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 09:32:14,327] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.52 | bwd_microstep: 4650.44 | bwd_inner_microstep: 4646.01 | bwd_allreduce_microstep: 4.35 | step_microstep: 39.70
-[2025-01-25 09:32:14,327] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.48 | bwd: 4650.46 | bwd_inner: 4646.02 | bwd_allreduce: 4.38 | step: 39.71
-  5%|▍         | 276/5800 [45:44<10:38:57,  6.94s/it]                                                     {'loss': 0.1172, 'grad_norm': 6.963194847106934, 'learning_rate': 3.996756729137011e-05, 'epoch': 2.38}
-  5%|▍         | 276/5800 [45:44<10:38:57,  6.94s/it]score1 tensor([[0.4590],
-        [0.4668],
-        [0.4727],
-        [0.3184]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.4590, 0.3984, 0.4160], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0659, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:32:21,257] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.92 | optimizer_step: 4.36
-[2025-01-25 09:32:21,258] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.36 | bwd_microstep: 4650.92 | bwd_inner_microstep: 4646.34 | bwd_allreduce_microstep: 4.50 | step_microstep: 44.44
-[2025-01-25 09:32:21,259] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.33 | bwd: 4650.94 | bwd_inner: 4646.34 | bwd_allreduce: 4.53 | step: 44.45
-  5%|▍         | 277/5800 [45:51<10:38:30,  6.94s/it]                                                     {'loss': 0.0659, 'grad_norm': 1.2012250423431396, 'learning_rate': 3.996692841475453e-05, 'epoch': 2.39}
-  5%|▍         | 277/5800 [45:51<10:38:30,  6.94s/it]score1 tensor([[0.4121],
-        [0.4961],
-        [0.4121],
-        [0.4648]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.4941, 0.4492, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0376, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:32:28,185] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 09:32:28,186] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.22 | bwd_microstep: 4648.07 | bwd_inner_microstep: 4643.62 | bwd_allreduce_microstep: 4.38 | step_microstep: 40.16
-[2025-01-25 09:32:28,187] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.19 | bwd: 4648.09 | bwd_inner: 4643.62 | bwd_allreduce: 4.41 | step: 40.16
-  5%|▍         | 278/5800 [45:58<10:38:10,  6.93s/it]                                                     {'loss': 0.0376, 'grad_norm': 6.908035755157471, 'learning_rate': 3.996628331210421e-05, 'epoch': 2.4}
-  5%|▍         | 278/5800 [45:58<10:38:10,  6.93s/it]score1 tensor([[0.5000],
-        [0.5234],
-        [0.4629],
-        [0.4648]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.4844, 0.4492, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0264, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:32:35,131] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.85 | optimizer_step: 4.36
-[2025-01-25 09:32:35,133] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.96 | bwd_microstep: 4644.05 | bwd_inner_microstep: 4638.34 | bwd_allreduce_microstep: 5.59 | step_microstep: 68.26
-[2025-01-25 09:32:35,134] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.94 | bwd: 4644.10 | bwd_inner: 4638.34 | bwd_allreduce: 5.65 | step: 68.26
-  5%|▍         | 279/5800 [46:05<10:39:43,  6.95s/it]                                                     {'loss': 0.0264, 'grad_norm': 13.828591346740723, 'learning_rate': 3.9965631983620304e-05, 'epoch': 2.41}
-  5%|▍         | 279/5800 [46:05<10:39:43,  6.95s/it]score1 tensor([[0.5156],
-        [0.6836],
-        [0.4922],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.6641, 0.4082, 0.4141], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0581, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:32:42,155] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 7.10 | optimizer_step: 4.36
-[2025-01-25 09:32:42,156] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.07 | bwd_microstep: 4640.39 | bwd_inner_microstep: 4634.09 | bwd_allreduce_microstep: 6.21 | step_microstep: 68.61
-[2025-01-25 09:32:42,156] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.03 | bwd: 4640.42 | bwd_inner: 4634.09 | bwd_allreduce: 6.26 | step: 68.62
-  5%|▍         | 280/5800 [46:12<10:41:51,  6.98s/it]                                                     {'loss': 0.0581, 'grad_norm': 7.237619876861572, 'learning_rate': 3.996497442950592e-05, 'epoch': 2.41}
-  5%|▍         | 280/5800 [46:12<10:41:51,  6.98s/it]score1 tensor([[0.4629],
-        [0.4688],
-        [0.5195],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.3945, 0.5352, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0557, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:32:49,174] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 09:32:49,175] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.22 | bwd_microstep: 4637.47 | bwd_inner_microstep: 4632.91 | bwd_allreduce_microstep: 4.48 | step_microstep: 48.33
-[2025-01-25 09:32:49,175] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.17 | bwd: 4637.49 | bwd_inner: 4632.91 | bwd_allreduce: 4.52 | step: 48.36
-  5%|▍         | 281/5800 [46:19<10:41:36,  6.98s/it]                                                     {'loss': 0.0557, 'grad_norm': 7.001344203948975, 'learning_rate': 3.9964310649966084e-05, 'epoch': 2.42}
-  5%|▍         | 281/5800 [46:19<10:41:36,  6.98s/it]score1 tensor([[0.5781],
-        [0.5039],
-        [0.5000],
-        [0.4961]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6367, 0.5625, 0.3105, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0776, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:32:56,110] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.22 | optimizer_step: 4.36
-[2025-01-25 09:32:56,111] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.50 | bwd_microstep: 4640.69 | bwd_inner_microstep: 4636.99 | bwd_allreduce_microstep: 3.64 | step_microstep: 30.77
-[2025-01-25 09:32:56,111] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.47 | bwd: 4640.71 | bwd_inner: 4636.99 | bwd_allreduce: 3.67 | step: 30.77
-  5%|▍         | 282/5800 [46:26<10:39:40,  6.96s/it]                                                     {'loss': 0.0776, 'grad_norm': 1.9384227991104126, 'learning_rate': 3.996364064520777e-05, 'epoch': 2.43}
-  5%|▍         | 282/5800 [46:26<10:39:40,  6.96s/it]score1 tensor([[0.4980],
-        [0.5820],
-        [0.5195],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.5391, 0.5391, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0396, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:33:02,997] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.51 | optimizer_step: 4.37
-[2025-01-25 09:33:02,998] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.00 | bwd_microstep: 4637.79 | bwd_inner_microstep: 4632.23 | bwd_allreduce_microstep: 5.45 | step_microstep: 36.75
-[2025-01-25 09:33:02,998] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.99 | bwd: 4637.83 | bwd_inner: 4632.23 | bwd_allreduce: 5.50 | step: 36.75
-  5%|▍         | 283/5800 [46:33<10:38:48,  6.95s/it]                                                     {'loss': 0.0396, 'grad_norm': 0.9459404945373535, 'learning_rate': 3.996296441543992e-05, 'epoch': 2.44}
-  5%|▍         | 283/5800 [46:33<10:38:48,  6.95s/it]score1 tensor([[0.5273],
-        [0.5508],
-        [0.5391],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.5703, 0.4355, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:33:09,946] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.34 | optimizer_step: 4.36
-[2025-01-25 09:33:09,946] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.57 | bwd_microstep: 4636.38 | bwd_inner_microstep: 4632.47 | bwd_allreduce_microstep: 3.85 | step_microstep: 35.51
-[2025-01-25 09:33:09,947] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.53 | bwd: 4636.40 | bwd_inner: 4632.47 | bwd_allreduce: 3.87 | step: 35.51
-  5%|▍         | 284/5800 [46:39<10:37:41,  6.94s/it]                                                     {'loss': 0.0469, 'grad_norm': 0.738709568977356, 'learning_rate': 3.9962281960873366e-05, 'epoch': 2.45}
-  5%|▍         | 284/5800 [46:39<10:37:41,  6.94s/it]score1 tensor([[0.5391],
-        [0.5156],
-        [0.5117],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.4395, 0.4043, 0.4023], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1152, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:33:16,843] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 09:33:16,844] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.06 | bwd_microstep: 4637.26 | bwd_inner_microstep: 4632.75 | bwd_allreduce_microstep: 4.43 | step_microstep: 41.57
-[2025-01-25 09:33:16,845] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.05 | bwd: 4637.28 | bwd_inner: 4632.75 | bwd_allreduce: 4.47 | step: 41.58
-  5%|▍         | 285/5800 [46:46<10:36:51,  6.93s/it]                                                     {'loss': 0.1152, 'grad_norm': 13.772743225097656, 'learning_rate': 3.996159328172092e-05, 'epoch': 2.46}
-  5%|▍         | 285/5800 [46:46<10:36:51,  6.93s/it]score1 tensor([[0.4863],
-        [0.5117],
-        [0.4727],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.4883, 0.3652, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0483, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:33:23,759] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 09:33:23,760] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.79 | bwd_microstep: 4636.60 | bwd_inner_microstep: 4632.14 | bwd_allreduce_microstep: 4.39 | step_microstep: 39.46
-[2025-01-25 09:33:23,760] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.71 | bwd: 4636.63 | bwd_inner: 4632.14 | bwd_allreduce: 4.42 | step: 39.47
-  5%|▍         | 286/5800 [46:53<10:36:18,  6.92s/it]                                                     {'loss': 0.0483, 'grad_norm': 6.960116863250732, 'learning_rate': 3.9960898378197324e-05, 'epoch': 2.47}
-  5%|▍         | 286/5800 [46:53<10:36:18,  6.92s/it]score1 tensor([[0.5312],
-        [0.4121],
-        [0.5039],
-        [0.4668]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5898, 0.4512, 0.6055, 0.4043], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0654, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:33:30,672] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 09:33:30,673] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.42 | bwd_microstep: 4637.55 | bwd_inner_microstep: 4632.93 | bwd_allreduce_microstep: 4.54 | step_microstep: 40.05
-[2025-01-25 09:33:30,673] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.39 | bwd: 4637.57 | bwd_inner: 4632.93 | bwd_allreduce: 4.58 | step: 40.06
-  5%|▍         | 287/5800 [47:00<10:35:56,  6.92s/it]                                                     {'loss': 0.0654, 'grad_norm': 6.846502780914307, 'learning_rate': 3.9960197250519267e-05, 'epoch': 2.47}
-  5%|▍         | 287/5800 [47:00<10:35:56,  6.92s/it]score1 tensor([[0.5117],
-        [0.5430],
-        [0.4199],
-        [0.4141]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.7031, 0.6172, 0.6094, 0.4766], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1289, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:33:37,596] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 09:33:37,598] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.49 | bwd_microstep: 4647.61 | bwd_inner_microstep: 4643.07 | bwd_allreduce_microstep: 4.47 | step_microstep: 43.90
-[2025-01-25 09:33:37,598] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.46 | bwd: 4647.63 | bwd_inner: 4643.06 | bwd_allreduce: 4.51 | step: 43.91
-  5%|▍         | 288/5800 [47:07<10:35:55,  6.92s/it]                                                     {'loss': 0.1289, 'grad_norm': 13.590271949768066, 'learning_rate': 3.9959489898905366e-05, 'epoch': 2.48}
-  5%|▍         | 288/5800 [47:07<10:35:55,  6.92s/it]score1 tensor([[0.4512],
-        [0.5703],
-        [0.4316],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4355, 0.5469, 0.4844, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0269, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:33:44,534] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.18 | optimizer_step: 4.36
-[2025-01-25 09:33:44,536] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.00 | bwd_microstep: 4646.74 | bwd_inner_microstep: 4641.35 | bwd_allreduce_microstep: 5.30 | step_microstep: 51.90
-[2025-01-25 09:33:44,536] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.97 | bwd: 4646.76 | bwd_inner: 4641.35 | bwd_allreduce: 5.34 | step: 51.90
-  5%|▍         | 289/5800 [47:14<10:36:48,  6.93s/it]                                                     {'loss': 0.0269, 'grad_norm': 7.1366777420043945, 'learning_rate': 3.995877632357619e-05, 'epoch': 2.49}
-  5%|▍         | 289/5800 [47:14<10:36:48,  6.93s/it]score1 tensor([[0.4043],
-        [0.4023],
-        [0.4121],
-        [0.4414]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4199, 0.4434, 0.3398, 0.4199], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0376, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:33:51,492] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.37 | optimizer_step: 4.36
-[2025-01-25 09:33:51,494] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.52 | bwd_microstep: 4647.77 | bwd_inner_microstep: 4642.52 | bwd_allreduce_microstep: 5.09 | step_microstep: 47.30
-[2025-01-25 09:33:51,494] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.50 | bwd: 4647.80 | bwd_inner: 4642.52 | bwd_allreduce: 5.19 | step: 47.31
-  5%|▌         | 290/5800 [47:21<10:37:21,  6.94s/it]                                                     {'loss': 0.0376, 'grad_norm': 0.6000106930732727, 'learning_rate': 3.995805652475424e-05, 'epoch': 2.5}
-  5%|▌         | 290/5800 [47:21<10:37:21,  6.94s/it]score1 tensor([[0.4434],
-        [0.4453],
-        [0.3770],
-        [0.4160]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4453, 0.5703, 0.4414, 0.4316], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0518, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:33:58,451] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.15 | optimizer_step: 4.36
-[2025-01-25 09:33:58,452] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.05 | bwd_microstep: 4640.25 | bwd_inner_microstep: 4634.71 | bwd_allreduce_microstep: 5.42 | step_microstep: 51.54
-[2025-01-25 09:33:58,453] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.02 | bwd: 4640.28 | bwd_inner: 4634.71 | bwd_allreduce: 5.48 | step: 51.55
-  5%|▌         | 291/5800 [47:28<10:37:31,  6.94s/it]                                                     {'loss': 0.0518, 'grad_norm': 13.34670639038086, 'learning_rate': 3.995733050266396e-05, 'epoch': 2.51}
-  5%|▌         | 291/5800 [47:28<10:37:31,  6.94s/it]score1 tensor([[0.5547],
-        [0.4141],
-        [0.4277],
-        [0.4102]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6953, 0.4297, 0.5508, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0933, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:34:05,403] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.37
-[2025-01-25 09:34:05,405] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.25 | bwd_microstep: 4644.50 | bwd_inner_microstep: 4639.43 | bwd_allreduce_microstep: 4.96 | step_microstep: 46.78
-[2025-01-25 09:34:05,405] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.22 | bwd: 4644.53 | bwd_inner: 4639.43 | bwd_allreduce: 5.01 | step: 46.78
-  5%|▌         | 292/5800 [47:35<10:37:49,  6.95s/it]                                                     {'loss': 0.0933, 'grad_norm': 13.309504508972168, 'learning_rate': 3.995659825753174e-05, 'epoch': 2.52}
-  5%|▌         | 292/5800 [47:35<10:37:49,  6.95s/it]score1 tensor([[0.5039],
-        [0.4727],
-        [0.5195],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.4141, 0.6172, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0527, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:34:12,351] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 09:34:12,352] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.74 | bwd_microstep: 4643.46 | bwd_inner_microstep: 4638.76 | bwd_allreduce_microstep: 4.62 | step_microstep: 42.01
-[2025-01-25 09:34:12,352] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.71 | bwd: 4643.48 | bwd_inner: 4638.76 | bwd_allreduce: 4.66 | step: 42.02
-  5%|▌         | 293/5800 [47:42<10:37:12,  6.94s/it]                                                     {'loss': 0.0527, 'grad_norm': 0.6819134950637817, 'learning_rate': 3.995585978958591e-05, 'epoch': 2.53}
-  5%|▌         | 293/5800 [47:42<10:37:12,  6.94s/it]score1 tensor([[0.5039],
-        [0.6055],
-        [0.6133],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4043, 0.6094, 0.5625, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0566, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:34:19,272] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.90 | optimizer_step: 4.37
-[2025-01-25 09:34:19,273] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.57 | bwd_microstep: 4641.73 | bwd_inner_microstep: 4636.76 | bwd_allreduce_microstep: 4.87 | step_microstep: 42.18
-[2025-01-25 09:34:19,274] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.54 | bwd: 4641.75 | bwd_inner: 4636.76 | bwd_allreduce: 4.92 | step: 42.18
-  5%|▌         | 294/5800 [47:49<10:36:34,  6.94s/it]                                                     {'loss': 0.0566, 'grad_norm': 6.85309362411499, 'learning_rate': 3.995511509905673e-05, 'epoch': 2.53}
-  5%|▌         | 294/5800 [47:49<10:36:34,  6.94s/it]score1 tensor([[0.5312],
-        [0.4590],
-        [0.5352],
-        [0.6055]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4219, 0.3691, 0.5664, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0742, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:34:26,223] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.95 | optimizer_step: 5.65
-[2025-01-25 09:34:26,226] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.67 | bwd_microstep: 4647.73 | bwd_inner_microstep: 4641.31 | bwd_allreduce_microstep: 6.30 | step_microstep: 58.29
-[2025-01-25 09:34:26,227] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.64 | bwd: 4647.75 | bwd_inner: 4641.31 | bwd_allreduce: 6.37 | step: 58.31
-  5%|▌         | 295/5800 [47:56<10:37:12,  6.95s/it]                                                     {'loss': 0.0742, 'grad_norm': 6.793201446533203, 'learning_rate': 3.995436418617641e-05, 'epoch': 2.54}
-  5%|▌         | 295/5800 [47:56<10:37:12,  6.95s/it]score1 tensor([[0.5547],
-        [0.5625],
-        [0.4961],
-        [0.6367]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.6211, 0.4043, 0.6133], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0454, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:34:33,190] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.94 | optimizer_step: 4.37
-[2025-01-25 09:34:33,193] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2171.12 | bwd_microstep: 4643.37 | bwd_inner_microstep: 4636.88 | bwd_allreduce_microstep: 6.35 | step_microstep: 51.99
-[2025-01-25 09:34:33,193] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2171.00 | bwd: 4643.40 | bwd_inner: 4636.88 | bwd_allreduce: 6.42 | step: 51.99
-  5%|▌         | 296/5800 [48:03<10:37:30,  6.95s/it]                                                     {'loss': 0.0454, 'grad_norm': 7.341552734375, 'learning_rate': 3.99536070511791e-05, 'epoch': 2.55}
-  5%|▌         | 296/5800 [48:03<10:37:30,  6.95s/it]score1 tensor([[0.4531],
-        [0.5156],
-        [0.4492],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4961, 0.6445, 0.3672, 0.5586], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0781, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:34:40,149] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.37
-[2025-01-25 09:34:40,151] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.54 | bwd_microstep: 4647.21 | bwd_inner_microstep: 4641.63 | bwd_allreduce_microstep: 5.48 | step_microstep: 51.28
-[2025-01-25 09:34:40,151] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.51 | bwd: 4647.24 | bwd_inner: 4641.63 | bwd_allreduce: 5.54 | step: 51.26
-  5%|▌         | 297/5800 [48:10<10:37:29,  6.95s/it]                                                     {'loss': 0.0781, 'grad_norm': 6.936427593231201, 'learning_rate': 3.9952843694300885e-05, 'epoch': 2.56}
-  5%|▌         | 297/5800 [48:10<10:37:29,  6.95s/it]score1 tensor([[0.5547],
-        [0.5039],
-        [0.3750],
-        [0.4375]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.7070, 0.5156, 0.3516, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0605, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:34:47,091] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.98 | optimizer_step: 4.49
-[2025-01-25 09:34:47,094] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.83 | bwd_microstep: 4644.25 | bwd_inner_microstep: 4637.86 | bwd_allreduce_microstep: 6.22 | step_microstep: 54.92
-[2025-01-25 09:34:47,095] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.80 | bwd: 4644.28 | bwd_inner: 4637.86 | bwd_allreduce: 6.32 | step: 54.92
-  5%|▌         | 298/5800 [48:17<10:37:18,  6.95s/it]                                                     {'loss': 0.0605, 'grad_norm': 7.120260715484619, 'learning_rate': 3.99520741157798e-05, 'epoch': 2.57}
-  5%|▌         | 298/5800 [48:17<10:37:18,  6.95s/it]score1 tensor([[0.4062],
-        [0.4883],
-        [0.5156],
-        [0.4199]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3340, 0.4805, 0.5508, 0.3672], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0420, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:34:54,115] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.84 | optimizer_step: 5.02
-[2025-01-25 09:34:54,118] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2174.85 | bwd_microstep: 4658.04 | bwd_inner_microstep: 4651.37 | bwd_allreduce_microstep: 6.57 | step_microstep: 95.05
-[2025-01-25 09:34:54,119] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2170.89 | bwd: 4658.07 | bwd_inner: 4651.37 | bwd_allreduce: 6.62 | step: 95.11
-  5%|▌         | 299/5800 [48:24<10:39:14,  6.97s/it]                                                     {'loss': 0.042, 'grad_norm': 6.600547790527344, 'learning_rate': 3.99512983158558e-05, 'epoch': 2.58}
-  5%|▌         | 299/5800 [48:24<10:39:14,  6.97s/it]score1 tensor([[0.5273],
-        [0.5156],
-        [0.4668],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6797, 0.4375, 0.5781, 0.6133], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1050, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:35:01,085] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 09:35:01,087] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2170.96 | bwd_microstep: 4644.66 | bwd_inner_microstep: 4639.81 | bwd_allreduce_microstep: 4.71 | step_microstep: 45.24
-[2025-01-25 09:35:01,087] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2170.89 | bwd: 4644.68 | bwd_inner: 4639.81 | bwd_allreduce: 4.78 | step: 45.23
-  5%|▌         | 300/5800 [48:31<10:38:48,  6.97s/it]                                                     {'loss': 0.105, 'grad_norm': 6.888860702514648, 'learning_rate': 3.99505162947708e-05, 'epoch': 2.59}
-  5%|▌         | 300/5800 [48:31<10:38:48,  6.97s/it]score1 tensor([[0.4531],
-        [0.4199],
-        [0.4629],
-        [0.4453]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4609, 0.3867, 0.4727, 0.4805], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:35:08,029] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 09:35:08,030] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.00 | bwd_microstep: 4646.35 | bwd_inner_microstep: 4641.10 | bwd_allreduce_microstep: 5.09 | step_microstep: 43.95
-[2025-01-25 09:35:08,031] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.95 | bwd: 4646.38 | bwd_inner: 4641.10 | bwd_allreduce: 5.21 | step: 43.95
-  5%|▌         | 301/5800 [48:38<10:37:57,  6.96s/it]                                                     {'loss': 0.0215, 'grad_norm': 6.813492774963379, 'learning_rate': 3.9949728052768646e-05, 'epoch': 2.59}
-  5%|▌         | 301/5800 [48:38<10:37:57,  6.96s/it]score1 tensor([[0.5312],
-        [0.6836],
-        [0.5703],
-        [0.6016]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4121, 0.6445, 0.3457, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1123, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:35:14,954] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.37
-[2025-01-25 09:35:14,956] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.84 | bwd_microstep: 4634.69 | bwd_inner_microstep: 4629.95 | bwd_allreduce_microstep: 4.60 | step_microstep: 45.90
-[2025-01-25 09:35:14,957] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.81 | bwd: 4634.71 | bwd_inner: 4629.95 | bwd_allreduce: 4.68 | step: 45.91
-  5%|▌         | 302/5800 [48:44<10:36:59,  6.95s/it]                                                     {'loss': 0.1123, 'grad_norm': 14.096909523010254, 'learning_rate': 3.9948933590095135e-05, 'epoch': 2.6}
-  5%|▌         | 302/5800 [48:44<10:36:59,  6.95s/it]score1 tensor([[0.5234],
-        [0.5430],
-        [0.7266],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.4297, 0.6484, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0596, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:35:21,891] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.37
-[2025-01-25 09:35:21,892] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.02 | bwd_microstep: 4638.38 | bwd_inner_microstep: 4632.99 | bwd_allreduce_microstep: 5.28 | step_microstep: 52.24
-[2025-01-25 09:35:21,893] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.98 | bwd: 4638.41 | bwd_inner: 4632.99 | bwd_allreduce: 5.36 | step: 52.25
-  5%|▌         | 303/5800 [48:51<10:36:20,  6.95s/it]                                                     {'loss': 0.0596, 'grad_norm': 7.220113277435303, 'learning_rate': 3.9948132906997986e-05, 'epoch': 2.61}
-  5%|▌         | 303/5800 [48:51<10:36:20,  6.95s/it]score1 tensor([[0.4941],
-        [0.5039],
-        [0.5977],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3477, 0.4297, 0.5664, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0825, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:35:28,808] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 09:35:28,809] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.55 | bwd_microstep: 4638.69 | bwd_inner_microstep: 4634.07 | bwd_allreduce_microstep: 4.54 | step_microstep: 41.46
-[2025-01-25 09:35:28,810] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.51 | bwd: 4638.72 | bwd_inner: 4634.06 | bwd_allreduce: 4.58 | step: 41.47
-  5%|▌         | 304/5800 [48:58<10:35:36,  6.94s/it]                                                     {'loss': 0.0825, 'grad_norm': 7.00277042388916, 'learning_rate': 3.994732600372686e-05, 'epoch': 2.62}
-  5%|▌         | 304/5800 [48:58<10:35:36,  6.94s/it]score1 tensor([[0.5430],
-        [0.4395],
-        [0.5039],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.3438, 0.5547, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0518, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:35:35,740] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 09:35:35,742] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.62 | bwd_microstep: 4638.33 | bwd_inner_microstep: 4633.07 | bwd_allreduce_microstep: 5.17 | step_microstep: 46.05
-[2025-01-25 09:35:35,742] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.58 | bwd: 4638.35 | bwd_inner: 4633.07 | bwd_allreduce: 5.21 | step: 46.05
-  5%|▌         | 305/5800 [49:05<10:35:02,  6.93s/it]                                                     {'loss': 0.0518, 'grad_norm': 6.87230110168457, 'learning_rate': 3.994651288053337e-05, 'epoch': 2.63}
-  5%|▌         | 305/5800 [49:05<10:35:02,  6.93s/it]score1 tensor([[0.5469],
-        [0.4688],
-        [0.3418],
-        [0.4707]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6875, 0.6602, 0.4160, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1079, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:35:42,653] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.93 | optimizer_step: 4.37
-[2025-01-25 09:35:42,655] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.10 | bwd_microstep: 4634.37 | bwd_inner_microstep: 4629.08 | bwd_allreduce_microstep: 5.19 | step_microstep: 45.22
-[2025-01-25 09:35:42,656] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.07 | bwd: 4634.40 | bwd_inner: 4629.08 | bwd_allreduce: 5.25 | step: 45.14
-  5%|▌         | 306/5800 [49:12<10:34:35,  6.93s/it]                                                     {'loss': 0.1079, 'grad_norm': 6.805199146270752, 'learning_rate': 3.994569353767107e-05, 'epoch': 2.64}
-  5%|▌         | 306/5800 [49:12<10:34:35,  6.93s/it]score1 tensor([[0.3945],
-        [0.4004],
-        [0.3555],
-        [0.3145]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4590, 0.4844, 0.4570, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1084, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:35:49,589] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.38 | optimizer_step: 4.37
-[2025-01-25 09:35:49,595] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.10 | bwd_microstep: 4634.37 | bwd_inner_microstep: 4627.97 | bwd_allreduce_microstep: 6.31 | step_microstep: 61.52
-[2025-01-25 09:35:49,595] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.07 | bwd: 4634.39 | bwd_inner: 4627.97 | bwd_allreduce: 6.35 | step: 61.54
-  5%|▌         | 307/5800 [49:19<10:34:43,  6.93s/it]                                                     {'loss': 0.1084, 'grad_norm': 13.288627624511719, 'learning_rate': 3.994486797539543e-05, 'epoch': 2.65}
-  5%|▌         | 307/5800 [49:19<10:34:43,  6.93s/it]score1 tensor([[0.3164],
-        [0.3066],
-        [0.4121],
-        [0.3828]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4277, 0.4551, 0.5352, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1318, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:35:56,574] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.80 | optimizer_step: 5.35
-[2025-01-25 09:35:56,577] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.61 | bwd_microstep: 4639.74 | bwd_inner_microstep: 4631.36 | bwd_allreduce_microstep: 8.06 | step_microstep: 87.03
-[2025-01-25 09:35:56,578] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.58 | bwd: 4639.82 | bwd_inner: 4631.36 | bwd_allreduce: 8.25 | step: 87.05
-  5%|▌         | 308/5800 [49:26<10:36:11,  6.95s/it]                                                     {'loss': 0.1318, 'grad_norm': 13.360769271850586, 'learning_rate': 3.9944036193963885e-05, 'epoch': 2.66}
-  5%|▌         | 308/5800 [49:26<10:36:11,  6.95s/it]score1 tensor([[0.4395],
-        [0.4941],
-        [0.3203],
-        [0.3770]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.5312, 0.3809, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0532, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:36:03,586] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.91 | optimizer_step: 4.36
-[2025-01-25 09:36:03,589] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.48 | bwd_microstep: 4636.29 | bwd_inner_microstep: 4627.56 | bwd_allreduce_microstep: 8.47 | step_microstep: 107.64
-[2025-01-25 09:36:03,589] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.44 | bwd: 4636.34 | bwd_inner: 4627.56 | bwd_allreduce: 8.64 | step: 107.67
-  5%|▌         | 309/5800 [49:33<10:37:58,  6.97s/it]                                                     {'loss': 0.0532, 'grad_norm': 13.463871955871582, 'learning_rate': 3.994319819363579e-05, 'epoch': 2.66}
-  5%|▌         | 309/5800 [49:33<10:37:58,  6.97s/it]score1 tensor([[0.5430],
-        [0.5156],
-        [0.5078],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5234, 0.5078, 0.4648, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0308, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:36:10,569] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 09:36:10,570] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2175.31 | bwd_microstep: 4635.94 | bwd_inner_microstep: 4630.77 | bwd_allreduce_microstep: 5.05 | step_microstep: 46.72
-[2025-01-25 09:36:10,571] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2175.24 | bwd: 4635.97 | bwd_inner: 4630.77 | bwd_allreduce: 5.11 | step: 46.72
-  5%|▌         | 310/5800 [49:40<10:37:37,  6.97s/it]                                                     {'loss': 0.0308, 'grad_norm': 13.849044799804688, 'learning_rate': 3.994235397467246e-05, 'epoch': 2.67}
-  5%|▌         | 310/5800 [49:40<10:37:37,  6.97s/it]score1 tensor([[0.4570],
-        [0.3633],
-        [0.5078],
-        [0.6367]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.3711, 0.4512, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0386, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:36:17,509] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 09:36:17,511] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.68 | bwd_microstep: 4637.06 | bwd_inner_microstep: 4632.16 | bwd_allreduce_microstep: 4.78 | step_microstep: 51.61
-[2025-01-25 09:36:17,511] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.65 | bwd: 4637.09 | bwd_inner: 4632.16 | bwd_allreduce: 4.85 | step: 51.62
-  5%|▌         | 311/5800 [49:47<10:36:43,  6.96s/it]                                                     {'loss': 0.0386, 'grad_norm': 6.9920973777771, 'learning_rate': 3.9941503537337134e-05, 'epoch': 2.68}
-  5%|▌         | 311/5800 [49:47<10:36:43,  6.96s/it]score1 tensor([[0.6250],
-        [0.4316],
-        [0.6094],
-        [0.6211]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.4512, 0.4863, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0864, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:36:24,466] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.20 | optimizer_step: 4.36
-[2025-01-25 09:36:24,468] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2172.55 | bwd_microstep: 4635.35 | bwd_inner_microstep: 4626.44 | bwd_allreduce_microstep: 8.60 | step_microstep: 55.16
-[2025-01-25 09:36:24,468] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2172.52 | bwd: 4635.38 | bwd_inner: 4626.44 | bwd_allreduce: 8.82 | step: 55.16
-  5%|▌         | 312/5800 [49:54<10:36:31,  6.96s/it]                                                     {'loss': 0.0864, 'grad_norm': 7.462128639221191, 'learning_rate': 3.994064688189498e-05, 'epoch': 2.69}
-  5%|▌         | 312/5800 [49:54<10:36:31,  6.96s/it]score1 tensor([[0.6445],
-        [0.5664],
-        [0.5234],
-        [0.6289]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5938, 0.4336, 0.4688, 0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0840, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:36:31,427] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.65 | optimizer_step: 4.41
-[2025-01-25 09:36:31,429] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.39 | bwd_microstep: 4637.79 | bwd_inner_microstep: 4630.09 | bwd_allreduce_microstep: 7.48 | step_microstep: 72.01
-[2025-01-25 09:36:31,430] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.36 | bwd: 4637.85 | bwd_inner: 4630.09 | bwd_allreduce: 7.60 | step: 72.03
-  5%|▌         | 313/5800 [50:01<10:36:25,  6.96s/it]                                                     {'loss': 0.084, 'grad_norm': 14.016160011291504, 'learning_rate': 3.9939784008613135e-05, 'epoch': 2.7}
-  5%|▌         | 313/5800 [50:01<10:36:25,  6.96s/it]score1 tensor([[0.5156],
-        [0.5391],
-        [0.5742],
-        [0.6328]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4707, 0.5469, 0.4492, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0640, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:36:38,396] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.63 | optimizer_step: 4.37
-[2025-01-25 09:36:38,397] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.27 | bwd_microstep: 4648.86 | bwd_inner_microstep: 4640.13 | bwd_allreduce_microstep: 8.42 | step_microstep: 73.59
-[2025-01-25 09:36:38,397] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.24 | bwd: 4648.92 | bwd_inner: 4640.13 | bwd_allreduce: 8.59 | step: 73.57
-  5%|▌         | 314/5800 [50:08<10:36:59,  6.97s/it]                                                     {'loss': 0.064, 'grad_norm': 7.030267238616943, 'learning_rate': 3.993891491776065e-05, 'epoch': 2.71}
-  5%|▌         | 314/5800 [50:08<10:36:59,  6.97s/it]score1 tensor([[0.5117],
-        [0.5625],
-        [0.5820],
-        [0.4082]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4414, 0.4609, 0.5664, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0488, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:36:45,343] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.55 | optimizer_step: 4.36
-[2025-01-25 09:36:45,344] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.47 | bwd_microstep: 4641.25 | bwd_inner_microstep: 4635.99 | bwd_allreduce_microstep: 5.17 | step_microstep: 47.69
-[2025-01-25 09:36:45,345] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.44 | bwd: 4641.28 | bwd_inner: 4635.99 | bwd_allreduce: 5.21 | step: 47.70
-  5%|▌         | 315/5800 [50:15<10:36:01,  6.96s/it]                                                     {'loss': 0.0488, 'grad_norm': 13.680968284606934, 'learning_rate': 3.993803960960852e-05, 'epoch': 2.72}
-  5%|▌         | 315/5800 [50:15<10:36:01,  6.96s/it]score1 tensor([[0.4121],
-        [0.5273],
-        [0.4375],
-        [0.4355]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4316, 0.5352, 0.4980, 0.4629], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0288, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:36:52,284] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.36
-[2025-01-25 09:36:52,290] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.91 | bwd_microstep: 4645.09 | bwd_inner_microstep: 4640.04 | bwd_allreduce_microstep: 4.96 | step_microstep: 50.18
-[2025-01-25 09:36:52,290] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.87 | bwd: 4645.11 | bwd_inner: 4640.04 | bwd_allreduce: 5.01 | step: 50.19
-  5%|▌         | 316/5800 [50:22<10:35:22,  6.95s/it]                                                     {'loss': 0.0288, 'grad_norm': 13.457956314086914, 'learning_rate': 3.993715808442968e-05, 'epoch': 2.72}
-  5%|▌         | 316/5800 [50:22<10:35:22,  6.95s/it]score1 tensor([[0.3496],
-        [0.4922],
-        [0.3320],
-        [0.3555]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.6016, 0.5117, 0.4668], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1465, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:36:59,220] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 09:36:59,225] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.54 | bwd_microstep: 4636.38 | bwd_inner_microstep: 4630.64 | bwd_allreduce_microstep: 5.62 | step_microstep: 46.02
-[2025-01-25 09:36:59,225] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.49 | bwd: 4636.42 | bwd_inner: 4630.64 | bwd_allreduce: 5.68 | step: 46.02
-  5%|▌         | 317/5800 [50:29<10:34:49,  6.95s/it]                                                     {'loss': 0.1465, 'grad_norm': 13.122235298156738, 'learning_rate': 3.9936270342499016e-05, 'epoch': 2.73}
-  5%|▌         | 317/5800 [50:29<10:34:49,  6.95s/it]score1 tensor([[0.4512],
-        [0.3203],
-        [0.2988],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.2812, 0.4551, 0.6367], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1011, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:37:06,149] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 09:37:06,155] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.96 | bwd_microstep: 4638.30 | bwd_inner_microstep: 4632.90 | bwd_allreduce_microstep: 5.31 | step_microstep: 50.48
-[2025-01-25 09:37:06,156] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.93 | bwd: 4638.32 | bwd_inner: 4632.90 | bwd_allreduce: 5.35 | step: 50.50
-  5%|▌         | 318/5800 [50:36<10:34:23,  6.94s/it]                                                     {'loss': 0.1011, 'grad_norm': 6.745631694793701, 'learning_rate': 3.9935376384093336e-05, 'epoch': 2.74}
-  5%|▌         | 318/5800 [50:36<10:34:23,  6.94s/it]score1 tensor([[0.4121],
-        [0.4199],
-        [0.3906],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4883, 0.4863, 0.4395, 0.6133], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0605, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:37:13,077] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.97 | optimizer_step: 4.36
-[2025-01-25 09:37:13,078] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.01 | bwd_microstep: 4629.79 | bwd_inner_microstep: 4624.19 | bwd_allreduce_microstep: 5.50 | step_microstep: 47.55
-[2025-01-25 09:37:13,079] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.97 | bwd: 4629.82 | bwd_inner: 4624.19 | bwd_allreduce: 5.55 | step: 47.56
-  6%|▌         | 319/5800 [50:43<10:33:38,  6.94s/it]                                                     {'loss': 0.0605, 'grad_norm': 13.377069473266602, 'learning_rate': 3.993447620949139e-05, 'epoch': 2.75}
-  6%|▌         | 319/5800 [50:43<10:33:38,  6.94s/it]score1 tensor([[0.6484],
-        [0.4727],
-        [0.5039],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.5078, 0.4473, 0.4531], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0513, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:37:19,994] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.31 | optimizer_step: 4.37
-[2025-01-25 09:37:19,996] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.53 | bwd_microstep: 4633.12 | bwd_inner_microstep: 4627.62 | bwd_allreduce_microstep: 5.40 | step_microstep: 44.30
-[2025-01-25 09:37:19,996] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.50 | bwd: 4633.15 | bwd_inner: 4627.62 | bwd_allreduce: 5.45 | step: 44.30
-  6%|▌         | 320/5800 [50:49<10:32:53,  6.93s/it]                                                     {'loss': 0.0513, 'grad_norm': 7.046546459197998, 'learning_rate': 3.993356981897387e-05, 'epoch': 2.76}
-  6%|▌         | 320/5800 [50:49<10:32:53,  6.93s/it]score1 tensor([[0.6445],
-        [0.5547],
-        [0.5898],
-        [0.6133]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6289, 0.5664, 0.6562, 0.6367], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:37:26,903] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 09:37:26,904] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.35 | bwd_microstep: 4629.87 | bwd_inner_microstep: 4624.80 | bwd_allreduce_microstep: 4.98 | step_microstep: 44.70
-[2025-01-25 09:37:26,904] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.32 | bwd: 4629.90 | bwd_inner: 4624.80 | bwd_allreduce: 5.02 | step: 44.70
-  6%|▌         | 321/5800 [50:56<10:32:17,  6.92s/it]                                                     {'loss': 0.0293, 'grad_norm': 6.859526634216309, 'learning_rate': 3.99326572128234e-05, 'epoch': 2.77}
-  6%|▌         | 321/5800 [50:56<10:32:17,  6.92s/it]score1 tensor([[0.5430],
-        [0.5312],
-        [0.5430],
-        [0.6953]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6406, 0.5156, 0.4746, 0.6484], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0571, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:37:33,837] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 09:37:33,840] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.65 | bwd_microstep: 4643.84 | bwd_inner_microstep: 4637.25 | bwd_allreduce_microstep: 6.49 | step_microstep: 46.23
-[2025-01-25 09:37:33,840] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.62 | bwd: 4643.86 | bwd_inner: 4637.25 | bwd_allreduce: 6.55 | step: 46.27
-  6%|▌         | 322/5800 [51:03<10:33:03,  6.93s/it]                                                     {'loss': 0.0571, 'grad_norm': 6.99367094039917, 'learning_rate': 3.993173839132455e-05, 'epoch': 2.78}
-  6%|▌         | 322/5800 [51:03<10:33:03,  6.93s/it]score1 tensor([[0.5664],
-        [0.6016],
-        [0.6094],
-        [0.6055]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4238, 0.4980, 0.4258, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1201, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:37:40,805] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 09:37:40,809] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2172.96 | bwd_microstep: 4647.81 | bwd_inner_microstep: 4642.30 | bwd_allreduce_microstep: 5.40 | step_microstep: 48.96
-[2025-01-25 09:37:40,809] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2172.93 | bwd: 4647.84 | bwd_inner: 4642.30 | bwd_allreduce: 5.45 | step: 49.05
-  6%|▌         | 323/5800 [51:10<10:33:37,  6.94s/it]                                                     {'loss': 0.1201, 'grad_norm': 13.822351455688477, 'learning_rate': 3.993081335476382e-05, 'epoch': 2.78}
-  6%|▌         | 323/5800 [51:10<10:33:37,  6.94s/it]score1 tensor([[0.6602],
-        [0.6523],
-        [0.6602],
-        [0.6953]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6719, 0.5781, 0.5742, 0.6562], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0527, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:37:47,779] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.98 | optimizer_step: 4.37
-[2025-01-25 09:37:47,781] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2176.71 | bwd_microstep: 4632.56 | bwd_inner_microstep: 4626.62 | bwd_allreduce_microstep: 5.85 | step_microstep: 43.96
-[2025-01-25 09:37:47,781] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2176.68 | bwd: 4632.59 | bwd_inner: 4626.62 | bwd_allreduce: 5.90 | step: 43.97
-  6%|▌         | 324/5800 [51:17<10:34:18,  6.95s/it]                                                     {'loss': 0.0527, 'grad_norm': 7.159776210784912, 'learning_rate': 3.992988210342966e-05, 'epoch': 2.79}
-  6%|▌         | 324/5800 [51:17<10:34:18,  6.95s/it]score1 tensor([[0.4922],
-        [0.5859],
-        [0.5430],
-        [0.4785]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.5742, 0.5430, 0.5820], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0356, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:37:54,676] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.37
-[2025-01-25 09:37:54,677] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.64 | bwd_microstep: 4596.33 | bwd_inner_microstep: 4591.14 | bwd_allreduce_microstep: 5.10 | step_microstep: 49.04
-[2025-01-25 09:37:54,678] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.60 | bwd: 4596.35 | bwd_inner: 4591.14 | bwd_allreduce: 5.14 | step: 49.01
-  6%|▌         | 325/5800 [51:24<10:32:30,  6.93s/it]                                                     {'loss': 0.0356, 'grad_norm': 3.2499423027038574, 'learning_rate': 3.992894463761245e-05, 'epoch': 2.8}
-  6%|▌         | 325/5800 [51:24<10:32:30,  6.93s/it]score1 tensor([[0.5977],
-        [0.5195],
-        [0.5312],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.4922, 0.4746, 0.4375], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0752, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:38:01,640] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 7.15 | optimizer_step: 4.62
-[2025-01-25 09:38:01,642] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.82 | bwd_microstep: 4642.56 | bwd_inner_microstep: 4633.09 | bwd_allreduce_microstep: 9.20 | step_microstep: 80.44
-[2025-01-25 09:38:01,642] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.77 | bwd: 4642.63 | bwd_inner: 4633.09 | bwd_allreduce: 9.35 | step: 80.44
-  6%|▌         | 326/5800 [51:31<10:34:01,  6.95s/it]                                                     {'loss': 0.0752, 'grad_norm': 13.496038436889648, 'learning_rate': 3.99280009576045e-05, 'epoch': 2.81}
-  6%|▌         | 326/5800 [51:31<10:34:01,  6.95s/it]score1 tensor([[0.4727],
-        [0.4922],
-        [0.4980],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3730, 0.4785, 0.5117, 0.4824], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0400, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:38:08,600] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.37
-[2025-01-25 09:38:08,601] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.66 | bwd_microstep: 4636.21 | bwd_inner_microstep: 4630.96 | bwd_allreduce_microstep: 5.15 | step_microstep: 44.78
-[2025-01-25 09:38:08,601] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.60 | bwd: 4636.25 | bwd_inner: 4630.96 | bwd_allreduce: 5.20 | step: 44.78
-  6%|▌         | 327/5800 [51:38<10:33:22,  6.94s/it]                                                     {'loss': 0.04, 'grad_norm': 6.9390387535095215, 'learning_rate': 3.9927051063700075e-05, 'epoch': 2.82}
-  6%|▌         | 327/5800 [51:38<10:33:22,  6.94s/it]score1 tensor([[0.3984],
-        [0.3828],
-        [0.4004],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.4922, 0.4004, 0.6875], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1064, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:38:15,483] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 09:38:15,484] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.75 | bwd_microstep: 4594.55 | bwd_inner_microstep: 4588.81 | bwd_allreduce_microstep: 5.65 | step_microstep: 45.95
-[2025-01-25 09:38:15,484] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.72 | bwd: 4594.58 | bwd_inner: 4588.81 | bwd_allreduce: 5.70 | step: 45.91
-  6%|▌         | 328/5800 [51:45<10:31:36,  6.93s/it]                                                     {'loss': 0.1064, 'grad_norm': 9.789416313171387, 'learning_rate': 3.9926094956195356e-05, 'epoch': 2.83}
-  6%|▌         | 328/5800 [51:45<10:31:36,  6.93s/it]score1 tensor([[0.4199],
-        [0.3535],
-        [0.3926],
-        [0.4277]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4609, 0.5508, 0.6289], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1494, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:38:22,409] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 09:38:22,411] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.49 | bwd_microstep: 4637.58 | bwd_inner_microstep: 4632.51 | bwd_allreduce_microstep: 4.96 | step_microstep: 49.25
-[2025-01-25 09:38:22,411] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.46 | bwd: 4637.61 | bwd_inner: 4632.51 | bwd_allreduce: 5.02 | step: 49.26
-  6%|▌         | 329/5800 [51:52<10:31:41,  6.93s/it]                                                     {'loss': 0.1494, 'grad_norm': 12.96607780456543, 'learning_rate': 3.9925132635388486e-05, 'epoch': 2.84}
-  6%|▌         | 329/5800 [51:52<10:31:41,  6.93s/it]score1 tensor([[0.4570],
-        [0.4590],
-        [0.3184],
-        [0.4512]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.6094, 0.4258, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1157, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:38:29,351] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.36 | optimizer_step: 4.36
-[2025-01-25 09:38:29,353] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.64 | bwd_microstep: 4648.67 | bwd_inner_microstep: 4643.63 | bwd_allreduce_microstep: 4.94 | step_microstep: 43.97
-[2025-01-25 09:38:29,353] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.61 | bwd: 4648.69 | bwd_inner: 4643.63 | bwd_allreduce: 4.99 | step: 43.97
-  6%|▌         | 330/5800 [51:59<10:31:53,  6.93s/it]                                                     {'loss': 0.1157, 'grad_norm': 13.022541046142578, 'learning_rate': 3.992416410157953e-05, 'epoch': 2.84}
-  6%|▌         | 330/5800 [51:59<10:31:53,  6.93s/it]score1 tensor([[0.4961],
-        [0.4434],
-        [0.4219],
-        [0.4473]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5078, 0.6211, 0.6172, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1201, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:38:36,293] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.99 | optimizer_step: 4.36
-[2025-01-25 09:38:36,295] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.05 | bwd_microstep: 4634.99 | bwd_inner_microstep: 4630.14 | bwd_allreduce_microstep: 4.75 | step_microstep: 47.98
-[2025-01-25 09:38:36,295] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.00 | bwd: 4635.01 | bwd_inner: 4630.14 | bwd_allreduce: 4.81 | step: 47.99
-  6%|▌         | 331/5800 [52:06<10:32:05,  6.93s/it]                                                     {'loss': 0.1201, 'grad_norm': 13.044503211975098, 'learning_rate': 3.992318935507049e-05, 'epoch': 2.85}
-  6%|▌         | 331/5800 [52:06<10:32:05,  6.93s/it]score1 tensor([[0.4727],
-        [0.5781],
-        [0.4941],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.6055, 0.5117, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0225, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:38:43,232] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.06 | optimizer_step: 4.36
-[2025-01-25 09:38:43,234] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.90 | bwd_microstep: 4637.95 | bwd_inner_microstep: 4632.85 | bwd_allreduce_microstep: 4.98 | step_microstep: 58.91
-[2025-01-25 09:38:43,234] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.87 | bwd: 4637.98 | bwd_inner: 4632.85 | bwd_allreduce: 5.04 | step: 58.92
-  6%|▌         | 332/5800 [52:13<10:32:17,  6.94s/it]                                                     {'loss': 0.0225, 'grad_norm': 6.866084575653076, 'learning_rate': 3.9922208396165316e-05, 'epoch': 2.86}
-  6%|▌         | 332/5800 [52:13<10:32:17,  6.94s/it]score1 tensor([[0.5820],
-        [0.6484],
-        [0.5586],
-        [0.5898]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.6562, 0.4961, 0.4277], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0659, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:38:50,198] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 09:38:50,199] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.06 | bwd_microstep: 4645.53 | bwd_inner_microstep: 4640.44 | bwd_allreduce_microstep: 4.96 | step_microstep: 60.07
-[2025-01-25 09:38:50,200] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.03 | bwd: 4645.57 | bwd_inner: 4640.44 | bwd_allreduce: 5.02 | step: 60.08
-  6%|▌         | 333/5800 [52:20<10:33:45,  6.96s/it]                                                     {'loss': 0.0659, 'grad_norm': 6.641707420349121, 'learning_rate': 3.992122122516988e-05, 'epoch': 2.87}
-  6%|▌         | 333/5800 [52:20<10:33:45,  6.96s/it]score1 tensor([[0.6250],
-        [0.6836],
-        [0.6328],
-        [0.7109]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.5000, 0.4785, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1245, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:38:57,200] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.37
-[2025-01-25 09:38:57,202] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2170.73 | bwd_microstep: 4637.49 | bwd_inner_microstep: 4631.35 | bwd_allreduce_microstep: 6.06 | step_microstep: 66.92
-[2025-01-25 09:38:57,202] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2170.70 | bwd: 4637.51 | bwd_inner: 4631.35 | bwd_allreduce: 6.09 | step: 66.93
-  6%|▌         | 334/5800 [52:27<10:34:04,  6.96s/it]                                                     {'loss': 0.1245, 'grad_norm': 13.763984680175781, 'learning_rate': 3.992022784239201e-05, 'epoch': 2.88}
-  6%|▌         | 334/5800 [52:27<10:34:04,  6.96s/it]score1 tensor([[0.6406],
-        [0.6602],
-        [0.6367],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.5898, 0.5195, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1089, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:39:04,153] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 09:39:04,154] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.36 | bwd_microstep: 4636.07 | bwd_inner_microstep: 4630.37 | bwd_allreduce_microstep: 5.58 | step_microstep: 51.29
-[2025-01-25 09:39:04,154] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.33 | bwd: 4636.09 | bwd_inner: 4630.37 | bwd_allreduce: 5.64 | step: 51.30
-  6%|▌         | 335/5800 [52:34<10:33:19,  6.95s/it]                                                     {'loss': 0.1089, 'grad_norm': 13.462910652160645, 'learning_rate': 3.991922824814145e-05, 'epoch': 2.89}
-  6%|▌         | 335/5800 [52:34<10:33:19,  6.95s/it]score1 tensor([[0.6445],
-        [0.6445],
-        [0.6133],
-        [0.6953]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.5586, 0.5039, 0.6406], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1074, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:39:11,071] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 09:39:11,072] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.34 | bwd_microstep: 4640.70 | bwd_inner_microstep: 4635.47 | bwd_allreduce_microstep: 5.12 | step_microstep: 46.87
-[2025-01-25 09:39:11,073] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.29 | bwd: 4640.72 | bwd_inner: 4635.48 | bwd_allreduce: 5.17 | step: 46.88
-  6%|▌         | 336/5800 [52:41<10:32:19,  6.94s/it]                                                     {'loss': 0.1074, 'grad_norm': 13.71038818359375, 'learning_rate': 3.9918222442729885e-05, 'epoch': 2.9}
-  6%|▌         | 336/5800 [52:41<10:32:19,  6.94s/it]score1 tensor([[0.5430],
-        [0.6055],
-        [0.5742],
-        [0.6875]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.4863, 0.4688, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0845, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:39:18,006] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 09:39:18,009] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.16 | bwd_microstep: 4644.62 | bwd_inner_microstep: 4639.14 | bwd_allreduce_microstep: 5.29 | step_microstep: 47.68
-[2025-01-25 09:39:18,009] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.11 | bwd: 4644.64 | bwd_inner: 4639.14 | bwd_allreduce: 5.39 | step: 47.69
-  6%|▌         | 337/5800 [52:47<10:32:14,  6.94s/it]                                                     {'loss': 0.0845, 'grad_norm': 13.365718841552734, 'learning_rate': 3.991721042647096e-05, 'epoch': 2.91}
-  6%|▌         | 337/5800 [52:47<10:32:14,  6.94s/it]score1 tensor([[0.5039],
-        [0.4863],
-        [0.5039],
-        [0.6133]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3652, 0.4941, 0.4023, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0630, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:39:24,945] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 09:39:24,947] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.83 | bwd_microstep: 4645.73 | bwd_inner_microstep: 4640.65 | bwd_allreduce_microstep: 4.98 | step_microstep: 45.35
-[2025-01-25 09:39:24,947] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.80 | bwd: 4645.75 | bwd_inner: 4640.65 | bwd_allreduce: 5.03 | step: 45.36
-  6%|▌         | 338/5800 [52:54<10:31:51,  6.94s/it]                                                     {'loss': 0.063, 'grad_norm': 6.827338218688965, 'learning_rate': 3.9916192199680225e-05, 'epoch': 2.91}
-  6%|▌         | 338/5800 [52:54<10:31:51,  6.94s/it]score1 tensor([[0.3809],
-        [0.4727],
-        [0.3984],
-        [0.4160]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4023, 0.5938, 0.5000, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0767, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:39:31,873] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 09:39:31,874] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.83 | bwd_microstep: 4635.60 | bwd_inner_microstep: 4630.65 | bwd_allreduce_microstep: 4.84 | step_microstep: 46.86
-[2025-01-25 09:39:31,874] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.79 | bwd: 4635.63 | bwd_inner: 4630.65 | bwd_allreduce: 4.90 | step: 46.87
-  6%|▌         | 339/5800 [53:01<10:31:13,  6.94s/it]                                                     {'loss': 0.0767, 'grad_norm': 12.623700141906738, 'learning_rate': 3.991516776267519e-05, 'epoch': 2.92}
-  6%|▌         | 339/5800 [53:01<10:31:13,  6.94s/it]score1 tensor([[0.3652],
-        [0.3984],
-        [0.3477],
-        [0.3633]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.5547, 0.4512, 0.4805], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0967, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:39:38,792] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.51 | optimizer_step: 4.36
-[2025-01-25 09:39:38,793] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.23 | bwd_microstep: 4642.10 | bwd_inner_microstep: 4637.13 | bwd_allreduce_microstep: 4.88 | step_microstep: 44.50
-[2025-01-25 09:39:38,794] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.19 | bwd: 4642.12 | bwd_inner: 4637.13 | bwd_allreduce: 4.92 | step: 44.51
-  6%|▌         | 340/5800 [53:08<10:30:38,  6.93s/it]                                                     {'loss': 0.0967, 'grad_norm': 12.500273704528809, 'learning_rate': 3.991413711577529e-05, 'epoch': 2.93}
-  6%|▌         | 340/5800 [53:08<10:30:38,  6.93s/it]score1 tensor([[0.4375],
-        [0.3320],
-        [0.3379],
-        [0.3535]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.4512, 0.4629, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1157, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:39:45,707] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.36
-[2025-01-25 09:39:45,708] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.83 | bwd_microstep: 4637.15 | bwd_inner_microstep: 4630.09 | bwd_allreduce_microstep: 6.97 | step_microstep: 46.36
-[2025-01-25 09:39:45,708] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.79 | bwd: 4637.18 | bwd_inner: 4630.09 | bwd_allreduce: 7.01 | step: 46.36
-  6%|▌         | 341/5800 [53:15<10:30:07,  6.93s/it]                                                     {'loss': 0.1157, 'grad_norm': 12.445155143737793, 'learning_rate': 3.9913100259301894e-05, 'epoch': 2.94}
-  6%|▌         | 341/5800 [53:15<10:30:07,  6.93s/it]score1 tensor([[0.4355],
-        [0.3594],
-        [0.3711],
-        [0.4258]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5742, 0.4902, 0.5703, 0.6875], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1826, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:39:52,665] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.36
-[2025-01-25 09:39:52,667] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.95 | bwd_microstep: 4638.83 | bwd_inner_microstep: 4633.63 | bwd_allreduce_microstep: 5.06 | step_microstep: 85.41
-[2025-01-25 09:39:52,667] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.85 | bwd: 4638.85 | bwd_inner: 4633.63 | bwd_allreduce: 5.14 | step: 85.42
-  6%|▌         | 342/5800 [53:22<10:31:22,  6.94s/it]                                                     {'loss': 0.1826, 'grad_norm': 12.583065032958984, 'learning_rate': 3.991205719357831e-05, 'epoch': 2.95}
-  6%|▌         | 342/5800 [53:22<10:31:22,  6.94s/it]score1 tensor([[0.4707],
-        [0.4414],
-        [0.4551],
-        [0.4043]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.6211, 0.4492, 0.4766], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0874, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:39:59,612] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.92 | optimizer_step: 4.36
-[2025-01-25 09:39:59,613] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2170.13 | bwd_microstep: 4636.70 | bwd_inner_microstep: 4631.61 | bwd_allreduce_microstep: 4.98 | step_microstep: 42.49
-[2025-01-25 09:39:59,613] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2170.08 | bwd: 4636.73 | bwd_inner: 4631.61 | bwd_allreduce: 5.04 | step: 42.51
-  6%|▌         | 343/5800 [53:29<10:31:08,  6.94s/it]                                                     {'loss': 0.0874, 'grad_norm': 6.393149375915527, 'learning_rate': 3.991100791892979e-05, 'epoch': 2.96}
-  6%|▌         | 343/5800 [53:29<10:31:08,  6.94s/it]score1 tensor([[0.5156],
-        [0.5078],
-        [0.5156],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.5117, 0.5195, 0.6016], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0371, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:40:06,561] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 09:40:06,562] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.38 | bwd_microstep: 4634.88 | bwd_inner_microstep: 4630.10 | bwd_allreduce_microstep: 4.70 | step_microstep: 52.69
-[2025-01-25 09:40:06,563] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2168.26 | bwd: 4634.91 | bwd_inner: 4630.10 | bwd_allreduce: 4.74 | step: 52.70
-  6%|▌         | 344/5800 [53:36<10:31:12,  6.94s/it]                                                     {'loss': 0.0371, 'grad_norm': 12.839221000671387, 'learning_rate': 3.9909952435683524e-05, 'epoch': 2.97}
-  6%|▌         | 344/5800 [53:36<10:31:12,  6.94s/it]score1 tensor([[0.5586],
-        [0.5781],
-        [0.6602],
-        [0.6602]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4570, 0.4258, 0.6641, 0.6250], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0732, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:40:13,486] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.55 | optimizer_step: 4.36
-[2025-01-25 09:40:13,487] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.32 | bwd_microstep: 4636.62 | bwd_inner_microstep: 4631.41 | bwd_allreduce_microstep: 5.05 | step_microstep: 45.30
-[2025-01-25 09:40:13,488] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.27 | bwd: 4636.65 | bwd_inner: 4631.41 | bwd_allreduce: 5.12 | step: 45.31
-  6%|▌         | 345/5800 [53:43<10:30:32,  6.94s/it]                                                     {'loss': 0.0732, 'grad_norm': 6.5324883460998535, 'learning_rate': 3.9908890744168615e-05, 'epoch': 2.97}
-  6%|▌         | 345/5800 [53:43<10:30:32,  6.94s/it]score1 tensor([[0.6367],
-        [0.5508],
-        [0.6211],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.3789, 0.4082, 0.3945], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1699, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:40:20,412] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.37
-[2025-01-25 09:40:20,413] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.00 | bwd_microstep: 4641.23 | bwd_inner_microstep: 4636.02 | bwd_allreduce_microstep: 5.09 | step_microstep: 48.04
-[2025-01-25 09:40:20,414] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.97 | bwd: 4641.26 | bwd_inner: 4636.02 | bwd_allreduce: 5.15 | step: 48.05
-  6%|▌         | 346/5800 [53:50<10:30:06,  6.93s/it]                                                     {'loss': 0.1699, 'grad_norm': 13.211094856262207, 'learning_rate': 3.990782284471612e-05, 'epoch': 2.98}
-  6%|▌         | 346/5800 [53:50<10:30:06,  6.93s/it]score1 tensor([[0.6094],
-        [0.7148],
-        [0.6797],
-        [0.6719]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.6016, 0.4277, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1484, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:40:27,365] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.49 | optimizer_step: 4.54
-[2025-01-25 09:40:27,367] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2172.95 | bwd_microstep: 4649.52 | bwd_inner_microstep: 4641.47 | bwd_allreduce_microstep: 7.85 | step_microstep: 64.70
-[2025-01-25 09:40:27,368] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2172.92 | bwd: 4649.58 | bwd_inner: 4641.47 | bwd_allreduce: 7.98 | step: 64.72
-  6%|▌         | 347/5800 [53:57<10:31:42,  6.95s/it]                                                     {'loss': 0.1484, 'grad_norm': 13.468965530395508, 'learning_rate': 3.990674873765904e-05, 'epoch': 2.99}
-  6%|▌         | 347/5800 [53:57<10:31:42,  6.95s/it][93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-score1 tensor([[0.6289]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1816, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:40:33,593] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.37
-[2025-01-25 09:40:33,594] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 576.39 | bwd_microstep: 1223.49 | bwd_inner_microstep: 1218.27 | bwd_allreduce_microstep: 5.03 | step_microstep: 44.24
-[2025-01-25 09:40:33,595] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 576.35 | bwd: 1223.49 | bwd_inner: 1218.25 | bwd_allreduce: 5.10 | step: 44.21
-  6%|▌         | 348/5800 [54:03<10:10:58,  6.72s/it]                                                     {'loss': 0.1816, 'grad_norm': 13.215475082397461, 'learning_rate': 3.990566842333228e-05, 'epoch': 3.0}
-  6%|▌         | 348/5800 [54:03<10:10:58,  6.72s/it][2025-01-25 09:40:38,512] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 09:40:49,484] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 09:41:01,461] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 09:41:13,153] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-score1 tensor([[0.6016],
-        [0.6562],
-        [0.6016],
-        [0.5977]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.6875, 0.4805, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0845, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:41:34,559] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 09:41:34,561] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2149.10 | bwd_microstep: 4601.63 | bwd_inner_microstep: 4594.23 | bwd_allreduce_microstep: 7.06 | step_microstep: 63.28
-[2025-01-25 09:41:34,561] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2149.06 | bwd: 4601.70 | bwd_inner: 4594.23 | bwd_allreduce: 7.27 | step: 63.25
-  6%|▌         | 349/5800 [55:04<34:49:49, 23.00s/it]                                                     {'loss': 0.0845, 'grad_norm': 6.52205228805542, 'learning_rate': 3.990458190207272e-05, 'epoch': 3.01}
-  6%|▌         | 349/5800 [55:04<34:49:49, 23.00s/it]score1 tensor([[0.6094],
-        [0.5547],
-        [0.6328],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6797, 0.5352, 0.6016, 0.4863], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0513, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:41:41,433] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 09:41:41,435] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2131.86 | bwd_microstep: 4580.95 | bwd_inner_microstep: 4575.63 | bwd_allreduce_microstep: 5.22 | step_microstep: 43.95
-[2025-01-25 09:41:41,435] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2131.83 | bwd: 4580.97 | bwd_inner: 4575.63 | bwd_allreduce: 5.27 | step: 43.96
-  6%|▌         | 350/5800 [55:11<27:29:17, 18.16s/it]                                                     {'loss': 0.0513, 'grad_norm': 6.471190452575684, 'learning_rate': 3.9903489174219144e-05, 'epoch': 3.02}
-  6%|▌         | 350/5800 [55:11<27:29:17, 18.16s/it]score1 tensor([[0.4883],
-        [0.4453],
-        [0.4492],
-        [0.4297]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.4941, 0.3750, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0903, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:41:48,293] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 09:41:48,295] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2135.20 | bwd_microstep: 4598.45 | bwd_inner_microstep: 4593.76 | bwd_allreduce_microstep: 4.61 | step_microstep: 44.81
-[2025-01-25 09:41:48,295] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2135.17 | bwd: 4598.48 | bwd_inner: 4593.76 | bwd_allreduce: 4.65 | step: 44.82
-  6%|▌         | 351/5800 [55:18<22:21:25, 14.77s/it]                                                     {'loss': 0.0903, 'grad_norm': 6.208306789398193, 'learning_rate': 3.990239024011229e-05, 'epoch': 3.03}
-  6%|▌         | 351/5800 [55:18<22:21:25, 14.77s/it]score1 tensor([[0.4180],
-        [0.3730],
-        [0.3809],
-        [0.4473]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.3672, 0.4570, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0791, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:41:55,171] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 09:41:55,173] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2142.81 | bwd_microstep: 4604.51 | bwd_inner_microstep: 4599.56 | bwd_allreduce_microstep: 4.86 | step_microstep: 45.34
-[2025-01-25 09:41:55,173] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2142.77 | bwd: 4604.56 | bwd_inner: 4599.56 | bwd_allreduce: 4.90 | step: 45.35
-  6%|▌         | 352/5800 [55:25<18:45:57, 12.40s/it]                                                     {'loss': 0.0791, 'grad_norm': 6.58590841293335, 'learning_rate': 3.990128510009482e-05, 'epoch': 3.03}
-  6%|▌         | 352/5800 [55:25<18:45:57, 12.40s/it]score1 tensor([[0.3828],
-        [0.3848],
-        [0.4258],
-        [0.3066]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4355, 0.3789, 0.4277, 0.3789], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0332, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:42:02,050] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 09:42:02,052] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2145.15 | bwd_microstep: 4610.92 | bwd_inner_microstep: 4604.14 | bwd_allreduce_microstep: 6.70 | step_microstep: 46.66
-[2025-01-25 09:42:02,053] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2145.11 | bwd: 4610.94 | bwd_inner: 4604.14 | bwd_allreduce: 6.73 | step: 46.68
-  6%|▌         | 353/5800 [55:32<16:15:22, 10.74s/it]                                                     {'loss': 0.0332, 'grad_norm': 6.044503688812256, 'learning_rate': 3.990017375451134e-05, 'epoch': 3.04}
-  6%|▌         | 353/5800 [55:32<16:15:22, 10.74s/it]score1 tensor([[0.4375],
-        [0.3594],
-        [0.4160],
-        [0.4219]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4629, 0.5156, 0.4277, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:42:08,943] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.95 | optimizer_step: 4.51
-[2025-01-25 09:42:08,945] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2147.99 | bwd_microstep: 4616.50 | bwd_inner_microstep: 4610.59 | bwd_allreduce_microstep: 5.80 | step_microstep: 49.72
-[2025-01-25 09:42:08,945] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2147.96 | bwd: 4616.53 | bwd_inner: 4610.59 | bwd_allreduce: 5.86 | step: 49.72
-  6%|▌         | 354/5800 [55:38<14:30:28,  9.59s/it]                                                     {'loss': 0.0547, 'grad_norm': 12.19159984588623, 'learning_rate': 3.989905620370839e-05, 'epoch': 3.05}
-  6%|▌         | 354/5800 [55:38<14:30:28,  9.59s/it]score1 tensor([[0.5273],
-        [0.4258],
-        [0.4902],
-        [0.4434]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.5469, 0.5156, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0488, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:42:15,853] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.12 | optimizer_step: 4.37
-[2025-01-25 09:42:15,855] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2151.28 | bwd_microstep: 4622.02 | bwd_inner_microstep: 4615.44 | bwd_allreduce_microstep: 6.49 | step_microstep: 49.40
-[2025-01-25 09:42:15,855] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2151.25 | bwd: 4622.04 | bwd_inner: 4615.44 | bwd_allreduce: 6.53 | step: 49.40
-  6%|▌         | 355/5800 [55:45<13:17:21,  8.79s/it]                                                     {'loss': 0.0488, 'grad_norm': 6.0943074226379395, 'learning_rate': 3.9897932448034436e-05, 'epoch': 3.06}
-  6%|▌         | 355/5800 [55:45<13:17:21,  8.79s/it]score1 tensor([[0.5859],
-        [0.4512],
-        [0.6055],
-        [0.5859]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.3223, 0.6484, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:42:22,742] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.37
-[2025-01-25 09:42:22,743] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2151.90 | bwd_microstep: 4602.89 | bwd_inner_microstep: 4597.60 | bwd_allreduce_microstep: 5.17 | step_microstep: 48.22
-[2025-01-25 09:42:22,744] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2151.85 | bwd: 4602.91 | bwd_inner: 4597.60 | bwd_allreduce: 5.23 | step: 48.22
-  6%|▌         | 356/5800 [55:52<12:25:18,  8.21s/it]                                                     {'loss': 0.0508, 'grad_norm': 0.9094560146331787, 'learning_rate': 3.9896802487839884e-05, 'epoch': 3.07}
-  6%|▌         | 356/5800 [55:52<12:25:18,  8.21s/it]score1 tensor([[0.6055],
-        [0.5781],
-        [0.5977],
-        [0.5781]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4062, 0.4922, 0.4688, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1182, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:42:29,616] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 09:42:29,617] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2151.04 | bwd_microstep: 4605.44 | bwd_inner_microstep: 4600.64 | bwd_allreduce_microstep: 4.70 | step_microstep: 44.27
-[2025-01-25 09:42:29,618] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2151.00 | bwd: 4605.46 | bwd_inner: 4600.64 | bwd_allreduce: 4.75 | step: 44.29
-  6%|▌         | 357/5800 [55:59<11:48:41,  7.81s/it]                                                     {'loss': 0.1182, 'grad_norm': 12.864053726196289, 'learning_rate': 3.989566632347708e-05, 'epoch': 3.08}
-  6%|▌         | 357/5800 [55:59<11:48:41,  7.81s/it]score1 tensor([[0.6914],
-        [0.5938],
-        [0.7344],
-        [0.6211]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.4688, 0.6719, 0.4902], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0913, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:42:36,495] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 09:42:36,497] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.56 | bwd_microstep: 4604.36 | bwd_inner_microstep: 4599.24 | bwd_allreduce_microstep: 5.02 | step_microstep: 46.98
-[2025-01-25 09:42:36,497] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.50 | bwd: 4604.39 | bwd_inner: 4599.24 | bwd_allreduce: 5.07 | step: 47.00
-  6%|▌         | 358/5800 [56:06<11:23:26,  7.54s/it]                                                     {'loss': 0.0913, 'grad_norm': 13.179926872253418, 'learning_rate': 3.9894523955300284e-05, 'epoch': 3.09}
-  6%|▌         | 358/5800 [56:06<11:23:26,  7.54s/it]score1 tensor([[0.6719],
-        [0.6484],
-        [0.4785],
-        [0.6055]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4453, 0.5273, 0.4844, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1021, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:42:43,388] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 09:42:43,389] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.45 | bwd_microstep: 4608.79 | bwd_inner_microstep: 4600.69 | bwd_allreduce_microstep: 8.00 | step_microstep: 44.16
-[2025-01-25 09:42:43,390] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.40 | bwd: 4608.82 | bwd_inner: 4600.69 | bwd_allreduce: 8.05 | step: 44.17
-  6%|▌         | 359/5800 [56:13<11:05:36,  7.34s/it]                                                     {'loss': 0.1021, 'grad_norm': 10.469208717346191, 'learning_rate': 3.9893375383665726e-05, 'epoch': 3.09}
-  6%|▌         | 359/5800 [56:13<11:05:36,  7.34s/it]score1 tensor([[0.6172],
-        [0.5820],
-        [0.5859],
-        [0.6328]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.4980, 0.4941, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1084, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:42:50,286] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.71
-[2025-01-25 09:42:50,288] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.11 | bwd_microstep: 4623.48 | bwd_inner_microstep: 4618.86 | bwd_allreduce_microstep: 4.53 | step_microstep: 48.29
-[2025-01-25 09:42:50,288] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.07 | bwd: 4623.50 | bwd_inner: 4618.87 | bwd_allreduce: 4.57 | step: 48.30
-  6%|▌         | 360/5800 [56:20<10:53:40,  7.21s/it]                                                     {'loss': 0.1084, 'grad_norm': 13.005948066711426, 'learning_rate': 3.9892220608931544e-05, 'epoch': 3.1}
-  6%|▌         | 360/5800 [56:20<10:53:40,  7.21s/it]score1 tensor([[0.5781],
-        [0.6328],
-        [0.4766],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.5703, 0.4707, 0.4082], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0537, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:42:57,187] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 09:42:57,188] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.49 | bwd_microstep: 4617.98 | bwd_inner_microstep: 4613.04 | bwd_allreduce_microstep: 4.83 | step_microstep: 44.74
-[2025-01-25 09:42:57,189] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.43 | bwd: 4618.01 | bwd_inner: 4613.04 | bwd_allreduce: 4.88 | step: 44.75
-  6%|▌         | 361/5800 [56:27<10:45:00,  7.12s/it]                                                     {'loss': 0.0537, 'grad_norm': 12.547198295593262, 'learning_rate': 3.989105963145782e-05, 'epoch': 3.11}
-  6%|▌         | 361/5800 [56:27<10:45:00,  7.12s/it]score1 tensor([[0.3887],
-        [0.5469],
-        [0.4727],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4883, 0.5508, 0.5195, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0464, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:43:04,074] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.37
-[2025-01-25 09:43:04,076] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.72 | bwd_microstep: 4609.49 | bwd_inner_microstep: 4604.44 | bwd_allreduce_microstep: 4.95 | step_microstep: 45.22
-[2025-01-25 09:43:04,076] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.69 | bwd: 4609.52 | bwd_inner: 4604.44 | bwd_allreduce: 5.01 | step: 45.23
-  6%|▌         | 362/5800 [56:34<10:38:48,  7.05s/it]                                                     {'loss': 0.0464, 'grad_norm': 6.134671211242676, 'learning_rate': 3.9889892451606557e-05, 'epoch': 3.12}
-  6%|▌         | 362/5800 [56:34<10:38:48,  7.05s/it]score1 tensor([[0.5352],
-        [0.4766],
-        [0.5508],
-        [0.3984]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6562, 0.5664, 0.6055, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0898, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:43:10,979] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 09:43:10,981] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.67 | bwd_microstep: 4623.06 | bwd_inner_microstep: 4617.88 | bwd_allreduce_microstep: 5.08 | step_microstep: 45.15
-[2025-01-25 09:43:10,981] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.64 | bwd: 4623.08 | bwd_inner: 4617.88 | bwd_allreduce: 5.13 | step: 45.17
-  6%|▋         | 363/5800 [56:40<10:34:35,  7.00s/it]                                                     {'loss': 0.0898, 'grad_norm': 12.331872940063477, 'learning_rate': 3.988871906974171e-05, 'epoch': 3.13}
-  6%|▋         | 363/5800 [56:40<10:34:35,  7.00s/it]score1 tensor([[0.4688],
-        [0.3711],
-        [0.4082],
-        [0.3770]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4258, 0.4180, 0.4570, 0.4805], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0605, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:43:17,874] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.37
-[2025-01-25 09:43:17,876] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.48 | bwd_microstep: 4614.35 | bwd_inner_microstep: 4608.92 | bwd_allreduce_microstep: 5.31 | step_microstep: 46.45
-[2025-01-25 09:43:17,876] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.43 | bwd: 4614.38 | bwd_inner: 4608.92 | bwd_allreduce: 5.37 | step: 46.46
-  6%|▋         | 364/5800 [56:47<10:31:39,  6.97s/it]                                                     {'loss': 0.0605, 'grad_norm': 5.942004680633545, 'learning_rate': 3.988753948622916e-05, 'epoch': 3.14}
-  6%|▋         | 364/5800 [56:47<10:31:39,  6.97s/it]score1 tensor([[0.5742],
-        [0.4375],
-        [0.5430],
-        [0.4746]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.4062, 0.5508, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0239, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:43:24,774] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 8.08 | optimizer_step: 4.37
-[2025-01-25 09:43:24,776] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.62 | bwd_microstep: 4613.74 | bwd_inner_microstep: 4608.45 | bwd_allreduce_microstep: 5.17 | step_microstep: 49.00
-[2025-01-25 09:43:24,776] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.58 | bwd: 4613.76 | bwd_inner: 4608.45 | bwd_allreduce: 5.23 | step: 49.01
-  6%|▋         | 365/5800 [56:54<10:29:25,  6.95s/it]                                                     {'loss': 0.0239, 'grad_norm': 0.9149106740951538, 'learning_rate': 3.988635370143672e-05, 'epoch': 3.15}
-  6%|▋         | 365/5800 [56:54<10:29:25,  6.95s/it]score1 tensor([[0.6211],
-        [0.4590],
-        [0.5742],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6367, 0.4727, 0.5195, 0.5586], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0317, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:43:31,658] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 09:43:31,660] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.45 | bwd_microstep: 4614.59 | bwd_inner_microstep: 4610.02 | bwd_allreduce_microstep: 4.48 | step_microstep: 44.24
-[2025-01-25 09:43:31,660] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.42 | bwd: 4614.61 | bwd_inner: 4610.02 | bwd_allreduce: 4.53 | step: 44.25
-  6%|▋         | 366/5800 [57:01<10:27:37,  6.93s/it]                                                     {'loss': 0.0317, 'grad_norm': 6.285831451416016, 'learning_rate': 3.9885161715734135e-05, 'epoch': 3.16}
-  6%|▋         | 366/5800 [57:01<10:27:37,  6.93s/it]score1 tensor([[0.5625],
-        [0.4844],
-        [0.4824],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5820, 0.4668, 0.4492, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:43:38,550] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.37
-[2025-01-25 09:43:38,552] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.16 | bwd_microstep: 4616.83 | bwd_inner_microstep: 4611.98 | bwd_allreduce_microstep: 4.75 | step_microstep: 45.90
-[2025-01-25 09:43:38,552] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.11 | bwd: 4616.86 | bwd_inner: 4611.98 | bwd_allreduce: 4.80 | step: 45.91
-  6%|▋         | 367/5800 [57:08<10:26:30,  6.92s/it]                                                     {'loss': 0.0293, 'grad_norm': 6.287962913513184, 'learning_rate': 3.988396352949309e-05, 'epoch': 3.16}
-  6%|▋         | 367/5800 [57:08<10:26:30,  6.92s/it]score1 tensor([[0.7031],
-        [0.5820],
-        [0.5977],
-        [0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6602, 0.6094, 0.4395, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0659, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:43:45,452] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.37
-[2025-01-25 09:43:45,454] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.55 | bwd_microstep: 4621.93 | bwd_inner_microstep: 4616.93 | bwd_allreduce_microstep: 4.92 | step_microstep: 44.49
-[2025-01-25 09:43:45,454] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.51 | bwd: 4621.95 | bwd_inner: 4616.93 | bwd_allreduce: 4.96 | step: 44.50
-  6%|▋         | 368/5800 [57:15<10:25:52,  6.91s/it]                                                     {'loss': 0.0659, 'grad_norm': 1.5219194889068604, 'learning_rate': 3.9882759143087194e-05, 'epoch': 3.17}
-  6%|▋         | 368/5800 [57:15<10:25:52,  6.91s/it]score1 tensor([[0.6328],
-        [0.5156],
-        [0.5273],
-        [0.6484]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3809, 0.4277, 0.4863, 0.5742], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1138, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:43:52,352] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.37
-[2025-01-25 09:43:52,353] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.71 | bwd_microstep: 4623.62 | bwd_inner_microstep: 4618.90 | bwd_allreduce_microstep: 4.62 | step_microstep: 43.77
-[2025-01-25 09:43:52,353] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.66 | bwd: 4623.64 | bwd_inner: 4618.90 | bwd_allreduce: 4.67 | step: 43.78
-  6%|▋         | 369/5800 [57:22<10:25:30,  6.91s/it]                                                     {'loss': 0.1138, 'grad_norm': 12.588117599487305, 'learning_rate': 3.9881548556892e-05, 'epoch': 3.18}
-  6%|▋         | 369/5800 [57:22<10:25:30,  6.91s/it]score1 tensor([[0.5273],
-        [0.5312],
-        [0.5508],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3984, 0.5625, 0.4453, 0.5586], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0723, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:43:59,260] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.65
-[2025-01-25 09:43:59,262] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.03 | bwd_microstep: 4626.52 | bwd_inner_microstep: 4621.91 | bwd_allreduce_microstep: 4.53 | step_microstep: 47.24
-[2025-01-25 09:43:59,262] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.99 | bwd: 4626.55 | bwd_inner: 4621.91 | bwd_allreduce: 4.57 | step: 47.25
-  6%|▋         | 370/5800 [57:29<10:25:18,  6.91s/it]                                                     {'loss': 0.0723, 'grad_norm': 6.21781587600708, 'learning_rate': 3.9880331771285e-05, 'epoch': 3.19}
-  6%|▋         | 370/5800 [57:29<10:25:18,  6.91s/it]score1 tensor([[0.4570],
-        [0.4414],
-        [0.5312],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4609, 0.3555, 0.5742, 0.4688], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0518, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:44:06,161] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 09:44:06,162] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.32 | bwd_microstep: 4617.22 | bwd_inner_microstep: 4612.37 | bwd_allreduce_microstep: 4.77 | step_microstep: 45.66
-[2025-01-25 09:44:06,163] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.28 | bwd: 4617.25 | bwd_inner: 4612.37 | bwd_allreduce: 4.81 | step: 45.66
-  6%|▋         | 371/5800 [57:36<10:24:54,  6.91s/it]                                                     {'loss': 0.0518, 'grad_norm': 0.8499332070350647, 'learning_rate': 3.98791087866456e-05, 'epoch': 3.2}
-  6%|▋         | 371/5800 [57:36<10:24:54,  6.91s/it]score1 tensor([[0.4727],
-        [0.4922],
-        [0.5898],
-        [0.4199]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.5273, 0.5781, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0449, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:44:13,057] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.95 | optimizer_step: 4.36
-[2025-01-25 09:44:13,059] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.00 | bwd_microstep: 4617.24 | bwd_inner_microstep: 4612.45 | bwd_allreduce_microstep: 4.69 | step_microstep: 44.36
-[2025-01-25 09:44:13,059] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.96 | bwd: 4617.26 | bwd_inner: 4612.45 | bwd_allreduce: 4.74 | step: 44.37
-  6%|▋         | 372/5800 [57:43<10:24:34,  6.90s/it]                                                     {'loss': 0.0449, 'grad_norm': 5.839444160461426, 'learning_rate': 3.9877879603355144e-05, 'epoch': 3.21}
-  6%|▋         | 372/5800 [57:43<10:24:34,  6.90s/it]score1 tensor([[0.3711],
-        [0.4590],
-        [0.5078],
-        [0.4082]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.4395, 0.4492, 0.4199], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0298, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:44:19,968] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 09:44:19,969] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.72 | bwd_microstep: 4626.65 | bwd_inner_microstep: 4621.71 | bwd_allreduce_microstep: 4.83 | step_microstep: 46.17
-[2025-01-25 09:44:19,969] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.68 | bwd: 4626.67 | bwd_inner: 4621.71 | bwd_allreduce: 4.89 | step: 46.18
-  6%|▋         | 373/5800 [57:49<10:24:30,  6.90s/it]                                                     {'loss': 0.0298, 'grad_norm': 0.8572362065315247, 'learning_rate': 3.987664422179692e-05, 'epoch': 3.22}
-  6%|▋         | 373/5800 [57:49<10:24:30,  6.90s/it]score1 tensor([[0.4375],
-        [0.4570],
-        [0.4590],
-        [0.4570]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.3809, 0.5430, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:44:26,878] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 09:44:26,879] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.02 | bwd_microstep: 4625.93 | bwd_inner_microstep: 4620.74 | bwd_allreduce_microstep: 5.08 | step_microstep: 51.20
-[2025-01-25 09:44:26,880] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.99 | bwd: 4625.96 | bwd_inner: 4620.74 | bwd_allreduce: 5.14 | step: 51.21
-  6%|▋         | 374/5800 [57:56<10:24:37,  6.91s/it]                                                     {'loss': 0.0547, 'grad_norm': 0.6047017574310303, 'learning_rate': 3.9875402642356136e-05, 'epoch': 3.22}
-  6%|▋         | 374/5800 [57:56<10:24:37,  6.91s/it]score1 tensor([[0.4785],
-        [0.4961],
-        [0.3594],
-        [0.4434]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5703, 0.6406, 0.3418, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0806, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:44:33,774] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 09:44:33,776] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.29 | bwd_microstep: 4618.54 | bwd_inner_microstep: 4613.88 | bwd_allreduce_microstep: 4.57 | step_microstep: 42.19
-[2025-01-25 09:44:33,776] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.26 | bwd: 4618.57 | bwd_inner: 4613.88 | bwd_allreduce: 4.61 | step: 42.19
-  6%|▋         | 375/5800 [58:03<10:24:14,  6.90s/it]                                                     {'loss': 0.0806, 'grad_norm': 6.158967971801758, 'learning_rate': 3.987415486541994e-05, 'epoch': 3.23}
-  6%|▋         | 375/5800 [58:03<10:24:14,  6.90s/it]score1 tensor([[0.4746],
-        [0.4277],
-        [0.4668],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.2812, 0.4004, 0.4297], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0938, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:44:40,685] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 09:44:40,686] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.14 | bwd_microstep: 4628.26 | bwd_inner_microstep: 4623.29 | bwd_allreduce_microstep: 4.83 | step_microstep: 44.05
-[2025-01-25 09:44:40,687] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.11 | bwd: 4628.29 | bwd_inner: 4623.29 | bwd_allreduce: 4.92 | step: 44.05
-  6%|▋         | 376/5800 [58:10<10:24:19,  6.91s/it]                                                     {'loss': 0.0938, 'grad_norm': 5.910329818725586, 'learning_rate': 3.987290089137741e-05, 'epoch': 3.24}
-  6%|▋         | 376/5800 [58:10<10:24:19,  6.91s/it]score1 tensor([[0.4590],
-        [0.4902],
-        [0.5195],
-        [0.4668]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.3945, 0.6445, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0723, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:44:47,594] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.91 | optimizer_step: 4.36
-[2025-01-25 09:44:47,595] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.09 | bwd_microstep: 4629.07 | bwd_inner_microstep: 4624.10 | bwd_allreduce_microstep: 4.84 | step_microstep: 41.49
-[2025-01-25 09:44:47,595] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.04 | bwd: 4629.10 | bwd_inner: 4624.10 | bwd_allreduce: 4.92 | step: 41.49
-  6%|▋         | 377/5800 [58:17<10:24:20,  6.91s/it]                                                     {'loss': 0.0723, 'grad_norm': 0.4209686815738678, 'learning_rate': 3.987164072061956e-05, 'epoch': 3.25}
-  6%|▋         | 377/5800 [58:17<10:24:20,  6.91s/it]score1 tensor([[0.4590],
-        [0.5664],
-        [0.4023],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5078, 0.7070, 0.3945, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0757, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:44:54,498] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.90 | optimizer_step: 4.37
-[2025-01-25 09:44:54,500] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.82 | bwd_microstep: 4624.99 | bwd_inner_microstep: 4620.08 | bwd_allreduce_microstep: 4.82 | step_microstep: 43.55
-[2025-01-25 09:44:54,500] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.78 | bwd: 4625.01 | bwd_inner: 4620.08 | bwd_allreduce: 4.87 | step: 43.56
-  7%|▋         | 378/5800 [58:24<10:24:05,  6.91s/it]                                                     {'loss': 0.0757, 'grad_norm': 6.302854061126709, 'learning_rate': 3.987037435353933e-05, 'epoch': 3.26}
-  7%|▋         | 378/5800 [58:24<10:24:05,  6.91s/it]score1 tensor([[0.4902],
-        [0.5039],
-        [0.5469],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.4219, 0.5000, 0.6367], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0610, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:45:01,400] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.36
-[2025-01-25 09:45:01,402] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.14 | bwd_microstep: 4621.79 | bwd_inner_microstep: 4617.02 | bwd_allreduce_microstep: 4.67 | step_microstep: 44.21
-[2025-01-25 09:45:01,402] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.10 | bwd: 4621.82 | bwd_inner: 4617.02 | bwd_allreduce: 4.72 | step: 44.21
-  7%|▋         | 379/5800 [58:31<10:23:56,  6.91s/it]                                                     {'loss': 0.061, 'grad_norm': 0.6078729033470154, 'learning_rate': 3.986910179053159e-05, 'epoch': 3.27}
-  7%|▋         | 379/5800 [58:31<10:23:56,  6.91s/it]score1 tensor([[0.5273],
-        [0.4668],
-        [0.5312],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.4434, 0.5078, 0.4375], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0488, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:45:08,324] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 09:45:08,326] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.66 | bwd_microstep: 4637.50 | bwd_inner_microstep: 4632.25 | bwd_allreduce_microstep: 5.16 | step_microstep: 45.81
-[2025-01-25 09:45:08,326] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.62 | bwd: 4637.52 | bwd_inner: 4632.25 | bwd_allreduce: 5.21 | step: 45.82
-  7%|▋         | 380/5800 [58:38<10:24:07,  6.91s/it]                                                     {'loss': 0.0488, 'grad_norm': 6.023326396942139, 'learning_rate': 3.9867823031993154e-05, 'epoch': 3.28}
-  7%|▋         | 380/5800 [58:38<10:24:07,  6.91s/it]score1 tensor([[0.3926],
-        [0.4688],
-        [0.5078],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3516, 0.5000, 0.5508, 0.4746], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0361, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:45:15,242] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.37
-[2025-01-25 09:45:15,243] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.25 | bwd_microstep: 4636.56 | bwd_inner_microstep: 4631.73 | bwd_allreduce_microstep: 4.75 | step_microstep: 44.81
-[2025-01-25 09:45:15,244] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2168.22 | bwd: 4636.58 | bwd_inner: 4631.73 | bwd_allreduce: 4.79 | step: 44.82
-  7%|▋         | 381/5800 [58:45<10:24:36,  6.92s/it]                                                     {'loss': 0.0361, 'grad_norm': 0.5878281593322754, 'learning_rate': 3.986653807832276e-05, 'epoch': 3.28}
-  7%|▋         | 381/5800 [58:45<10:24:36,  6.92s/it]score1 tensor([[0.5078],
-        [0.5430],
-        [0.4199],
-        [0.6641]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.5625, 0.4238, 0.6484], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:45:22,178] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 09:45:22,180] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.61 | bwd_microstep: 4637.28 | bwd_inner_microstep: 4632.58 | bwd_allreduce_microstep: 4.59 | step_microstep: 44.23
-[2025-01-25 09:45:22,180] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.58 | bwd: 4637.31 | bwd_inner: 4632.58 | bwd_allreduce: 4.64 | step: 44.24
-  7%|▋         | 382/5800 [58:52<10:24:46,  6.92s/it]                                                     {'loss': 0.0107, 'grad_norm': 5.971628189086914, 'learning_rate': 3.9865246929921076e-05, 'epoch': 3.29}
-  7%|▋         | 382/5800 [58:52<10:24:46,  6.92s/it]score1 tensor([[0.5625],
-        [0.4258],
-        [0.5234],
-        [0.4609]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.3340, 0.4844, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0415, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:45:29,117] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.32 | optimizer_step: 4.36
-[2025-01-25 09:45:29,118] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.59 | bwd_microstep: 4650.35 | bwd_inner_microstep: 4639.42 | bwd_allreduce_microstep: 10.82 | step_microstep: 49.81
-[2025-01-25 09:45:29,119] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.55 | bwd: 4650.41 | bwd_inner: 4639.42 | bwd_allreduce: 10.88 | step: 49.81
-  7%|▋         | 383/5800 [58:59<10:25:01,  6.92s/it]                                                     {'loss': 0.0415, 'grad_norm': 6.113086700439453, 'learning_rate': 3.9863949587190714e-05, 'epoch': 3.3}
-  7%|▋         | 383/5800 [58:59<10:25:01,  6.92s/it]score1 tensor([[0.4746],
-        [0.6055],
-        [0.5312],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.5625, 0.6172, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0859, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:45:36,040] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 09:45:36,041] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2169.76 | bwd_microstep: 4635.22 | bwd_inner_microstep: 4630.16 | bwd_allreduce_microstep: 4.95 | step_microstep: 42.65
-[2025-01-25 09:45:36,042] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2169.74 | bwd: 4635.24 | bwd_inner: 4630.16 | bwd_allreduce: 5.00 | step: 42.65
-  7%|▋         | 384/5800 [59:06<10:25:02,  6.92s/it]                                                     {'loss': 0.0859, 'grad_norm': 0.7569342851638794, 'learning_rate': 3.98626460505362e-05, 'epoch': 3.31}
-  7%|▋         | 384/5800 [59:06<10:25:02,  6.92s/it]score1 tensor([[0.5859],
-        [0.6016],
-        [0.5234],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.6797, 0.5273, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0283, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:45:42,962] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 09:45:42,964] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.43 | bwd_microstep: 4636.53 | bwd_inner_microstep: 4631.53 | bwd_allreduce_microstep: 4.89 | step_microstep: 46.15
-[2025-01-25 09:45:42,964] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.38 | bwd: 4636.55 | bwd_inner: 4631.53 | bwd_allreduce: 4.95 | step: 46.16
-  7%|▋         | 385/5800 [59:12<10:24:50,  6.92s/it]                                                     {'loss': 0.0283, 'grad_norm': 6.260208606719971, 'learning_rate': 3.9861336320364004e-05, 'epoch': 3.32}
-  7%|▋         | 385/5800 [59:12<10:24:50,  6.92s/it]score1 tensor([[0.4805],
-        [0.4746],
-        [0.5742],
-        [0.4102]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.5781, 0.5039, 0.4629], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0771, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:45:49,885] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 09:45:49,887] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.61 | bwd_microstep: 4638.09 | bwd_inner_microstep: 4633.21 | bwd_allreduce_microstep: 4.77 | step_microstep: 44.75
-[2025-01-25 09:45:49,887] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.57 | bwd: 4638.12 | bwd_inner: 4633.21 | bwd_allreduce: 4.83 | step: 44.76
-  7%|▋         | 386/5800 [59:19<10:24:44,  6.92s/it]                                                     {'loss': 0.0771, 'grad_norm': 5.97385835647583, 'learning_rate': 3.9860020397082516e-05, 'epoch': 3.33}
-  7%|▋         | 386/5800 [59:19<10:24:44,  6.92s/it]score1 tensor([[0.5352],
-        [0.5352],
-        [0.4629],
-        [0.6719]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3652, 0.5234, 0.4961, 0.6289], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0645, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:45:56,801] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 09:45:56,803] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.75 | bwd_microstep: 4636.34 | bwd_inner_microstep: 4631.21 | bwd_allreduce_microstep: 5.03 | step_microstep: 43.00
-[2025-01-25 09:45:56,803] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.70 | bwd: 4636.37 | bwd_inner: 4631.21 | bwd_allreduce: 5.08 | step: 43.01
-  7%|▋         | 387/5800 [59:26<10:24:34,  6.92s/it]                                                     {'loss': 0.0645, 'grad_norm': 6.496270656585693, 'learning_rate': 3.985869828110206e-05, 'epoch': 3.34}
-  7%|▋         | 387/5800 [59:26<10:24:34,  6.92s/it]score1 tensor([[0.5391],
-        [0.5898],
-        [0.6602],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.4941, 0.6055, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0688, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:46:03,738] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.37
-[2025-01-25 09:46:03,739] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.77 | bwd_microstep: 4639.23 | bwd_inner_microstep: 4634.40 | bwd_allreduce_microstep: 4.75 | step_microstep: 43.76
-[2025-01-25 09:46:03,739] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.70 | bwd: 4639.25 | bwd_inner: 4634.40 | bwd_allreduce: 4.79 | step: 43.77
-  7%|▋         | 388/5800 [59:33<10:24:37,  6.92s/it]                                                     {'loss': 0.0688, 'grad_norm': 12.704859733581543, 'learning_rate': 3.985736997283491e-05, 'epoch': 3.34}
-  7%|▋         | 388/5800 [59:33<10:24:37,  6.92s/it]score1 tensor([[0.5430],
-        [0.5312],
-        [0.4941],
-        [0.5898]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4043, 0.4453, 0.4551, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0776, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:46:10,658] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 09:46:10,660] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.72 | bwd_microstep: 4634.78 | bwd_inner_microstep: 4629.59 | bwd_allreduce_microstep: 5.08 | step_microstep: 48.10
-[2025-01-25 09:46:10,660] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.67 | bwd: 4634.81 | bwd_inner: 4629.59 | bwd_allreduce: 5.14 | step: 48.11
-  7%|▋         | 389/5800 [59:40<10:24:26,  6.92s/it]                                                     {'loss': 0.0776, 'grad_norm': 12.293010711669922, 'learning_rate': 3.985603547269524e-05, 'epoch': 3.35}
-  7%|▋         | 389/5800 [59:40<10:24:26,  6.92s/it]score1 tensor([[0.5234],
-        [0.5352],
-        [0.5195],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.4883, 0.5273, 0.4160], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:46:17,575] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.37
-[2025-01-25 09:46:17,577] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.75 | bwd_microstep: 4635.54 | bwd_inner_microstep: 4630.56 | bwd_allreduce_microstep: 4.88 | step_microstep: 47.80
-[2025-01-25 09:46:17,578] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.72 | bwd: 4635.56 | bwd_inner: 4630.56 | bwd_allreduce: 4.93 | step: 47.81
-  7%|▋         | 390/5800 [59:47<10:24:03,  6.92s/it]                                                     {'loss': 0.0391, 'grad_norm': 6.085441589355469, 'learning_rate': 3.9854694781099184e-05, 'epoch': 3.36}
-  7%|▋         | 390/5800 [59:47<10:24:03,  6.92s/it]score1 tensor([[0.4258],
-        [0.4746],
-        [0.5312],
-        [0.4492]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.5352, 0.6133, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0718, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:46:24,496] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 09:46:24,497] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.07 | bwd_microstep: 4643.61 | bwd_inner_microstep: 4638.61 | bwd_allreduce_microstep: 4.90 | step_microstep: 44.56
-[2025-01-25 09:46:24,498] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.02 | bwd: 4643.64 | bwd_inner: 4638.61 | bwd_allreduce: 4.96 | step: 44.57
-  7%|▋         | 391/5800 [59:54<10:24:00,  6.92s/it]                                                     {'loss': 0.0718, 'grad_norm': 11.987409591674805, 'learning_rate': 3.985334789846478e-05, 'epoch': 3.37}
-  7%|▋         | 391/5800 [59:54<10:24:00,  6.92s/it]score1 tensor([[0.5508],
-        [0.4512],
-        [0.3965],
-        [0.3535]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.4844, 0.4648, 0.3398], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0522, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:46:31,416] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 09:46:31,417] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.73 | bwd_microstep: 4640.51 | bwd_inner_microstep: 4635.14 | bwd_allreduce_microstep: 5.26 | step_microstep: 44.77
-[2025-01-25 09:46:31,417] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.67 | bwd: 4640.53 | bwd_inner: 4635.14 | bwd_allreduce: 5.31 | step: 44.77
-  7%|▋         | 392/5800 [1:00:01<10:23:49,  6.92s/it]                                                       {'loss': 0.0522, 'grad_norm': 6.1483330726623535, 'learning_rate': 3.9851994825212024e-05, 'epoch': 3.38}
-  7%|▋         | 392/5800 [1:00:01<10:23:49,  6.92s/it]score1 tensor([[0.3848],
-        [0.4102],
-        [0.3809],
-        [0.4199]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4336, 0.4160, 0.4512, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:46:38,351] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.36
-[2025-01-25 09:46:38,352] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.79 | bwd_microstep: 4638.16 | bwd_inner_microstep: 4633.59 | bwd_allreduce_microstep: 4.49 | step_microstep: 46.49
-[2025-01-25 09:46:38,353] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.75 | bwd: 4638.19 | bwd_inner: 4633.59 | bwd_allreduce: 4.52 | step: 46.50
-  7%|▋         | 393/5800 [1:00:08<10:24:01,  6.92s/it]                                                       {'loss': 0.0508, 'grad_norm': 11.554535865783691, 'learning_rate': 3.985063556176281e-05, 'epoch': 3.39}
-  7%|▋         | 393/5800 [1:00:08<10:24:01,  6.92s/it]score1 tensor([[0.4238],
-        [0.3887],
-        [0.5195],
-        [0.4219]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4609, 0.3984, 0.6289, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0474, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:46:45,269] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 09:46:45,270] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.38 | bwd_microstep: 4636.63 | bwd_inner_microstep: 4631.64 | bwd_allreduce_microstep: 4.89 | step_microstep: 45.01
-[2025-01-25 09:46:45,270] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.34 | bwd: 4636.65 | bwd_inner: 4631.64 | bwd_allreduce: 4.94 | step: 45.02
-  7%|▋         | 394/5800 [1:00:15<10:23:44,  6.92s/it]                                                       {'loss': 0.0474, 'grad_norm': 11.849567413330078, 'learning_rate': 3.984927010854099e-05, 'epoch': 3.4}
-  7%|▋         | 394/5800 [1:00:15<10:23:44,  6.92s/it]score1 tensor([[0.5117],
-        [0.5078],
-        [0.4668],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.4961, 0.5898, 0.4316], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0542, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:46:52,196] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.36
-[2025-01-25 09:46:52,198] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.41 | bwd_microstep: 4644.63 | bwd_inner_microstep: 4639.56 | bwd_allreduce_microstep: 4.93 | step_microstep: 47.69
-[2025-01-25 09:46:52,198] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.38 | bwd: 4644.66 | bwd_inner: 4639.56 | bwd_allreduce: 5.01 | step: 47.70
-  7%|▋         | 395/5800 [1:00:22<10:23:50,  6.93s/it]                                                       {'loss': 0.0542, 'grad_norm': 6.1128926277160645, 'learning_rate': 3.984789846597234e-05, 'epoch': 3.41}
-  7%|▋         | 395/5800 [1:00:22<10:23:50,  6.93s/it]score1 tensor([[0.4531],
-        [0.4824],
-        [0.6289],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.3750, 0.5625, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0439, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:46:59,082] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.36
-[2025-01-25 09:46:59,083] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.55 | bwd_microstep: 4597.75 | bwd_inner_microstep: 4592.58 | bwd_allreduce_microstep: 5.05 | step_microstep: 44.54
-[2025-01-25 09:46:59,083] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.51 | bwd: 4597.79 | bwd_inner: 4592.58 | bwd_allreduce: 5.10 | step: 44.54
-  7%|▋         | 396/5800 [1:00:29<10:22:44,  6.91s/it]                                                       {'loss': 0.0439, 'grad_norm': 3.3790457248687744, 'learning_rate': 3.9846520634484564e-05, 'epoch': 3.41}
-  7%|▋         | 396/5800 [1:00:29<10:22:44,  6.91s/it]score1 tensor([[0.5312],
-        [0.5508],
-        [0.4648],
-        [0.4453]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.4648, 0.4668, 0.4531], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0298, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:47:06,006] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 09:47:06,007] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.74 | bwd_microstep: 4638.58 | bwd_inner_microstep: 4634.73 | bwd_allreduce_microstep: 3.79 | step_microstep: 41.63
-[2025-01-25 09:47:06,008] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.69 | bwd: 4638.60 | bwd_inner: 4634.73 | bwd_allreduce: 3.82 | step: 41.64
-  7%|▋         | 397/5800 [1:00:35<10:22:37,  6.91s/it]                                                       {'loss': 0.0298, 'grad_norm': 5.822442054748535, 'learning_rate': 3.984513661450728e-05, 'epoch': 3.42}
-  7%|▋         | 397/5800 [1:00:35<10:22:37,  6.91s/it]score1 tensor([[0.5000],
-        [0.5273],
-        [0.5430],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.3457, 0.6016, 0.3438], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1128, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:47:12,924] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 09:47:12,925] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.71 | bwd_microstep: 4638.03 | bwd_inner_microstep: 4632.91 | bwd_allreduce_microstep: 5.00 | step_microstep: 44.00
-[2025-01-25 09:47:12,926] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.69 | bwd: 4638.05 | bwd_inner: 4632.91 | bwd_allreduce: 5.06 | step: 44.02
-  7%|▋         | 398/5800 [1:00:42<10:22:44,  6.92s/it]                                                       {'loss': 0.1128, 'grad_norm': 0.5452026128768921, 'learning_rate': 3.9843746406472055e-05, 'epoch': 3.43}
-  7%|▋         | 398/5800 [1:00:42<10:22:44,  6.92s/it]score1 tensor([[0.5664],
-        [0.4258],
-        [0.5508],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6016, 0.3652, 0.5156, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0332, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:47:19,856] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 09:47:19,857] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.45 | bwd_microstep: 4643.23 | bwd_inner_microstep: 4637.88 | bwd_allreduce_microstep: 5.26 | step_microstep: 48.00
-[2025-01-25 09:47:19,858] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.41 | bwd: 4643.26 | bwd_inner: 4637.88 | bwd_allreduce: 5.31 | step: 48.01
-  7%|▋         | 399/5800 [1:00:49<10:22:58,  6.92s/it]                                                       {'loss': 0.0332, 'grad_norm': 5.836393356323242, 'learning_rate': 3.984235001081238e-05, 'epoch': 3.44}
-  7%|▋         | 399/5800 [1:00:49<10:22:58,  6.92s/it]score1 tensor([[0.5273],
-        [0.5391],
-        [0.5508],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.3613, 0.4590, 0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0938, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:47:26,784] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.37
-[2025-01-25 09:47:26,785] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.82 | bwd_microstep: 4647.69 | bwd_inner_microstep: 4642.76 | bwd_allreduce_microstep: 4.84 | step_microstep: 44.56
-[2025-01-25 09:47:26,786] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.76 | bwd: 4647.72 | bwd_inner: 4642.76 | bwd_allreduce: 4.89 | step: 44.56
-  7%|▋         | 400/5800 [1:00:56<10:23:07,  6.92s/it]                                                       {'loss': 0.0938, 'grad_norm': 6.1023736000061035, 'learning_rate': 3.984094742796368e-05, 'epoch': 3.45}
-  7%|▋         | 400/5800 [1:00:56<10:23:07,  6.92s/it]score1 tensor([[0.5742],
-        [0.4941],
-        [0.5234],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5938, 0.4824, 0.5430, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:47:33,708] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.37
-[2025-01-25 09:47:33,710] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.42 | bwd_microstep: 4642.79 | bwd_inner_microstep: 4637.90 | bwd_allreduce_microstep: 4.80 | step_microstep: 44.27
-[2025-01-25 09:47:33,710] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.38 | bwd: 4642.81 | bwd_inner: 4637.90 | bwd_allreduce: 4.85 | step: 44.28
-  7%|▋         | 401/5800 [1:01:03<10:23:01,  6.92s/it]                                                       {'loss': 0.0146, 'grad_norm': 0.6181584000587463, 'learning_rate': 3.9839538658363305e-05, 'epoch': 3.46}
-  7%|▋         | 401/5800 [1:01:03<10:23:01,  6.92s/it]score1 tensor([[0.4727],
-        [0.4316],
-        [0.4785],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.4336, 0.4180, 0.6250], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:47:40,621] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.51 | optimizer_step: 4.36
-[2025-01-25 09:47:40,622] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.58 | bwd_microstep: 4633.25 | bwd_inner_microstep: 4628.08 | bwd_allreduce_microstep: 5.09 | step_microstep: 44.93
-[2025-01-25 09:47:40,622] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.54 | bwd: 4633.27 | bwd_inner: 4628.08 | bwd_allreduce: 5.13 | step: 44.94
-  7%|▋         | 402/5800 [1:01:10<10:22:37,  6.92s/it]                                                       {'loss': 0.0547, 'grad_norm': 5.936216831207275, 'learning_rate': 3.9838123702450525e-05, 'epoch': 3.47}
-  7%|▋         | 402/5800 [1:01:10<10:22:37,  6.92s/it]score1 tensor([[0.4609],
-        [0.5391],
-        [0.4922],
-        [0.4727]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.5117, 0.5352, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0317, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:47:47,535] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 09:47:47,537] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.54 | bwd_microstep: 4635.14 | bwd_inner_microstep: 4630.37 | bwd_allreduce_microstep: 4.67 | step_microstep: 44.31
-[2025-01-25 09:47:47,537] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.51 | bwd: 4635.17 | bwd_inner: 4630.37 | bwd_allreduce: 4.72 | step: 44.32
-  7%|▋         | 403/5800 [1:01:17<10:22:28,  6.92s/it]                                                       {'loss': 0.0317, 'grad_norm': 0.5325890779495239, 'learning_rate': 3.9836702560666554e-05, 'epoch': 3.47}
-  7%|▋         | 403/5800 [1:01:17<10:22:28,  6.92s/it]score1 tensor([[0.4609],
-        [0.5117],
-        [0.4336],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4863, 0.5156, 0.6094, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0571, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:47:54,466] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.36
-[2025-01-25 09:47:54,467] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.04 | bwd_microstep: 4637.83 | bwd_inner_microstep: 4632.75 | bwd_allreduce_microstep: 4.97 | step_microstep: 45.29
-[2025-01-25 09:47:54,468] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.00 | bwd: 4637.85 | bwd_inner: 4632.75 | bwd_allreduce: 5.03 | step: 45.28
-  7%|▋         | 404/5800 [1:01:24<10:22:32,  6.92s/it]                                                       {'loss': 0.0571, 'grad_norm': 11.862768173217773, 'learning_rate': 3.983527523345453e-05, 'epoch': 3.48}
-  7%|▋         | 404/5800 [1:01:24<10:22:32,  6.92s/it]score1 tensor([[0.4316],
-        [0.5234],
-        [0.5234],
-        [0.4766]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.6133, 0.5039, 0.4023], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0542, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:48:01,383] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 09:48:01,385] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.55 | bwd_microstep: 4634.58 | bwd_inner_microstep: 4629.82 | bwd_allreduce_microstep: 4.68 | step_microstep: 43.53
-[2025-01-25 09:48:01,385] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.52 | bwd: 4634.60 | bwd_inner: 4629.82 | bwd_allreduce: 4.72 | step: 43.54
-  7%|▋         | 405/5800 [1:01:31<10:22:20,  6.92s/it]                                                       {'loss': 0.0542, 'grad_norm': 0.623187243938446, 'learning_rate': 3.9833841721259515e-05, 'epoch': 3.49}
-  7%|▋         | 405/5800 [1:01:31<10:22:20,  6.92s/it]score1 tensor([[0.5273],
-        [0.5195],
-        [0.5820],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.5625, 0.6875, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0679, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:48:08,317] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 09:48:08,319] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.17 | bwd_microstep: 4638.38 | bwd_inner_microstep: 4633.26 | bwd_allreduce_microstep: 5.01 | step_microstep: 42.99
-[2025-01-25 09:48:08,319] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.12 | bwd: 4638.40 | bwd_inner: 4633.26 | bwd_allreduce: 5.07 | step: 42.99
-  7%|▋         | 406/5800 [1:01:38<10:22:29,  6.92s/it]                                                       {'loss': 0.0679, 'grad_norm': 0.7771206498146057, 'learning_rate': 3.983240202452851e-05, 'epoch': 3.5}
-  7%|▋         | 406/5800 [1:01:38<10:22:29,  6.92s/it]score1 tensor([[0.5195],
-        [0.5430],
-        [0.5781],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4609, 0.5469, 0.6875, 0.4043], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0776, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:48:15,247] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 09:48:15,249] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.35 | bwd_microstep: 4645.82 | bwd_inner_microstep: 4640.75 | bwd_allreduce_microstep: 4.97 | step_microstep: 47.83
-[2025-01-25 09:48:15,249] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.31 | bwd: 4645.85 | bwd_inner: 4640.75 | bwd_allreduce: 5.04 | step: 47.83
-  7%|▋         | 407/5800 [1:01:45<10:22:36,  6.93s/it]                                                       {'loss': 0.0776, 'grad_norm': 0.5877869725227356, 'learning_rate': 3.983095614371042e-05, 'epoch': 3.51}
-  7%|▋         | 407/5800 [1:01:45<10:22:36,  6.93s/it]score1 tensor([[0.6484],
-        [0.5625],
-        [0.5625],
-        [0.6250]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5078, 0.5117, 0.4121, 0.4766], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1226, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:48:22,171] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 09:48:22,172] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.62 | bwd_microstep: 4640.90 | bwd_inner_microstep: 4635.86 | bwd_allreduce_microstep: 4.92 | step_microstep: 42.44
-[2025-01-25 09:48:22,172] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.58 | bwd: 4640.93 | bwd_inner: 4635.86 | bwd_allreduce: 4.98 | step: 42.45
-  7%|▋         | 408/5800 [1:01:52<10:22:19,  6.93s/it]                                                       {'loss': 0.1226, 'grad_norm': 12.472979545593262, 'learning_rate': 3.9829504079256114e-05, 'epoch': 3.52}
-  7%|▋         | 408/5800 [1:01:52<10:22:19,  6.93s/it]score1 tensor([[0.6680],
-        [0.6133],
-        [0.5820],
-        [0.6367]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.7031, 0.5898, 0.4551, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0532, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:48:29,089] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.37
-[2025-01-25 09:48:29,090] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.82 | bwd_microstep: 4632.31 | bwd_inner_microstep: 4627.41 | bwd_allreduce_microstep: 4.82 | step_microstep: 47.50
-[2025-01-25 09:48:29,091] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.78 | bwd: 4632.34 | bwd_inner: 4627.41 | bwd_allreduce: 4.86 | step: 47.50
-  7%|▋         | 409/5800 [1:01:59<10:22:05,  6.92s/it]                                                       {'loss': 0.0532, 'grad_norm': 6.274942398071289, 'learning_rate': 3.9828045831618366e-05, 'epoch': 3.53}
-  7%|▋         | 409/5800 [1:01:59<10:22:05,  6.92s/it]score1 tensor([[0.6055],
-        [0.6055],
-        [0.4648],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.6328, 0.4512, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0347, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:48:36,005] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 09:48:36,006] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.97 | bwd_microstep: 4632.37 | bwd_inner_microstep: 4627.68 | bwd_allreduce_microstep: 4.60 | step_microstep: 45.73
-[2025-01-25 09:48:36,006] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.93 | bwd: 4632.39 | bwd_inner: 4627.68 | bwd_allreduce: 4.64 | step: 45.74
-  7%|▋         | 410/5800 [1:02:05<10:21:40,  6.92s/it]                                                       {'loss': 0.0347, 'grad_norm': 6.121956825256348, 'learning_rate': 3.982658140125188e-05, 'epoch': 3.53}
-  7%|▋         | 410/5800 [1:02:05<10:21:40,  6.92s/it]score1 tensor([[0.4570],
-        [0.5156],
-        [0.3906],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4121, 0.6289, 0.4258, 0.5234], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0503, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:48:42,918] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.52 | optimizer_step: 4.37
-[2025-01-25 09:48:42,920] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.18 | bwd_microstep: 4634.94 | bwd_inner_microstep: 4630.23 | bwd_allreduce_microstep: 4.63 | step_microstep: 47.71
-[2025-01-25 09:48:42,920] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.14 | bwd: 4634.97 | bwd_inner: 4630.23 | bwd_allreduce: 4.67 | step: 47.72
-  7%|▋         | 411/5800 [1:02:12<10:21:22,  6.92s/it]                                                       {'loss': 0.0503, 'grad_norm': 5.9917311668396, 'learning_rate': 3.982511078861329e-05, 'epoch': 3.54}
-  7%|▋         | 411/5800 [1:02:12<10:21:22,  6.92s/it]score1 tensor([[0.5469],
-        [0.5078],
-        [0.4434],
-        [0.4160]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.5469, 0.4258, 0.4531], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:48:49,855] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 09:48:49,857] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.55 | bwd_microstep: 4645.47 | bwd_inner_microstep: 4640.57 | bwd_allreduce_microstep: 4.80 | step_microstep: 42.73
-[2025-01-25 09:48:49,857] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.51 | bwd: 4645.49 | bwd_inner: 4640.57 | bwd_allreduce: 4.85 | step: 42.74
-  7%|▋         | 412/5800 [1:02:19<10:21:50,  6.92s/it]                                                       {'loss': 0.0273, 'grad_norm': 6.093668460845947, 'learning_rate': 3.982363399416116e-05, 'epoch': 3.55}
-  7%|▋         | 412/5800 [1:02:19<10:21:50,  6.92s/it]score1 tensor([[0.4609],
-        [0.4863],
-        [0.4512],
-        [0.4648]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3730, 0.4727, 0.4844, 0.4336], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0415, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:48:56,769] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 09:48:56,770] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.19 | bwd_microstep: 4631.56 | bwd_inner_microstep: 4626.36 | bwd_allreduce_microstep: 5.05 | step_microstep: 46.51
-[2025-01-25 09:48:56,771] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.14 | bwd: 4631.58 | bwd_inner: 4626.36 | bwd_allreduce: 5.15 | step: 46.52
-  7%|▋         | 413/5800 [1:02:26<10:21:22,  6.92s/it]                                                       {'loss': 0.0415, 'grad_norm': 6.0475945472717285, 'learning_rate': 3.982215101835598e-05, 'epoch': 3.56}
-  7%|▋         | 413/5800 [1:02:26<10:21:22,  6.92s/it]score1 tensor([[0.5625],
-        [0.5273],
-        [0.4824],
-        [0.4355]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.6172, 0.5547, 0.4570], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0605, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:49:03,683] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 09:49:03,685] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.29 | bwd_microstep: 4631.31 | bwd_inner_microstep: 4626.21 | bwd_allreduce_microstep: 5.03 | step_microstep: 48.62
-[2025-01-25 09:49:03,685] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.23 | bwd: 4631.33 | bwd_inner: 4626.21 | bwd_allreduce: 5.07 | step: 48.63
-  7%|▋         | 414/5800 [1:02:33<10:21:03,  6.92s/it]                                                       {'loss': 0.0605, 'grad_norm': 12.204127311706543, 'learning_rate': 3.9820661861660166e-05, 'epoch': 3.57}
-  7%|▋         | 414/5800 [1:02:33<10:21:03,  6.92s/it]score1 tensor([[0.4961],
-        [0.4688],
-        [0.4023],
-        [0.3965]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.5547, 0.4395, 0.4141], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0488, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:49:10,594] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 09:49:10,595] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.05 | bwd_microstep: 4633.57 | bwd_inner_microstep: 4628.95 | bwd_allreduce_microstep: 4.52 | step_microstep: 40.71
-[2025-01-25 09:49:10,595] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.01 | bwd: 4633.59 | bwd_inner: 4628.95 | bwd_allreduce: 4.57 | step: 40.71
-  7%|▋         | 415/5800 [1:02:40<10:20:47,  6.92s/it]                                                       {'loss': 0.0488, 'grad_norm': 11.816289901733398, 'learning_rate': 3.9819166524538075e-05, 'epoch': 3.58}
-  7%|▋         | 415/5800 [1:02:40<10:20:47,  6.92s/it]score1 tensor([[0.5078],
-        [0.4902],
-        [0.5508],
-        [0.4707]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.4922, 0.5664, 0.4121], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0259, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:49:17,504] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 09:49:17,505] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.63 | bwd_microstep: 4630.93 | bwd_inner_microstep: 4626.41 | bwd_allreduce_microstep: 4.44 | step_microstep: 42.39
-[2025-01-25 09:49:17,506] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.59 | bwd: 4630.95 | bwd_inner: 4626.41 | bwd_allreduce: 4.48 | step: 42.40
-  7%|▋         | 416/5800 [1:02:47<10:20:29,  6.91s/it]                                                       {'loss': 0.0259, 'grad_norm': 6.155540943145752, 'learning_rate': 3.9817665007455964e-05, 'epoch': 3.59}
-  7%|▋         | 416/5800 [1:02:47<10:20:29,  6.91s/it]score1 tensor([[0.6484],
-        [0.6250],
-        [0.5195],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.5547, 0.4551, 0.4707], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0820, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:49:24,409] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.39 | optimizer_step: 4.36
-[2025-01-25 09:49:24,410] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.52 | bwd_microstep: 4639.50 | bwd_inner_microstep: 4634.67 | bwd_allreduce_microstep: 4.73 | step_microstep: 35.71
-[2025-01-25 09:49:24,411] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.49 | bwd: 4639.52 | bwd_inner: 4634.67 | bwd_allreduce: 4.78 | step: 35.72
-  7%|▋         | 417/5800 [1:02:54<10:19:45,  6.91s/it]                                                       {'loss': 0.082, 'grad_norm': 12.593482971191406, 'learning_rate': 3.981615731088203e-05, 'epoch': 3.59}
-  7%|▋         | 417/5800 [1:02:54<10:19:45,  6.91s/it]score1 tensor([[0.7305],
-        [0.8633],
-        [0.5234],
-        [0.5938]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.6641, 0.4453, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1299, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:49:31,314] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.90 | optimizer_step: 4.37
-[2025-01-25 09:49:31,316] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.09 | bwd_microstep: 4639.78 | bwd_inner_microstep: 4634.60 | bwd_allreduce_microstep: 5.09 | step_microstep: 47.54
-[2025-01-25 09:49:31,316] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.06 | bwd: 4639.81 | bwd_inner: 4634.60 | bwd_allreduce: 5.13 | step: 47.55
-  7%|▋         | 418/5800 [1:03:01<10:19:52,  6.91s/it]                                                       {'loss': 0.1299, 'grad_norm': 13.222386360168457, 'learning_rate': 3.981464343528642e-05, 'epoch': 3.6}
-  7%|▋         | 418/5800 [1:03:01<10:19:52,  6.91s/it]score1 tensor([[0.6172],
-        [0.8789],
-        [0.5039],
-        [0.3340]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.6953, 0.4688, 0.3691], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1050, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:49:38,239] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 09:49:38,240] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.17 | bwd_microstep: 4645.71 | bwd_inner_microstep: 4640.84 | bwd_allreduce_microstep: 4.77 | step_microstep: 41.84
-[2025-01-25 09:49:38,241] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.14 | bwd: 4645.74 | bwd_inner: 4640.84 | bwd_allreduce: 4.83 | step: 41.85
-  7%|▋         | 419/5800 [1:03:08<10:20:10,  6.92s/it]                                                       {'loss': 0.105, 'grad_norm': 7.409669876098633, 'learning_rate': 3.9813123381141167e-05, 'epoch': 3.61}
-  7%|▋         | 419/5800 [1:03:08<10:20:10,  6.92s/it]score1 tensor([[0.8477],
-        [0.6211],
-        [0.5977],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.5508, 0.6094, 0.5898], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1025, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:49:45,161] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 09:49:45,162] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.57 | bwd_microstep: 4636.52 | bwd_inner_microstep: 4631.27 | bwd_allreduce_microstep: 5.15 | step_microstep: 44.49
-[2025-01-25 09:49:45,162] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.53 | bwd: 4636.54 | bwd_inner: 4631.27 | bwd_allreduce: 5.20 | step: 44.49
-  7%|▋         | 420/5800 [1:03:15<10:20:14,  6.92s/it]                                                       {'loss': 0.1025, 'grad_norm': 1.5434376001358032, 'learning_rate': 3.9811597148920246e-05, 'epoch': 3.62}
-  7%|▋         | 420/5800 [1:03:15<10:20:14,  6.92s/it]score1 tensor([[0.5938],
-        [0.5117],
-        [0.4980],
-        [0.4180]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5000, 0.4473, 0.4473, 0.3730], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0635, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:49:52,087] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 09:49:52,089] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.80 | bwd_microstep: 4640.49 | bwd_inner_microstep: 4634.96 | bwd_allreduce_microstep: 5.39 | step_microstep: 50.56
-[2025-01-25 09:49:52,090] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.76 | bwd: 4640.52 | bwd_inner: 4634.96 | bwd_allreduce: 5.47 | step: 50.57
-  7%|▋         | 421/5800 [1:03:22<10:20:23,  6.92s/it]                                                       {'loss': 0.0635, 'grad_norm': 12.08456039428711, 'learning_rate': 3.9810064739099586e-05, 'epoch': 3.63}
-  7%|▋         | 421/5800 [1:03:22<10:20:23,  6.92s/it]score1 tensor([[0.4297],
-        [0.4551],
-        [0.4648],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.5195, 0.5039, 0.4805], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0444, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:49:59,012] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.94 | optimizer_step: 4.36
-[2025-01-25 09:49:59,013] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.95 | bwd_microstep: 4644.93 | bwd_inner_microstep: 4640.07 | bwd_allreduce_microstep: 4.76 | step_microstep: 43.71
-[2025-01-25 09:49:59,014] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.90 | bwd: 4644.95 | bwd_inner: 4640.07 | bwd_allreduce: 4.81 | step: 43.72
-  7%|▋         | 422/5800 [1:03:28<10:20:27,  6.92s/it]                                                       {'loss': 0.0444, 'grad_norm': 0.9089236259460449, 'learning_rate': 3.980852615215701e-05, 'epoch': 3.64}
-  7%|▋         | 422/5800 [1:03:28<10:20:27,  6.92s/it]score1 tensor([[0.4727],
-        [0.4492],
-        [0.4023],
-        [0.6055]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4297, 0.4023, 0.3926, 0.5938], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0278, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:50:05,940] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 09:50:05,942] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.35 | bwd_microstep: 4646.38 | bwd_inner_microstep: 4641.84 | bwd_allreduce_microstep: 4.47 | step_microstep: 42.81
-[2025-01-25 09:50:05,942] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.30 | bwd: 4646.41 | bwd_inner: 4641.84 | bwd_allreduce: 4.51 | step: 42.82
-  7%|▋         | 423/5800 [1:03:35<10:20:26,  6.92s/it]                                                       {'loss': 0.0278, 'grad_norm': 11.863357543945312, 'learning_rate': 3.980698138857226e-05, 'epoch': 3.65}
-  7%|▋         | 423/5800 [1:03:35<10:20:26,  6.92s/it]score1 tensor([[0.3906],
-        [0.3574],
-        [0.2852],
-        [0.3789]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.4141, 0.4414, 0.4629], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1123, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:50:12,858] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.52 | optimizer_step: 4.37
-[2025-01-25 09:50:12,859] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.63 | bwd_microstep: 4635.38 | bwd_inner_microstep: 4630.44 | bwd_allreduce_microstep: 4.83 | step_microstep: 45.35
-[2025-01-25 09:50:12,860] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.60 | bwd: 4635.41 | bwd_inner: 4630.44 | bwd_allreduce: 4.89 | step: 45.36
-  7%|▋         | 424/5800 [1:03:42<10:20:07,  6.92s/it]                                                       {'loss': 0.1123, 'grad_norm': 11.310232162475586, 'learning_rate': 3.980543044882703e-05, 'epoch': 3.66}
-  7%|▋         | 424/5800 [1:03:42<10:20:07,  6.92s/it]score1 tensor([[0.4023],
-        [0.4316],
-        [0.3457],
-        [0.4648]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.5156, 0.5000, 0.6523], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1416, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:50:19,772] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.52 | optimizer_step: 4.36
-[2025-01-25 09:50:19,773] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.77 | bwd_microstep: 4634.74 | bwd_inner_microstep: 4630.04 | bwd_allreduce_microstep: 4.62 | step_microstep: 43.09
-[2025-01-25 09:50:19,774] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.73 | bwd: 4634.76 | bwd_inner: 4630.04 | bwd_allreduce: 4.66 | step: 43.13
-  7%|▋         | 425/5800 [1:03:49<10:19:47,  6.92s/it]                                                       {'loss': 0.1416, 'grad_norm': 11.571310043334961, 'learning_rate': 3.980387333340494e-05, 'epoch': 3.66}
-  7%|▋         | 425/5800 [1:03:49<10:19:47,  6.92s/it]score1 tensor([[0.4766],
-        [0.3223],
-        [0.3496],
-        [0.2949]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4414, 0.3105, 0.4297], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0918, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:50:26,678] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.00 | optimizer_step: 4.37
-[2025-01-25 09:50:26,679] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.66 | bwd_microstep: 4634.94 | bwd_inner_microstep: 4630.06 | bwd_allreduce_microstep: 4.80 | step_microstep: 41.72
-[2025-01-25 09:50:26,679] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.63 | bwd: 4634.96 | bwd_inner: 4630.06 | bwd_allreduce: 4.84 | step: 41.73
-  7%|▋         | 426/5800 [1:03:56<10:19:27,  6.92s/it]                                                       {'loss': 0.0918, 'grad_norm': 5.676902770996094, 'learning_rate': 3.980231004279151e-05, 'epoch': 3.67}
-  7%|▋         | 426/5800 [1:03:56<10:19:27,  6.92s/it]score1 tensor([[0.3047],
-        [0.3555],
-        [0.3164],
-        [0.4277]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4160, 0.4648, 0.4004, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1099, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:50:33,601] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.37
-[2025-01-25 09:50:33,602] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.55 | bwd_microstep: 4641.48 | bwd_inner_microstep: 4636.32 | bwd_allreduce_microstep: 5.06 | step_microstep: 43.45
-[2025-01-25 09:50:33,603] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.51 | bwd: 4641.51 | bwd_inner: 4636.32 | bwd_allreduce: 5.12 | step: 43.45
-  7%|▋         | 427/5800 [1:04:03<10:19:27,  6.92s/it]                                                       {'loss': 0.1099, 'grad_norm': 11.111123085021973, 'learning_rate': 3.980074057747422e-05, 'epoch': 3.68}
-  7%|▋         | 427/5800 [1:04:03<10:19:27,  6.92s/it]score1 tensor([[0.4023],
-        [0.4844],
-        [0.4785],
-        [0.4727]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.5391, 0.5625, 0.4766], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0396, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:50:40,525] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.36
-[2025-01-25 09:50:40,527] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.29 | bwd_microstep: 4644.31 | bwd_inner_microstep: 4639.66 | bwd_allreduce_microstep: 4.56 | step_microstep: 43.71
-[2025-01-25 09:50:40,527] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.24 | bwd: 4644.34 | bwd_inner: 4639.66 | bwd_allreduce: 4.60 | step: 43.72
-  7%|▋         | 428/5800 [1:04:10<10:19:43,  6.92s/it]                                                       {'loss': 0.0396, 'grad_norm': 11.55932331085205, 'learning_rate': 3.979916493794244e-05, 'epoch': 3.69}
-  7%|▋         | 428/5800 [1:04:10<10:19:43,  6.92s/it]score1 tensor([[0.5156],
-        [0.5898],
-        [0.4590],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.5352, 0.4648, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0415, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:50:47,410] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 09:50:47,412] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.23 | bwd_microstep: 4595.98 | bwd_inner_microstep: 4591.08 | bwd_allreduce_microstep: 4.80 | step_microstep: 44.23
-[2025-01-25 09:50:47,412] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.19 | bwd: 4596.00 | bwd_inner: 4591.08 | bwd_allreduce: 4.85 | step: 44.24
-  7%|▋         | 429/5800 [1:04:17<10:18:30,  6.91s/it]                                                       {'loss': 0.0415, 'grad_norm': 3.022033929824829, 'learning_rate': 3.9797583124687485e-05, 'epoch': 3.7}
-  7%|▋         | 429/5800 [1:04:17<10:18:30,  6.91s/it]score1 tensor([[0.6289],
-        [0.5859],
-        [0.6289],
-        [0.6484]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.4316, 0.5195, 0.6641], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1226, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:50:54,329] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.97 | optimizer_step: 4.37
-[2025-01-25 09:50:54,330] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.31 | bwd_microstep: 4637.87 | bwd_inner_microstep: 4633.33 | bwd_allreduce_microstep: 4.44 | step_microstep: 42.37
-[2025-01-25 09:50:54,331] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.28 | bwd: 4637.89 | bwd_inner: 4633.33 | bwd_allreduce: 4.49 | step: 42.38
-  7%|▋         | 430/5800 [1:04:24<10:18:34,  6.91s/it]                                                       {'loss': 0.1226, 'grad_norm': 6.092610836029053, 'learning_rate': 3.9795995138202596e-05, 'epoch': 3.71}
-  7%|▋         | 430/5800 [1:04:24<10:18:34,  6.91s/it]score1 tensor([[0.6992],
-        [0.6523],
-        [0.6133],
-        [0.6992]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.3262, 0.3457, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2354, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:51:01,246] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.37
-[2025-01-25 09:51:01,248] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.68 | bwd_microstep: 4639.66 | bwd_inner_microstep: 4634.88 | bwd_allreduce_microstep: 4.70 | step_microstep: 42.93
-[2025-01-25 09:51:01,248] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.65 | bwd: 4639.69 | bwd_inner: 4634.88 | bwd_allreduce: 4.74 | step: 42.94
-  7%|▋         | 431/5800 [1:04:31<10:18:32,  6.91s/it]                                                       {'loss': 0.2354, 'grad_norm': 12.50498104095459, 'learning_rate': 3.9794400978982926e-05, 'epoch': 3.72}
-  7%|▋         | 431/5800 [1:04:31<10:18:32,  6.91s/it]score1 tensor([[0.6680],
-        [0.6797],
-        [0.7188],
-        [0.6719]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.4785, 0.6055, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1426, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:51:08,166] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.88 | optimizer_step: 4.37
-[2025-01-25 09:51:08,167] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.92 | bwd_microstep: 4642.97 | bwd_inner_microstep: 4638.22 | bwd_allreduce_microstep: 4.66 | step_microstep: 41.63
-[2025-01-25 09:51:08,168] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.89 | bwd: 4642.99 | bwd_inner: 4638.22 | bwd_allreduce: 4.70 | step: 41.64
-  7%|▋         | 432/5800 [1:04:38<10:18:45,  6.92s/it]                                                       {'loss': 0.1426, 'grad_norm': 12.485721588134766, 'learning_rate': 3.9792800647525575e-05, 'epoch': 3.72}
-  7%|▋         | 432/5800 [1:04:38<10:18:45,  6.92s/it]score1 tensor([[0.6758],
-        [0.5977],
-        [0.6758],
-        [0.7188]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.3711, 0.4473, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1914, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:51:15,083] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 09:51:15,084] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.66 | bwd_microstep: 4636.64 | bwd_inner_microstep: 4631.75 | bwd_allreduce_microstep: 4.81 | step_microstep: 43.65
-[2025-01-25 09:51:15,085] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.62 | bwd: 4636.67 | bwd_inner: 4631.75 | bwd_allreduce: 4.85 | step: 43.66
-  7%|▋         | 433/5800 [1:04:45<10:18:34,  6.92s/it]                                                       {'loss': 0.1914, 'grad_norm': 12.341293334960938, 'learning_rate': 3.9791194144329545e-05, 'epoch': 3.73}
-  7%|▋         | 433/5800 [1:04:45<10:18:34,  6.92s/it]score1 tensor([[0.6523],
-        [0.6719],
-        [0.6445],
-        [0.6250]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6484, 0.6250, 0.5664, 0.5820], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:51:22,006] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 09:51:22,007] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.14 | bwd_microstep: 4646.52 | bwd_inner_microstep: 4642.03 | bwd_allreduce_microstep: 4.41 | step_microstep: 42.17
-[2025-01-25 09:51:22,007] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.11 | bwd: 4646.54 | bwd_inner: 4642.03 | bwd_allreduce: 4.45 | step: 42.18
-  7%|▋         | 434/5800 [1:04:51<10:18:39,  6.92s/it]                                                       {'loss': 0.043, 'grad_norm': 12.17960262298584, 'learning_rate': 3.978958146989578e-05, 'epoch': 3.74}
-  7%|▋         | 434/5800 [1:04:51<10:18:39,  6.92s/it]score1 tensor([[0.6016],
-        [0.5898],
-        [0.6133],
-        [0.6133]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5586, 0.4941, 0.5977, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0723, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:51:28,917] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.37
-[2025-01-25 09:51:28,919] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.72 | bwd_microstep: 4636.36 | bwd_inner_microstep: 4631.83 | bwd_allreduce_microstep: 4.46 | step_microstep: 46.61
-[2025-01-25 09:51:28,919] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.69 | bwd: 4636.39 | bwd_inner: 4631.83 | bwd_allreduce: 4.50 | step: 46.61
-  8%|▊         | 435/5800 [1:04:58<10:18:22,  6.92s/it]                                                       {'loss': 0.0723, 'grad_norm': 11.9556884765625, 'learning_rate': 3.9787962624727126e-05, 'epoch': 3.75}
-  8%|▊         | 435/5800 [1:04:58<10:18:22,  6.92s/it]score1 tensor([[0.5391],
-        [0.5547],
-        [0.5938],
-        [0.4785]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5078, 0.5430, 0.6211, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0371, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:51:35,831] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 09:51:35,832] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.54 | bwd_microstep: 4643.21 | bwd_inner_microstep: 4638.63 | bwd_allreduce_microstep: 4.49 | step_microstep: 42.72
-[2025-01-25 09:51:35,833] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.51 | bwd: 4643.25 | bwd_inner: 4638.63 | bwd_allreduce: 4.53 | step: 42.72
-  8%|▊         | 436/5800 [1:05:05<10:18:25,  6.92s/it]                                                       {'loss': 0.0371, 'grad_norm': 5.676013469696045, 'learning_rate': 3.9786337609328374e-05, 'epoch': 3.76}
-  8%|▊         | 436/5800 [1:05:05<10:18:25,  6.92s/it]score1 tensor([[0.4258],
-        [0.4785],
-        [0.4590],
-        [0.4902]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.6172, 0.5391, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0776, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:51:42,754] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.37
-[2025-01-25 09:51:42,755] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.46 | bwd_microstep: 4634.32 | bwd_inner_microstep: 4629.44 | bwd_allreduce_microstep: 4.77 | step_microstep: 43.11
-[2025-01-25 09:51:42,756] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.42 | bwd: 4634.34 | bwd_inner: 4629.44 | bwd_allreduce: 4.83 | step: 43.12
-  8%|▊         | 437/5800 [1:05:12<10:18:17,  6.92s/it]                                                       {'loss': 0.0776, 'grad_norm': 11.170373916625977, 'learning_rate': 3.978470642420623e-05, 'epoch': 3.77}
-  8%|▊         | 437/5800 [1:05:12<10:18:17,  6.92s/it]score1 tensor([[0.4238],
-        [0.4023],
-        [0.4219],
-        [0.4258]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6406, 0.4199, 0.6133, 0.4375], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1094, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:51:49,675] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.37
-[2025-01-25 09:51:49,677] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.66 | bwd_microstep: 4646.38 | bwd_inner_microstep: 4641.27 | bwd_allreduce_microstep: 5.00 | step_microstep: 41.97
-[2025-01-25 09:51:49,677] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.63 | bwd: 4646.41 | bwd_inner: 4641.27 | bwd_allreduce: 5.06 | step: 41.97
-  8%|▊         | 438/5800 [1:05:19<10:18:17,  6.92s/it]                                                       {'loss': 0.1094, 'grad_norm': 11.001954078674316, 'learning_rate': 3.978306906986934e-05, 'epoch': 3.78}
-  8%|▊         | 438/5800 [1:05:19<10:18:17,  6.92s/it]score1 tensor([[0.4043],
-        [0.3887],
-        [0.3164],
-        [0.3770]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.4590, 0.3086, 0.4141], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0522, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:51:56,590] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 09:51:56,591] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.05 | bwd_microstep: 4636.22 | bwd_inner_microstep: 4631.73 | bwd_allreduce_microstep: 4.40 | step_microstep: 41.90
-[2025-01-25 09:51:56,592] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.99 | bwd: 4636.25 | bwd_inner: 4631.73 | bwd_allreduce: 4.45 | step: 41.91
-  8%|▊         | 439/5800 [1:05:26<10:18:02,  6.92s/it]                                                       {'loss': 0.0522, 'grad_norm': 5.545100688934326, 'learning_rate': 3.9781425546828235e-05, 'epoch': 3.78}
-  8%|▊         | 439/5800 [1:05:26<10:18:02,  6.92s/it]score1 tensor([[0.3828],
-        [0.4102],
-        [0.4141],
-        [0.4141]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3789, 0.4785, 0.5664, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1050, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:52:03,514] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 09:52:03,516] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.08 | bwd_microstep: 4648.42 | bwd_inner_microstep: 4643.87 | bwd_allreduce_microstep: 4.45 | step_microstep: 41.63
-[2025-01-25 09:52:03,516] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.03 | bwd: 4648.45 | bwd_inner: 4643.87 | bwd_allreduce: 4.50 | step: 41.64
-  8%|▊         | 440/5800 [1:05:33<10:18:07,  6.92s/it]                                                       {'loss': 0.105, 'grad_norm': 5.5511579513549805, 'learning_rate': 3.977977585559542e-05, 'epoch': 3.79}
-  8%|▊         | 440/5800 [1:05:33<10:18:07,  6.92s/it]score1 tensor([[0.4102],
-        [0.4355],
-        [0.4316],
-        [0.3906]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6484, 0.5547, 0.4902, 0.4609], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1216, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:52:10,435] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 09:52:10,437] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.56 | bwd_microstep: 4641.93 | bwd_inner_microstep: 4637.21 | bwd_allreduce_microstep: 4.62 | step_microstep: 47.59
-[2025-01-25 09:52:10,437] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.53 | bwd: 4641.95 | bwd_inner: 4637.21 | bwd_allreduce: 4.67 | step: 47.60
-  8%|▊         | 441/5800 [1:05:40<10:18:03,  6.92s/it]                                                       {'loss': 0.1216, 'grad_norm': 10.817839622497559, 'learning_rate': 3.9778119996685275e-05, 'epoch': 3.8}
-  8%|▊         | 441/5800 [1:05:40<10:18:03,  6.92s/it]score1 tensor([[0.4727],
-        [0.4395],
-        [0.5039],
-        [0.4785]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.4512, 0.5977, 0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0737, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:52:17,358] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.90 | optimizer_step: 4.36
-[2025-01-25 09:52:17,360] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.38 | bwd_microstep: 4642.24 | bwd_inner_microstep: 4637.55 | bwd_allreduce_microstep: 4.60 | step_microstep: 42.54
-[2025-01-25 09:52:17,360] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.34 | bwd: 4642.26 | bwd_inner: 4637.55 | bwd_allreduce: 4.64 | step: 42.55
-  8%|▊         | 442/5800 [1:05:47<10:18:09,  6.92s/it]                                                       {'loss': 0.0737, 'grad_norm': 11.029022216796875, 'learning_rate': 3.9776457970614136e-05, 'epoch': 3.81}
-  8%|▊         | 442/5800 [1:05:47<10:18:09,  6.92s/it]score1 tensor([[0.5469],
-        [0.5273],
-        [0.5234],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.3750, 0.4238, 0.4375], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0972, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:52:24,277] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.37
-[2025-01-25 09:52:24,278] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.53 | bwd_microstep: 4633.54 | bwd_inner_microstep: 4629.00 | bwd_allreduce_microstep: 4.45 | step_microstep: 41.48
-[2025-01-25 09:52:24,278] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.50 | bwd: 4633.57 | bwd_inner: 4629.00 | bwd_allreduce: 4.50 | step: 41.50
-  8%|▊         | 443/5800 [1:05:54<10:17:47,  6.92s/it]                                                       {'loss': 0.0972, 'grad_norm': 11.300838470458984, 'learning_rate': 3.9774789777900247e-05, 'epoch': 3.82}
-  8%|▊         | 443/5800 [1:05:54<10:17:47,  6.92s/it]score1 tensor([[0.5977],
-        [0.5820],
-        [0.5703],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.4473, 0.5469, 0.3672], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1138, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:52:31,195] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 09:52:31,196] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.76 | bwd_microstep: 4644.69 | bwd_inner_microstep: 4640.14 | bwd_allreduce_microstep: 4.45 | step_microstep: 41.14
-[2025-01-25 09:52:31,197] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.73 | bwd: 4644.71 | bwd_inner: 4640.14 | bwd_allreduce: 4.50 | step: 41.15
-  8%|▊         | 444/5800 [1:06:01<10:17:42,  6.92s/it]                                                       {'loss': 0.1138, 'grad_norm': 11.537907600402832, 'learning_rate': 3.977311541906379e-05, 'epoch': 3.83}
-  8%|▊         | 444/5800 [1:06:01<10:17:42,  6.92s/it]score1 tensor([[0.5703],
-        [0.5508],
-        [0.6250],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.4043, 0.6836, 0.3887], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0977, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:52:38,115] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.36
-[2025-01-25 09:52:38,116] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.95 | bwd_microstep: 4642.07 | bwd_inner_microstep: 4637.33 | bwd_allreduce_microstep: 4.64 | step_microstep: 42.74
-[2025-01-25 09:52:38,116] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.92 | bwd: 4642.10 | bwd_inner: 4637.33 | bwd_allreduce: 4.69 | step: 42.74
-  8%|▊         | 445/5800 [1:06:08<10:17:34,  6.92s/it]                                                       {'loss': 0.0977, 'grad_norm': 5.677021026611328, 'learning_rate': 3.977143489462685e-05, 'epoch': 3.84}
-  8%|▊         | 445/5800 [1:06:08<10:17:34,  6.92s/it]score1 tensor([[0.5391],
-        [0.6328],
-        [0.5977],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3652, 0.6133, 0.5117, 0.4824], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0879, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:52:45,034] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.37
-[2025-01-25 09:52:45,036] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.85 | bwd_microstep: 4641.98 | bwd_inner_microstep: 4637.36 | bwd_allreduce_microstep: 4.52 | step_microstep: 43.13
-[2025-01-25 09:52:45,036] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.82 | bwd: 4642.01 | bwd_inner: 4637.37 | bwd_allreduce: 4.56 | step: 43.13
-  8%|▊         | 446/5800 [1:06:15<10:17:32,  6.92s/it]                                                       {'loss': 0.0879, 'grad_norm': 11.496918678283691, 'learning_rate': 3.976974820511345e-05, 'epoch': 3.84}
-  8%|▊         | 446/5800 [1:06:15<10:17:32,  6.92s/it]score1 tensor([[0.5977],
-        [0.5273],
-        [0.4941],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.5469, 0.3867, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0527, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:52:51,949] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 09:52:51,950] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.58 | bwd_microstep: 4633.38 | bwd_inner_microstep: 4628.41 | bwd_allreduce_microstep: 4.88 | step_microstep: 44.65
-[2025-01-25 09:52:51,951] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.55 | bwd: 4633.41 | bwd_inner: 4628.41 | bwd_allreduce: 4.93 | step: 44.66
-  8%|▊         | 447/5800 [1:06:21<10:17:14,  6.92s/it]                                                       {'loss': 0.0527, 'grad_norm': 0.7733369469642639, 'learning_rate': 3.976805535104951e-05, 'epoch': 3.85}
-  8%|▊         | 447/5800 [1:06:21<10:17:14,  6.92s/it]score1 tensor([[0.5586],
-        [0.4961],
-        [0.5039],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6367, 0.4844, 0.5000, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:52:58,869] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.97 | optimizer_step: 4.37
-[2025-01-25 09:52:58,870] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.10 | bwd_microstep: 4636.42 | bwd_inner_microstep: 4631.84 | bwd_allreduce_microstep: 4.46 | step_microstep: 46.08
-[2025-01-25 09:52:58,871] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.07 | bwd: 4636.45 | bwd_inner: 4631.84 | bwd_allreduce: 4.51 | step: 46.09
-  8%|▊         | 448/5800 [1:06:28<10:17:05,  6.92s/it]                                                       {'loss': 0.0293, 'grad_norm': 0.7653846740722656, 'learning_rate': 3.976635633296292e-05, 'epoch': 3.86}
-  8%|▊         | 448/5800 [1:06:28<10:17:05,  6.92s/it]score1 tensor([[0.5117],
-        [0.5312],
-        [0.4824],
-        [0.3574]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5742, 0.6562, 0.4570, 0.1787], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0977, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:53:05,789] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 09:53:05,790] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.16 | bwd_microstep: 4644.55 | bwd_inner_microstep: 4639.96 | bwd_allreduce_microstep: 4.51 | step_microstep: 41.70
-[2025-01-25 09:53:05,790] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.13 | bwd: 4644.58 | bwd_inner: 4639.96 | bwd_allreduce: 4.55 | step: 41.71
-  8%|▊         | 449/5800 [1:06:35<10:17:04,  6.92s/it]                                                       {'loss': 0.0977, 'grad_norm': 1.292911410331726, 'learning_rate': 3.9764651151383444e-05, 'epoch': 3.87}
-  8%|▊         | 449/5800 [1:06:35<10:17:04,  6.92s/it]score1 tensor([[0.3711],
-        [0.4512],
-        [0.4082],
-        [0.4746]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4922, 0.3477, 0.4922, 0.4434], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0850, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:53:12,702] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.37
-[2025-01-25 09:53:12,704] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.17 | bwd_microstep: 4639.51 | bwd_inner_microstep: 4634.63 | bwd_allreduce_microstep: 4.78 | step_microstep: 41.60
-[2025-01-25 09:53:12,704] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.13 | bwd: 4639.55 | bwd_inner: 4634.63 | bwd_allreduce: 4.83 | step: 41.60
-  8%|▊         | 450/5800 [1:06:42<10:16:50,  6.92s/it]                                                       {'loss': 0.085, 'grad_norm': 0.8805148005485535, 'learning_rate': 3.97629398068428e-05, 'epoch': 3.88}
-  8%|▊         | 450/5800 [1:06:42<10:16:50,  6.92s/it]score1 tensor([[0.3984],
-        [0.4141],
-        [0.4395],
-        [0.4570]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.4961, 0.4180, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0518, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:53:19,611] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 09:53:19,612] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.07 | bwd_microstep: 4633.00 | bwd_inner_microstep: 4628.56 | bwd_allreduce_microstep: 4.35 | step_microstep: 41.49
-[2025-01-25 09:53:19,612] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.04 | bwd: 4633.03 | bwd_inner: 4628.56 | bwd_allreduce: 4.40 | step: 41.49
-  8%|▊         | 451/5800 [1:06:49<10:16:25,  6.91s/it]                                                       {'loss': 0.0518, 'grad_norm': 5.3423967361450195, 'learning_rate': 3.9761222299874595e-05, 'epoch': 3.89}
-  8%|▊         | 451/5800 [1:06:49<10:16:25,  6.91s/it]score1 tensor([[0.3203],
-        [0.5469],
-        [0.3867],
-        [0.4121]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4629, 0.6172, 0.4492, 0.4727], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0840, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:53:26,538] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 09:53:26,539] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.97 | bwd_microstep: 4640.03 | bwd_inner_microstep: 4635.22 | bwd_allreduce_microstep: 4.73 | step_microstep: 41.53
-[2025-01-25 09:53:26,540] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.93 | bwd: 4640.05 | bwd_inner: 4635.22 | bwd_allreduce: 4.77 | step: 41.53
-  8%|▊         | 452/5800 [1:06:56<10:16:47,  6.92s/it]                                                       {'loss': 0.084, 'grad_norm': 10.741913795471191, 'learning_rate': 3.97594986310144e-05, 'epoch': 3.9}
-  8%|▊         | 452/5800 [1:06:56<10:16:47,  6.92s/it]score1 tensor([[0.4531],
-        [0.5195],
-        [0.5469],
-        [0.4785]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3906, 0.5508, 0.6133, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0493, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:53:33,454] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 09:53:33,455] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.62 | bwd_microstep: 4634.35 | bwd_inner_microstep: 4629.68 | bwd_allreduce_microstep: 4.56 | step_microstep: 42.97
-[2025-01-25 09:53:33,456] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.58 | bwd: 4634.38 | bwd_inner: 4629.68 | bwd_allreduce: 4.61 | step: 42.98
-  8%|▊         | 453/5800 [1:07:03<10:16:30,  6.92s/it]                                                       {'loss': 0.0493, 'grad_norm': 5.762066841125488, 'learning_rate': 3.9757768800799666e-05, 'epoch': 3.91}
-  8%|▊         | 453/5800 [1:07:03<10:16:30,  6.92s/it]score1 tensor([[0.5508],
-        [0.4980],
-        [0.6797],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.5039, 0.6445, 0.4707], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0381, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:53:40,378] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.36
-[2025-01-25 09:53:40,380] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.47 | bwd_microstep: 4642.26 | bwd_inner_microstep: 4637.52 | bwd_allreduce_microstep: 4.64 | step_microstep: 46.71
-[2025-01-25 09:53:40,380] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.44 | bwd: 4642.28 | bwd_inner: 4637.52 | bwd_allreduce: 4.69 | step: 46.72
-  8%|▊         | 454/5800 [1:07:10<10:16:24,  6.92s/it]                                                       {'loss': 0.0381, 'grad_norm': 6.1919026374816895, 'learning_rate': 3.975603280976979e-05, 'epoch': 3.91}
-  8%|▊         | 454/5800 [1:07:10<10:16:24,  6.92s/it]score1 tensor([[0.5078],
-        [0.5078],
-        [0.4844],
-        [0.4531]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4805, 0.4785, 0.4004, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0454, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:53:47,295] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.36
-[2025-01-25 09:53:47,297] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.05 | bwd_microstep: 4641.36 | bwd_inner_microstep: 4636.54 | bwd_allreduce_microstep: 4.73 | step_microstep: 46.40
-[2025-01-25 09:53:47,297] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.01 | bwd: 4641.38 | bwd_inner: 4636.54 | bwd_allreduce: 4.77 | step: 46.41
-  8%|▊         | 455/5800 [1:07:17<10:16:22,  6.92s/it]                                                       {'loss': 0.0454, 'grad_norm': 5.705972671508789, 'learning_rate': 3.97542906584661e-05, 'epoch': 3.92}
-  8%|▊         | 455/5800 [1:07:17<10:16:22,  6.92s/it]score1 tensor([[0.5078],
-        [0.5156],
-        [0.4160],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4590, 0.4727, 0.4668, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0757, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:53:54,218] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 09:53:54,220] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.39 | bwd_microstep: 4643.00 | bwd_inner_microstep: 4637.58 | bwd_allreduce_microstep: 5.30 | step_microstep: 44.21
-[2025-01-25 09:53:54,220] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.35 | bwd: 4643.03 | bwd_inner: 4637.58 | bwd_allreduce: 5.37 | step: 44.22
-  8%|▊         | 456/5800 [1:07:24<10:16:21,  6.92s/it]                                                       {'loss': 0.0757, 'grad_norm': 5.92673921585083, 'learning_rate': 3.975254234743181e-05, 'epoch': 3.93}
-  8%|▊         | 456/5800 [1:07:24<10:16:21,  6.92s/it]score1 tensor([[0.5234],
-        [0.4043],
-        [0.4141],
-        [0.4492]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4023, 0.4219, 0.5039, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0693, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:54:01,132] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.37
-[2025-01-25 09:54:01,133] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.35 | bwd_microstep: 4633.64 | bwd_inner_microstep: 4629.17 | bwd_allreduce_microstep: 4.38 | step_microstep: 44.01
-[2025-01-25 09:54:01,133] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.31 | bwd: 4633.66 | bwd_inner: 4629.17 | bwd_allreduce: 4.43 | step: 44.03
-  8%|▊         | 457/5800 [1:07:31<10:15:58,  6.92s/it]                                                       {'loss': 0.0693, 'grad_norm': 5.337062835693359, 'learning_rate': 3.975078787721208e-05, 'epoch': 3.94}
-  8%|▊         | 457/5800 [1:07:31<10:15:58,  6.92s/it]score1 tensor([[0.4941],
-        [0.5430],
-        [0.4941],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4277, 0.5391, 0.5586, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0522, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:54:08,047] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.37
-[2025-01-25 09:54:08,049] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.55 | bwd_microstep: 4642.19 | bwd_inner_microstep: 4637.35 | bwd_allreduce_microstep: 4.75 | step_microstep: 45.20
-[2025-01-25 09:54:08,049] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.52 | bwd: 4642.21 | bwd_inner: 4637.35 | bwd_allreduce: 4.80 | step: 45.21
-  8%|▊         | 458/5800 [1:07:38<10:15:52,  6.92s/it]                                                       {'loss': 0.0522, 'grad_norm': 5.751522541046143, 'learning_rate': 3.9749027248353986e-05, 'epoch': 3.95}
-  8%|▊         | 458/5800 [1:07:38<10:15:52,  6.92s/it]score1 tensor([[0.5195],
-        [0.5078],
-        [0.4883],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.4863, 0.5742, 0.6016], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0483, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:54:14,964] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 09:54:14,965] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.06 | bwd_microstep: 4640.44 | bwd_inner_microstep: 4636.11 | bwd_allreduce_microstep: 4.24 | step_microstep: 40.88
-[2025-01-25 09:54:14,965] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.02 | bwd: 4640.46 | bwd_inner: 4636.11 | bwd_allreduce: 4.28 | step: 40.89
-  8%|▊         | 459/5800 [1:07:44<10:15:42,  6.92s/it]                                                       {'loss': 0.0483, 'grad_norm': 5.797513008117676, 'learning_rate': 3.974726046140651e-05, 'epoch': 3.96}
-  8%|▊         | 459/5800 [1:07:44<10:15:42,  6.92s/it]score1 tensor([[0.5156],
-        [0.3926],
-        [0.5625],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.4766, 0.5781, 0.4355], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 09:54:21,875] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 09:54:21,876] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.34 | bwd_microstep: 4635.68 | bwd_inner_microstep: 4631.08 | bwd_allreduce_microstep: 4.52 | step_microstep: 40.97
-[2025-01-25 09:54:21,876] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.31 | bwd: 4635.70 | bwd_inner: 4631.07 | bwd_allreduce: 4.56 | step: 40.98
-  8%|▊         | 460/5800 [1:07:51<10:15:27,  6.92s/it]                                                       {'loss': 0.0703, 'grad_norm': 5.549680709838867, 'learning_rate': 3.9745487516920584e-05, 'epoch': 3.97}
-  8%|▊         | 460/5800 [1:07:51<10:15:27,  6.92s/it]evaluate!
-score1 tensor([[0.4453]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4805, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5469]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5625, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3828]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4766, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0938, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4961]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5703, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0742, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6094]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5742, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4746]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5547, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0801, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5625]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5977, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4219, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1016, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4102]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0781, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4590]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5000, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0410, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4082]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3945, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5469]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5000, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4219]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3730, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0488, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4531]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3984]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4414, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6797, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1406, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4297]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5391, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1094, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4922]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4648, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3965]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4258, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4082]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4707, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0625, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3867]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3965, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3750, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1523, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4980]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4492, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0488, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5117]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5586, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5039]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6523, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1484, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5898, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0625, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4941]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4355, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0586, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4707]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5156, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0449, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4023]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3262, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0762, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4199]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4863, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4414]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2070, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6055, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0781, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5000]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4570, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1211, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5586]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.7031, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1445, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4336, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0820, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5391, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3906]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4473, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0566, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4941]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5430, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0488, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4746]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4590, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6016]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6641, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0625, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4453, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5547]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.7109, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1562, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4727]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5273, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3770]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3691, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4883]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4180, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4648]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3223, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1426, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5508]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6016, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4746]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4648, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4805]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3418, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1387, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4746]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4590, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4688]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5117, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4668]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4551, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3633]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5234, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1602, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5078]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5312, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5820, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4141]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4199, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4355]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3926, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5625]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4941]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5078]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5039]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5352, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5625]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6250, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0625, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5625]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5703, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4473]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4727, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4980]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4434, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4883]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4941, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5586]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5352, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4883]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0., device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4141]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3828, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.2930]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4219, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1289, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3672]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4844, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1172, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4805, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1016, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6094]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5898, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5859]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6641, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0781, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4316]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4688, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0371, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3906]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4297, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4102]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4668, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0566, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0938, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4316]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4297, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4355]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4609, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4629]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1543, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3848]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4492, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0645, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4180]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3730, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0449, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6445, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1211, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4355]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2129, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3496]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5430, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1934, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5430]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5273, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3516]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3867, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4824]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4668, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4082]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0840, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5625]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5625, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0., device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5781]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6250, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5625]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5078]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1211, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4473]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4785, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4551]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4062, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0488, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4902]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4395, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3516]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5664, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2148, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4062]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3555, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3711]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4785, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1074, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5039]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5078, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5703]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6016, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4043]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4277, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5742]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5781, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4766]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3477, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1289, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5078]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6133, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1055, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5156, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4609]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3379, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1230, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4414]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3418, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0996, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5820]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3848, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1973, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4121]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3828, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4805]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4277, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0527, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3906]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4043, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16)
-Results saved to /DATA/DATA1/wjr/intern/InternVL/internvl_chat/new_train_result/stage2/mos3_test.csv
-Accuracy: 0.0
-SRCC_score: 0.5747556565665081
-PLCC_score: 0.5536437101097104
-KRCC_score: 0.39969380943105237
-SRCC_level: 0.5747556565665081
-PLCC_level: 0.5536437101097104
-KRCC_level: 0.39969380943105237
-New best SRCC_score: 0.5747556565665081. Saving model...
-[INFO|trainer.py:3705] 2025-01-25 10:04:46,071 >> Saving model checkpoint to /DATA/env/wjr/newtrain/stage2/mos3
-[INFO|configuration_utils.py:410] 2025-01-25 10:04:46,078 >> Configuration saved in /DATA/env/wjr/newtrain/stage2/mos3/config.json
-[INFO|configuration_utils.py:868] 2025-01-25 10:04:46,079 >> Configuration saved in /DATA/env/wjr/newtrain/stage2/mos3/generation_config.json
-[INFO|modeling_utils.py:2844] 2025-01-25 10:06:23,912 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 4 checkpoint shards. You can find where each parameters has been saved in the index located at /DATA/env/wjr/newtrain/stage2/mos3/model.safetensors.index.json.
-[INFO|tokenization_utils_base.py:2641] 2025-01-25 10:06:23,916 >> tokenizer config file saved in /DATA/env/wjr/newtrain/stage2/mos3/tokenizer_config.json
-[INFO|tokenization_utils_base.py:2650] 2025-01-25 10:06:23,916 >> Special tokens file saved in /DATA/env/wjr/newtrain/stage2/mos3/special_tokens_map.json
-[INFO|tokenization_utils_base.py:2701] 2025-01-25 10:06:23,916 >> added tokens file saved in /DATA/env/wjr/newtrain/stage2/mos3/added_tokens.json
-01/25/2025 10:06:38 - INFO - __main__ - Saved LoRA weights to /DATA/env/wjr/newtrain/stage2/mos3/lora_weights.pth
-score1 tensor([[0.4883],
-        [0.4531],
-        [0.6094],
-        [0.4707]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4082, 0.4746, 0.6094, 0.4043], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0420, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:06:45,536] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 10:06:45,538] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2122.26 | bwd_microstep: 4527.47 | bwd_inner_microstep: 4522.79 | bwd_allreduce_microstep: 4.60 | step_microstep: 61.31
-[2025-01-25 10:06:45,538] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2122.21 | bwd: 4527.50 | bwd_inner: 4522.79 | bwd_allreduce: 4.64 | step: 61.33
-  8%|▊         | 461/5800 [1:20:15<338:02:46, 227.94s/it]                                                         {'loss': 0.042, 'grad_norm': 2.8534791469573975, 'learning_rate': 3.974370841544904e-05, 'epoch': 3.97}
-  8%|▊         | 461/5800 [1:20:15<338:02:46, 227.94s/it]score1 tensor([[0.6133],
-        [0.6172],
-        [0.5117],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.6562, 0.5820, 0.4883], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0439, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:06:52,380] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 10:06:52,381] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2132.09 | bwd_microstep: 4587.90 | bwd_inner_microstep: 4582.93 | bwd_allreduce_microstep: 4.86 | step_microstep: 48.18
-[2025-01-25 10:06:52,382] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2132.05 | bwd: 4587.92 | bwd_inner: 4582.93 | bwd_allreduce: 4.91 | step: 48.19
-  8%|▊         | 462/5800 [1:20:22<239:37:58, 161.61s/it]                                                         {'loss': 0.0439, 'grad_norm': 6.029369354248047, 'learning_rate': 3.974192315754663e-05, 'epoch': 3.98}
-  8%|▊         | 462/5800 [1:20:22<239:37:58, 161.61s/it]score1 tensor([[0.5938],
-        [0.5469],
-        [0.5664],
-        [0.4395]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5703, 0.4492, 0.4863, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0601, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:06:59,249] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.37
-[2025-01-25 10:06:59,251] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2139.51 | bwd_microstep: 4610.70 | bwd_inner_microstep: 4605.77 | bwd_allreduce_microstep: 4.84 | step_microstep: 41.96
-[2025-01-25 10:06:59,251] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2139.47 | bwd: 4610.73 | bwd_inner: 4605.77 | bwd_allreduce: 4.88 | step: 41.97
-  8%|▊         | 463/5800 [1:20:29<170:46:01, 115.19s/it]                                                         {'loss': 0.0601, 'grad_norm': 6.117221355438232, 'learning_rate': 3.974013174377002e-05, 'epoch': 3.99}
-  8%|▊         | 463/5800 [1:20:29<170:46:01, 115.19s/it][93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0781, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:07:04,338] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 10:07:04,339] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 568.51 | bwd_microstep: 1212.91 | bwd_inner_microstep: 1208.22 | bwd_allreduce_microstep: 4.58 | step_microstep: 42.26
-[2025-01-25 10:07:04,339] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 568.46 | bwd: 1212.93 | bwd_inner: 1208.22 | bwd_allreduce: 4.64 | step: 42.27
-  8%|▊         | 464/5800 [1:20:34<121:46:30, 82.16s/it]                                                         {'loss': 0.0781, 'grad_norm': 11.48621940612793, 'learning_rate': 3.9738334174677816e-05, 'epoch': 4.0}
-  8%|▊         | 464/5800 [1:20:34<121:46:30, 82.16s/it][2025-01-25 10:07:08,786] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 10:07:19,109] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 10:07:29,355] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 10:07:39,352] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-score1 tensor([[0.4902],
-        [0.5898],
-        [0.5273],
-        [0.5977]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.5156, 0.5508, 0.5703], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0435, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:07:57,202] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.44 | optimizer_step: 4.37
-[2025-01-25 10:07:57,207] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2129.98 | bwd_microstep: 4584.81 | bwd_inner_microstep: 4579.84 | bwd_allreduce_microstep: 4.85 | step_microstep: 49.15
-[2025-01-25 10:07:57,207] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2129.95 | bwd: 4584.83 | bwd_inner: 4579.84 | bwd_allreduce: 4.91 | step: 49.16
-  8%|▊         | 465/5800 [1:21:27<108:43:59, 73.37s/it]                                                        {'loss': 0.0435, 'grad_norm': 0.8772261142730713, 'learning_rate': 3.9736530450830525e-05, 'epoch': 4.01}
-  8%|▊         | 465/5800 [1:21:27<108:43:59, 73.37s/it]score1 tensor([[0.6094],
-        [0.7031],
-        [0.5508],
-        [0.6367]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.6953, 0.4375, 0.7070], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0488, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:08:04,077] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.37
-[2025-01-25 10:08:04,079] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2136.78 | bwd_microstep: 4608.14 | bwd_inner_microstep: 4603.31 | bwd_allreduce_microstep: 4.75 | step_microstep: 42.40
-[2025-01-25 10:08:04,079] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2136.75 | bwd: 4608.17 | bwd_inner: 4603.31 | bwd_allreduce: 4.79 | step: 42.41
-  8%|▊         | 466/5800 [1:21:34<79:09:09, 53.42s/it]                                                        {'loss': 0.0488, 'grad_norm': 6.0489020347595215, 'learning_rate': 3.9734720572790586e-05, 'epoch': 4.02}
-  8%|▊         | 466/5800 [1:21:34<79:09:09, 53.42s/it]score1 tensor([[0.4609],
-        [0.5898],
-        [0.4590],
-        [0.4727]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.5078, 0.3887, 0.4570], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0635, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:08:10,934] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 10:08:10,935] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2145.80 | bwd_microstep: 4584.72 | bwd_inner_microstep: 4572.35 | bwd_allreduce_microstep: 12.27 | step_microstep: 46.77
-[2025-01-25 10:08:10,935] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2145.76 | bwd: 4584.75 | bwd_inner: 4572.35 | bwd_allreduce: 12.33 | step: 46.79
-  8%|▊         | 467/5800 [1:21:40<58:26:31, 39.45s/it]                                                       {'loss': 0.0635, 'grad_norm': 11.106167793273926, 'learning_rate': 3.973290454112234e-05, 'epoch': 4.03}
-  8%|▊         | 467/5800 [1:21:40<58:26:31, 39.45s/it]score1 tensor([[0.5742],
-        [0.4727],
-        [0.3711],
-        [0.4551]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.4785, 0.3789, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0151, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:08:17,782] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.37
-[2025-01-25 10:08:17,783] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2131.04 | bwd_microstep: 4584.79 | bwd_inner_microstep: 4579.96 | bwd_allreduce_microstep: 4.74 | step_microstep: 42.55
-[2025-01-25 10:08:17,784] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2131.00 | bwd: 4584.81 | bwd_inner: 4579.96 | bwd_allreduce: 4.78 | step: 42.56
-  8%|▊         | 468/5800 [1:21:47<43:56:47, 29.67s/it]                                                       {'loss': 0.0151, 'grad_norm': 0.8516520261764526, 'learning_rate': 3.973108235639206e-05, 'epoch': 4.03}
-  8%|▊         | 468/5800 [1:21:47<43:56:47, 29.67s/it]score1 tensor([[0.4922],
-        [0.5547],
-        [0.4551],
-        [0.4551]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.6797, 0.4590, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0757, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:08:24,638] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 10:08:24,640] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2136.93 | bwd_microstep: 4593.56 | bwd_inner_microstep: 4588.13 | bwd_allreduce_microstep: 5.33 | step_microstep: 46.92
-[2025-01-25 10:08:24,640] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2136.89 | bwd: 4593.58 | bwd_inner: 4588.13 | bwd_allreduce: 5.38 | step: 46.93
-  8%|▊         | 469/5800 [1:21:54<33:48:05, 22.83s/it]                                                       {'loss': 0.0757, 'grad_norm': 10.966355323791504, 'learning_rate': 3.972925401916794e-05, 'epoch': 4.04}
-  8%|▊         | 469/5800 [1:21:54<33:48:05, 22.83s/it]score1 tensor([[0.4668],
-        [0.4043],
-        [0.5469],
-        [0.4531]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.5469, 0.5781, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1094, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:08:31,532] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.34 | optimizer_step: 4.37
-[2025-01-25 10:08:31,533] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2142.09 | bwd_microstep: 4609.43 | bwd_inner_microstep: 4604.55 | bwd_allreduce_microstep: 4.80 | step_microstep: 47.42
-[2025-01-25 10:08:31,534] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2142.06 | bwd: 4609.46 | bwd_inner: 4604.55 | bwd_allreduce: 4.84 | step: 47.43
-  8%|▊         | 470/5800 [1:22:01<26:43:10, 18.05s/it]                                                       {'loss': 0.1094, 'grad_norm': 10.890995979309082, 'learning_rate': 3.9727419530020086e-05, 'epoch': 4.05}
-  8%|▊         | 470/5800 [1:22:01<26:43:10, 18.05s/it]score1 tensor([[0.3887],
-        [0.4453],
-        [0.5234],
-        [0.5781]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.4824, 0.5781, 0.6289], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0596, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:08:38,415] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 10:08:38,417] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2144.39 | bwd_microstep: 4614.56 | bwd_inner_microstep: 4609.69 | bwd_allreduce_microstep: 4.79 | step_microstep: 43.05
-[2025-01-25 10:08:38,417] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2144.34 | bwd: 4614.59 | bwd_inner: 4609.69 | bwd_allreduce: 4.83 | step: 43.06
-  8%|▊         | 471/5800 [1:22:08<21:45:19, 14.70s/it]                                                       {'loss': 0.0596, 'grad_norm': 10.969101905822754, 'learning_rate': 3.972557888952052e-05, 'epoch': 4.06}
-  8%|▊         | 471/5800 [1:22:08<21:45:19, 14.70s/it]score1 tensor([[0.5039],
-        [0.4434],
-        [0.5117],
-        [0.3047]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.4453, 0.4727, 0.3691], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:08:45,297] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.36
-[2025-01-25 10:08:45,298] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2147.43 | bwd_microstep: 4609.90 | bwd_inner_microstep: 4605.42 | bwd_allreduce_microstep: 4.41 | step_microstep: 43.08
-[2025-01-25 10:08:45,299] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2147.39 | bwd: 4609.92 | bwd_inner: 4605.42 | bwd_allreduce: 4.44 | step: 43.09
-  8%|▊         | 472/5800 [1:22:15<18:16:54, 12.35s/it]                                                       {'loss': 0.0391, 'grad_norm': 5.172242164611816, 'learning_rate': 3.9723732098243186e-05, 'epoch': 4.07}
-  8%|▊         | 472/5800 [1:22:15<18:16:54, 12.35s/it]score1 tensor([[0.5625],
-        [0.5781],
-        [0.1729],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.5430, 0.1787, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:08:52,189] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 10:08:52,191] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2147.96 | bwd_microstep: 4623.29 | bwd_inner_microstep: 4618.17 | bwd_allreduce_microstep: 5.01 | step_microstep: 44.37
-[2025-01-25 10:08:52,191] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2147.93 | bwd: 4623.32 | bwd_inner: 4618.17 | bwd_allreduce: 5.08 | step: 44.38
-  8%|▊         | 473/5800 [1:22:22<15:51:17, 10.71s/it]                                                       {'loss': 0.0508, 'grad_norm': 4.29999303817749, 'learning_rate': 3.972187915676395e-05, 'epoch': 4.08}
-  8%|▊         | 473/5800 [1:22:22<15:51:17, 10.71s/it]score1 tensor([[0.6406],
-        [0.5742],
-        [0.6367],
-        [0.5859]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.3672, 0.6875, 0.4395], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1196, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:08:59,081] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 10:08:59,082] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.02 | bwd_microstep: 4622.46 | bwd_inner_microstep: 4616.80 | bwd_allreduce_microstep: 5.53 | step_microstep: 43.04
-[2025-01-25 10:08:59,083] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2149.98 | bwd: 4622.49 | bwd_inner: 4616.79 | bwd_allreduce: 5.60 | step: 43.05
-  8%|▊         | 474/5800 [1:22:29<14:09:19,  9.57s/it]                                                       {'loss': 0.1196, 'grad_norm': 5.632753372192383, 'learning_rate': 3.972002006566059e-05, 'epoch': 4.09}
-  8%|▊         | 474/5800 [1:22:29<14:09:19,  9.57s/it]score1 tensor([[0.6211],
-        [0.6484],
-        [0.6211],
-        [0.6602]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.5742, 0.5156, 0.6562], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0752, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:09:05,966] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 10:09:05,967] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.93 | bwd_microstep: 4604.25 | bwd_inner_microstep: 4599.24 | bwd_allreduce_microstep: 4.91 | step_microstep: 43.33
-[2025-01-25 10:09:05,968] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.90 | bwd: 4604.27 | bwd_inner: 4599.24 | bwd_allreduce: 4.96 | step: 43.34
-  8%|▊         | 475/5800 [1:22:35<12:57:51,  8.76s/it]                                                       {'loss': 0.0752, 'grad_norm': 11.647573471069336, 'learning_rate': 3.97181548255128e-05, 'epoch': 4.09}
-  8%|▊         | 475/5800 [1:22:35<12:57:51,  8.76s/it]score1 tensor([[0.5977],
-        [0.5703],
-        [0.6328],
-        [0.5859]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3945, 0.5586, 0.5312, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1025, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:09:12,875] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.98 | optimizer_step: 4.37
-[2025-01-25 10:09:12,876] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.98 | bwd_microstep: 4617.76 | bwd_inner_microstep: 4612.46 | bwd_allreduce_microstep: 5.20 | step_microstep: 44.29
-[2025-01-25 10:09:12,876] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.95 | bwd: 4617.79 | bwd_inner: 4612.46 | bwd_allreduce: 5.25 | step: 44.30
-  8%|▊         | 476/5800 [1:22:42<12:08:13,  8.21s/it]                                                       {'loss': 0.1025, 'grad_norm': 11.447088241577148, 'learning_rate': 3.9716283436902194e-05, 'epoch': 4.1}
-  8%|▊         | 476/5800 [1:22:42<12:08:13,  8.21s/it]score1 tensor([[0.5508],
-        [0.5664],
-        [0.5664],
-        [0.6016]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.4512, 0.4785, 0.5938], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0659, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:09:19,758] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 10:09:19,759] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2151.43 | bwd_microstep: 4609.87 | bwd_inner_microstep: 4604.17 | bwd_allreduce_microstep: 5.58 | step_microstep: 44.02
-[2025-01-25 10:09:19,760] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2151.40 | bwd: 4609.89 | bwd_inner: 4604.17 | bwd_allreduce: 5.64 | step: 44.02
-  8%|▊         | 477/5800 [1:22:49<11:32:48,  7.81s/it]                                                       {'loss': 0.0659, 'grad_norm': 11.259836196899414, 'learning_rate': 3.97144059004123e-05, 'epoch': 4.11}
-  8%|▊         | 477/5800 [1:22:49<11:32:48,  7.81s/it]score1 tensor([[0.5000],
-        [0.4844],
-        [0.5039],
-        [0.4863]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.4199, 0.4980, 0.5234], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0356, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:09:26,650] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.37
-[2025-01-25 10:09:26,651] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.31 | bwd_microstep: 4613.71 | bwd_inner_microstep: 4608.13 | bwd_allreduce_microstep: 5.41 | step_microstep: 44.69
-[2025-01-25 10:09:26,652] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.26 | bwd: 4613.74 | bwd_inner: 4608.13 | bwd_allreduce: 5.51 | step: 44.70
-  8%|▊         | 478/5800 [1:22:56<11:08:16,  7.53s/it]                                                       {'loss': 0.0356, 'grad_norm': 0.3157023787498474, 'learning_rate': 3.971252221662858e-05, 'epoch': 4.12}
-  8%|▊         | 478/5800 [1:22:56<11:08:16,  7.53s/it]score1 tensor([[0.4375],
-        [0.5078],
-        [0.4453],
-        [0.4766]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4238, 0.6406, 0.3926, 0.4629], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0532, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:09:33,546] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 10:09:33,548] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.86 | bwd_microstep: 4614.79 | bwd_inner_microstep: 4609.63 | bwd_allreduce_microstep: 5.05 | step_microstep: 42.61
-[2025-01-25 10:09:33,548] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.82 | bwd: 4614.82 | bwd_inner: 4609.63 | bwd_allreduce: 5.10 | step: 42.62
-  8%|▊         | 479/5800 [1:23:03<10:51:10,  7.34s/it]                                                       {'loss': 0.0532, 'grad_norm': 5.186888694763184, 'learning_rate': 3.9710632386138393e-05, 'epoch': 4.13}
-  8%|▊         | 479/5800 [1:23:03<10:51:10,  7.34s/it]score1 tensor([[0.4395],
-        [0.3906],
-        [0.4824],
-        [0.4004]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.4180, 0.6367, 0.4316], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0840, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:09:40,447] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.92 | optimizer_step: 4.36
-[2025-01-25 10:09:40,448] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.22 | bwd_microstep: 4619.39 | bwd_inner_microstep: 4614.73 | bwd_allreduce_microstep: 4.58 | step_microstep: 42.11
-[2025-01-25 10:09:40,448] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.18 | bwd: 4619.41 | bwd_inner: 4614.73 | bwd_allreduce: 4.62 | step: 42.11
-  8%|▊         | 480/5800 [1:23:10<10:39:19,  7.21s/it]                                                       {'loss': 0.084, 'grad_norm': 10.396432876586914, 'learning_rate': 3.970873640953101e-05, 'epoch': 4.14}
-  8%|▊         | 480/5800 [1:23:10<10:39:19,  7.21s/it]score1 tensor([[0.3945],
-        [0.4023],
-        [0.3242],
-        [0.3574]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4609, 0.4453, 0.3398, 0.4023], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0425, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:09:47,350] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.37
-[2025-01-25 10:09:47,351] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.97 | bwd_microstep: 4621.00 | bwd_inner_microstep: 4616.08 | bwd_allreduce_microstep: 4.84 | step_microstep: 42.19
-[2025-01-25 10:09:47,351] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.92 | bwd: 4621.03 | bwd_inner: 4616.08 | bwd_allreduce: 4.88 | step: 42.20
-  8%|▊         | 481/5800 [1:23:17<10:31:09,  7.12s/it]                                                       {'loss': 0.0425, 'grad_norm': 10.097538948059082, 'learning_rate': 3.970683428739764e-05, 'epoch': 4.15}
-  8%|▊         | 481/5800 [1:23:17<10:31:09,  7.12s/it]score1 tensor([[0.3555],
-        [0.3906],
-        [0.3789],
-        [0.3848]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.4863, 0.5117, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1270, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:09:54,261] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.94 | optimizer_step: 4.36
-[2025-01-25 10:09:54,262] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.77 | bwd_microstep: 4625.12 | bwd_inner_microstep: 4620.23 | bwd_allreduce_microstep: 4.78 | step_microstep: 45.34
-[2025-01-25 10:09:54,263] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.73 | bwd: 4625.15 | bwd_inner: 4620.23 | bwd_allreduce: 4.84 | step: 45.35
-  8%|▊         | 482/5800 [1:23:24<10:25:22,  7.06s/it]                                                       {'loss': 0.127, 'grad_norm': 10.070008277893066, 'learning_rate': 3.9704926020331404e-05, 'epoch': 4.16}
-  8%|▊         | 482/5800 [1:23:24<10:25:22,  7.06s/it]score1 tensor([[0.3711],
-        [0.3691],
-        [0.3516],
-        [0.3789]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4746, 0.4844, 0.4980, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1289, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:10:01,155] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 10:10:01,156] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.54 | bwd_microstep: 4618.13 | bwd_inner_microstep: 4613.37 | bwd_allreduce_microstep: 4.67 | step_microstep: 42.07
-[2025-01-25 10:10:01,156] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.51 | bwd: 4618.15 | bwd_inner: 4613.37 | bwd_allreduce: 4.71 | step: 42.08
-  8%|▊         | 483/5800 [1:23:31<10:20:55,  7.01s/it]                                                       {'loss': 0.1289, 'grad_norm': 10.079607963562012, 'learning_rate': 3.9703011608927316e-05, 'epoch': 4.16}
-  8%|▊         | 483/5800 [1:23:31<10:20:55,  7.01s/it]score1 tensor([[0.4102],
-        [0.3887],
-        [0.4395],
-        [0.4785]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4121, 0.4512, 0.4863, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0605, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:10:08,075] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 8.99 | optimizer_step: 4.37
-[2025-01-25 10:10:08,077] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.86 | bwd_microstep: 4628.72 | bwd_inner_microstep: 4623.22 | bwd_allreduce_microstep: 5.39 | step_microstep: 55.22
-[2025-01-25 10:10:08,077] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.82 | bwd: 4628.75 | bwd_inner: 4623.22 | bwd_allreduce: 5.44 | step: 55.23
-  8%|▊         | 484/5800 [1:23:38<10:18:40,  6.98s/it]                                                       {'loss': 0.0605, 'grad_norm': 10.265423774719238, 'learning_rate': 3.970109105378234e-05, 'epoch': 4.17}
-  8%|▊         | 484/5800 [1:23:38<10:18:40,  6.98s/it]score1 tensor([[0.5000],
-        [0.4883],
-        [0.5195],
-        [0.4746]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.5391, 0.6055, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0601, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:10:14,994] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.37
-[2025-01-25 10:10:14,995] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.63 | bwd_microstep: 4622.74 | bwd_inner_microstep: 4617.98 | bwd_allreduce_microstep: 4.66 | step_microstep: 42.15
-[2025-01-25 10:10:14,996] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.60 | bwd: 4622.76 | bwd_inner: 4617.98 | bwd_allreduce: 4.71 | step: 42.15
-  8%|▊         | 485/5800 [1:23:44<10:16:51,  6.96s/it]                                                       {'loss': 0.0601, 'grad_norm': 10.659721374511719, 'learning_rate': 3.969916435549532e-05, 'epoch': 4.18}
-  8%|▊         | 485/5800 [1:23:44<10:16:51,  6.96s/it]score1 tensor([[0.5117],
-        [0.5234],
-        [0.5430],
-        [0.4492]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4160, 0.4629, 0.6289, 0.3652], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0815, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:10:21,893] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 10:10:21,895] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.82 | bwd_microstep: 4620.86 | bwd_inner_microstep: 4615.90 | bwd_allreduce_microstep: 4.87 | step_microstep: 44.08
-[2025-01-25 10:10:21,895] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.79 | bwd: 4620.88 | bwd_inner: 4615.90 | bwd_allreduce: 4.92 | step: 44.10
-  8%|▊         | 486/5800 [1:23:51<10:14:58,  6.94s/it]                                                       {'loss': 0.0815, 'grad_norm': 5.212292671203613, 'learning_rate': 3.9697231514667046e-05, 'epoch': 4.19}
-  8%|▊         | 486/5800 [1:23:51<10:14:58,  6.94s/it]score1 tensor([[0.5781],
-        [0.5625],
-        [0.6172],
-        [0.5938]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4961, 0.4531, 0.5195, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0972, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:10:28,812] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.89 | optimizer_step: 4.36
-[2025-01-25 10:10:28,814] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.65 | bwd_microstep: 4628.54 | bwd_inner_microstep: 4623.86 | bwd_allreduce_microstep: 4.60 | step_microstep: 42.37
-[2025-01-25 10:10:28,814] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.61 | bwd: 4628.56 | bwd_inner: 4623.86 | bwd_allreduce: 4.64 | step: 42.37
-  8%|▊         | 487/5800 [1:23:58<10:14:08,  6.94s/it]                                                       {'loss': 0.0972, 'grad_norm': 11.022942543029785, 'learning_rate': 3.969529253190021e-05, 'epoch': 4.2}
-  8%|▊         | 487/5800 [1:23:58<10:14:08,  6.94s/it]score1 tensor([[0.5898],
-        [0.6836],
-        [0.5469],
-        [0.6055]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4629, 0.5742, 0.3457, 0.4316], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1523, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:10:35,717] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.36
-[2025-01-25 10:10:35,718] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.40 | bwd_microstep: 4620.62 | bwd_inner_microstep: 4615.40 | bwd_allreduce_microstep: 5.12 | step_microstep: 44.02
-[2025-01-25 10:10:35,719] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.35 | bwd: 4620.65 | bwd_inner: 4615.40 | bwd_allreduce: 5.17 | step: 44.02
-  8%|▊         | 488/5800 [1:24:05<10:13:11,  6.93s/it]                                                       {'loss': 0.1523, 'grad_norm': 11.093964576721191, 'learning_rate': 3.969334740779942e-05, 'epoch': 4.21}
-  8%|▊         | 488/5800 [1:24:05<10:13:11,  6.93s/it]score1 tensor([[0.6523],
-        [0.5977],
-        [0.6016],
-        [0.5859]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.4492, 0.4863, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0933, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:10:42,626] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.92 | optimizer_step: 4.36
-[2025-01-25 10:10:42,628] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.69 | bwd_microstep: 4625.29 | bwd_inner_microstep: 4620.32 | bwd_allreduce_microstep: 4.83 | step_microstep: 41.65
-[2025-01-25 10:10:42,628] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.64 | bwd: 4625.32 | bwd_inner: 4620.32 | bwd_allreduce: 4.88 | step: 41.65
-  8%|▊         | 489/5800 [1:24:12<10:12:36,  6.92s/it]                                                       {'loss': 0.0933, 'grad_norm': 11.066105842590332, 'learning_rate': 3.96913961429712e-05, 'epoch': 4.22}
-  8%|▊         | 489/5800 [1:24:12<10:12:36,  6.92s/it]score1 tensor([[0.5469],
-        [0.5938],
-        [0.5625],
-        [0.5938]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4258, 0.4023, 0.4941, 0.5898], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0962, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:10:49,533] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.37
-[2025-01-25 10:10:49,534] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.89 | bwd_microstep: 4618.92 | bwd_inner_microstep: 4613.82 | bwd_allreduce_microstep: 5.02 | step_microstep: 42.11
-[2025-01-25 10:10:49,535] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.86 | bwd: 4618.95 | bwd_inner: 4613.82 | bwd_allreduce: 5.06 | step: 42.11
-  8%|▊         | 490/5800 [1:24:19<10:12:06,  6.92s/it]                                                       {'loss': 0.0962, 'grad_norm': 10.643426895141602, 'learning_rate': 3.9689438738023985e-05, 'epoch': 4.22}
-  8%|▊         | 490/5800 [1:24:19<10:12:06,  6.92s/it]score1 tensor([[0.5039],
-        [0.5156],
-        [0.5156],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4141, 0.3711, 0.4141, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0928, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:10:56,436] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 10:10:56,443] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.60 | bwd_microstep: 4618.87 | bwd_inner_microstep: 4613.60 | bwd_allreduce_microstep: 5.16 | step_microstep: 50.91
-[2025-01-25 10:10:56,443] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.56 | bwd: 4618.89 | bwd_inner: 4613.59 | bwd_allreduce: 5.22 | step: 50.93
-  8%|▊         | 491/5800 [1:24:26<10:11:46,  6.91s/it]                                                       {'loss': 0.0928, 'grad_norm': 5.018006801605225, 'learning_rate': 3.968747519356814e-05, 'epoch': 4.23}
-  8%|▊         | 491/5800 [1:24:26<10:11:46,  6.91s/it]score1 tensor([[0.5078],
-        [0.4824],
-        [0.4746],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5742, 0.4297, 0.4043, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0532, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:11:03,353] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 10:11:03,354] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.40 | bwd_microstep: 4627.54 | bwd_inner_microstep: 4622.88 | bwd_allreduce_microstep: 4.56 | step_microstep: 48.99
-[2025-01-25 10:11:03,355] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.37 | bwd: 4627.57 | bwd_inner: 4622.88 | bwd_allreduce: 4.60 | step: 49.00
-  8%|▊         | 492/5800 [1:24:33<10:11:44,  6.92s/it]                                                       {'loss': 0.0532, 'grad_norm': 0.49794214963912964, 'learning_rate': 3.9685505510215905e-05, 'epoch': 4.24}
-  8%|▊         | 492/5800 [1:24:33<10:11:44,  6.92s/it]score1 tensor([[0.4297],
-        [0.4375],
-        [0.4238],
-        [0.4297]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.4727, 0.4980, 0.5586], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0869, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:11:10,275] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.37
-[2025-01-25 10:11:10,276] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.03 | bwd_microstep: 4624.99 | bwd_inner_microstep: 4619.59 | bwd_allreduce_microstep: 5.31 | step_microstep: 43.58
-[2025-01-25 10:11:10,277] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.98 | bwd: 4625.01 | bwd_inner: 4619.59 | bwd_allreduce: 5.36 | step: 43.58
-  8%|▊         | 493/5800 [1:24:40<10:11:36,  6.91s/it]                                                       {'loss': 0.0869, 'grad_norm': 9.942627906799316, 'learning_rate': 3.968352968858149e-05, 'epoch': 4.25}
-  8%|▊         | 493/5800 [1:24:40<10:11:36,  6.91s/it]score1 tensor([[0.4961],
-        [0.3867],
-        [0.3633],
-        [0.3594]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6484, 0.4512, 0.4043, 0.4238], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0806, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:11:17,175] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.47 | optimizer_step: 4.37
-[2025-01-25 10:11:17,177] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.99 | bwd_microstep: 4624.54 | bwd_inner_microstep: 4619.88 | bwd_allreduce_microstep: 4.59 | step_microstep: 38.11
-[2025-01-25 10:11:17,177] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.94 | bwd: 4624.57 | bwd_inner: 4619.88 | bwd_allreduce: 4.63 | step: 38.11
-  9%|▊         | 494/5800 [1:24:47<10:11:03,  6.91s/it]                                                       {'loss': 0.0806, 'grad_norm': 9.813652038574219, 'learning_rate': 3.968154772928097e-05, 'epoch': 4.26}
-  9%|▊         | 494/5800 [1:24:47<10:11:03,  6.91s/it]score1 tensor([[0.3828],
-        [0.4004],
-        [0.3867],
-        [0.3945]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4668, 0.5781, 0.4160, 0.4141], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0776, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:11:24,077] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.37
-[2025-01-25 10:11:24,079] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.71 | bwd_microstep: 4623.35 | bwd_inner_microstep: 4618.15 | bwd_allreduce_microstep: 5.11 | step_microstep: 43.91
-[2025-01-25 10:11:24,079] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.68 | bwd: 4623.38 | bwd_inner: 4618.15 | bwd_allreduce: 5.16 | step: 43.91
-  9%|▊         | 495/5800 [1:24:54<10:10:48,  6.91s/it]                                                       {'loss': 0.0776, 'grad_norm': 9.476706504821777, 'learning_rate': 3.967955963293237e-05, 'epoch': 4.27}
-  9%|▊         | 495/5800 [1:24:54<10:10:48,  6.91s/it]score1 tensor([[0.4062],
-        [0.3984],
-        [0.4668],
-        [0.3770]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.5078, 0.4961, 0.3906], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0410, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:11:30,979] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.37
-[2025-01-25 10:11:30,980] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.57 | bwd_microstep: 4622.47 | bwd_inner_microstep: 4616.69 | bwd_allreduce_microstep: 5.64 | step_microstep: 45.64
-[2025-01-25 10:11:30,980] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.53 | bwd: 4622.50 | bwd_inner: 4616.69 | bwd_allreduce: 5.69 | step: 45.64
-  9%|▊         | 496/5800 [1:25:00<10:10:28,  6.91s/it]                                                       {'loss': 0.041, 'grad_norm': 9.683473587036133, 'learning_rate': 3.967756540015561e-05, 'epoch': 4.28}
-  9%|▊         | 496/5800 [1:25:00<10:10:28,  6.91s/it]score1 tensor([[0.4668],
-        [0.5000],
-        [0.4355],
-        [0.4824]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4609, 0.6094, 0.3750, 0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0562, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:11:37,884] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 10:11:37,885] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.72 | bwd_microstep: 4625.20 | bwd_inner_microstep: 4620.52 | bwd_allreduce_microstep: 4.60 | step_microstep: 42.53
-[2025-01-25 10:11:37,885] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.67 | bwd: 4625.22 | bwd_inner: 4620.52 | bwd_allreduce: 4.64 | step: 42.54
-  9%|▊         | 497/5800 [1:25:07<10:10:29,  6.91s/it]                                                       {'loss': 0.0562, 'grad_norm': 0.5986563563346863, 'learning_rate': 3.9675565031572524e-05, 'epoch': 4.28}
-  9%|▊         | 497/5800 [1:25:07<10:10:29,  6.91s/it]score1 tensor([[0.4863],
-        [0.5898],
-        [0.5664],
-        [0.4336]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4902, 0.5938, 0.7031, 0.3223], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0640, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:11:44,799] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 10:11:44,801] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.19 | bwd_microstep: 4630.06 | bwd_inner_microstep: 4624.96 | bwd_allreduce_microstep: 5.00 | step_microstep: 44.71
-[2025-01-25 10:11:44,801] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.15 | bwd: 4630.09 | bwd_inner: 4624.96 | bwd_allreduce: 5.06 | step: 44.71
-  9%|▊         | 498/5800 [1:25:14<10:10:30,  6.91s/it]                                                       {'loss': 0.064, 'grad_norm': 5.464024543762207, 'learning_rate': 3.967355852780685e-05, 'epoch': 4.29}
-  9%|▊         | 498/5800 [1:25:14<10:10:30,  6.91s/it]score1 tensor([[0.5352],
-        [0.5195],
-        [0.5547],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4531, 0.4570, 0.5273, 0.4688], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0576, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:11:51,870] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 10:11:51,871] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.03 | bwd_microstep: 4619.74 | bwd_inner_microstep: 4615.24 | bwd_allreduce_microstep: 4.42 | step_microstep: 51.29
-[2025-01-25 10:11:51,872] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.99 | bwd: 4619.76 | bwd_inner: 4615.24 | bwd_allreduce: 4.46 | step: 51.30
-  9%|▊         | 499/5800 [1:25:21<10:14:54,  6.96s/it]                                                       {'loss': 0.0576, 'grad_norm': 10.320480346679688, 'learning_rate': 3.9671545889484264e-05, 'epoch': 4.3}
-  9%|▊         | 499/5800 [1:25:21<10:14:54,  6.96s/it]score1 tensor([[0.5430],
-        [0.6797],
-        [0.5664],
-        [0.5781]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4883, 0.6250, 0.4551, 0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0669, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:11:58,819] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.37
-[2025-01-25 10:11:58,821] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.16 | bwd_microstep: 4633.63 | bwd_inner_microstep: 4628.30 | bwd_allreduce_microstep: 5.21 | step_microstep: 57.53
-[2025-01-25 10:11:58,822] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.13 | bwd: 4633.66 | bwd_inner: 4628.30 | bwd_allreduce: 5.27 | step: 57.58
-  9%|▊         | 500/5800 [1:25:28<10:14:27,  6.96s/it]                                                       {'loss': 0.0669, 'grad_norm': 10.663252830505371, 'learning_rate': 3.9669527117232346e-05, 'epoch': 4.31}
-  9%|▊         | 500/5800 [1:25:28<10:14:27,  6.96s/it]score1 tensor([[0.5508],
-        [0.5312],
-        [0.6953],
-        [0.6875]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.3730, 0.6562, 0.6133], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0845, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:12:05,765] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 10:12:05,766] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.56 | bwd_microstep: 4627.46 | bwd_inner_microstep: 4622.17 | bwd_allreduce_microstep: 5.21 | step_microstep: 46.44
-[2025-01-25 10:12:05,767] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.46 | bwd: 4627.49 | bwd_inner: 4622.17 | bwd_allreduce: 5.25 | step: 46.45
-  9%|▊         | 501/5800 [1:25:35<10:13:56,  6.95s/it]                                                       {'loss': 0.0845, 'grad_norm': 10.935404777526855, 'learning_rate': 3.9667502211680565e-05, 'epoch': 4.32}
-  9%|▊         | 501/5800 [1:25:35<10:13:56,  6.95s/it]score1 tensor([[0.5430],
-        [0.6914],
-        [0.5859],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4121, 0.6133, 0.5664, 0.4805], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0786, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:12:12,683] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.37
-[2025-01-25 10:12:12,684] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.74 | bwd_microstep: 4621.29 | bwd_inner_microstep: 4616.17 | bwd_allreduce_microstep: 5.04 | step_microstep: 50.39
-[2025-01-25 10:12:12,685] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.68 | bwd: 4621.32 | bwd_inner: 4616.17 | bwd_allreduce: 5.08 | step: 50.39
-  9%|▊         | 502/5800 [1:25:42<10:13:15,  6.95s/it]                                                       {'loss': 0.0786, 'grad_norm': 10.693560600280762, 'learning_rate': 3.966547117346035e-05, 'epoch': 4.33}
-  9%|▊         | 502/5800 [1:25:42<10:13:15,  6.95s/it]score1 tensor([[0.5781],
-        [0.5234],
-        [0.5352],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.4648, 0.5273, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0264, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:12:19,642] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 10:12:19,644] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.20 | bwd_microstep: 4643.57 | bwd_inner_microstep: 4638.63 | bwd_allreduce_microstep: 4.86 | step_microstep: 53.18
-[2025-01-25 10:12:19,644] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.16 | bwd: 4643.59 | bwd_inner: 4638.63 | bwd_allreduce: 4.90 | step: 53.19
-  9%|▊         | 503/5800 [1:25:49<10:13:06,  6.94s/it]                                                       {'loss': 0.0264, 'grad_norm': 5.3781304359436035, 'learning_rate': 3.966343400320498e-05, 'epoch': 4.34}
-  9%|▊         | 503/5800 [1:25:49<10:13:06,  6.94s/it]score1 tensor([[0.5156],
-        [0.4531],
-        [0.4785],
-        [0.4199]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4902, 0.4844, 0.4609, 0.3652], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0322, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:12:26,565] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.96 | optimizer_step: 4.37
-[2025-01-25 10:12:26,571] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.95 | bwd_microstep: 4635.20 | bwd_inner_microstep: 4629.55 | bwd_allreduce_microstep: 5.52 | step_microstep: 54.97
-[2025-01-25 10:12:26,572] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.90 | bwd: 4635.23 | bwd_inner: 4629.56 | bwd_allreduce: 5.59 | step: 54.99
-  9%|▊         | 504/5800 [1:25:56<10:12:36,  6.94s/it]                                                       {'loss': 0.0322, 'grad_norm': 5.0356764793396, 'learning_rate': 3.966139070154971e-05, 'epoch': 4.34}
-  9%|▊         | 504/5800 [1:25:56<10:12:36,  6.94s/it]score1 tensor([[0.4453],
-        [0.3789],
-        [0.4141],
-        [0.4004]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4922, 0.4297, 0.6602, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1157, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:12:33,480] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.37
-[2025-01-25 10:12:33,482] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.82 | bwd_microstep: 4628.35 | bwd_inner_microstep: 4623.27 | bwd_allreduce_microstep: 4.99 | step_microstep: 42.19
-[2025-01-25 10:12:33,482] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.79 | bwd: 4628.38 | bwd_inner: 4623.27 | bwd_allreduce: 5.04 | step: 42.20
-  9%|▊         | 505/5800 [1:26:03<10:11:44,  6.93s/it]                                                       {'loss': 0.1157, 'grad_norm': 9.68897819519043, 'learning_rate': 3.965934126913166e-05, 'epoch': 4.35}
-  9%|▊         | 505/5800 [1:26:03<10:11:44,  6.93s/it]score1 tensor([[0.4199],
-        [0.5430],
-        [0.3828],
-        [0.4023]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.6562, 0.6406, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1455, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:12:40,406] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 10:12:40,407] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.45 | bwd_microstep: 4629.23 | bwd_inner_microstep: 4624.07 | bwd_allreduce_microstep: 5.09 | step_microstep: 43.45
-[2025-01-25 10:12:40,407] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.40 | bwd: 4629.26 | bwd_inner: 4624.07 | bwd_allreduce: 5.13 | step: 43.45
-  9%|▊         | 506/5800 [1:26:10<10:11:23,  6.93s/it]                                                       {'loss': 0.1455, 'grad_norm': 9.861950874328613, 'learning_rate': 3.965728570658988e-05, 'epoch': 4.36}
-  9%|▊         | 506/5800 [1:26:10<10:11:23,  6.93s/it]score1 tensor([[0.4980],
-        [0.4688],
-        [0.4785],
-        [0.3691]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.5312, 0.6094, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1196, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:12:47,317] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 10:12:47,319] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.03 | bwd_microstep: 4627.38 | bwd_inner_microstep: 4622.64 | bwd_allreduce_microstep: 4.67 | step_microstep: 42.22
-[2025-01-25 10:12:47,319] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.99 | bwd: 4627.40 | bwd_inner: 4622.63 | bwd_allreduce: 4.71 | step: 42.23
-  9%|▊         | 507/5800 [1:26:17<10:10:44,  6.92s/it]                                                       {'loss': 0.1196, 'grad_norm': 10.06836223602295, 'learning_rate': 3.965522401456534e-05, 'epoch': 4.37}
-  9%|▊         | 507/5800 [1:26:17<10:10:44,  6.92s/it]score1 tensor([[0.2773],
-        [0.4004],
-        [0.3613],
-        [0.3281]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3867, 0.5156, 0.4219, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0894, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:12:54,224] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 10:12:54,226] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.70 | bwd_microstep: 4627.83 | bwd_inner_microstep: 4623.09 | bwd_allreduce_microstep: 4.63 | step_microstep: 41.53
-[2025-01-25 10:12:54,226] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.66 | bwd: 4627.85 | bwd_inner: 4623.08 | bwd_allreduce: 4.69 | step: 41.54
-  9%|▉         | 508/5800 [1:26:24<10:10:17,  6.92s/it]                                                       {'loss': 0.0894, 'grad_norm': 9.358710289001465, 'learning_rate': 3.96531561937009e-05, 'epoch': 4.38}
-  9%|▉         | 508/5800 [1:26:24<10:10:17,  6.92s/it]score1 tensor([[0.4180],
-        [0.4180],
-        [0.3945],
-        [0.4648]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.4551, 0.4258, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0864, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:13:01,144] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 10:13:01,145] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.53 | bwd_microstep: 4625.13 | bwd_inner_microstep: 4620.07 | bwd_allreduce_microstep: 4.94 | step_microstep: 43.90
-[2025-01-25 10:13:01,145] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.49 | bwd: 4625.16 | bwd_inner: 4620.07 | bwd_allreduce: 5.01 | step: 43.90
-  9%|▉         | 509/5800 [1:26:31<10:10:13,  6.92s/it]                                                       {'loss': 0.0864, 'grad_norm': 9.924144744873047, 'learning_rate': 3.9651082244641345e-05, 'epoch': 4.39}
-  9%|▉         | 509/5800 [1:26:31<10:10:13,  6.92s/it]score1 tensor([[0.4844],
-        [0.5586],
-        [0.3906],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5977, 0.5469, 0.4844, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0557, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:13:08,052] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 10:13:08,053] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.87 | bwd_microstep: 4628.66 | bwd_inner_microstep: 4623.31 | bwd_allreduce_microstep: 5.27 | step_microstep: 43.06
-[2025-01-25 10:13:08,053] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.84 | bwd: 4628.68 | bwd_inner: 4623.31 | bwd_allreduce: 5.31 | step: 43.07
-  9%|▉         | 510/5800 [1:26:38<10:09:39,  6.91s/it]                                                       {'loss': 0.0557, 'grad_norm': 1.2613238096237183, 'learning_rate': 3.964900216803338e-05, 'epoch': 4.4}
-  9%|▉         | 510/5800 [1:26:38<10:09:39,  6.91s/it]score1 tensor([[0.5078],
-        [0.5117],
-        [0.5703],
-        [0.6523]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.5469, 0.6328, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:13:14,975] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 10:13:14,976] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.51 | bwd_microstep: 4630.84 | bwd_inner_microstep: 4625.73 | bwd_allreduce_microstep: 4.99 | step_microstep: 47.65
-[2025-01-25 10:13:14,977] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.48 | bwd: 4630.87 | bwd_inner: 4625.73 | bwd_allreduce: 5.05 | step: 47.66
-  9%|▉         | 511/5800 [1:26:44<10:09:49,  6.92s/it]                                                       {'loss': 0.043, 'grad_norm': 5.911367893218994, 'learning_rate': 3.964691596452559e-05, 'epoch': 4.41}
-  9%|▉         | 511/5800 [1:26:44<10:09:49,  6.92s/it]score1 tensor([[0.7070],
-        [0.5703],
-        [0.5430],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6875, 0.5117, 0.4492, 0.5234], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0518, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:13:21,892] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 10:13:21,893] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.65 | bwd_microstep: 4629.24 | bwd_inner_microstep: 4624.18 | bwd_allreduce_microstep: 4.94 | step_microstep: 44.08
-[2025-01-25 10:13:21,893] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.61 | bwd: 4629.26 | bwd_inner: 4624.18 | bwd_allreduce: 5.00 | step: 44.08
-  9%|▉         | 512/5800 [1:26:51<10:09:41,  6.92s/it]                                                       {'loss': 0.0518, 'grad_norm': 12.419947624206543, 'learning_rate': 3.9644823634768496e-05, 'epoch': 4.41}
-  9%|▉         | 512/5800 [1:26:51<10:09:41,  6.92s/it]score1 tensor([[0.5156],
-        [0.5859],
-        [0.5625],
-        [0.5781]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4805, 0.5078, 0.4688, 0.4805], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0762, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:13:28,810] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.90 | optimizer_step: 4.36
-[2025-01-25 10:13:28,811] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.30 | bwd_microstep: 4628.14 | bwd_inner_microstep: 4623.34 | bwd_allreduce_microstep: 4.72 | step_microstep: 43.13
-[2025-01-25 10:13:28,811] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.25 | bwd: 4628.16 | bwd_inner: 4623.34 | bwd_allreduce: 4.76 | step: 43.15
-  9%|▉         | 513/5800 [1:26:58<10:09:40,  6.92s/it]                                                       {'loss': 0.0762, 'grad_norm': 12.144975662231445, 'learning_rate': 3.964272517941453e-05, 'epoch': 4.42}
-  9%|▉         | 513/5800 [1:26:58<10:09:40,  6.92s/it]score1 tensor([[0.6289],
-        [0.6133],
-        [0.5938],
-        [0.5781]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4766, 0.5781, 0.4648, 0.4707], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1060, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:13:35,729] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.37
-[2025-01-25 10:13:35,731] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.47 | bwd_microstep: 4621.69 | bwd_inner_microstep: 4616.39 | bwd_allreduce_microstep: 5.18 | step_microstep: 45.97
-[2025-01-25 10:13:35,731] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.41 | bwd: 4621.71 | bwd_inner: 4616.39 | bwd_allreduce: 5.24 | step: 45.97
-  9%|▉         | 514/5800 [1:27:05<10:09:31,  6.92s/it]                                                       {'loss': 0.106, 'grad_norm': 11.581771850585938, 'learning_rate': 3.964062059911802e-05, 'epoch': 4.43}
-  9%|▉         | 514/5800 [1:27:05<10:09:31,  6.92s/it]score1 tensor([[0.5820],
-        [0.5312],
-        [0.5391],
-        [0.6094]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.5664, 0.3809, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0649, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:13:42,598] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 10:13:42,599] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.14 | bwd_microstep: 4576.86 | bwd_inner_microstep: 4572.17 | bwd_allreduce_microstep: 4.62 | step_microstep: 39.51
-[2025-01-25 10:13:42,599] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.11 | bwd: 4576.88 | bwd_inner: 4572.17 | bwd_allreduce: 4.65 | step: 39.52
-  9%|▉         | 515/5800 [1:27:12<10:08:04,  6.90s/it]                                                       {'loss': 0.0649, 'grad_norm': 2.687469244003296, 'learning_rate': 3.963850989453522e-05, 'epoch': 4.44}
-  9%|▉         | 515/5800 [1:27:12<10:08:04,  6.90s/it]score1 tensor([[0.4961],
-        [0.5508],
-        [0.5391],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4590, 0.5508, 0.4707, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:13:49,462] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 10:13:49,464] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.77 | bwd_microstep: 4574.45 | bwd_inner_microstep: 4569.61 | bwd_allreduce_microstep: 4.76 | step_microstep: 42.53
-[2025-01-25 10:13:49,464] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.73 | bwd: 4574.49 | bwd_inner: 4569.61 | bwd_allreduce: 4.80 | step: 42.53
-  9%|▉         | 516/5800 [1:27:19<10:06:55,  6.89s/it]                                                       {'loss': 0.0273, 'grad_norm': 2.575587272644043, 'learning_rate': 3.963639306632427e-05, 'epoch': 4.45}
-  9%|▉         | 516/5800 [1:27:19<10:06:55,  6.89s/it]score1 tensor([[0.4023],
-        [0.4785],
-        [0.4785],
-        [0.4668]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3652, 0.4473, 0.4277, 0.4863], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0347, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:13:56,381] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 10:13:56,382] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.88 | bwd_microstep: 4629.24 | bwd_inner_microstep: 4623.98 | bwd_allreduce_microstep: 5.15 | step_microstep: 44.66
-[2025-01-25 10:13:56,382] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.85 | bwd: 4629.27 | bwd_inner: 4623.98 | bwd_allreduce: 5.20 | step: 44.66
-  9%|▉         | 517/5800 [1:27:26<10:07:37,  6.90s/it]                                                       {'loss': 0.0347, 'grad_norm': 4.7575860023498535, 'learning_rate': 3.9634270115145244e-05, 'epoch': 4.46}
-  9%|▉         | 517/5800 [1:27:26<10:07:37,  6.90s/it]score1 tensor([[0.3711],
-        [0.3926],
-        [0.3887],
-        [0.3965]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.4941, 0.4551, 0.4688], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0610, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:14:03,302] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 10:14:03,303] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.67 | bwd_microstep: 4623.02 | bwd_inner_microstep: 4618.19 | bwd_allreduce_microstep: 4.75 | step_microstep: 41.67
-[2025-01-25 10:14:03,304] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.60 | bwd: 4623.04 | bwd_inner: 4618.19 | bwd_allreduce: 4.79 | step: 41.67
-  9%|▉         | 518/5800 [1:27:33<10:07:55,  6.91s/it]                                                       {'loss': 0.061, 'grad_norm': 9.162281036376953, 'learning_rate': 3.963214104166011e-05, 'epoch': 4.47}
-  9%|▉         | 518/5800 [1:27:33<10:07:55,  6.91s/it]score1 tensor([[0.4023],
-        [0.4023],
-        [0.3594],
-        [0.3047]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.6211, 0.4961, 0.4043], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1445, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:14:10,220] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 10:14:10,221] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.93 | bwd_microstep: 4622.52 | bwd_inner_microstep: 4617.50 | bwd_allreduce_microstep: 4.94 | step_microstep: 45.25
-[2025-01-25 10:14:10,221] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.90 | bwd: 4622.55 | bwd_inner: 4617.50 | bwd_allreduce: 4.98 | step: 45.26
-  9%|▉         | 519/5800 [1:27:40<10:08:04,  6.91s/it]                                                       {'loss': 0.1445, 'grad_norm': 8.974865913391113, 'learning_rate': 3.963000584653275e-05, 'epoch': 4.47}
-  9%|▉         | 519/5800 [1:27:40<10:08:04,  6.91s/it]score1 tensor([[0.3125],
-        [0.4629],
-        [0.3848],
-        [0.3418]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4668, 0.6367, 0.5195, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1680, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:14:17,131] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 10:14:17,133] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.45 | bwd_microstep: 4627.33 | bwd_inner_microstep: 4622.34 | bwd_allreduce_microstep: 4.89 | step_microstep: 47.56
-[2025-01-25 10:14:17,133] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.39 | bwd: 4627.35 | bwd_inner: 4622.34 | bwd_allreduce: 4.94 | step: 47.58
-  9%|▉         | 520/5800 [1:27:47<10:08:09,  6.91s/it]                                                       {'loss': 0.168, 'grad_norm': 9.029157638549805, 'learning_rate': 3.962786453042896e-05, 'epoch': 4.48}
-  9%|▉         | 520/5800 [1:27:47<10:08:09,  6.91s/it]score1 tensor([[0.3711],
-        [0.3613],
-        [0.3379],
-        [0.3652]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.4082, 0.4648, 0.4277], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0981, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:14:24,051] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.36
-[2025-01-25 10:14:24,052] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.84 | bwd_microstep: 4625.82 | bwd_inner_microstep: 4620.78 | bwd_allreduce_microstep: 4.93 | step_microstep: 46.05
-[2025-01-25 10:14:24,053] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.80 | bwd: 4625.84 | bwd_inner: 4620.78 | bwd_allreduce: 4.99 | step: 46.05
-  9%|▉         | 521/5800 [1:27:54<10:08:11,  6.91s/it]                                                       {'loss': 0.0981, 'grad_norm': 8.759740829467773, 'learning_rate': 3.962571709401643e-05, 'epoch': 4.49}
-  9%|▉         | 521/5800 [1:27:54<10:08:11,  6.91s/it]score1 tensor([[0.3457],
-        [0.3613],
-        [0.3828],
-        [0.3906]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3457, 0.4414, 0.5000, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0815, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:14:30,912] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.91 | optimizer_step: 4.36
-[2025-01-25 10:14:30,914] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.76 | bwd_microstep: 4568.95 | bwd_inner_microstep: 4561.61 | bwd_allreduce_microstep: 7.23 | step_microstep: 45.40
-[2025-01-25 10:14:30,914] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.72 | bwd: 4568.98 | bwd_inner: 4561.61 | bwd_allreduce: 7.29 | step: 45.41
-  9%|▉         | 522/5800 [1:28:00<10:06:41,  6.90s/it]                                                       {'loss': 0.0815, 'grad_norm': 6.617953777313232, 'learning_rate': 3.9623563537964784e-05, 'epoch': 4.5}
-  9%|▉         | 522/5800 [1:28:00<10:06:41,  6.90s/it]score1 tensor([[0.4883],
-        [0.4941],
-        [0.4199],
-        [0.4688]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133, 0.5508, 0.4766, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0840, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:14:37,827] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 9.14 | optimizer_step: 4.36
-[2025-01-25 10:14:37,828] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.20 | bwd_microstep: 4621.81 | bwd_inner_microstep: 4616.87 | bwd_allreduce_microstep: 4.84 | step_microstep: 54.38
-[2025-01-25 10:14:37,829] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.16 | bwd: 4621.84 | bwd_inner: 4616.86 | bwd_allreduce: 4.90 | step: 54.38
-  9%|▉         | 523/5800 [1:28:07<10:07:02,  6.90s/it]                                                       {'loss': 0.084, 'grad_norm': 9.39023208618164, 'learning_rate': 3.9621403862945526e-05, 'epoch': 4.51}
-  9%|▉         | 523/5800 [1:28:07<10:07:02,  6.90s/it]score1 tensor([[0.4941],
-        [0.4863],
-        [0.5117],
-        [0.4961]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.4434, 0.5117, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:14:44,654] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 10:14:44,655] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.94 | bwd_microstep: 4543.55 | bwd_inner_microstep: 4539.72 | bwd_allreduce_microstep: 3.77 | step_microstep: 42.61
-[2025-01-25 10:14:44,656] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.89 | bwd: 4543.57 | bwd_inner: 4539.72 | bwd_allreduce: 3.80 | step: 42.62
-  9%|▉         | 524/5800 [1:28:14<10:05:02,  6.88s/it]                                                       {'loss': 0.0117, 'grad_norm': 4.617427349090576, 'learning_rate': 3.9619238069632084e-05, 'epoch': 4.52}
-  9%|▉         | 524/5800 [1:28:14<10:05:02,  6.88s/it]score1 tensor([[0.5664],
-        [0.5781],
-        [0.5547],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.6445, 0.5664, 0.6016], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:14:51,566] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.37
-[2025-01-25 10:14:51,567] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.49 | bwd_microstep: 4627.50 | bwd_inner_microstep: 4622.63 | bwd_allreduce_microstep: 4.79 | step_microstep: 43.31
-[2025-01-25 10:14:51,568] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.45 | bwd: 4627.53 | bwd_inner: 4622.63 | bwd_allreduce: 4.83 | step: 43.32
-  9%|▉         | 525/5800 [1:28:21<10:05:41,  6.89s/it]                                                       {'loss': 0.0469, 'grad_norm': 4.959676742553711, 'learning_rate': 3.961706615869978e-05, 'epoch': 4.53}
-  9%|▉         | 525/5800 [1:28:21<10:05:41,  6.89s/it]score1 tensor([[0.5977],
-        [0.5938],
-        [0.6094],
-        [0.6484]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.4375, 0.4941, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1143, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:14:58,477] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 10:14:58,478] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.11 | bwd_microstep: 4618.52 | bwd_inner_microstep: 4613.87 | bwd_allreduce_microstep: 4.53 | step_microstep: 43.27
-[2025-01-25 10:14:58,480] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.08 | bwd: 4618.54 | bwd_inner: 4613.87 | bwd_allreduce: 4.60 | step: 43.28
-  9%|▉         | 526/5800 [1:28:28<10:06:08,  6.90s/it]                                                       {'loss': 0.1143, 'grad_norm': 10.18409538269043, 'learning_rate': 3.9614888130825865e-05, 'epoch': 4.53}
-  9%|▉         | 526/5800 [1:28:28<10:06:08,  6.90s/it]score1 tensor([[0.6328],
-        [0.6484],
-        [0.5977],
-        [0.6016]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3809, 0.4199, 0.4883, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1836, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:15:05,400] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 10:15:05,402] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.30 | bwd_microstep: 4631.52 | bwd_inner_microstep: 4626.48 | bwd_allreduce_microstep: 4.94 | step_microstep: 42.47
-[2025-01-25 10:15:05,402] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.27 | bwd: 4631.54 | bwd_inner: 4626.48 | bwd_allreduce: 4.99 | step: 42.48
-  9%|▉         | 527/5800 [1:28:35<10:06:49,  6.90s/it]                                                       {'loss': 0.1836, 'grad_norm': 10.196394920349121, 'learning_rate': 3.9612703986689486e-05, 'epoch': 4.54}
-  9%|▉         | 527/5800 [1:28:35<10:06:49,  6.90s/it]score1 tensor([[0.6484],
-        [0.5664],
-        [0.6094],
-        [0.6328]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.3418, 0.4336, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1748, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:15:12,305] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 10:15:12,307] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.43 | bwd_microstep: 4624.65 | bwd_inner_microstep: 4620.10 | bwd_allreduce_microstep: 4.45 | step_microstep: 41.22
-[2025-01-25 10:15:12,307] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.37 | bwd: 4624.68 | bwd_inner: 4620.10 | bwd_allreduce: 4.50 | step: 41.23
-  9%|▉         | 528/5800 [1:28:42<10:06:32,  6.90s/it]                                                       {'loss': 0.1748, 'grad_norm': 9.996057510375977, 'learning_rate': 3.96105137269717e-05, 'epoch': 4.55}
-  9%|▉         | 528/5800 [1:28:42<10:06:32,  6.90s/it]score1 tensor([[0.6328],
-        [0.6094],
-        [0.6211],
-        [0.6328]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.5000, 0.5625, 0.6484], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0830, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:15:19,207] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.90 | optimizer_step: 4.37
-[2025-01-25 10:15:19,208] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.81 | bwd_microstep: 4622.19 | bwd_inner_microstep: 4617.48 | bwd_allreduce_microstep: 4.63 | step_microstep: 46.55
-[2025-01-25 10:15:19,208] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.77 | bwd: 4622.22 | bwd_inner: 4617.48 | bwd_allreduce: 4.67 | step: 46.56
-  9%|▉         | 529/5800 [1:28:49<10:06:28,  6.90s/it]                                                       {'loss': 0.083, 'grad_norm': 5.0260491371154785, 'learning_rate': 3.960831735235545e-05, 'epoch': 4.56}
-  9%|▉         | 529/5800 [1:28:49<10:06:28,  6.90s/it]score1 tensor([[0.6250],
-        [0.5742],
-        [0.6328],
-        [0.6445]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6289, 0.5039, 0.6172, 0.6484], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:15:26,119] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.37
-[2025-01-25 10:15:26,121] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.79 | bwd_microstep: 4626.14 | bwd_inner_microstep: 4621.37 | bwd_allreduce_microstep: 4.69 | step_microstep: 44.04
-[2025-01-25 10:15:26,121] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.75 | bwd: 4626.16 | bwd_inner: 4621.37 | bwd_allreduce: 4.73 | step: 44.05
-  9%|▉         | 530/5800 [1:28:56<10:06:48,  6.91s/it]                                                       {'loss': 0.0234, 'grad_norm': 0.46978411078453064, 'learning_rate': 3.960611486352562e-05, 'epoch': 4.57}
-  9%|▉         | 530/5800 [1:28:56<10:06:48,  6.91s/it]score1 tensor([[0.6016],
-        [0.5469],
-        [0.5508],
-        [0.5781]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.4414, 0.4180, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0918, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:15:33,036] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 10:15:33,038] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.20 | bwd_microstep: 4622.38 | bwd_inner_microstep: 4617.81 | bwd_allreduce_microstep: 4.46 | step_microstep: 41.20
-[2025-01-25 10:15:33,038] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.15 | bwd: 4622.42 | bwd_inner: 4617.81 | bwd_allreduce: 4.51 | step: 41.20
-  9%|▉         | 531/5800 [1:29:03<10:06:41,  6.91s/it]                                                       {'loss': 0.0918, 'grad_norm': 9.650640487670898, 'learning_rate': 3.960390626116898e-05, 'epoch': 4.58}
-  9%|▉         | 531/5800 [1:29:03<10:06:41,  6.91s/it]score1 tensor([[0.5312],
-        [0.4785],
-        [0.5234],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4570, 0.3340, 0.4512, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0747, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:15:39,936] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 10:15:39,938] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.97 | bwd_microstep: 4615.50 | bwd_inner_microstep: 4610.35 | bwd_allreduce_microstep: 5.05 | step_microstep: 44.52
-[2025-01-25 10:15:39,938] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.93 | bwd: 4615.53 | bwd_inner: 4610.35 | bwd_allreduce: 5.10 | step: 44.53
-  9%|▉         | 532/5800 [1:29:09<10:06:22,  6.91s/it]                                                       {'loss': 0.0747, 'grad_norm': 4.596861362457275, 'learning_rate': 3.960169154597421e-05, 'epoch': 4.59}
-  9%|▉         | 532/5800 [1:29:09<10:06:22,  6.91s/it]score1 tensor([[0.5195],
-        [0.4707],
-        [0.4922],
-        [0.4766]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.5586, 0.5039, 0.3262], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0840, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:15:46,850] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.37
-[2025-01-25 10:15:46,851] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.00 | bwd_microstep: 4628.34 | bwd_inner_microstep: 4623.11 | bwd_allreduce_microstep: 5.12 | step_microstep: 44.74
-[2025-01-25 10:15:46,852] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.95 | bwd: 4628.36 | bwd_inner: 4623.11 | bwd_allreduce: 5.18 | step: 44.76
-  9%|▉         | 533/5800 [1:29:16<10:06:26,  6.91s/it]                                                       {'loss': 0.084, 'grad_norm': 4.70476770401001, 'learning_rate': 3.95994707186319e-05, 'epoch': 4.59}
-  9%|▉         | 533/5800 [1:29:16<10:06:26,  6.91s/it]score1 tensor([[0.4863],
-        [0.4609],
-        [0.4512],
-        [0.4648]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6523, 0.5000, 0.4277, 0.5820], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0864, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:15:53,757] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 10:15:53,759] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.57 | bwd_microstep: 4626.84 | bwd_inner_microstep: 4622.04 | bwd_allreduce_microstep: 4.70 | step_microstep: 42.04
-[2025-01-25 10:15:53,762] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.53 | bwd: 4626.86 | bwd_inner: 4622.04 | bwd_allreduce: 4.75 | step: 42.04
-  9%|▉         | 534/5800 [1:29:23<10:06:24,  6.91s/it]                                                       {'loss': 0.0864, 'grad_norm': 4.703179359436035, 'learning_rate': 3.9597243779834536e-05, 'epoch': 4.6}
-  9%|▉         | 534/5800 [1:29:23<10:06:24,  6.91s/it]score1 tensor([[0.4316],
-        [0.4707],
-        [0.4688],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.6172, 0.6719, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1377, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:16:00,664] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 10:16:00,665] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.28 | bwd_microstep: 4628.18 | bwd_inner_microstep: 4623.17 | bwd_allreduce_microstep: 4.91 | step_microstep: 41.70
-[2025-01-25 10:16:00,665] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.25 | bwd: 4628.21 | bwd_inner: 4623.17 | bwd_allreduce: 4.95 | step: 41.71
-  9%|▉         | 535/5800 [1:29:30<10:06:13,  6.91s/it]                                                       {'loss': 0.1377, 'grad_norm': 9.301264762878418, 'learning_rate': 3.959501073027652e-05, 'epoch': 4.61}
-  9%|▉         | 535/5800 [1:29:30<10:06:13,  6.91s/it]score1 tensor([[0.4609],
-        [0.4785],
-        [0.4473],
-        [0.4727]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.5391, 0.5703, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0654, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:16:07,575] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 10:16:07,577] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.58 | bwd_microstep: 4622.70 | bwd_inner_microstep: 4617.84 | bwd_allreduce_microstep: 4.79 | step_microstep: 42.12
-[2025-01-25 10:16:07,577] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.52 | bwd: 4622.72 | bwd_inner: 4617.84 | bwd_allreduce: 4.83 | step: 42.13
-  9%|▉         | 536/5800 [1:29:37<10:06:10,  6.91s/it]                                                       {'loss': 0.0654, 'grad_norm': 9.231874465942383, 'learning_rate': 3.959277157065416e-05, 'epoch': 4.62}
-  9%|▉         | 536/5800 [1:29:37<10:06:10,  6.91s/it]score1 tensor([[0.5391],
-        [0.5039],
-        [0.4941],
-        [0.4551]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.6055, 0.5625, 0.4160], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0552, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:16:14,498] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 10:16:14,499] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.60 | bwd_microstep: 4627.16 | bwd_inner_microstep: 4622.11 | bwd_allreduce_microstep: 4.96 | step_microstep: 52.20
-[2025-01-25 10:16:14,506] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.58 | bwd: 4627.18 | bwd_inner: 4622.11 | bwd_allreduce: 5.00 | step: 52.21
-  9%|▉         | 537/5800 [1:29:44<10:06:22,  6.91s/it]                                                       {'loss': 0.0552, 'grad_norm': 5.629150390625, 'learning_rate': 3.959052630166565e-05, 'epoch': 4.63}
-  9%|▉         | 537/5800 [1:29:44<10:06:22,  6.91s/it]score1 tensor([[0.5352],
-        [0.4453],
-        [0.4492],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5820, 0.4004, 0.3086, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0806, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:16:21,405] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.37
-[2025-01-25 10:16:21,406] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.53 | bwd_microstep: 4621.68 | bwd_inner_microstep: 4617.04 | bwd_allreduce_microstep: 4.56 | step_microstep: 42.22
-[2025-01-25 10:16:21,407] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.48 | bwd: 4621.71 | bwd_inner: 4617.04 | bwd_allreduce: 4.60 | step: 42.23
-  9%|▉         | 538/5800 [1:29:51<10:05:59,  6.91s/it]                                                       {'loss': 0.0806, 'grad_norm': 0.7222028374671936, 'learning_rate': 3.958827492401112e-05, 'epoch': 4.64}
-  9%|▉         | 538/5800 [1:29:51<10:05:59,  6.91s/it]score1 tensor([[0.6172],
-        [0.5586],
-        [0.6016],
-        [0.5977]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6016, 0.4863, 0.4590, 0.6484], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:16:28,313] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 10:16:28,314] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.11 | bwd_microstep: 4627.02 | bwd_inner_microstep: 4622.14 | bwd_allreduce_microstep: 4.79 | step_microstep: 46.12
-[2025-01-25 10:16:28,315] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.07 | bwd: 4627.04 | bwd_inner: 4622.14 | bwd_allreduce: 4.83 | step: 46.13
-  9%|▉         | 539/5800 [1:29:58<10:05:49,  6.91s/it]                                                       {'loss': 0.0703, 'grad_norm': 5.094717502593994, 'learning_rate': 3.9586017438392585e-05, 'epoch': 4.65}
-  9%|▉         | 539/5800 [1:29:58<10:05:49,  6.91s/it]score1 tensor([[0.5586],
-        [0.5781],
-        [0.5859],
-        [0.6406]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4961, 0.4512, 0.5273, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0845, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:16:35,226] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 10:16:35,227] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.03 | bwd_microstep: 4632.63 | bwd_inner_microstep: 4627.20 | bwd_allreduce_microstep: 5.33 | step_microstep: 47.53
-[2025-01-25 10:16:35,228] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.00 | bwd: 4632.66 | bwd_inner: 4627.20 | bwd_allreduce: 5.38 | step: 47.53
-  9%|▉         | 540/5800 [1:30:05<10:05:51,  6.91s/it]                                                       {'loss': 0.0845, 'grad_norm': 10.180756568908691, 'learning_rate': 3.958375384551396e-05, 'epoch': 4.66}
-  9%|▉         | 540/5800 [1:30:05<10:05:51,  6.91s/it]score1 tensor([[0.5508],
-        [0.5781],
-        [0.6016],
-        [0.6250]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.4277, 0.5391, 0.5000], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1099, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:16:42,146] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 10:16:42,147] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.20 | bwd_microstep: 4630.80 | bwd_inner_microstep: 4625.77 | bwd_allreduce_microstep: 4.94 | step_microstep: 43.65
-[2025-01-25 10:16:42,147] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.16 | bwd: 4630.83 | bwd_inner: 4625.77 | bwd_allreduce: 4.98 | step: 43.66
-  9%|▉         | 541/5800 [1:30:12<10:05:59,  6.91s/it]                                                       {'loss': 0.1099, 'grad_norm': 10.155352592468262, 'learning_rate': 3.958148414608109e-05, 'epoch': 4.66}
-  9%|▉         | 541/5800 [1:30:12<10:05:59,  6.91s/it]score1 tensor([[0.6211],
-        [0.5703],
-        [0.6094],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.4844, 0.4980, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1187, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:16:49,048] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 10:16:49,051] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.71 | bwd_microstep: 4621.45 | bwd_inner_microstep: 4616.31 | bwd_allreduce_microstep: 5.02 | step_microstep: 46.68
-[2025-01-25 10:16:49,051] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.68 | bwd: 4621.47 | bwd_inner: 4616.31 | bwd_allreduce: 5.07 | step: 46.69
-  9%|▉         | 542/5800 [1:30:19<10:05:44,  6.91s/it]                                                       {'loss': 0.1187, 'grad_norm': 10.197153091430664, 'learning_rate': 3.9579208340801684e-05, 'epoch': 4.67}
-  9%|▉         | 542/5800 [1:30:19<10:05:44,  6.91s/it]score1 tensor([[0.5195],
-        [0.4355],
-        [0.5859],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4453, 0.4395, 0.6094, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:16:55,963] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.37
-[2025-01-25 10:16:55,964] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.45 | bwd_microstep: 4625.40 | bwd_inner_microstep: 4619.80 | bwd_allreduce_microstep: 5.51 | step_microstep: 45.72
-[2025-01-25 10:16:55,964] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.42 | bwd: 4625.42 | bwd_inner: 4619.80 | bwd_allreduce: 5.55 | step: 45.72
-  9%|▉         | 543/5800 [1:30:25<10:05:32,  6.91s/it]                                                       {'loss': 0.0391, 'grad_norm': 5.138301849365234, 'learning_rate': 3.957692643038539e-05, 'epoch': 4.68}
-  9%|▉         | 543/5800 [1:30:25<10:05:32,  6.91s/it]score1 tensor([[0.5391],
-        [0.5391],
-        [0.5508],
-        [0.6836]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.5391, 0.5430, 0.6836], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0249, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:17:02,793] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.37
-[2025-01-25 10:17:02,794] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.61 | bwd_microstep: 4548.07 | bwd_inner_microstep: 4543.34 | bwd_allreduce_microstep: 4.65 | step_microstep: 42.42
-[2025-01-25 10:17:02,794] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.58 | bwd: 4548.10 | bwd_inner: 4543.34 | bwd_allreduce: 4.69 | step: 42.43
-  9%|▉         | 544/5800 [1:30:32<10:03:17,  6.89s/it]                                                       {'loss': 0.0249, 'grad_norm': 4.991322040557861, 'learning_rate': 3.957463841554375e-05, 'epoch': 4.69}
-  9%|▉         | 544/5800 [1:30:32<10:03:17,  6.89s/it]score1 tensor([[0.4902],
-        [0.4121],
-        [0.4688],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.3613, 0.4629, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:17:09,704] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 10:17:09,705] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.84 | bwd_microstep: 4627.09 | bwd_inner_microstep: 4622.30 | bwd_allreduce_microstep: 4.70 | step_microstep: 41.50
-[2025-01-25 10:17:09,705] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.77 | bwd: 4627.11 | bwd_inner: 4622.30 | bwd_allreduce: 4.75 | step: 41.51
-  9%|▉         | 545/5800 [1:30:39<10:03:50,  6.89s/it]                                                       {'loss': 0.0352, 'grad_norm': 4.717033386230469, 'learning_rate': 3.95723442969902e-05, 'epoch': 4.7}
-  9%|▉         | 545/5800 [1:30:39<10:03:50,  6.89s/it]score1 tensor([[0.3750],
-        [0.3418],
-        [0.5156],
-        [0.3398]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.4180, 0.5508, 0.4609], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0762, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:17:16,621] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 7.28 | optimizer_step: 4.36
-[2025-01-25 10:17:16,622] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.11 | bwd_microstep: 4627.79 | bwd_inner_microstep: 4622.89 | bwd_allreduce_microstep: 4.81 | step_microstep: 46.63
-[2025-01-25 10:17:16,623] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.08 | bwd: 4627.81 | bwd_inner: 4622.89 | bwd_allreduce: 4.85 | step: 46.63
-  9%|▉         | 546/5800 [1:30:46<10:04:18,  6.90s/it]                                                       {'loss': 0.0762, 'grad_norm': 9.05161190032959, 'learning_rate': 3.957004407544009e-05, 'epoch': 4.71}
-  9%|▉         | 546/5800 [1:30:46<10:04:18,  6.90s/it]score1 tensor([[0.3379],
-        [0.4238],
-        [0.3613],
-        [0.3145]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.5547, 0.5117, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1147, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:17:23,523] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 10:17:23,525] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.24 | bwd_microstep: 4619.39 | bwd_inner_microstep: 4614.63 | bwd_allreduce_microstep: 4.68 | step_microstep: 43.94
-[2025-01-25 10:17:23,525] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.20 | bwd: 4619.42 | bwd_inner: 4614.63 | bwd_allreduce: 4.72 | step: 43.94
-  9%|▉         | 547/5800 [1:30:53<10:04:09,  6.90s/it]                                                       {'loss': 0.1147, 'grad_norm': 8.890227317810059, 'learning_rate': 3.9567737751610666e-05, 'epoch': 4.72}
-  9%|▉         | 547/5800 [1:30:53<10:04:09,  6.90s/it]score1 tensor([[0.3535],
-        [0.3770],
-        [0.3594],
-        [0.4590]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3789, 0.4746, 0.3789, 0.5977], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:17:30,433] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 10:17:30,434] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.82 | bwd_microstep: 4632.13 | bwd_inner_microstep: 4627.32 | bwd_allreduce_microstep: 4.72 | step_microstep: 41.78
-[2025-01-25 10:17:30,436] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.78 | bwd: 4632.15 | bwd_inner: 4627.32 | bwd_allreduce: 4.76 | step: 41.79
-  9%|▉         | 548/5800 [1:31:00<10:04:19,  6.90s/it]                                                       {'loss': 0.0703, 'grad_norm': 8.94106674194336, 'learning_rate': 3.9565425326221086e-05, 'epoch': 4.72}
-  9%|▉         | 548/5800 [1:31:00<10:04:19,  6.90s/it]score1 tensor([[0.4062],
-        [0.4453],
-        [0.4180],
-        [0.3496]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.5391, 0.4492, 0.4688], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0757, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:17:37,332] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 10:17:37,333] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.13 | bwd_microstep: 4621.54 | bwd_inner_microstep: 4617.02 | bwd_allreduce_microstep: 4.44 | step_microstep: 43.76
-[2025-01-25 10:17:37,333] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.09 | bwd: 4621.56 | bwd_inner: 4617.02 | bwd_allreduce: 4.48 | step: 43.77
-  9%|▉         | 549/5800 [1:31:07<10:04:02,  6.90s/it]                                                       {'loss': 0.0757, 'grad_norm': 9.13925552368164, 'learning_rate': 3.956310679999239e-05, 'epoch': 4.73}
-  9%|▉         | 549/5800 [1:31:07<10:04:02,  6.90s/it]score1 tensor([[0.5117],
-        [0.3848],
-        [0.4961],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3516, 0.4336, 0.6211, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0913, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:17:44,234] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 10:17:44,235] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.98 | bwd_microstep: 4622.97 | bwd_inner_microstep: 4618.03 | bwd_allreduce_microstep: 4.85 | step_microstep: 42.91
-[2025-01-25 10:17:44,235] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.95 | bwd: 4623.00 | bwd_inner: 4618.03 | bwd_allreduce: 4.89 | step: 42.92
-  9%|▉         | 550/5800 [1:31:14<10:03:53,  6.90s/it]                                                       {'loss': 0.0913, 'grad_norm': 7.189351558685303, 'learning_rate': 3.956078217364755e-05, 'epoch': 4.74}
-  9%|▉         | 550/5800 [1:31:14<10:03:53,  6.90s/it]score1 tensor([[0.5195],
-        [0.4805],
-        [0.6016],
-        [0.3984]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.5156, 0.6133, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0298, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:17:51,150] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 10:17:51,151] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.77 | bwd_microstep: 4627.02 | bwd_inner_microstep: 4622.24 | bwd_allreduce_microstep: 4.69 | step_microstep: 47.00
-[2025-01-25 10:17:51,152] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.73 | bwd: 4627.04 | bwd_inner: 4622.24 | bwd_allreduce: 4.73 | step: 47.01
- 10%|▉         | 551/5800 [1:31:21<10:04:11,  6.91s/it]                                                       {'loss': 0.0298, 'grad_norm': 9.736190795898438, 'learning_rate': 3.955845144791142e-05, 'epoch': 4.75}
- 10%|▉         | 551/5800 [1:31:21<10:04:11,  6.91s/it]score1 tensor([[0.7617],
-        [0.6328],
-        [0.6758],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6641, 0.5898, 0.4941, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0967, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:17:58,067] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.37
-[2025-01-25 10:17:58,068] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.58 | bwd_microstep: 4623.79 | bwd_inner_microstep: 4619.01 | bwd_allreduce_microstep: 4.68 | step_microstep: 43.71
-[2025-01-25 10:17:58,069] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.45 | bwd: 4623.82 | bwd_inner: 4619.01 | bwd_allreduce: 4.72 | step: 43.72
- 10%|▉         | 552/5800 [1:31:28<10:04:23,  6.91s/it]                                                       {'loss': 0.0967, 'grad_norm': 10.929574012756348, 'learning_rate': 3.9556114623510755e-05, 'epoch': 4.76}
- 10%|▉         | 552/5800 [1:31:28<10:04:23,  6.91s/it]score1 tensor([[0.6250],
-        [0.5508],
-        [0.7734],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.4707, 0.6875, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1309, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:18:04,978] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.36
-[2025-01-25 10:18:04,979] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.70 | bwd_microstep: 4626.95 | bwd_inner_microstep: 4621.79 | bwd_allreduce_microstep: 5.04 | step_microstep: 45.71
-[2025-01-25 10:18:04,980] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.64 | bwd: 4626.98 | bwd_inner: 4621.79 | bwd_allreduce: 5.09 | step: 45.71
- 10%|▉         | 553/5800 [1:31:34<10:04:24,  6.91s/it]                                                       {'loss': 0.1309, 'grad_norm': 10.491647720336914, 'learning_rate': 3.955377170117422e-05, 'epoch': 4.77}
- 10%|▉         | 553/5800 [1:31:34<10:04:24,  6.91s/it]score1 tensor([[0.6133],
-        [0.6367],
-        [0.6367],
-        [0.5898]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4824, 0.5352, 0.5664, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0786, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:18:11,892] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 10:18:11,893] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.60 | bwd_microstep: 4626.27 | bwd_inner_microstep: 4621.39 | bwd_allreduce_microstep: 4.79 | step_microstep: 41.60
-[2025-01-25 10:18:11,894] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.57 | bwd: 4626.30 | bwd_inner: 4621.39 | bwd_allreduce: 4.84 | step: 41.61
- 10%|▉         | 554/5800 [1:31:41<10:04:13,  6.91s/it]                                                       {'loss': 0.0786, 'grad_norm': 10.5689058303833, 'learning_rate': 3.955142268163239e-05, 'epoch': 4.78}
- 10%|▉         | 554/5800 [1:31:41<10:04:13,  6.91s/it]score1 tensor([[0.6719],
-        [0.5977],
-        [0.6055],
-        [0.6641]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4922, 0.4492, 0.4922, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1523, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:18:18,802] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 10:18:18,803] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.85 | bwd_microstep: 4622.73 | bwd_inner_microstep: 4618.03 | bwd_allreduce_microstep: 4.60 | step_microstep: 44.29
-[2025-01-25 10:18:18,804] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.81 | bwd: 4622.75 | bwd_inner: 4618.03 | bwd_allreduce: 4.65 | step: 44.29
- 10%|▉         | 555/5800 [1:31:48<10:04:06,  6.91s/it]                                                       {'loss': 0.1523, 'grad_norm': 10.533262252807617, 'learning_rate': 3.954906756561772e-05, 'epoch': 4.78}
- 10%|▉         | 555/5800 [1:31:48<10:04:06,  6.91s/it]score1 tensor([[0.5938],
-        [0.5234],
-        [0.6406],
-        [0.7070]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4453, 0.2812, 0.3750, 0.6250], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1846, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:18:25,732] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.33 | optimizer_step: 4.36
-[2025-01-25 10:18:25,734] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.59 | bwd_microstep: 4623.21 | bwd_inner_microstep: 4617.22 | bwd_allreduce_microstep: 5.89 | step_microstep: 65.14
-[2025-01-25 10:18:25,735] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.54 | bwd: 4623.24 | bwd_inner: 4617.22 | bwd_allreduce: 5.95 | step: 65.22
- 10%|▉         | 556/5800 [1:31:55<10:05:07,  6.92s/it]                                                       {'loss': 0.1846, 'grad_norm': 10.362982749938965, 'learning_rate': 3.954670635386457e-05, 'epoch': 4.79}
- 10%|▉         | 556/5800 [1:31:55<10:05:07,  6.92s/it]score1 tensor([[0.5469],
-        [0.5586],
-        [0.5469],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4805, 0.4082, 0.4121, 0.4336], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1230, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:18:32,698] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.53 | optimizer_step: 4.36
-[2025-01-25 10:18:32,700] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2169.21 | bwd_microstep: 4626.36 | bwd_inner_microstep: 4621.57 | bwd_allreduce_microstep: 4.69 | step_microstep: 63.49
-[2025-01-25 10:18:32,701] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2169.17 | bwd: 4626.38 | bwd_inner: 4621.57 | bwd_allreduce: 4.74 | step: 63.53
- 10%|▉         | 557/5800 [1:32:02<10:06:20,  6.94s/it]                                                       {'loss': 0.123, 'grad_norm': 9.833823204040527, 'learning_rate': 3.954433904710922e-05, 'epoch': 4.8}
- 10%|▉         | 557/5800 [1:32:02<10:06:20,  6.94s/it]score1 tensor([[0.5234],
-        [0.5586],
-        [0.6484],
-        [0.3984]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.4062, 0.6016, 0.3555], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0811, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:18:39,675] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.36
-[2025-01-25 10:18:39,677] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2171.01 | bwd_microstep: 4628.17 | bwd_inner_microstep: 4623.16 | bwd_allreduce_microstep: 4.92 | step_microstep: 46.93
-[2025-01-25 10:18:39,677] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2170.98 | bwd: 4628.20 | bwd_inner: 4623.16 | bwd_allreduce: 4.97 | step: 46.94
- 10%|▉         | 558/5800 [1:32:09<10:06:26,  6.94s/it]                                                       {'loss': 0.0811, 'grad_norm': 4.950697422027588, 'learning_rate': 3.954196564608983e-05, 'epoch': 4.81}
- 10%|▉         | 558/5800 [1:32:09<10:06:26,  6.94s/it]score1 tensor([[0.4727],
-        [0.5391],
-        [0.5469],
-        [0.4785]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.5078, 0.5898, 0.4668], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0400, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:18:46,591] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 10:18:46,593] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.32 | bwd_microstep: 4622.24 | bwd_inner_microstep: 4617.05 | bwd_allreduce_microstep: 5.10 | step_microstep: 48.77
-[2025-01-25 10:18:46,593] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.29 | bwd: 4622.26 | bwd_inner: 4617.05 | bwd_allreduce: 5.15 | step: 48.77
- 10%|▉         | 559/5800 [1:32:16<10:05:47,  6.94s/it]                                                       {'loss': 0.04, 'grad_norm': 0.48044225573539734, 'learning_rate': 3.9539586151546464e-05, 'epoch': 4.82}
- 10%|▉         | 559/5800 [1:32:16<10:05:47,  6.94s/it]score1 tensor([[0.3691],
-        [0.4785],
-        [0.3691],
-        [0.4961]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.5508, 0.4297, 0.5742], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0894, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:18:53,508] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 10:18:53,509] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.38 | bwd_microstep: 4623.06 | bwd_inner_microstep: 4618.22 | bwd_allreduce_microstep: 4.74 | step_microstep: 41.44
-[2025-01-25 10:18:53,509] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.34 | bwd: 4623.08 | bwd_inner: 4618.22 | bwd_allreduce: 4.79 | step: 41.45
- 10%|▉         | 560/5800 [1:32:23<10:05:07,  6.93s/it]                                                       {'loss': 0.0894, 'grad_norm': 8.87533950805664, 'learning_rate': 3.9537200564221106e-05, 'epoch': 4.83}
- 10%|▉         | 560/5800 [1:32:23<10:05:07,  6.93s/it]score1 tensor([[0.4355],
-        [0.4414],
-        [0.4219],
-        [0.4258]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.4355, 0.5703, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1113, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:19:00,416] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 10:19:00,417] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.46 | bwd_microstep: 4623.85 | bwd_inner_microstep: 4618.50 | bwd_allreduce_microstep: 5.27 | step_microstep: 41.02
-[2025-01-25 10:19:00,417] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.42 | bwd: 4623.87 | bwd_inner: 4618.50 | bwd_allreduce: 5.31 | step: 41.03
- 10%|▉         | 561/5800 [1:32:30<10:04:19,  6.92s/it]                                                       {'loss': 0.1113, 'grad_norm': 4.480847358703613, 'learning_rate': 3.9534808884857605e-05, 'epoch': 4.84}
- 10%|▉         | 561/5800 [1:32:30<10:04:19,  6.92s/it]score1 tensor([[0.3906],
-        [0.4043],
-        [0.4004],
-        [0.3008]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.5195, 0.5625, 0.3438], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1060, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:19:07,332] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 10:19:07,334] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.85 | bwd_microstep: 4622.76 | bwd_inner_microstep: 4617.72 | bwd_allreduce_microstep: 4.96 | step_microstep: 46.54
-[2025-01-25 10:19:07,337] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.81 | bwd: 4622.79 | bwd_inner: 4617.72 | bwd_allreduce: 5.00 | step: 46.55
- 10%|▉         | 562/5800 [1:32:37<10:04:05,  6.92s/it]                                                       {'loss': 0.106, 'grad_norm': 8.483345031738281, 'learning_rate': 3.953241111420174e-05, 'epoch': 4.84}
- 10%|▉         | 562/5800 [1:32:37<10:04:05,  6.92s/it]score1 tensor([[0.3730],
-        [0.5547],
-        [0.5195],
-        [0.3496]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.6445, 0.6445, 0.4043], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1108, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:19:14,256] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.37
-[2025-01-25 10:19:14,258] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.49 | bwd_microstep: 4638.13 | bwd_inner_microstep: 4633.52 | bwd_allreduce_microstep: 4.54 | step_microstep: 41.99
-[2025-01-25 10:19:14,258] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.44 | bwd: 4638.15 | bwd_inner: 4633.52 | bwd_allreduce: 4.58 | step: 41.99
- 10%|▉         | 563/5800 [1:32:44<10:04:03,  6.92s/it]                                                       {'loss': 0.1108, 'grad_norm': 9.127629280090332, 'learning_rate': 3.953000725300117e-05, 'epoch': 4.85}
- 10%|▉         | 563/5800 [1:32:44<10:04:03,  6.92s/it]score1 tensor([[0.5117],
-        [0.4395],
-        [0.3906],
-        [0.4199]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6016, 0.4941, 0.4395, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0693, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:19:21,166] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.51 | optimizer_step: 4.36
-[2025-01-25 10:19:21,167] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.21 | bwd_microstep: 4632.26 | bwd_inner_microstep: 4627.21 | bwd_allreduce_microstep: 4.94 | step_microstep: 40.98
-[2025-01-25 10:19:21,168] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.17 | bwd: 4632.28 | bwd_inner: 4627.21 | bwd_allreduce: 5.00 | step: 40.99
- 10%|▉         | 564/5800 [1:32:51<10:03:42,  6.92s/it]                                                       {'loss': 0.0693, 'grad_norm': 9.008822441101074, 'learning_rate': 3.952759730200546e-05, 'epoch': 4.86}
- 10%|▉         | 564/5800 [1:32:51<10:03:42,  6.92s/it]score1 tensor([[0.4629],
-        [0.5117],
-        [0.4219],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.4980, 0.5391, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0444, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:19:28,078] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.37
-[2025-01-25 10:19:28,079] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.47 | bwd_microstep: 4627.91 | bwd_inner_microstep: 4623.35 | bwd_allreduce_microstep: 4.48 | step_microstep: 41.97
-[2025-01-25 10:19:28,079] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.44 | bwd: 4627.93 | bwd_inner: 4623.35 | bwd_allreduce: 4.52 | step: 41.97
- 10%|▉         | 565/5800 [1:32:58<10:03:22,  6.92s/it]                                                       {'loss': 0.0444, 'grad_norm': 5.099398612976074, 'learning_rate': 3.9525181261966074e-05, 'epoch': 4.87}
- 10%|▉         | 565/5800 [1:32:58<10:03:22,  6.92s/it]score1 tensor([[0.4668],
-        [0.5703],
-        [0.5078],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4375, 0.5625, 0.4883, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0259, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:19:34,982] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.44 | optimizer_step: 4.36
-[2025-01-25 10:19:34,983] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.38 | bwd_microstep: 4619.58 | bwd_inner_microstep: 4615.22 | bwd_allreduce_microstep: 4.29 | step_microstep: 41.87
-[2025-01-25 10:19:34,984] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.34 | bwd: 4619.61 | bwd_inner: 4615.22 | bwd_allreduce: 4.33 | step: 41.87
- 10%|▉         | 566/5800 [1:33:04<10:02:59,  6.91s/it]                                                       {'loss': 0.0259, 'grad_norm': 4.83725643157959, 'learning_rate': 3.952275913363639e-05, 'epoch': 4.88}
- 10%|▉         | 566/5800 [1:33:04<10:02:59,  6.91s/it]score1 tensor([[0.6328],
-        [0.4551],
-        [0.5820],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6641, 0.4258, 0.6797, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0454, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:19:41,891] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.17 | optimizer_step: 4.61
-[2025-01-25 10:19:41,892] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.63 | bwd_microstep: 4624.59 | bwd_inner_microstep: 4620.08 | bwd_allreduce_microstep: 4.41 | step_microstep: 43.71
-[2025-01-25 10:19:41,892] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.60 | bwd: 4624.62 | bwd_inner: 4620.08 | bwd_allreduce: 4.46 | step: 43.72
- 10%|▉         | 567/5800 [1:33:11<10:02:41,  6.91s/it]                                                       {'loss': 0.0454, 'grad_norm': 5.438660144805908, 'learning_rate': 3.9520330917771654e-05, 'epoch': 4.89}
- 10%|▉         | 567/5800 [1:33:11<10:02:41,  6.91s/it]score1 tensor([[0.5820],
-        [0.5469],
-        [0.6562],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.4570, 0.6367, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:19:48,794] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 10:19:48,795] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.70 | bwd_microstep: 4622.55 | bwd_inner_microstep: 4617.60 | bwd_allreduce_microstep: 4.85 | step_microstep: 41.96
-[2025-01-25 10:19:48,795] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.67 | bwd: 4622.58 | bwd_inner: 4617.60 | bwd_allreduce: 4.90 | step: 41.96
- 10%|▉         | 568/5800 [1:33:18<10:02:30,  6.91s/it]                                                       {'loss': 0.0508, 'grad_norm': 10.142536163330078, 'learning_rate': 3.9517896615129034e-05, 'epoch': 4.9}
- 10%|▉         | 568/5800 [1:33:18<10:02:30,  6.91s/it]score1 tensor([[0.4609],
-        [0.5742],
-        [0.5312],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3672, 0.4980, 0.5117, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0571, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:19:55,702] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.44 | optimizer_step: 4.36
-[2025-01-25 10:19:55,703] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.96 | bwd_microstep: 4618.86 | bwd_inner_microstep: 4614.24 | bwd_allreduce_microstep: 4.52 | step_microstep: 41.55
-[2025-01-25 10:19:55,704] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.93 | bwd: 4618.88 | bwd_inner: 4614.24 | bwd_allreduce: 4.57 | step: 41.55
- 10%|▉         | 569/5800 [1:33:25<10:02:16,  6.91s/it]                                                       {'loss': 0.0571, 'grad_norm': 9.677462577819824, 'learning_rate': 3.951545622646758e-05, 'epoch': 4.91}
- 10%|▉         | 569/5800 [1:33:25<10:02:16,  6.91s/it]score1 tensor([[0.4766],
-        [0.5547],
-        [0.5156],
-        [0.4473]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5820, 0.6133, 0.4727, 0.4434], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0527, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:20:02,614] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.48 | optimizer_step: 4.36
-[2025-01-25 10:20:02,615] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.16 | bwd_microstep: 4627.66 | bwd_inner_microstep: 4619.52 | bwd_allreduce_microstep: 8.05 | step_microstep: 42.91
-[2025-01-25 10:20:02,615] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.13 | bwd: 4627.68 | bwd_inner: 4619.52 | bwd_allreduce: 8.08 | step: 42.92
- 10%|▉         | 570/5800 [1:33:32<10:02:15,  6.91s/it]                                                       {'loss': 0.0527, 'grad_norm': 0.6204631924629211, 'learning_rate': 3.951300975254825e-05, 'epoch': 4.91}
- 10%|▉         | 570/5800 [1:33:32<10:02:15,  6.91s/it]score1 tensor([[0.4824],
-        [0.4395],
-        [0.5234],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.4961, 0.4688, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0605, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:20:09,510] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.96 | optimizer_step: 4.36
-[2025-01-25 10:20:09,515] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.76 | bwd_microstep: 4628.90 | bwd_inner_microstep: 4624.80 | bwd_allreduce_microstep: 4.03 | step_microstep: 41.17
-[2025-01-25 10:20:09,515] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.72 | bwd: 4628.92 | bwd_inner: 4624.80 | bwd_allreduce: 4.06 | step: 41.19
- 10%|▉         | 571/5800 [1:33:39<10:02:08,  6.91s/it]                                                       {'loss': 0.0605, 'grad_norm': 0.6533364653587341, 'learning_rate': 3.9510557194133904e-05, 'epoch': 4.92}
- 10%|▉         | 571/5800 [1:33:39<10:02:08,  6.91s/it]score1 tensor([[0.6016],
-        [0.5000],
-        [0.4375],
-        [0.4863]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.4453, 0.3105, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0557, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:20:16,419] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.24 | optimizer_step: 4.37
-[2025-01-25 10:20:16,420] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.95 | bwd_microstep: 4623.21 | bwd_inner_microstep: 4618.06 | bwd_allreduce_microstep: 5.08 | step_microstep: 39.87
-[2025-01-25 10:20:16,421] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.92 | bwd: 4623.22 | bwd_inner: 4618.06 | bwd_allreduce: 5.11 | step: 39.87
- 10%|▉         | 572/5800 [1:33:46<10:01:21,  6.90s/it]                                                       {'loss': 0.0557, 'grad_norm': 0.807702362537384, 'learning_rate': 3.9508098551989284e-05, 'epoch': 4.93}
- 10%|▉         | 572/5800 [1:33:46<10:01:21,  6.90s/it]score1 tensor([[0.5586],
-        [0.4844],
-        [0.3828],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.5469, 0.3984, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0225, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:20:23,309] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 10:20:23,311] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.41 | bwd_microstep: 4631.25 | bwd_inner_microstep: 4625.52 | bwd_allreduce_microstep: 4.81 | step_microstep: 47.87
-[2025-01-25 10:20:23,311] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.40 | bwd: 4631.27 | bwd_inner: 4625.52 | bwd_allreduce: 5.65 | step: 47.89
- 10%|▉         | 573/5800 [1:33:53<10:01:18,  6.90s/it]                                                       {'loss': 0.0225, 'grad_norm': 4.638636589050293, 'learning_rate': 3.9505633826881044e-05, 'epoch': 4.94}
- 10%|▉         | 573/5800 [1:33:53<10:01:18,  6.90s/it]score1 tensor([[0.4688],
-        [0.5234],
-        [0.3594],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.5625, 0.3945, 0.4766], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0269, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:20:30,221] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 10:20:30,222] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.58 | bwd_microstep: 4629.12 | bwd_inner_microstep: 4622.38 | bwd_allreduce_microstep: 6.67 | step_microstep: 41.99
-[2025-01-25 10:20:30,222] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.55 | bwd: 4629.15 | bwd_inner: 4622.38 | bwd_allreduce: 6.70 | step: 42.00
- 10%|▉         | 574/5800 [1:34:00<10:01:24,  6.90s/it]                                                       {'loss': 0.0269, 'grad_norm': 0.6447532773017883, 'learning_rate': 3.950316301957772e-05, 'epoch': 4.95}
- 10%|▉         | 574/5800 [1:34:00<10:01:24,  6.90s/it]score1 tensor([[0.6641],
-        [0.4238],
-        [0.5156],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.3730, 0.5430, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0308, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:20:37,120] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 10:20:37,121] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.61 | bwd_microstep: 4622.12 | bwd_inner_microstep: 4617.35 | bwd_allreduce_microstep: 4.68 | step_microstep: 41.56
-[2025-01-25 10:20:37,121] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.59 | bwd: 4622.14 | bwd_inner: 4617.35 | bwd_allreduce: 4.72 | step: 41.57
- 10%|▉         | 575/5800 [1:34:07<10:01:06,  6.90s/it]                                                       {'loss': 0.0308, 'grad_norm': 4.987320423126221, 'learning_rate': 3.950068613084976e-05, 'epoch': 4.96}
- 10%|▉         | 575/5800 [1:34:07<10:01:06,  6.90s/it]score1 tensor([[0.5195],
-        [0.5703],
-        [0.4062],
-        [0.4980]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4023, 0.5000, 0.4492, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0625, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:20:44,016] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 10:20:44,017] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.95 | bwd_microstep: 4624.08 | bwd_inner_microstep: 4619.29 | bwd_allreduce_microstep: 4.72 | step_microstep: 41.05
-[2025-01-25 10:20:44,018] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.90 | bwd: 4624.10 | bwd_inner: 4619.29 | bwd_allreduce: 4.75 | step: 41.05
- 10%|▉         | 576/5800 [1:34:13<10:01:07,  6.90s/it]                                                       {'loss': 0.0625, 'grad_norm': 5.358130931854248, 'learning_rate': 3.94982031614695e-05, 'epoch': 4.97}
- 10%|▉         | 576/5800 [1:34:14<10:01:07,  6.90s/it]score1 tensor([[0.4707],
-        [0.4531],
-        [0.4531],
-        [0.3867]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5586, 0.5430, 0.5039, 0.4062], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0620, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:20:50,934] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 10:20:50,936] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.76 | bwd_microstep: 4625.93 | bwd_inner_microstep: 4621.02 | bwd_allreduce_microstep: 4.80 | step_microstep: 41.98
-[2025-01-25 10:20:50,936] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.72 | bwd: 4625.95 | bwd_inner: 4621.02 | bwd_allreduce: 4.86 | step: 41.99
- 10%|▉         | 577/5800 [1:34:20<10:01:08,  6.91s/it]                                                       {'loss': 0.062, 'grad_norm': 9.017085075378418, 'learning_rate': 3.9495714112211164e-05, 'epoch': 4.97}
- 10%|▉         | 577/5800 [1:34:20<10:01:08,  6.91s/it]score1 tensor([[0.5117],
-        [0.4727],
-        [0.4414],
-        [0.4492]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.4648, 0.4004, 0.4727], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0308, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:20:57,848] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 10:20:57,850] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.71 | bwd_microstep: 4632.93 | bwd_inner_microstep: 4628.15 | bwd_allreduce_microstep: 4.69 | step_microstep: 41.99
-[2025-01-25 10:20:57,850] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.68 | bwd: 4632.96 | bwd_inner: 4628.15 | bwd_allreduce: 4.74 | step: 42.01
- 10%|▉         | 578/5800 [1:34:27<10:01:12,  6.91s/it]                                                       {'loss': 0.0308, 'grad_norm': 0.7832223773002625, 'learning_rate': 3.94932189838509e-05, 'epoch': 4.98}
- 10%|▉         | 578/5800 [1:34:27<10:01:12,  6.91s/it]score1 tensor([[0.4395],
-        [0.4648],
-        [0.3984],
-        [0.3652]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4219, 0.4355, 0.3984, 0.3477], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:21:04,725] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 10:21:04,727] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.38 | bwd_microstep: 4584.43 | bwd_inner_microstep: 4579.48 | bwd_allreduce_microstep: 4.86 | step_microstep: 54.13
-[2025-01-25 10:21:04,727] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.34 | bwd: 4584.45 | bwd_inner: 4579.48 | bwd_allreduce: 4.90 | step: 54.14
- 10%|▉         | 579/5800 [1:34:34<10:00:20,  6.90s/it]                                                       {'loss': 0.0161, 'grad_norm': 6.65670108795166, 'learning_rate': 3.9490717777166706e-05, 'epoch': 4.99}
- 10%|▉         | 579/5800 [1:34:34<10:00:20,  6.90s/it][93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-score1 tensor([[0.4258]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1914, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:21:08,906] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 10:21:08,907] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 571.87 | bwd_microstep: 1219.77 | bwd_inner_microstep: 1215.06 | bwd_allreduce_microstep: 4.62 | step_microstep: 41.34
-[2025-01-25 10:21:08,908] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 571.83 | bwd: 1219.81 | bwd_inner: 1215.07 | bwd_allreduce: 4.66 | step: 41.35
- 10%|█         | 580/5800 [1:34:38<8:49:32,  6.09s/it]                                                       {'loss': 0.1914, 'grad_norm': 8.95147705078125, 'learning_rate': 3.948821049293853e-05, 'epoch': 5.0}
- 10%|█         | 580/5800 [1:34:38<8:49:32,  6.09s/it][2025-01-25 10:21:13,316] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 10:21:22,915] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 10:21:33,205] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 10:21:43,232] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-score1 tensor([[0.4199],
-        [0.3828],
-        [0.5195],
-        [0.3770]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.4453, 0.4590, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0806, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:22:00,101] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 10:22:00,103] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2140.70 | bwd_microstep: 4593.89 | bwd_inner_microstep: 4589.19 | bwd_allreduce_microstep: 4.62 | step_microstep: 43.18
-[2025-01-25 10:22:00,103] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2140.65 | bwd: 4593.92 | bwd_inner: 4589.19 | bwd_allreduce: 4.66 | step: 43.18
- 10%|█         | 581/5800 [1:35:30<28:26:17, 19.62s/it]                                                       {'loss': 0.0806, 'grad_norm': 4.1996541023254395, 'learning_rate': 3.948569713194816e-05, 'epoch': 5.01}
- 10%|█         | 581/5800 [1:35:30<28:26:17, 19.62s/it]score1 tensor([[0.5508],
-        [0.5391],
-        [0.3867],
-        [0.3750]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6289, 0.7070, 0.3691, 0.3887], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0693, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:22:06,930] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 10:22:06,931] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2131.42 | bwd_microstep: 4582.27 | bwd_inner_microstep: 4577.53 | bwd_allreduce_microstep: 4.64 | step_microstep: 41.17
-[2025-01-25 10:22:06,931] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2131.39 | bwd: 4582.30 | bwd_inner: 4577.53 | bwd_allreduce: 4.70 | step: 41.17
- 10%|█         | 582/5800 [1:35:36<22:52:22, 15.78s/it]                                                       {'loss': 0.0693, 'grad_norm': 5.009578704833984, 'learning_rate': 3.9483177694979324e-05, 'epoch': 5.02}
- 10%|█         | 582/5800 [1:35:36<22:52:22, 15.78s/it]score1 tensor([[0.6367],
-        [0.4180],
-        [0.5156],
-        [0.4414]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133, 0.4082, 0.4961, 0.3730], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0303, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:22:13,783] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 10:22:13,785] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2134.39 | bwd_microstep: 4599.43 | bwd_inner_microstep: 4594.12 | bwd_allreduce_microstep: 5.19 | step_microstep: 45.54
-[2025-01-25 10:22:13,785] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2134.35 | bwd: 4599.45 | bwd_inner: 4594.13 | bwd_allreduce: 5.25 | step: 45.54
- 10%|█         | 583/5800 [1:35:43<18:59:16, 13.10s/it]                                                       {'loss': 0.0303, 'grad_norm': 9.381196975708008, 'learning_rate': 3.948065218281761e-05, 'epoch': 5.03}
- 10%|█         | 583/5800 [1:35:43<18:59:16, 13.10s/it]score1 tensor([[0.5312],
-        [0.4473],
-        [0.4062],
-        [0.4102]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.4395, 0.2812, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0586, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:22:20,653] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 10:22:20,654] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2141.87 | bwd_microstep: 4605.14 | bwd_inner_microstep: 4600.60 | bwd_allreduce_microstep: 4.44 | step_microstep: 42.28
-[2025-01-25 10:22:20,654] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2141.83 | bwd: 4605.16 | bwd_inner: 4600.60 | bwd_allreduce: 4.49 | step: 42.29
- 10%|█         | 584/5800 [1:35:50<16:16:24, 11.23s/it]                                                       {'loss': 0.0586, 'grad_norm': 0.729198157787323, 'learning_rate': 3.9478120596250546e-05, 'epoch': 5.03}
- 10%|█         | 584/5800 [1:35:50<16:16:24, 11.23s/it]score1 tensor([[0.4707],
-        [0.4766],
-        [0.6445],
-        [0.4863]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5898, 0.5273, 0.7031, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0771, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:22:27,528] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 10:22:27,529] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2148.51 | bwd_microstep: 4608.79 | bwd_inner_microstep: 4604.11 | bwd_allreduce_microstep: 4.61 | step_microstep: 41.76
-[2025-01-25 10:22:27,529] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2148.46 | bwd: 4608.81 | bwd_inner: 4604.11 | bwd_allreduce: 4.64 | step: 41.77
- 10%|█         | 585/5800 [1:35:57<14:22:35,  9.92s/it]                                                       {'loss': 0.0771, 'grad_norm': 9.587626457214355, 'learning_rate': 3.947558293606749e-05, 'epoch': 5.04}
- 10%|█         | 585/5800 [1:35:57<14:22:35,  9.92s/it]score1 tensor([[0.4883],
-        [0.4922],
-        [0.4414],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4395, 0.4277, 0.4980, 0.5742], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0435, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:22:34,403] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 10:22:34,404] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2145.37 | bwd_microstep: 4613.71 | bwd_inner_microstep: 4608.41 | bwd_allreduce_microstep: 5.16 | step_microstep: 44.12
-[2025-01-25 10:22:34,404] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2145.33 | bwd: 4613.74 | bwd_inner: 4608.41 | bwd_allreduce: 5.21 | step: 44.12
- 10%|█         | 586/5800 [1:36:04<13:02:58,  9.01s/it]                                                       {'loss': 0.0435, 'grad_norm': 0.6378743052482605, 'learning_rate': 3.9473039203059743e-05, 'epoch': 5.05}
- 10%|█         | 586/5800 [1:36:04<13:02:58,  9.01s/it]score1 tensor([[0.5312],
-        [0.4727],
-        [0.5625],
-        [0.7500]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4199, 0.4141, 0.5352, 0.6367], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0776, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:22:41,292] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 10:22:41,293] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2149.21 | bwd_microstep: 4620.35 | bwd_inner_microstep: 4615.42 | bwd_allreduce_microstep: 4.80 | step_microstep: 43.07
-[2025-01-25 10:22:41,294] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2149.18 | bwd: 4620.38 | bwd_inner: 4615.42 | bwd_allreduce: 4.86 | step: 43.08
- 10%|█         | 587/5800 [1:36:11<12:07:40,  8.38s/it]                                                       {'loss': 0.0776, 'grad_norm': 10.040495872497559, 'learning_rate': 3.9470489398020495e-05, 'epoch': 5.06}
- 10%|█         | 587/5800 [1:36:11<12:07:40,  8.38s/it]score1 tensor([[0.5273],
-        [0.7383],
-        [0.5234],
-        [0.4609]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.5625, 0.4453, 0.3652], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1021, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:22:48,175] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.37
-[2025-01-25 10:22:48,176] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2148.63 | bwd_microstep: 4612.01 | bwd_inner_microstep: 4607.36 | bwd_allreduce_microstep: 4.55 | step_microstep: 42.34
-[2025-01-25 10:22:48,177] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2148.60 | bwd: 4612.05 | bwd_inner: 4607.36 | bwd_allreduce: 4.60 | step: 42.35
- 10%|█         | 588/5800 [1:36:18<11:28:27,  7.93s/it]                                                       {'loss': 0.1021, 'grad_norm': 9.741040229797363, 'learning_rate': 3.946793352174481e-05, 'epoch': 5.07}
- 10%|█         | 588/5800 [1:36:18<11:28:27,  7.93s/it]score1 tensor([[0.4844],
-        [0.5469],
-        [0.5078],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3984, 0.4980, 0.4297, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0601, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:22:55,048] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.37
-[2025-01-25 10:22:55,049] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.70 | bwd_microstep: 4603.89 | bwd_inner_microstep: 4599.31 | bwd_allreduce_microstep: 4.50 | step_microstep: 41.72
-[2025-01-25 10:22:55,049] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.67 | bwd: 4603.91 | bwd_inner: 4599.31 | bwd_allreduce: 4.53 | step: 41.73
- 10%|█         | 589/5800 [1:36:25<11:00:54,  7.61s/it]                                                       {'loss': 0.0601, 'grad_norm': 9.489486694335938, 'learning_rate': 3.946537157502965e-05, 'epoch': 5.08}
- 10%|█         | 589/5800 [1:36:25<11:00:54,  7.61s/it]score1 tensor([[0.5547],
-        [0.5859],
-        [0.4648],
-        [0.4473]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.4531, 0.4160, 0.4805], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0586, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:23:01,929] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 10:23:01,930] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2151.35 | bwd_microstep: 4615.89 | bwd_inner_microstep: 4611.35 | bwd_allreduce_microstep: 4.47 | step_microstep: 41.12
-[2025-01-25 10:23:01,931] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2151.27 | bwd: 4615.91 | bwd_inner: 4611.35 | bwd_allreduce: 4.50 | step: 41.13
- 10%|█         | 590/5800 [1:36:31<10:41:51,  7.39s/it]                                                       {'loss': 0.0586, 'grad_norm': 5.209217071533203, 'learning_rate': 3.946280355867388e-05, 'epoch': 5.09}
- 10%|█         | 590/5800 [1:36:31<10:41:51,  7.39s/it]score1 tensor([[0.4355],
-        [0.6445],
-        [0.4199],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4258, 0.6133, 0.5195, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0410, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:23:08,811] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 10:23:08,813] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.15 | bwd_microstep: 4613.65 | bwd_inner_microstep: 4608.90 | bwd_allreduce_microstep: 4.66 | step_microstep: 44.16
-[2025-01-25 10:23:08,813] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.11 | bwd: 4613.67 | bwd_inner: 4608.90 | bwd_allreduce: 4.70 | step: 44.17
- 10%|█         | 591/5800 [1:36:38<10:28:26,  7.24s/it]                                                       {'loss': 0.041, 'grad_norm': 5.413472652435303, 'learning_rate': 3.9460229473478256e-05, 'epoch': 5.09}
- 10%|█         | 591/5800 [1:36:38<10:28:26,  7.24s/it]score1 tensor([[0.3477],
-        [0.4004],
-        [0.4434],
-        [0.3203]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.4297, 0.5820, 0.3730], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0620, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:23:15,695] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 10:23:15,696] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.36 | bwd_microstep: 4612.16 | bwd_inner_microstep: 4607.61 | bwd_allreduce_microstep: 4.46 | step_microstep: 41.52
-[2025-01-25 10:23:15,696] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.33 | bwd: 4612.21 | bwd_inner: 4607.61 | bwd_allreduce: 4.51 | step: 41.53
- 10%|█         | 592/5800 [1:36:45<10:19:01,  7.13s/it]                                                       {'loss': 0.062, 'grad_norm': 8.526514053344727, 'learning_rate': 3.945764932024541e-05, 'epoch': 5.1}
- 10%|█         | 592/5800 [1:36:45<10:19:01,  7.13s/it]score1 tensor([[0.2988],
-        [0.4023],
-        [0.3848],
-        [0.2832]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.4961, 0.5898, 0.3809], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1245, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:23:22,588] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 10:23:22,589] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.27 | bwd_microstep: 4622.71 | bwd_inner_microstep: 4617.88 | bwd_allreduce_microstep: 4.73 | step_microstep: 42.90
-[2025-01-25 10:23:22,589] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.23 | bwd: 4622.74 | bwd_inner: 4617.88 | bwd_allreduce: 4.79 | step: 42.91
- 10%|█         | 593/5800 [1:36:52<10:12:46,  7.06s/it]                                                       {'loss': 0.1245, 'grad_norm': 8.257475852966309, 'learning_rate': 3.945506309977989e-05, 'epoch': 5.11}
- 10%|█         | 593/5800 [1:36:52<10:12:46,  7.06s/it]score1 tensor([[0.4160],
-        [0.4492],
-        [0.3457],
-        [0.3926]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.5273, 0.5469, 0.4355], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0889, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:23:29,485] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.37
-[2025-01-25 10:23:29,486] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.28 | bwd_microstep: 4623.62 | bwd_inner_microstep: 4618.78 | bwd_allreduce_microstep: 4.74 | step_microstep: 42.91
-[2025-01-25 10:23:29,487] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.24 | bwd: 4623.65 | bwd_inner: 4618.78 | bwd_allreduce: 4.80 | step: 42.92
- 10%|█         | 594/5800 [1:36:59<10:08:22,  7.01s/it]                                                       {'loss': 0.0889, 'grad_norm': 8.609065055847168, 'learning_rate': 3.945247081288813e-05, 'epoch': 5.12}
- 10%|█         | 594/5800 [1:36:59<10:08:22,  7.01s/it]score1 tensor([[0.3281],
-        [0.4590],
-        [0.4258],
-        [0.3359]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4336, 0.6211, 0.5469, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1177, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:23:36,391] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.37
-[2025-01-25 10:23:36,393] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.00 | bwd_microstep: 4626.70 | bwd_inner_microstep: 4621.78 | bwd_allreduce_microstep: 4.83 | step_microstep: 44.13
-[2025-01-25 10:23:36,393] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.97 | bwd: 4626.73 | bwd_inner: 4621.78 | bwd_allreduce: 4.88 | step: 44.15
- 10%|█         | 595/5800 [1:37:06<10:05:34,  6.98s/it]                                                       {'loss': 0.1177, 'grad_norm': 8.522393226623535, 'learning_rate': 3.9449872460378426e-05, 'epoch': 5.13}
- 10%|█         | 595/5800 [1:37:06<10:05:34,  6.98s/it]score1 tensor([[0.4121],
-        [0.4414],
-        [0.4648],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.6484, 0.5703, 0.6133], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1309, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:23:43,284] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 10:23:43,286] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.28 | bwd_microstep: 4614.11 | bwd_inner_microstep: 4609.00 | bwd_allreduce_microstep: 5.01 | step_microstep: 43.27
-[2025-01-25 10:23:43,286] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.23 | bwd: 4614.14 | bwd_inner: 4609.00 | bwd_allreduce: 5.07 | step: 43.28
- 10%|█         | 596/5800 [1:37:13<10:03:03,  6.95s/it]                                                       {'loss': 0.1309, 'grad_norm': 9.025249481201172, 'learning_rate': 3.944726804306101e-05, 'epoch': 5.14}
- 10%|█         | 596/5800 [1:37:13<10:03:03,  6.95s/it]score1 tensor([[0.5039],
-        [0.3828],
-        [0.3457],
-        [0.6172]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.4395, 0.3418, 0.6484], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0244, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:23:50,168] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.36
-[2025-01-25 10:23:50,170] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.20 | bwd_microstep: 4615.82 | bwd_inner_microstep: 4611.07 | bwd_allreduce_microstep: 4.67 | step_microstep: 41.82
-[2025-01-25 10:23:50,170] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.17 | bwd: 4615.84 | bwd_inner: 4611.06 | bwd_allreduce: 4.71 | step: 41.82
- 10%|█         | 597/5800 [1:37:20<10:01:13,  6.93s/it]                                                       {'loss': 0.0244, 'grad_norm': 0.9366965293884277, 'learning_rate': 3.9444657561747976e-05, 'epoch': 5.15}
- 10%|█         | 597/5800 [1:37:20<10:01:13,  6.93s/it]score1 tensor([[0.4375],
-        [0.4551],
-        [0.4531],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.4492, 0.4512, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0142, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:23:57,068] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 10:23:57,070] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.17 | bwd_microstep: 4627.41 | bwd_inner_microstep: 4622.60 | bwd_allreduce_microstep: 4.71 | step_microstep: 42.10
-[2025-01-25 10:23:57,070] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.14 | bwd: 4627.44 | bwd_inner: 4622.60 | bwd_allreduce: 4.76 | step: 42.11
- 10%|█         | 598/5800 [1:37:27<10:00:13,  6.92s/it]                                                       {'loss': 0.0142, 'grad_norm': 8.993098258972168, 'learning_rate': 3.944204101725333e-05, 'epoch': 5.16}
- 10%|█         | 598/5800 [1:37:27<10:00:13,  6.92s/it]score1 tensor([[0.7070],
-        [0.6367],
-        [0.6484],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.6445, 0.6211, 0.4082], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0747, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:24:03,957] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 10:24:03,959] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.38 | bwd_microstep: 4617.15 | bwd_inner_microstep: 4612.00 | bwd_allreduce_microstep: 5.04 | step_microstep: 43.53
-[2025-01-25 10:24:03,959] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.35 | bwd: 4617.17 | bwd_inner: 4612.00 | bwd_allreduce: 5.09 | step: 43.54
- 10%|█         | 599/5800 [1:37:33<9:59:12,  6.91s/it]                                                       {'loss': 0.0747, 'grad_norm': 5.249048709869385, 'learning_rate': 3.9439418410392935e-05, 'epoch': 5.16}
- 10%|█         | 599/5800 [1:37:33<9:59:12,  6.91s/it]score1 tensor([[0.5078],
-        [0.6445],
-        [0.6602],
-        [0.6797]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4668, 0.5547, 0.6328, 0.6562], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0454, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:24:10,858] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.04 | optimizer_step: 4.37
-[2025-01-25 10:24:10,860] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.33 | bwd_microstep: 4623.15 | bwd_inner_microstep: 4618.32 | bwd_allreduce_microstep: 4.72 | step_microstep: 42.80
-[2025-01-25 10:24:10,860] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.29 | bwd: 4623.17 | bwd_inner: 4618.32 | bwd_allreduce: 4.77 | step: 42.80
- 10%|█         | 600/5800 [1:37:40<9:59:01,  6.91s/it]                                                      {'loss': 0.0454, 'grad_norm': 10.534418106079102, 'learning_rate': 3.943678974198458e-05, 'epoch': 5.17}
- 10%|█         | 600/5800 [1:37:40<9:59:01,  6.91s/it]score1 tensor([[0.6289],
-        [0.6211],
-        [0.4395],
-        [0.4727]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.6016, 0.5156, 0.4863], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0610, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:24:17,763] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 10:24:17,764] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.76 | bwd_microstep: 4614.57 | bwd_inner_microstep: 4610.06 | bwd_allreduce_microstep: 4.43 | step_microstep: 40.92
-[2025-01-25 10:24:17,764] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.72 | bwd: 4614.59 | bwd_inner: 4610.06 | bwd_allreduce: 4.46 | step: 40.92
- 10%|█         | 601/5800 [1:37:47<9:58:28,  6.91s/it]                                                      {'loss': 0.061, 'grad_norm': 1.9021075963974, 'learning_rate': 3.9434155012847924e-05, 'epoch': 5.18}
- 10%|█         | 601/5800 [1:37:47<9:58:28,  6.91s/it]score1 tensor([[0.4355],
-        [0.4375],
-        [0.4746],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.4473, 0.4941, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:24:24,658] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 10:24:24,660] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.23 | bwd_microstep: 4621.19 | bwd_inner_microstep: 4615.95 | bwd_allreduce_microstep: 5.14 | step_microstep: 42.73
-[2025-01-25 10:24:24,660] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.20 | bwd: 4621.21 | bwd_inner: 4615.95 | bwd_allreduce: 5.19 | step: 42.74
- 10%|█         | 602/5800 [1:37:54<9:58:07,  6.90s/it]                                                      {'loss': 0.0312, 'grad_norm': 9.103344917297363, 'learning_rate': 3.943151422380453e-05, 'epoch': 5.19}
- 10%|█         | 602/5800 [1:37:54<9:58:07,  6.90s/it]score1 tensor([[0.4941],
-        [0.4473],
-        [0.5000],
-        [0.5859]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.4180, 0.5703, 0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0557, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:24:31,555] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 10:24:31,556] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.55 | bwd_microstep: 4619.29 | bwd_inner_microstep: 4614.80 | bwd_allreduce_microstep: 4.42 | step_microstep: 41.51
-[2025-01-25 10:24:31,556] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.51 | bwd: 4619.31 | bwd_inner: 4614.80 | bwd_allreduce: 4.45 | step: 41.52
- 10%|█         | 603/5800 [1:38:01<9:57:46,  6.90s/it]                                                      {'loss': 0.0557, 'grad_norm': 0.6154565215110779, 'learning_rate': 3.942886737567783e-05, 'epoch': 5.2}
- 10%|█         | 603/5800 [1:38:01<9:57:46,  6.90s/it]score1 tensor([[0.5430],
-        [0.5039],
-        [0.4316],
-        [0.4316]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.5625, 0.3750, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0674, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:24:38,454] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.02 | optimizer_step: 4.37
-[2025-01-25 10:24:38,455] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.18 | bwd_microstep: 4624.65 | bwd_inner_microstep: 4619.65 | bwd_allreduce_microstep: 4.91 | step_microstep: 41.81
-[2025-01-25 10:24:38,456] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.16 | bwd: 4624.68 | bwd_inner: 4619.65 | bwd_allreduce: 4.95 | step: 41.81
- 10%|█         | 604/5800 [1:38:08<9:57:40,  6.90s/it]                                                      {'loss': 0.0674, 'grad_norm': 4.761830806732178, 'learning_rate': 3.942621446929316e-05, 'epoch': 5.21}
- 10%|█         | 604/5800 [1:38:08<9:57:40,  6.90s/it]score1 tensor([[0.6445],
-        [0.4375],
-        [0.5039],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6641, 0.5039, 0.4609, 0.5234], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:24:45,358] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 10:24:45,359] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.95 | bwd_microstep: 4622.31 | bwd_inner_microstep: 4617.96 | bwd_allreduce_microstep: 4.27 | step_microstep: 42.22
-[2025-01-25 10:24:45,359] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.92 | bwd: 4622.33 | bwd_inner: 4617.96 | bwd_allreduce: 4.31 | step: 42.22
- 10%|█         | 605/5800 [1:38:15<9:57:29,  6.90s/it]                                                      {'loss': 0.0352, 'grad_norm': 4.9189558029174805, 'learning_rate': 3.942355550547775e-05, 'epoch': 5.22}
- 10%|█         | 605/5800 [1:38:15<9:57:29,  6.90s/it]score1 tensor([[0.4863],
-        [0.5039],
-        [0.4922],
-        [0.4922]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.5391, 0.4180, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:24:52,262] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 10:24:52,263] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.21 | bwd_microstep: 4628.51 | bwd_inner_microstep: 4623.82 | bwd_allreduce_microstep: 4.61 | step_microstep: 41.15
-[2025-01-25 10:24:52,264] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.17 | bwd: 4628.53 | bwd_inner: 4623.82 | bwd_allreduce: 4.65 | step: 41.16
- 10%|█         | 606/5800 [1:38:22<9:57:30,  6.90s/it]                                                      {'loss': 0.0352, 'grad_norm': 4.582035541534424, 'learning_rate': 3.94208904850607e-05, 'epoch': 5.22}
- 10%|█         | 606/5800 [1:38:22<9:57:30,  6.90s/it]score1 tensor([[0.4570],
-        [0.6797],
-        [0.4844],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4258, 0.6133, 0.4141, 0.4375], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0625, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:24:59,175] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 10:24:59,176] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.75 | bwd_microstep: 4638.15 | bwd_inner_microstep: 4633.66 | bwd_allreduce_microstep: 4.41 | step_microstep: 42.33
-[2025-01-25 10:24:59,176] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.72 | bwd: 4638.19 | bwd_inner: 4633.66 | bwd_allreduce: 4.45 | step: 42.34
- 10%|█         | 607/5800 [1:38:29<9:57:39,  6.91s/it]                                                      {'loss': 0.0625, 'grad_norm': 9.564851760864258, 'learning_rate': 3.9418219408873016e-05, 'epoch': 5.23}
- 10%|█         | 607/5800 [1:38:29<9:57:39,  6.91s/it]score1 tensor([[0.5312],
-        [0.5469],
-        [0.4902],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.5469, 0.4551, 0.4160], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0425, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:25:06,036] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 10:25:06,037] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.89 | bwd_microstep: 4585.89 | bwd_inner_microstep: 4580.90 | bwd_allreduce_microstep: 4.92 | step_microstep: 41.48
-[2025-01-25 10:25:06,037] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.85 | bwd: 4585.91 | bwd_inner: 4580.89 | bwd_allreduce: 4.95 | step: 41.48
- 10%|█         | 608/5800 [1:38:36<9:56:24,  6.89s/it]                                                      {'loss': 0.0425, 'grad_norm': 2.325637102127075, 'learning_rate': 3.941554227774758e-05, 'epoch': 5.24}
- 10%|█         | 608/5800 [1:38:36<9:56:24,  6.89s/it]score1 tensor([[0.5312],
-        [0.5859],
-        [0.4629],
-        [0.4629]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5586, 0.6094, 0.4043, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:25:12,963] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 10:25:12,964] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.75 | bwd_microstep: 4649.30 | bwd_inner_microstep: 4644.56 | bwd_allreduce_microstep: 4.66 | step_microstep: 42.17
-[2025-01-25 10:25:12,965] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.72 | bwd: 4649.32 | bwd_inner: 4644.56 | bwd_allreduce: 4.70 | step: 42.18
- 10%|█         | 609/5800 [1:38:42<9:57:10,  6.90s/it]                                                      {'loss': 0.0352, 'grad_norm': 5.11940336227417, 'learning_rate': 3.9412859092519184e-05, 'epoch': 5.25}
- 10%|█         | 609/5800 [1:38:42<9:57:10,  6.90s/it]score1 tensor([[0.4805],
-        [0.4336],
-        [0.4023],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.4473, 0.4004, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:25:19,896] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.37
-[2025-01-25 10:25:19,897] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.23 | bwd_microstep: 4649.71 | bwd_inner_microstep: 4644.98 | bwd_allreduce_microstep: 4.63 | step_microstep: 42.82
-[2025-01-25 10:25:19,898] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.19 | bwd: 4649.73 | bwd_inner: 4644.98 | bwd_allreduce: 4.68 | step: 42.81
- 11%|█         | 610/5800 [1:38:49<9:57:50,  6.91s/it]                                                      {'loss': 0.0103, 'grad_norm': 4.696634292602539, 'learning_rate': 3.941016985402448e-05, 'epoch': 5.26}
- 11%|█         | 610/5800 [1:38:49<9:57:50,  6.91s/it]score1 tensor([[0.5469],
-        [0.5742],
-        [0.5156],
-        [0.4746]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.6523, 0.4355, 0.4609], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:25:26,769] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 10:25:26,771] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.24 | bwd_microstep: 4591.22 | bwd_inner_microstep: 4585.92 | bwd_allreduce_microstep: 5.21 | step_microstep: 44.56
-[2025-01-25 10:25:26,771] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.19 | bwd: 4591.25 | bwd_inner: 4585.92 | bwd_allreduce: 5.26 | step: 44.58
- 11%|█         | 611/5800 [1:38:56<9:56:42,  6.90s/it]                                                      {'loss': 0.043, 'grad_norm': 2.1751208305358887, 'learning_rate': 3.9407474563102015e-05, 'epoch': 5.27}
- 11%|█         | 611/5800 [1:38:56<9:56:42,  6.90s/it]score1 tensor([[0.5273],
-        [0.4863],
-        [0.4492],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3809, 0.5000, 0.4805, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0811, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:25:33,703] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.37
-[2025-01-25 10:25:33,704] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.48 | bwd_microstep: 4650.83 | bwd_inner_microstep: 4645.88 | bwd_allreduce_microstep: 4.76 | step_microstep: 45.89
-[2025-01-25 10:25:33,705] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.44 | bwd: 4650.85 | bwd_inner: 4645.88 | bwd_allreduce: 4.88 | step: 45.90
- 11%|█         | 612/5800 [1:39:03<9:57:36,  6.91s/it]                                                      {'loss': 0.0811, 'grad_norm': 4.4580817222595215, 'learning_rate': 3.940477322059223e-05, 'epoch': 5.28}
- 11%|█         | 612/5800 [1:39:03<9:57:36,  6.91s/it]score1 tensor([[0.4805],
-        [0.4434],
-        [0.5078],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4629, 0.3789, 0.5156, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0347, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:25:40,628] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 10:25:40,629] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.36 | bwd_microstep: 4642.23 | bwd_inner_microstep: 4637.30 | bwd_allreduce_microstep: 4.83 | step_microstep: 42.41
-[2025-01-25 10:25:40,629] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.31 | bwd: 4642.25 | bwd_inner: 4637.30 | bwd_allreduce: 4.88 | step: 42.41
- 11%|█         | 613/5800 [1:39:10<9:57:57,  6.92s/it]                                                      {'loss': 0.0347, 'grad_norm': 4.576569557189941, 'learning_rate': 3.940206582733746e-05, 'epoch': 5.28}
- 11%|█         | 613/5800 [1:39:10<9:57:57,  6.92s/it]score1 tensor([[0.4922],
-        [0.4590],
-        [0.5195],
-        [0.6055]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.5078, 0.4863, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0303, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:25:47,566] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.93 | optimizer_step: 4.36
-[2025-01-25 10:25:47,568] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.32 | bwd_microstep: 4651.08 | bwd_inner_microstep: 4646.05 | bwd_allreduce_microstep: 4.93 | step_microstep: 43.01
-[2025-01-25 10:25:47,568] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.29 | bwd: 4651.11 | bwd_inner: 4646.05 | bwd_allreduce: 4.98 | step: 43.02
- 11%|█         | 614/5800 [1:39:17<9:58:21,  6.92s/it]                                                      {'loss': 0.0303, 'grad_norm': 4.912810802459717, 'learning_rate': 3.93993523841819e-05, 'epoch': 5.29}
- 11%|█         | 614/5800 [1:39:17<9:58:21,  6.92s/it]score1 tensor([[0.5039],
-        [0.4902],
-        [0.5000],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.3945, 0.5430, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0503, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:25:54,460] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.88 | optimizer_step: 4.37
-[2025-01-25 10:25:54,462] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.69 | bwd_microstep: 4605.07 | bwd_inner_microstep: 4600.15 | bwd_allreduce_microstep: 4.83 | step_microstep: 41.35
-[2025-01-25 10:25:54,462] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.65 | bwd: 4605.10 | bwd_inner: 4600.15 | bwd_allreduce: 4.88 | step: 41.36
- 11%|█         | 615/5800 [1:39:24<9:57:25,  6.91s/it]                                                      {'loss': 0.0503, 'grad_norm': 2.435187578201294, 'learning_rate': 3.9396632891971666e-05, 'epoch': 5.3}
- 11%|█         | 615/5800 [1:39:24<9:57:25,  6.91s/it]score1 tensor([[0.5898],
-        [0.5547],
-        [0.4824],
-        [0.5898]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5820, 0.4785, 0.4160, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0688, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:26:01,397] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 10:26:01,398] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.65 | bwd_microstep: 4647.93 | bwd_inner_microstep: 4642.92 | bwd_allreduce_microstep: 4.90 | step_microstep: 41.50
-[2025-01-25 10:26:01,398] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.62 | bwd: 4647.95 | bwd_inner: 4642.92 | bwd_allreduce: 4.95 | step: 41.51
- 11%|█         | 616/5800 [1:39:31<9:57:49,  6.92s/it]                                                      {'loss': 0.0688, 'grad_norm': 9.910919189453125, 'learning_rate': 3.939390735155473e-05, 'epoch': 5.31}
- 11%|█         | 616/5800 [1:39:31<9:57:49,  6.92s/it]score1 tensor([[0.5078],
-        [0.5469],
-        [0.4668],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.5352, 0.4512, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:26:08,324] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.97 | optimizer_step: 4.36
-[2025-01-25 10:26:08,326] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.32 | bwd_microstep: 4646.13 | bwd_inner_microstep: 4641.09 | bwd_allreduce_microstep: 4.91 | step_microstep: 42.57
-[2025-01-25 10:26:08,326] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.28 | bwd: 4646.15 | bwd_inner: 4641.09 | bwd_allreduce: 4.98 | step: 42.58
- 11%|█         | 617/5800 [1:39:38<9:58:08,  6.92s/it]                                                      {'loss': 0.0195, 'grad_norm': 0.6507911086082458, 'learning_rate': 3.9391175763780974e-05, 'epoch': 5.32}
- 11%|█         | 617/5800 [1:39:38<9:58:08,  6.92s/it]score1 tensor([[0.5312],
-        [0.6758],
-        [0.5430],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4531, 0.5625, 0.4473, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0854, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:26:15,264] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.90 | optimizer_step: 4.36
-[2025-01-25 10:26:15,265] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.21 | bwd_microstep: 4646.78 | bwd_inner_microstep: 4642.17 | bwd_allreduce_microstep: 4.53 | step_microstep: 41.89
-[2025-01-25 10:26:15,265] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.18 | bwd: 4646.80 | bwd_inner: 4642.17 | bwd_allreduce: 4.57 | step: 41.90
- 11%|█         | 618/5800 [1:39:45<9:58:18,  6.93s/it]                                                      {'loss': 0.0854, 'grad_norm': 9.939507484436035, 'learning_rate': 3.9388438129502143e-05, 'epoch': 5.33}
- 11%|█         | 618/5800 [1:39:45<9:58:18,  6.93s/it]score1 tensor([[0.5117],
-        [0.6367],
-        [0.4004],
-        [0.4219]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.5625, 0.3105, 0.4434], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0542, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:26:22,190] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.00 | optimizer_step: 4.36
-[2025-01-25 10:26:22,191] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.92 | bwd_microstep: 4643.33 | bwd_inner_microstep: 4638.23 | bwd_allreduce_microstep: 4.98 | step_microstep: 43.80
-[2025-01-25 10:26:22,192] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.89 | bwd: 4643.36 | bwd_inner: 4638.23 | bwd_allreduce: 5.04 | step: 43.81
- 11%|█         | 619/5800 [1:39:52<9:58:07,  6.93s/it]                                                      {'loss': 0.0542, 'grad_norm': 0.9713221788406372, 'learning_rate': 3.9385694449571886e-05, 'epoch': 5.34}
- 11%|█         | 619/5800 [1:39:52<9:58:07,  6.93s/it]score1 tensor([[0.5508],
-        [0.5625],
-        [0.5430],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.5039, 0.4863, 0.4023], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0688, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:26:29,121] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.12 | optimizer_step: 4.37
-[2025-01-25 10:26:29,123] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.93 | bwd_microstep: 4644.25 | bwd_inner_microstep: 4639.17 | bwd_allreduce_microstep: 4.98 | step_microstep: 43.07
-[2025-01-25 10:26:29,123] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.89 | bwd: 4644.27 | bwd_inner: 4639.17 | bwd_allreduce: 5.03 | step: 43.10
- 11%|█         | 620/5800 [1:39:59<9:58:16,  6.93s/it]                                                      {'loss': 0.0688, 'grad_norm': 10.023364067077637, 'learning_rate': 3.938294472484573e-05, 'epoch': 5.34}
- 11%|█         | 620/5800 [1:39:59<9:58:16,  6.93s/it]score1 tensor([[0.5352],
-        [0.4688],
-        [0.4941],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4414, 0.4922, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0220, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:26:36,061] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.89 | optimizer_step: 4.37
-[2025-01-25 10:26:36,063] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.02 | bwd_microstep: 4647.34 | bwd_inner_microstep: 4642.43 | bwd_allreduce_microstep: 4.80 | step_microstep: 42.77
-[2025-01-25 10:26:36,063] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.97 | bwd: 4647.36 | bwd_inner: 4642.43 | bwd_allreduce: 4.86 | step: 42.78
- 11%|█         | 621/5800 [1:40:06<9:58:17,  6.93s/it]                                                      {'loss': 0.022, 'grad_norm': 4.701114654541016, 'learning_rate': 3.9380188956181084e-05, 'epoch': 5.35}
- 11%|█         | 621/5800 [1:40:06<9:58:17,  6.93s/it]score1 tensor([[0.5117],
-        [0.4707],
-        [0.6016],
-        [0.4531]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.4707, 0.6875, 0.4805], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0518, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:26:42,963] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.11 | optimizer_step: 4.36
-[2025-01-25 10:26:42,964] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.30 | bwd_microstep: 4614.19 | bwd_inner_microstep: 4609.04 | bwd_allreduce_microstep: 5.06 | step_microstep: 43.46
-[2025-01-25 10:26:42,965] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.24 | bwd: 4614.21 | bwd_inner: 4609.04 | bwd_allreduce: 5.10 | step: 43.46
- 11%|█         | 622/5800 [1:40:12<9:57:22,  6.92s/it]                                                      {'loss': 0.0518, 'grad_norm': 7.4154887199401855, 'learning_rate': 3.937742714443725e-05, 'epoch': 5.36}
- 11%|█         | 622/5800 [1:40:12<9:57:22,  6.92s/it]score1 tensor([[0.3965],
-        [0.3379],
-        [0.4531],
-        [0.4512]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4316, 0.4473, 0.5195, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0698, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:26:49,898] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 10:26:49,899] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.80 | bwd_microstep: 4648.04 | bwd_inner_microstep: 4640.95 | bwd_allreduce_microstep: 6.99 | step_microstep: 42.25
-[2025-01-25 10:26:49,899] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.76 | bwd: 4648.06 | bwd_inner: 4640.95 | bwd_allreduce: 7.04 | step: 42.25
- 11%|█         | 623/5800 [1:40:19<9:57:36,  6.93s/it]                                                      {'loss': 0.0698, 'grad_norm': 8.9642333984375, 'learning_rate': 3.93746592904754e-05, 'epoch': 5.37}
- 11%|█         | 623/5800 [1:40:19<9:57:36,  6.93s/it]score1 tensor([[0.4727],
-        [0.3750],
-        [0.4102],
-        [0.4043]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5078, 0.4336, 0.4883, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0654, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:26:56,824] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 10:26:56,825] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.13 | bwd_microstep: 4642.98 | bwd_inner_microstep: 4638.13 | bwd_allreduce_microstep: 4.78 | step_microstep: 40.89
-[2025-01-25 10:26:56,825] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.09 | bwd: 4643.01 | bwd_inner: 4638.13 | bwd_allreduce: 4.81 | step: 40.90
- 11%|█         | 624/5800 [1:40:26<9:57:37,  6.93s/it]                                                      {'loss': 0.0654, 'grad_norm': 9.117073059082031, 'learning_rate': 3.9371885395158604e-05, 'epoch': 5.38}
- 11%|█         | 624/5800 [1:40:26<9:57:37,  6.93s/it]score1 tensor([[0.4824],
-        [0.4082],
-        [0.4590],
-        [0.4043]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.4453, 0.4844, 0.4570], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0371, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:27:03,766] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 8.35 | optimizer_step: 4.36
-[2025-01-25 10:27:03,768] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.52 | bwd_microstep: 4649.44 | bwd_inner_microstep: 4644.50 | bwd_allreduce_microstep: 4.83 | step_microstep: 46.56
-[2025-01-25 10:27:03,768] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.49 | bwd: 4649.46 | bwd_inner: 4644.50 | bwd_allreduce: 4.88 | step: 46.57
- 11%|█         | 625/5800 [1:40:33<9:57:44,  6.93s/it]                                                      {'loss': 0.0371, 'grad_norm': 4.462512969970703, 'learning_rate': 3.936910545935182e-05, 'epoch': 5.39}
- 11%|█         | 625/5800 [1:40:33<9:57:44,  6.93s/it]score1 tensor([[0.4082],
-        [0.4707],
-        [0.5430],
-        [0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4883, 0.4922, 0.5742, 0.4746], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0356, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:27:10,693] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.88 | optimizer_step: 4.37
-[2025-01-25 10:27:10,695] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.26 | bwd_microstep: 4645.77 | bwd_inner_microstep: 4637.46 | bwd_allreduce_microstep: 8.20 | step_microstep: 44.46
-[2025-01-25 10:27:10,695] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.22 | bwd: 4645.80 | bwd_inner: 4637.46 | bwd_allreduce: 8.26 | step: 44.47
- 11%|█         | 626/5800 [1:40:40<9:57:37,  6.93s/it]                                                      {'loss': 0.0356, 'grad_norm': 4.721743106842041, 'learning_rate': 3.936631948392186e-05, 'epoch': 5.4}
- 11%|█         | 626/5800 [1:40:40<9:57:37,  6.93s/it]score1 tensor([[0.5781],
-        [0.5234],
-        [0.5430],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6875, 0.4844, 0.5469, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0488, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:27:17,634] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.97 | optimizer_step: 4.37
-[2025-01-25 10:27:17,636] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.71 | bwd_microstep: 4640.61 | bwd_inner_microstep: 4634.92 | bwd_allreduce_microstep: 5.59 | step_microstep: 55.72
-[2025-01-25 10:27:17,636] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.66 | bwd: 4640.64 | bwd_inner: 4634.92 | bwd_allreduce: 5.63 | step: 55.74
- 11%|█         | 627/5800 [1:40:47<9:57:44,  6.93s/it]                                                      {'loss': 0.0488, 'grad_norm': 5.298247814178467, 'learning_rate': 3.936352746973745e-05, 'epoch': 5.41}
- 11%|█         | 627/5800 [1:40:47<9:57:44,  6.93s/it]score1 tensor([[0.6328],
-        [0.5938],
-        [0.5664],
-        [0.5781]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.4844, 0.4180, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0918, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:27:24,585] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.36
-[2025-01-25 10:27:24,588] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.31 | bwd_microstep: 4649.91 | bwd_inner_microstep: 4645.05 | bwd_allreduce_microstep: 4.76 | step_microstep: 52.15
-[2025-01-25 10:27:24,589] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.28 | bwd: 4649.94 | bwd_inner: 4645.05 | bwd_allreduce: 4.81 | step: 52.19
- 11%|█         | 628/5800 [1:40:54<9:58:53,  6.95s/it]                                                      {'loss': 0.0918, 'grad_norm': 10.249286651611328, 'learning_rate': 3.936072941766919e-05, 'epoch': 5.41}
- 11%|█         | 628/5800 [1:40:54<9:58:53,  6.95s/it]score1 tensor([[0.5469],
-        [0.6875],
-        [0.6133],
-        [0.6250]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3711, 0.6367, 0.4551, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1289, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:27:31,571] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.53 | optimizer_step: 4.36
-[2025-01-25 10:27:31,574] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.14 | bwd_microstep: 4650.77 | bwd_inner_microstep: 4643.08 | bwd_allreduce_microstep: 7.46 | step_microstep: 62.23
-[2025-01-25 10:27:31,574] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.11 | bwd: 4650.84 | bwd_inner: 4643.08 | bwd_allreduce: 7.58 | step: 62.23
- 11%|█         | 629/5800 [1:41:01<9:59:03,  6.95s/it]                                                      {'loss': 0.1289, 'grad_norm': 10.451498985290527, 'learning_rate': 3.9357925328589566e-05, 'epoch': 5.42}
- 11%|█         | 629/5800 [1:41:01<9:59:03,  6.95s/it]score1 tensor([[0.6680],
-        [0.6875],
-        [0.6602],
-        [0.6719]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4766, 0.5977, 0.5547, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1309, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:27:38,518] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.37
-[2025-01-25 10:27:38,520] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.74 | bwd_microstep: 4648.02 | bwd_inner_microstep: 4643.15 | bwd_allreduce_microstep: 4.79 | step_microstep: 41.03
-[2025-01-25 10:27:38,520] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.66 | bwd: 4648.04 | bwd_inner: 4643.15 | bwd_allreduce: 4.83 | step: 41.03
- 11%|█         | 630/5800 [1:41:08<9:58:43,  6.95s/it]                                                      {'loss': 0.1309, 'grad_norm': 10.8652925491333, 'learning_rate': 3.935511520337293e-05, 'epoch': 5.43}
- 11%|█         | 630/5800 [1:41:08<9:58:43,  6.95s/it]score1 tensor([[0.6445],
-        [0.6406],
-        [0.7344],
-        [0.6914]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4766, 0.4844, 0.5938, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1377, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:27:45,450] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.36
-[2025-01-25 10:27:45,451] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.45 | bwd_microstep: 4651.02 | bwd_inner_microstep: 4645.90 | bwd_allreduce_microstep: 5.03 | step_microstep: 41.65
-[2025-01-25 10:27:45,452] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.42 | bwd: 4651.04 | bwd_inner: 4645.90 | bwd_allreduce: 5.07 | step: 41.66
- 11%|█         | 631/5800 [1:41:15<9:58:12,  6.94s/it]                                                      {'loss': 0.1377, 'grad_norm': 10.856188774108887, 'learning_rate': 3.9352299042895534e-05, 'epoch': 5.44}
- 11%|█         | 631/5800 [1:41:15<9:58:12,  6.94s/it]score1 tensor([[0.6797],
-        [0.6992],
-        [0.4863],
-        [0.6367]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.6289, 0.3652, 0.4570], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1279, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:27:52,377] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 10:27:52,379] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.09 | bwd_microstep: 4638.78 | bwd_inner_microstep: 4633.64 | bwd_allreduce_microstep: 5.05 | step_microstep: 42.17
-[2025-01-25 10:27:52,379] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.06 | bwd: 4638.81 | bwd_inner: 4633.64 | bwd_allreduce: 5.10 | step: 42.18
- 11%|█         | 632/5800 [1:41:22<9:57:42,  6.94s/it]                                                      {'loss': 0.1279, 'grad_norm': 10.509485244750977, 'learning_rate': 3.93494768480355e-05, 'epoch': 5.45}
- 11%|█         | 632/5800 [1:41:22<9:57:42,  6.94s/it]score1 tensor([[0.5859],
-        [0.6680],
-        [0.6133],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4570, 0.6953, 0.5391, 0.4199], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0854, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:27:59,306] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 10:27:59,307] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.15 | bwd_microstep: 4645.25 | bwd_inner_microstep: 4640.42 | bwd_allreduce_microstep: 4.75 | step_microstep: 41.03
-[2025-01-25 10:27:59,308] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.11 | bwd: 4645.27 | bwd_inner: 4640.42 | bwd_allreduce: 4.79 | step: 41.04
- 11%|█         | 633/5800 [1:41:29<9:57:13,  6.94s/it]                                                      {'loss': 0.0854, 'grad_norm': 4.788797855377197, 'learning_rate': 3.934664861967285e-05, 'epoch': 5.46}
- 11%|█         | 633/5800 [1:41:29<9:57:13,  6.94s/it]score1 tensor([[0.5469],
-        [0.5898],
-        [0.5703],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.5156, 0.3789, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1040, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:28:06,235] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.36
-[2025-01-25 10:28:06,237] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.02 | bwd_microstep: 4646.34 | bwd_inner_microstep: 4641.42 | bwd_allreduce_microstep: 4.82 | step_microstep: 41.83
-[2025-01-25 10:28:06,237] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.99 | bwd: 4646.37 | bwd_inner: 4641.42 | bwd_allreduce: 4.87 | step: 41.84
- 11%|█         | 634/5800 [1:41:36<9:56:52,  6.93s/it]                                                      {'loss': 0.104, 'grad_norm': 10.002917289733887, 'learning_rate': 3.934381435868946e-05, 'epoch': 5.47}
- 11%|█         | 634/5800 [1:41:36<9:56:52,  6.93s/it]score1 tensor([[0.5117],
-        [0.4727],
-        [0.5234],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6484, 0.4316, 0.5664, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0630, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:28:13,144] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.36
-[2025-01-25 10:28:13,145] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.36 | bwd_microstep: 4633.89 | bwd_inner_microstep: 4629.31 | bwd_allreduce_microstep: 4.49 | step_microstep: 42.36
-[2025-01-25 10:28:13,145] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.33 | bwd: 4633.91 | bwd_inner: 4629.31 | bwd_allreduce: 4.53 | step: 42.37
- 11%|█         | 635/5800 [1:41:43<9:56:08,  6.93s/it]                                                      {'loss': 0.063, 'grad_norm': 5.198956489562988, 'learning_rate': 3.934097406596912e-05, 'epoch': 5.47}
- 11%|█         | 635/5800 [1:41:43<9:56:08,  6.93s/it]score1 tensor([[0.4629],
-        [0.4824],
-        [0.4648],
-        [0.4688]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.5781, 0.4824, 0.4277], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:28:20,064] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.37
-[2025-01-25 10:28:20,065] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.13 | bwd_microstep: 4637.90 | bwd_inner_microstep: 4633.38 | bwd_allreduce_microstep: 4.43 | step_microstep: 41.71
-[2025-01-25 10:28:20,065] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.08 | bwd: 4637.93 | bwd_inner: 4633.38 | bwd_allreduce: 4.47 | step: 41.72
- 11%|█         | 636/5800 [1:41:50<9:55:55,  6.92s/it]                                                      {'loss': 0.0508, 'grad_norm': 4.764087200164795, 'learning_rate': 3.933812774239746e-05, 'epoch': 5.48}
- 11%|█         | 636/5800 [1:41:50<9:55:55,  6.92s/it]score1 tensor([[0.3418],
-        [0.4980],
-        [0.4590],
-        [0.4668]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3457, 0.6055, 0.5273, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0542, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:28:26,988] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 10:28:26,989] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.20 | bwd_microstep: 4648.25 | bwd_inner_microstep: 4643.71 | bwd_allreduce_microstep: 4.46 | step_microstep: 42.33
-[2025-01-25 10:28:26,990] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.16 | bwd: 4648.28 | bwd_inner: 4643.72 | bwd_allreduce: 4.50 | step: 42.34
- 11%|█         | 637/5800 [1:41:56<9:55:49,  6.92s/it]                                                      {'loss': 0.0542, 'grad_norm': 9.182003021240234, 'learning_rate': 3.9335275388862026e-05, 'epoch': 5.49}
- 11%|█         | 637/5800 [1:41:56<9:55:49,  6.92s/it]score1 tensor([[0.4199],
-        [0.3926],
-        [0.4902],
-        [0.4160]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.3613, 0.5508, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0376, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:28:33,912] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 10:28:33,913] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.61 | bwd_microstep: 4645.70 | bwd_inner_microstep: 4640.92 | bwd_allreduce_microstep: 4.68 | step_microstep: 41.78
-[2025-01-25 10:28:33,914] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.58 | bwd: 4645.73 | bwd_inner: 4640.92 | bwd_allreduce: 4.73 | step: 41.79
- 11%|█         | 638/5800 [1:42:03<9:55:42,  6.92s/it]                                                      {'loss': 0.0376, 'grad_norm': 0.7003293633460999, 'learning_rate': 3.933241700625223e-05, 'epoch': 5.5}
- 11%|█         | 638/5800 [1:42:03<9:55:42,  6.92s/it]score1 tensor([[0.4609],
-        [0.4922],
-        [0.4824],
-        [0.3516]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.4863, 0.6289, 0.4707], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0786, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:28:40,831] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 10:28:40,832] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.66 | bwd_microstep: 4640.08 | bwd_inner_microstep: 4635.45 | bwd_allreduce_microstep: 4.52 | step_microstep: 42.57
-[2025-01-25 10:28:40,832] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.61 | bwd: 4640.11 | bwd_inner: 4635.44 | bwd_allreduce: 4.58 | step: 42.58
- 11%|█         | 639/5800 [1:42:10<9:55:26,  6.92s/it]                                                      {'loss': 0.0786, 'grad_norm': 1.407281517982483, 'learning_rate': 3.932955259545936e-05, 'epoch': 5.51}
- 11%|█         | 639/5800 [1:42:10<9:55:26,  6.92s/it]score1 tensor([[0.4707],
-        [0.5430],
-        [0.4121],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.6445, 0.4941, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0581, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:28:47,756] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.37
-[2025-01-25 10:28:47,757] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.15 | bwd_microstep: 4639.01 | bwd_inner_microstep: 4634.24 | bwd_allreduce_microstep: 4.68 | step_microstep: 41.28
-[2025-01-25 10:28:47,758] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.12 | bwd: 4639.04 | bwd_inner: 4634.24 | bwd_allreduce: 4.72 | step: 41.28
- 11%|█         | 640/5800 [1:42:17<9:55:24,  6.92s/it]                                                      {'loss': 0.0581, 'grad_norm': 9.634444236755371, 'learning_rate': 3.93266821573766e-05, 'epoch': 5.52}
- 11%|█         | 640/5800 [1:42:17<9:55:24,  6.92s/it]score1 tensor([[0.4551],
-        [0.4980],
-        [0.4609],
-        [0.4863]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4746, 0.5781, 0.3750, 0.4258], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0615, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:28:54,677] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 10:28:54,679] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.25 | bwd_microstep: 4639.57 | bwd_inner_microstep: 4635.05 | bwd_allreduce_microstep: 4.41 | step_microstep: 45.22
-[2025-01-25 10:28:54,679] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.21 | bwd: 4639.60 | bwd_inner: 4635.06 | bwd_allreduce: 4.46 | step: 45.23
- 11%|█         | 641/5800 [1:42:24<9:55:13,  6.92s/it]                                                      {'loss': 0.0615, 'grad_norm': 0.5230026245117188, 'learning_rate': 3.9323805692899e-05, 'epoch': 5.53}
- 11%|█         | 641/5800 [1:42:24<9:55:13,  6.92s/it]score1 tensor([[0.5352],
-        [0.4863],
-        [0.4316],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6016, 0.4922, 0.4062, 0.4609], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:29:01,598] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 7.93 | optimizer_step: 4.36
-[2025-01-25 10:29:01,600] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.94 | bwd_microstep: 4638.35 | bwd_inner_microstep: 4633.60 | bwd_allreduce_microstep: 4.64 | step_microstep: 47.94
-[2025-01-25 10:29:01,600] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.90 | bwd: 4638.38 | bwd_inner: 4633.60 | bwd_allreduce: 4.69 | step: 47.95
- 11%|█         | 642/5800 [1:42:31<9:55:13,  6.92s/it]                                                      {'loss': 0.0352, 'grad_norm': 0.8693122267723083, 'learning_rate': 3.932092320292348e-05, 'epoch': 5.53}
- 11%|█         | 642/5800 [1:42:31<9:55:13,  6.92s/it]score1 tensor([[0.6016],
-        [0.5586],
-        [0.5625],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6562, 0.5625, 0.5703, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0181, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:29:08,524] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 10:29:08,525] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.62 | bwd_microstep: 4642.48 | bwd_inner_microstep: 4637.86 | bwd_allreduce_microstep: 4.51 | step_microstep: 41.97
-[2025-01-25 10:29:08,526] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.58 | bwd: 4642.50 | bwd_inner: 4637.86 | bwd_allreduce: 4.57 | step: 42.00
- 11%|█         | 643/5800 [1:42:38<9:55:00,  6.92s/it]                                                      {'loss': 0.0181, 'grad_norm': 5.4541730880737305, 'learning_rate': 3.9318034688348865e-05, 'epoch': 5.54}
- 11%|█         | 643/5800 [1:42:38<9:55:00,  6.92s/it]score1 tensor([[0.5547],
-        [0.6250],
-        [0.4883],
-        [0.6328]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5586, 0.6094, 0.4512, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0278, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:29:15,434] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.37
-[2025-01-25 10:29:15,435] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.37 | bwd_microstep: 4634.55 | bwd_inner_microstep: 4629.85 | bwd_allreduce_microstep: 4.61 | step_microstep: 42.38
-[2025-01-25 10:29:15,435] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.34 | bwd: 4634.57 | bwd_inner: 4629.85 | bwd_allreduce: 4.66 | step: 42.39
- 11%|█         | 644/5800 [1:42:45<9:54:33,  6.92s/it]                                                      {'loss': 0.0278, 'grad_norm': 5.349878787994385, 'learning_rate': 3.931514015007583e-05, 'epoch': 5.55}
- 11%|█         | 644/5800 [1:42:45<9:54:33,  6.92s/it]score1 tensor([[0.5469],
-        [0.5430],
-        [0.6484],
-        [0.6289]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.5234, 0.6406, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:29:22,354] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.36
-[2025-01-25 10:29:22,355] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.95 | bwd_microstep: 4642.52 | bwd_inner_microstep: 4638.10 | bwd_allreduce_microstep: 4.34 | step_microstep: 42.19
-[2025-01-25 10:29:22,355] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.91 | bwd: 4642.54 | bwd_inner: 4638.10 | bwd_allreduce: 4.38 | step: 42.20
- 11%|█         | 645/5800 [1:42:52<9:54:25,  6.92s/it]                                                      {'loss': 0.0254, 'grad_norm': 10.504392623901367, 'learning_rate': 3.931223958900695e-05, 'epoch': 5.56}
- 11%|█         | 645/5800 [1:42:52<9:54:25,  6.92s/it]score1 tensor([[0.6250],
-        [0.5195],
-        [0.6406],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6367, 0.3398, 0.6250, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0684, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:29:29,276] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.37
-[2025-01-25 10:29:29,277] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.75 | bwd_microstep: 4644.59 | bwd_inner_microstep: 4639.80 | bwd_allreduce_microstep: 4.69 | step_microstep: 45.87
-[2025-01-25 10:29:29,277] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.72 | bwd: 4644.61 | bwd_inner: 4639.80 | bwd_allreduce: 4.73 | step: 45.88
- 11%|█         | 646/5800 [1:42:59<9:54:25,  6.92s/it]                                                      {'loss': 0.0684, 'grad_norm': 5.096596717834473, 'learning_rate': 3.9309333006046674e-05, 'epoch': 5.57}
- 11%|█         | 646/5800 [1:42:59<9:54:25,  6.92s/it]score1 tensor([[0.6055],
-        [0.5898],
-        [0.4883],
-        [0.6055]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5586, 0.4375, 0.4473, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0640, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:29:36,202] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 10:29:36,203] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.25 | bwd_microstep: 4650.63 | bwd_inner_microstep: 4646.09 | bwd_allreduce_microstep: 4.44 | step_microstep: 42.53
-[2025-01-25 10:29:36,203] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.21 | bwd: 4650.65 | bwd_inner: 4646.09 | bwd_allreduce: 4.49 | step: 42.53
- 11%|█         | 647/5800 [1:43:06<9:54:27,  6.92s/it]                                                      {'loss': 0.064, 'grad_norm': 5.040713310241699, 'learning_rate': 3.930642040210132e-05, 'epoch': 5.58}
- 11%|█         | 647/5800 [1:43:06<9:54:27,  6.92s/it]score1 tensor([[0.3438],
-        [0.4785],
-        [0.5000],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.4023, 0.5547, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0474, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:29:43,120] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 10:29:43,122] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.63 | bwd_microstep: 4640.59 | bwd_inner_microstep: 4636.16 | bwd_allreduce_microstep: 4.35 | step_microstep: 41.80
-[2025-01-25 10:29:43,122] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.60 | bwd: 4640.61 | bwd_inner: 4636.16 | bwd_allreduce: 4.38 | step: 41.80
- 11%|█         | 648/5800 [1:43:13<9:54:15,  6.92s/it]                                                      {'loss': 0.0474, 'grad_norm': 1.0869550704956055, 'learning_rate': 3.930350177807909e-05, 'epoch': 5.59}
- 11%|█         | 648/5800 [1:43:13<9:54:15,  6.92s/it]score1 tensor([[0.5273],
-        [0.4570],
-        [0.4238],
-        [0.4043]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6016, 0.5508, 0.5977, 0.4414], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0947, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:29:50,032] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 10:29:50,033] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.50 | bwd_microstep: 4635.40 | bwd_inner_microstep: 4630.78 | bwd_allreduce_microstep: 4.54 | step_microstep: 41.55
-[2025-01-25 10:29:50,034] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.45 | bwd: 4635.43 | bwd_inner: 4630.78 | bwd_allreduce: 4.58 | step: 41.56
- 11%|█         | 649/5800 [1:43:20<9:53:57,  6.92s/it]                                                      {'loss': 0.0947, 'grad_norm': 9.52290153503418, 'learning_rate': 3.9300577134890056e-05, 'epoch': 5.59}
- 11%|█         | 649/5800 [1:43:20<9:53:57,  6.92s/it]score1 tensor([[0.5078],
-        [0.5469],
-        [0.5391],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.6797, 0.6445, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0908, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:29:56,946] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 10:29:56,947] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.41 | bwd_microstep: 4634.80 | bwd_inner_microstep: 4629.77 | bwd_allreduce_microstep: 4.92 | step_microstep: 42.37
-[2025-01-25 10:29:56,947] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.37 | bwd: 4634.83 | bwd_inner: 4629.77 | bwd_allreduce: 4.98 | step: 42.38
- 11%|█         | 650/5800 [1:43:26<9:53:42,  6.92s/it]                                                      {'loss': 0.0908, 'grad_norm': 5.083731651306152, 'learning_rate': 3.929764647344618e-05, 'epoch': 5.6}
- 11%|█         | 650/5800 [1:43:26<9:53:42,  6.92s/it]score1 tensor([[0.4336],
-        [0.4395],
-        [0.5664],
-        [0.3652]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.5781, 0.6445, 0.4727], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0908, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:30:03,871] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 10:30:03,872] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.60 | bwd_microstep: 4644.26 | bwd_inner_microstep: 4639.87 | bwd_allreduce_microstep: 4.30 | step_microstep: 45.69
-[2025-01-25 10:30:03,872] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.55 | bwd: 4644.29 | bwd_inner: 4639.87 | bwd_allreduce: 4.34 | step: 45.69
- 11%|█         | 651/5800 [1:43:33<9:53:44,  6.92s/it]                                                      {'loss': 0.0908, 'grad_norm': 9.652783393859863, 'learning_rate': 3.929470979466129e-05, 'epoch': 5.61}
- 11%|█         | 651/5800 [1:43:33<9:53:44,  6.92s/it]score1 tensor([[0.5000],
-        [0.5820],
-        [0.3926],
-        [0.3809]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5000, 0.6797, 0.4707, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0615, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:30:10,728] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.37
-[2025-01-25 10:30:10,730] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.06 | bwd_microstep: 4585.72 | bwd_inner_microstep: 4581.02 | bwd_allreduce_microstep: 4.61 | step_microstep: 42.65
-[2025-01-25 10:30:10,730] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.02 | bwd: 4585.74 | bwd_inner: 4581.02 | bwd_allreduce: 4.66 | step: 42.68
- 11%|█         | 652/5800 [1:43:40<9:52:08,  6.90s/it]                                                      {'loss': 0.0615, 'grad_norm': 7.15111780166626, 'learning_rate': 3.92917670994511e-05, 'epoch': 5.62}
- 11%|█         | 652/5800 [1:43:40<9:52:08,  6.90s/it]score1 tensor([[0.4961],
-        [0.4688],
-        [0.5352],
-        [0.4746]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.4570, 0.4727, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0620, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:30:17,668] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 10:30:17,669] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.16 | bwd_microstep: 4648.97 | bwd_inner_microstep: 4644.05 | bwd_allreduce_microstep: 4.82 | step_microstep: 42.05
-[2025-01-25 10:30:17,670] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.12 | bwd: 4648.99 | bwd_inner: 4644.05 | bwd_allreduce: 4.87 | step: 42.06
- 11%|█▏        | 653/5800 [1:43:47<9:52:56,  6.91s/it]                                                      {'loss': 0.062, 'grad_norm': 1.2391791343688965, 'learning_rate': 3.928881838873318e-05, 'epoch': 5.63}
- 11%|█▏        | 653/5800 [1:43:47<9:52:56,  6.91s/it]score1 tensor([[0.6211],
-        [0.6250],
-        [0.5039],
-        [0.4570]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.5430, 0.4727, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0459, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:30:24,585] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.55 | optimizer_step: 4.36
-[2025-01-25 10:30:24,586] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.47 | bwd_microstep: 4643.00 | bwd_inner_microstep: 4636.57 | bwd_allreduce_microstep: 6.36 | step_microstep: 41.01
-[2025-01-25 10:30:24,587] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.43 | bwd: 4643.03 | bwd_inner: 4636.57 | bwd_allreduce: 6.40 | step: 41.02
- 11%|█▏        | 654/5800 [1:43:54<9:52:54,  6.91s/it]                                                      {'loss': 0.0459, 'grad_norm': 5.661542892456055, 'learning_rate': 3.9285863663427e-05, 'epoch': 5.64}
- 11%|█▏        | 654/5800 [1:43:54<9:52:54,  6.91s/it]score1 tensor([[0.5273],
-        [0.5586],
-        [0.6367],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4043, 0.5352, 0.6016, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0532, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:30:31,500] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.53 | optimizer_step: 4.36
-[2025-01-25 10:30:31,501] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.22 | bwd_microstep: 4643.80 | bwd_inner_microstep: 4639.19 | bwd_allreduce_microstep: 4.51 | step_microstep: 41.37
-[2025-01-25 10:30:31,501] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.19 | bwd: 4643.82 | bwd_inner: 4639.19 | bwd_allreduce: 4.56 | step: 41.38
- 11%|█▏        | 655/5800 [1:44:01<9:52:52,  6.91s/it]                                                      {'loss': 0.0532, 'grad_norm': 5.423430442810059, 'learning_rate': 3.928290292445388e-05, 'epoch': 5.65}
- 11%|█▏        | 655/5800 [1:44:01<9:52:52,  6.91s/it]score1 tensor([[0.5547],
-        [0.6172],
-        [0.4902],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.6133, 0.4277, 0.4141], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0532, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:30:38,408] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 10:30:38,410] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.78 | bwd_microstep: 4632.58 | bwd_inner_microstep: 4627.72 | bwd_allreduce_microstep: 4.78 | step_microstep: 42.88
-[2025-01-25 10:30:38,410] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.74 | bwd: 4632.60 | bwd_inner: 4627.72 | bwd_allreduce: 4.82 | step: 42.89
- 11%|█▏        | 656/5800 [1:44:08<9:52:38,  6.91s/it]                                                      {'loss': 0.0532, 'grad_norm': 10.017351150512695, 'learning_rate': 3.927993617273705e-05, 'epoch': 5.66}
- 11%|█▏        | 656/5800 [1:44:08<9:52:38,  6.91s/it]score1 tensor([[0.4805],
-        [0.4883],
-        [0.4961],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.4551, 0.5391, 0.6875], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0620, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:30:45,329] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 10:30:45,331] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.67 | bwd_microstep: 4644.25 | bwd_inner_microstep: 4639.71 | bwd_allreduce_microstep: 4.47 | step_microstep: 41.76
-[2025-01-25 10:30:45,331] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.65 | bwd: 4644.27 | bwd_inner: 4639.71 | bwd_allreduce: 4.51 | step: 41.78
- 11%|█▏        | 657/5800 [1:44:15<9:52:45,  6.92s/it]                                                      {'loss': 0.062, 'grad_norm': 0.6583248376846313, 'learning_rate': 3.9276963409201585e-05, 'epoch': 5.66}
- 11%|█▏        | 657/5800 [1:44:15<9:52:45,  6.92s/it]score1 tensor([[0.4922],
-        [0.4473],
-        [0.4824],
-        [0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.4805, 0.4961, 0.4277], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0435, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:30:52,245] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 10:30:52,246] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.46 | bwd_microstep: 4636.94 | bwd_inner_microstep: 4631.79 | bwd_allreduce_microstep: 5.05 | step_microstep: 42.08
-[2025-01-25 10:30:52,246] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.44 | bwd: 4636.96 | bwd_inner: 4631.79 | bwd_allreduce: 5.10 | step: 42.10
- 11%|█▏        | 658/5800 [1:44:22<9:52:38,  6.92s/it]                                                      {'loss': 0.0435, 'grad_norm': 4.755984783172607, 'learning_rate': 3.9273984634774446e-05, 'epoch': 5.67}
- 11%|█▏        | 658/5800 [1:44:22<9:52:38,  6.92s/it]score1 tensor([[0.4707],
-        [0.4531],
-        [0.4844],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.3789, 0.5117, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0322, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:30:59,164] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 10:30:59,165] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.17 | bwd_microstep: 4641.13 | bwd_inner_microstep: 4636.19 | bwd_allreduce_microstep: 4.83 | step_microstep: 43.68
-[2025-01-25 10:30:59,166] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.14 | bwd: 4641.15 | bwd_inner: 4636.19 | bwd_allreduce: 4.88 | step: 43.68
- 11%|█▏        | 659/5800 [1:44:29<9:52:37,  6.92s/it]                                                      {'loss': 0.0322, 'grad_norm': 4.997477054595947, 'learning_rate': 3.927099985038446e-05, 'epoch': 5.68}
- 11%|█▏        | 659/5800 [1:44:29<9:52:37,  6.92s/it]score1 tensor([[0.5195],
-        [0.4883],
-        [0.4805],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.4590, 0.3926, 0.4590], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0688, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:31:06,079] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.37
-[2025-01-25 10:31:06,080] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.22 | bwd_microstep: 4632.30 | bwd_inner_microstep: 4627.54 | bwd_allreduce_microstep: 4.57 | step_microstep: 45.92
-[2025-01-25 10:31:06,081] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.18 | bwd: 4632.32 | bwd_inner: 4627.54 | bwd_allreduce: 4.68 | step: 45.92
- 11%|█▏        | 660/5800 [1:44:36<9:52:45,  6.92s/it]                                                      {'loss': 0.0688, 'grad_norm': 4.946372032165527, 'learning_rate': 3.926800905696235e-05, 'epoch': 5.69}
- 11%|█▏        | 660/5800 [1:44:36<9:52:45,  6.92s/it]score1 tensor([[0.5273],
-        [0.5820],
-        [0.5820],
-        [0.6016]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.4785, 0.6094, 0.6484], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0527, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:31:13,004] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 10:31:13,005] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.43 | bwd_microstep: 4632.89 | bwd_inner_microstep: 4628.03 | bwd_allreduce_microstep: 4.75 | step_microstep: 41.79
-[2025-01-25 10:31:13,005] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.39 | bwd: 4632.92 | bwd_inner: 4628.03 | bwd_allreduce: 4.80 | step: 41.80
- 11%|█▏        | 661/5800 [1:44:42<9:52:35,  6.92s/it]                                                      {'loss': 0.0527, 'grad_norm': 1.0102322101593018, 'learning_rate': 3.926501225544067e-05, 'epoch': 5.7}
- 11%|█▏        | 661/5800 [1:44:42<9:52:35,  6.92s/it]score1 tensor([[0.4688],
-        [0.6484],
-        [0.5664],
-        [0.6289]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3223, 0.6562, 0.5508, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0620, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:31:19,918] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 10:31:19,919] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.95 | bwd_microstep: 4635.85 | bwd_inner_microstep: 4630.92 | bwd_allreduce_microstep: 4.83 | step_microstep: 42.55
-[2025-01-25 10:31:19,919] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.91 | bwd: 4635.88 | bwd_inner: 4630.92 | bwd_allreduce: 4.88 | step: 42.56
- 11%|█▏        | 662/5800 [1:44:49<9:52:28,  6.92s/it]                                                      {'loss': 0.062, 'grad_norm': 4.90052604675293, 'learning_rate': 3.9262009446753903e-05, 'epoch': 5.71}
- 11%|█▏        | 662/5800 [1:44:49<9:52:28,  6.92s/it]score1 tensor([[0.5195],
-        [0.4570],
-        [0.5039],
-        [0.4746]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.3555, 0.3945, 0.4609], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0581, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:31:26,842] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 10:31:26,843] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.78 | bwd_microstep: 4636.71 | bwd_inner_microstep: 4632.05 | bwd_allreduce_microstep: 4.57 | step_microstep: 42.27
-[2025-01-25 10:31:26,843] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.74 | bwd: 4636.73 | bwd_inner: 4632.05 | bwd_allreduce: 4.61 | step: 42.28
- 11%|█▏        | 663/5800 [1:44:56<9:52:23,  6.92s/it]                                                      {'loss': 0.0581, 'grad_norm': 9.784029960632324, 'learning_rate': 3.9259000631838364e-05, 'epoch': 5.72}
- 11%|█▏        | 663/5800 [1:44:56<9:52:23,  6.92s/it]score1 tensor([[0.4648],
-        [0.4785],
-        [0.4707],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4961, 0.4941, 0.4004, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:31:33,761] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 10:31:33,762] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.81 | bwd_microstep: 4640.90 | bwd_inner_microstep: 4635.88 | bwd_allreduce_microstep: 4.91 | step_microstep: 43.85
-[2025-01-25 10:31:33,763] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.77 | bwd: 4640.93 | bwd_inner: 4635.88 | bwd_allreduce: 4.97 | step: 43.85
- 11%|█▏        | 664/5800 [1:45:03<9:52:10,  6.92s/it]                                                      {'loss': 0.0469, 'grad_norm': 5.03342866897583, 'learning_rate': 3.925598581163226e-05, 'epoch': 5.72}
- 11%|█▏        | 664/5800 [1:45:03<9:52:10,  6.92s/it]score1 tensor([[0.4883],
-        [0.4512],
-        [0.4766],
-        [0.4629]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.4648, 0.5273, 0.4902], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0425, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:31:40,682] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.37
-[2025-01-25 10:31:40,683] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.00 | bwd_microstep: 4639.83 | bwd_inner_microstep: 4635.21 | bwd_allreduce_microstep: 4.54 | step_microstep: 41.42
-[2025-01-25 10:31:40,683] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.95 | bwd: 4639.86 | bwd_inner: 4635.21 | bwd_allreduce: 4.59 | step: 41.42
- 11%|█▏        | 665/5800 [1:45:10<9:52:10,  6.92s/it]                                                      {'loss': 0.0425, 'grad_norm': 9.693482398986816, 'learning_rate': 3.9252964987075656e-05, 'epoch': 5.73}
- 11%|█▏        | 665/5800 [1:45:10<9:52:10,  6.92s/it]score1 tensor([[0.4785],
-        [0.4609],
-        [0.4590],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.4551, 0.4766, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0562, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:31:47,605] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 10:31:47,607] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.53 | bwd_microstep: 4643.79 | bwd_inner_microstep: 4639.06 | bwd_allreduce_microstep: 4.66 | step_microstep: 44.87
-[2025-01-25 10:31:47,607] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.49 | bwd: 4643.82 | bwd_inner: 4639.06 | bwd_allreduce: 4.70 | step: 44.88
- 11%|█▏        | 666/5800 [1:45:17<9:52:07,  6.92s/it]                                                      {'loss': 0.0562, 'grad_norm': 0.45314210653305054, 'learning_rate': 3.92499381591105e-05, 'epoch': 5.74}
- 11%|█▏        | 666/5800 [1:45:17<9:52:07,  6.92s/it]score1 tensor([[0.5234],
-        [0.4902],
-        [0.4629],
-        [0.6484]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.4941, 0.3906, 0.5938], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0337, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:31:54,521] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 10:31:54,523] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.72 | bwd_microstep: 4637.98 | bwd_inner_microstep: 4632.84 | bwd_allreduce_microstep: 5.03 | step_microstep: 43.84
-[2025-01-25 10:31:54,523] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.69 | bwd: 4638.00 | bwd_inner: 4632.84 | bwd_allreduce: 5.09 | step: 43.85
- 12%|█▏        | 667/5800 [1:45:24<9:51:53,  6.92s/it]                                                      {'loss': 0.0337, 'grad_norm': 5.328794002532959, 'learning_rate': 3.924690532868061e-05, 'epoch': 5.75}
- 12%|█▏        | 667/5800 [1:45:24<9:51:53,  6.92s/it]score1 tensor([[0.4902],
-        [0.4375],
-        [0.4824],
-        [0.4590]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.3438, 0.5625, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0596, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:32:01,437] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 10:32:01,438] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.95 | bwd_microstep: 4635.00 | bwd_inner_microstep: 4630.14 | bwd_allreduce_microstep: 4.76 | step_microstep: 42.97
-[2025-01-25 10:32:01,439] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.91 | bwd: 4635.03 | bwd_inner: 4630.14 | bwd_allreduce: 4.82 | step: 42.98
- 12%|█▏        | 668/5800 [1:45:31<9:51:43,  6.92s/it]                                                      {'loss': 0.0596, 'grad_norm': 0.45070409774780273, 'learning_rate': 3.924386649673167e-05, 'epoch': 5.76}
- 12%|█▏        | 668/5800 [1:45:31<9:51:43,  6.92s/it]score1 tensor([[0.4961],
-        [0.4688],
-        [0.4844],
-        [0.4375]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.4629, 0.6055, 0.4883], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0483, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:32:08,363] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 10:32:08,364] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.26 | bwd_microstep: 4636.13 | bwd_inner_microstep: 4631.24 | bwd_allreduce_microstep: 4.79 | step_microstep: 48.87
-[2025-01-25 10:32:08,365] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.21 | bwd: 4636.15 | bwd_inner: 4631.24 | bwd_allreduce: 4.84 | step: 48.88
- 12%|█▏        | 669/5800 [1:45:38<9:51:43,  6.92s/it]                                                      {'loss': 0.0483, 'grad_norm': 4.960744857788086, 'learning_rate': 3.9240821664211256e-05, 'epoch': 5.77}
- 12%|█▏        | 669/5800 [1:45:38<9:51:43,  6.92s/it]score1 tensor([[0.5664],
-        [0.4629],
-        [0.4199],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5898, 0.3340, 0.3086, 0.5586], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0806, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:32:15,278] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 10:32:15,279] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.60 | bwd_microstep: 4633.17 | bwd_inner_microstep: 4628.36 | bwd_allreduce_microstep: 4.71 | step_microstep: 42.04
-[2025-01-25 10:32:15,280] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.54 | bwd: 4633.19 | bwd_inner: 4628.36 | bwd_allreduce: 4.76 | step: 42.05
- 12%|█▏        | 670/5800 [1:45:45<9:51:41,  6.92s/it]                                                      {'loss': 0.0806, 'grad_norm': 0.9294989109039307, 'learning_rate': 3.9237770832068786e-05, 'epoch': 5.78}
- 12%|█▏        | 670/5800 [1:45:45<9:51:41,  6.92s/it]score1 tensor([[0.4980],
-        [0.4199],
-        [0.5000],
-        [0.7266]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4121, 0.3867, 0.4473, 0.6641], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0586, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:32:22,212] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.47 | optimizer_step: 4.36
-[2025-01-25 10:32:22,214] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.20 | bwd_microstep: 4640.07 | bwd_inner_microstep: 4635.47 | bwd_allreduce_microstep: 4.52 | step_microstep: 54.27
-[2025-01-25 10:32:22,214] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.16 | bwd: 4640.09 | bwd_inner: 4635.47 | bwd_allreduce: 4.56 | step: 54.28
- 12%|█▏        | 671/5800 [1:45:52<9:51:52,  6.92s/it]                                                      {'loss': 0.0586, 'grad_norm': 10.375765800476074, 'learning_rate': 3.9234714001255564e-05, 'epoch': 5.78}
- 12%|█▏        | 671/5800 [1:45:52<9:51:52,  6.92s/it]score1 tensor([[0.6133],
-        [0.4902],
-        [0.4961],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5742, 0.5156, 0.5000, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0244, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:32:29,176] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.82 | optimizer_step: 4.36
-[2025-01-25 10:32:29,178] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.35 | bwd_microstep: 4640.75 | bwd_inner_microstep: 4635.75 | bwd_allreduce_microstep: 4.78 | step_microstep: 78.51
-[2025-01-25 10:32:29,179] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.32 | bwd: 4640.80 | bwd_inner: 4635.75 | bwd_allreduce: 4.90 | step: 78.55
- 12%|█▏        | 672/5800 [1:45:59<9:53:44,  6.95s/it]                                                      {'loss': 0.0244, 'grad_norm': 0.9753303527832031, 'learning_rate': 3.923165117272477e-05, 'epoch': 5.79}
- 12%|█▏        | 672/5800 [1:45:59<9:53:44,  6.95s/it]score1 tensor([[0.4551],
-        [0.5352],
-        [0.4727],
-        [0.4570]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.6172, 0.4902, 0.4629], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0386, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:32:36,159] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.57 | optimizer_step: 4.36
-[2025-01-25 10:32:36,161] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.81 | bwd_microstep: 4639.85 | bwd_inner_microstep: 4635.22 | bwd_allreduce_microstep: 4.54 | step_microstep: 50.53
-[2025-01-25 10:32:36,161] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2168.78 | bwd: 4639.88 | bwd_inner: 4635.22 | bwd_allreduce: 4.59 | step: 50.56
- 12%|█▏        | 673/5800 [1:46:06<9:53:38,  6.95s/it]                                                      {'loss': 0.0386, 'grad_norm': 9.909187316894531, 'learning_rate': 3.9228582347431444e-05, 'epoch': 5.8}
- 12%|█▏        | 673/5800 [1:46:06<9:53:38,  6.95s/it]score1 tensor([[0.5195],
-        [0.4922],
-        [0.4766],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.5469, 0.5352, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:32:43,074] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.36
-[2025-01-25 10:32:43,075] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.68 | bwd_microstep: 4633.49 | bwd_inner_microstep: 4628.80 | bwd_allreduce_microstep: 4.58 | step_microstep: 41.04
-[2025-01-25 10:32:43,076] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.63 | bwd: 4633.51 | bwd_inner: 4628.80 | bwd_allreduce: 4.63 | step: 41.05
- 12%|█▏        | 674/5800 [1:46:13<9:52:35,  6.94s/it]                                                      {'loss': 0.0703, 'grad_norm': 0.7495076656341553, 'learning_rate': 3.922550752633249e-05, 'epoch': 5.81}
- 12%|█▏        | 674/5800 [1:46:13<9:52:35,  6.94s/it]score1 tensor([[0.5859],
-        [0.5703],
-        [0.4902],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.4863, 0.4473, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0605, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:32:50,025] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.42 | optimizer_step: 4.45
-[2025-01-25 10:32:50,028] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.43 | bwd_microstep: 4639.51 | bwd_inner_microstep: 4630.40 | bwd_allreduce_microstep: 8.89 | step_microstep: 76.44
-[2025-01-25 10:32:50,029] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.39 | bwd: 4639.61 | bwd_inner: 4630.40 | bwd_allreduce: 9.00 | step: 76.48
- 12%|█▏        | 675/5800 [1:46:20<9:53:15,  6.95s/it]                                                      {'loss': 0.0605, 'grad_norm': 10.603419303894043, 'learning_rate': 3.92224267103867e-05, 'epoch': 5.82}
- 12%|█▏        | 675/5800 [1:46:20<9:53:15,  6.95s/it]score1 tensor([[0.5625],
-        [0.4297],
-        [0.4727],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.4043, 0.4492, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0220, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:32:56,960] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 10:32:56,962] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.34 | bwd_microstep: 4639.26 | bwd_inner_microstep: 4634.34 | bwd_allreduce_microstep: 4.85 | step_microstep: 42.22
-[2025-01-25 10:32:56,962] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.31 | bwd: 4639.29 | bwd_inner: 4634.34 | bwd_allreduce: 4.88 | step: 42.23
- 12%|█▏        | 676/5800 [1:46:26<9:52:25,  6.94s/it]                                                      {'loss': 0.022, 'grad_norm': 4.948378562927246, 'learning_rate': 3.921933990055472e-05, 'epoch': 5.83}
- 12%|█▏        | 676/5800 [1:46:26<9:52:25,  6.94s/it]score1 tensor([[0.5156],
-        [0.4883],
-        [0.4785],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5000, 0.5664, 0.4492, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0356, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:33:03,879] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.37
-[2025-01-25 10:33:03,880] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.24 | bwd_microstep: 4639.20 | bwd_inner_microstep: 4633.70 | bwd_allreduce_microstep: 5.41 | step_microstep: 43.03
-[2025-01-25 10:33:03,880] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.20 | bwd: 4639.22 | bwd_inner: 4633.70 | bwd_allreduce: 5.45 | step: 43.03
- 12%|█▏        | 677/5800 [1:46:33<9:51:50,  6.93s/it]                                                      {'loss': 0.0356, 'grad_norm': 0.6351600289344788, 'learning_rate': 3.921624709779908e-05, 'epoch': 5.84}
- 12%|█▏        | 677/5800 [1:46:33<9:51:50,  6.93s/it]score1 tensor([[0.5273],
-        [0.4473],
-        [0.5195],
-        [0.4941]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.4219, 0.5391, 0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0303, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:33:10,799] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 10:33:10,800] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.84 | bwd_microstep: 4644.60 | bwd_inner_microstep: 4639.82 | bwd_allreduce_microstep: 4.71 | step_microstep: 43.76
-[2025-01-25 10:33:10,800] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.78 | bwd: 4644.62 | bwd_inner: 4639.82 | bwd_allreduce: 4.74 | step: 43.77
- 12%|█▏        | 678/5800 [1:46:40<9:51:24,  6.93s/it]                                                      {'loss': 0.0303, 'grad_norm': 5.349234104156494, 'learning_rate': 3.921314830308416e-05, 'epoch': 5.84}
- 12%|█▏        | 678/5800 [1:46:40<9:51:24,  6.93s/it]score1 tensor([[0.4453],
-        [0.4238],
-        [0.4707],
-        [0.4219]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.3984, 0.4668, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0303, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:33:17,713] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 10:33:17,714] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.52 | bwd_microstep: 4638.65 | bwd_inner_microstep: 4634.04 | bwd_allreduce_microstep: 4.52 | step_microstep: 42.14
-[2025-01-25 10:33:17,714] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.48 | bwd: 4638.67 | bwd_inner: 4634.04 | bwd_allreduce: 4.57 | step: 42.15
- 12%|█▏        | 679/5800 [1:46:47<9:50:59,  6.92s/it]                                                      {'loss': 0.0303, 'grad_norm': 0.46612057089805603, 'learning_rate': 3.9210043517376225e-05, 'epoch': 5.85}
- 12%|█▏        | 679/5800 [1:46:47<9:50:59,  6.92s/it]score1 tensor([[0.5469],
-        [0.4414],
-        [0.4766],
-        [0.4863]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.3457, 0.5078, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0645, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:33:24,628] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 10:33:24,629] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.18 | bwd_microstep: 4636.91 | bwd_inner_microstep: 4632.15 | bwd_allreduce_microstep: 4.65 | step_microstep: 43.54
-[2025-01-25 10:33:24,629] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.14 | bwd: 4636.93 | bwd_inner: 4632.15 | bwd_allreduce: 4.71 | step: 43.54
- 12%|█▏        | 680/5800 [1:46:54<9:50:37,  6.92s/it]                                                      {'loss': 0.0645, 'grad_norm': 5.2487616539001465, 'learning_rate': 3.92069327416434e-05, 'epoch': 5.86}
- 12%|█▏        | 680/5800 [1:46:54<9:50:37,  6.92s/it]score1 tensor([[0.4766],
-        [0.4082],
-        [0.6445],
-        [0.4414]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3262, 0.4043, 0.1787, 0.3672], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1738, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:33:31,555] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 10:33:31,556] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.38 | bwd_microstep: 4646.69 | bwd_inner_microstep: 4641.93 | bwd_allreduce_microstep: 4.68 | step_microstep: 46.77
-[2025-01-25 10:33:31,557] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.34 | bwd: 4646.71 | bwd_inner: 4641.93 | bwd_allreduce: 4.72 | step: 46.78
- 12%|█▏        | 681/5800 [1:47:01<9:50:41,  6.92s/it]                                                      {'loss': 0.1738, 'grad_norm': 9.235342025756836, 'learning_rate': 3.920381597685567e-05, 'epoch': 5.87}
- 12%|█▏        | 681/5800 [1:47:01<9:50:41,  6.92s/it]score1 tensor([[0.4668],
-        [0.3887],
-        [0.4922],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.4023, 0.6406, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:33:38,476] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 10:33:38,477] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.65 | bwd_microstep: 4645.90 | bwd_inner_microstep: 4640.92 | bwd_allreduce_microstep: 4.86 | step_microstep: 41.87
-[2025-01-25 10:33:38,477] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.62 | bwd: 4645.93 | bwd_inner: 4640.92 | bwd_allreduce: 4.92 | step: 41.88
- 12%|█▏        | 682/5800 [1:47:08<9:50:30,  6.92s/it]                                                      {'loss': 0.0703, 'grad_norm': 4.789985179901123, 'learning_rate': 3.920069322398491e-05, 'epoch': 5.88}
- 12%|█▏        | 682/5800 [1:47:08<9:50:30,  6.92s/it]score1 tensor([[0.5078],
-        [0.5312],
-        [0.5117],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.5508, 0.4434, 0.5742], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0278, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:33:45,393] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 10:33:45,394] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.78 | bwd_microstep: 4637.70 | bwd_inner_microstep: 4632.85 | bwd_allreduce_microstep: 4.75 | step_microstep: 41.95
-[2025-01-25 10:33:45,395] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.75 | bwd: 4637.72 | bwd_inner: 4632.85 | bwd_allreduce: 4.80 | step: 41.96
- 12%|█▏        | 683/5800 [1:47:15<9:50:20,  6.92s/it]                                                      {'loss': 0.0278, 'grad_norm': 5.26478910446167, 'learning_rate': 3.919756448400484e-05, 'epoch': 5.89}
- 12%|█▏        | 683/5800 [1:47:15<9:50:20,  6.92s/it]score1 tensor([[0.5430],
-        [0.4082],
-        [0.5625],
-        [0.6055]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.3652, 0.5820, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:33:52,312] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 10:33:52,314] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.56 | bwd_microstep: 4638.22 | bwd_inner_microstep: 4633.17 | bwd_allreduce_microstep: 4.96 | step_microstep: 43.34
-[2025-01-25 10:33:52,314] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.53 | bwd: 4638.24 | bwd_inner: 4633.17 | bwd_allreduce: 5.01 | step: 43.36
- 12%|█▏        | 684/5800 [1:47:22<9:49:59,  6.92s/it]                                                      {'loss': 0.0293, 'grad_norm': 1.0931532382965088, 'learning_rate': 3.919442975789106e-05, 'epoch': 5.9}
- 12%|█▏        | 684/5800 [1:47:22<9:49:59,  6.92s/it]score1 tensor([[0.5664],
-        [0.5508],
-        [0.4980],
-        [0.4609]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.4629, 0.4297, 0.4121], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0630, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:33:59,233] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 10:33:59,234] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.46 | bwd_microstep: 4643.36 | bwd_inner_microstep: 4633.91 | bwd_allreduce_microstep: 9.33 | step_microstep: 46.11
-[2025-01-25 10:33:59,235] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.42 | bwd: 4643.39 | bwd_inner: 4633.91 | bwd_allreduce: 9.40 | step: 46.12
- 12%|█▏        | 685/5800 [1:47:29<9:49:59,  6.92s/it]                                                      {'loss': 0.063, 'grad_norm': 10.1300048828125, 'learning_rate': 3.919128904662102e-05, 'epoch': 5.91}
- 12%|█▏        | 685/5800 [1:47:29<9:49:59,  6.92s/it]score1 tensor([[0.5938],
-        [0.6250],
-        [0.5078],
-        [0.4453]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.5039, 0.4238, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0835, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:34:06,156] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.90 | optimizer_step: 4.37
-[2025-01-25 10:34:06,157] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.67 | bwd_microstep: 4642.11 | bwd_inner_microstep: 4637.30 | bwd_allreduce_microstep: 4.72 | step_microstep: 40.47
-[2025-01-25 10:34:06,158] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.62 | bwd: 4642.14 | bwd_inner: 4637.30 | bwd_allreduce: 4.77 | step: 40.47
- 12%|█▏        | 686/5800 [1:47:36<9:49:48,  6.92s/it]                                                      {'loss': 0.0835, 'grad_norm': 5.721269607543945, 'learning_rate': 3.918814235117406e-05, 'epoch': 5.91}
- 12%|█▏        | 686/5800 [1:47:36<9:49:48,  6.92s/it]score1 tensor([[0.4863],
-        [0.5352],
-        [0.5859],
-        [0.3320]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.5430, 0.4844, 0.3516], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0649, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:34:13,077] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 10:34:13,079] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.99 | bwd_microstep: 4643.79 | bwd_inner_microstep: 4638.98 | bwd_allreduce_microstep: 4.72 | step_microstep: 46.38
-[2025-01-25 10:34:13,079] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.96 | bwd: 4643.81 | bwd_inner: 4638.98 | bwd_allreduce: 4.76 | step: 46.39
- 12%|█▏        | 687/5800 [1:47:43<9:49:53,  6.92s/it]                                                      {'loss': 0.0649, 'grad_norm': 4.687314987182617, 'learning_rate': 3.918498967253138e-05, 'epoch': 5.92}
- 12%|█▏        | 687/5800 [1:47:43<9:49:53,  6.92s/it]score1 tensor([[0.4648],
-        [0.5469],
-        [0.4863],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6250, 0.5469, 0.4824, 0.4238], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0571, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:34:19,938] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.95 | optimizer_step: 4.37
-[2025-01-25 10:34:19,940] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.21 | bwd_microstep: 4579.39 | bwd_inner_microstep: 4574.69 | bwd_allreduce_microstep: 4.62 | step_microstep: 41.32
-[2025-01-25 10:34:19,940] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.16 | bwd: 4579.41 | bwd_inner: 4574.69 | bwd_allreduce: 4.65 | step: 41.33
- 12%|█▏        | 688/5800 [1:47:49<9:48:15,  6.90s/it]                                                      {'loss': 0.0571, 'grad_norm': 47.5813102722168, 'learning_rate': 3.918183101167603e-05, 'epoch': 5.93}
- 12%|█▏        | 688/5800 [1:47:49<9:48:15,  6.90s/it]score1 tensor([[0.5352],
-        [0.5781],
-        [0.4238],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4375, 0.5469, 0.3672, 0.6602], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0757, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:34:26,873] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 10:34:26,875] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.46 | bwd_microstep: 4646.96 | bwd_inner_microstep: 4642.30 | bwd_allreduce_microstep: 4.55 | step_microstep: 46.62
-[2025-01-25 10:34:26,875] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.41 | bwd: 4646.98 | bwd_inner: 4642.30 | bwd_allreduce: 4.60 | step: 46.62
- 12%|█▏        | 689/5800 [1:47:56<9:48:56,  6.91s/it]                                                      {'loss': 0.0757, 'grad_norm': 5.1464619636535645, 'learning_rate': 3.9178666369592925e-05, 'epoch': 5.94}
- 12%|█▏        | 689/5800 [1:47:56<9:48:56,  6.91s/it]score1 tensor([[0.6680],
-        [0.6641],
-        [0.5156],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6836, 0.5430, 0.3477, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0977, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:34:33,800] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 10:34:33,801] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.33 | bwd_microstep: 4642.51 | bwd_inner_microstep: 4637.64 | bwd_allreduce_microstep: 4.75 | step_microstep: 42.07
-[2025-01-25 10:34:33,801] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.29 | bwd: 4642.54 | bwd_inner: 4637.64 | bwd_allreduce: 4.82 | step: 42.07
- 12%|█▏        | 690/5800 [1:48:03<9:48:59,  6.92s/it]                                                      {'loss': 0.0977, 'grad_norm': 1.1638013124465942, 'learning_rate': 3.9175495747268876e-05, 'epoch': 5.95}
- 12%|█▏        | 690/5800 [1:48:03<9:48:59,  6.92s/it]evaluate!
-score1 tensor([[0.4355]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4805, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0449, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5820]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5625, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5039]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4766, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4980]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5703, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0723, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.2471]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5742, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.3281, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4102]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5547, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1445, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6289]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5977, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6406]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4219, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2188, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6055]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1172, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5000, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4238]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3945, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5938]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5000, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0938, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5117]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3730, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1387, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5977]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1055, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4062]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4414, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4922]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6797, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1875, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6211]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5391, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0820, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4980]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4648, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0332, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3945]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4258, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4492]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4707, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4590]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3965, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0625, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4160]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3750, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0410, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4492, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3281]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5586, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2305, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5508]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6523, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1016, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6367]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5898, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6172]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4355, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1816, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5312]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5156, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5078]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3262, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1816, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3906]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4863, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0957, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4102]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2383, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5664]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6055, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5000]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4570, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6680]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6992]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.7031, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3535]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4336, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0801, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6094]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5391, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4512]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4473, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5938]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5430, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4863]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4590, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6523]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6641, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5820]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4453, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1367, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6641]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.7109, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5938]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5273, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5117]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3691, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1426, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4707]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4180, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0527, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4043]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3223, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0820, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5977]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6016, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6055]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4648, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1406, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4082]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3418, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4590, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0645, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4512]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5117, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0605, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5000]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4551, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0449, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5508]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5234, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5938]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5312, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0625, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6289]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5820, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4102]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4199, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4258]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3926, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0332, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3652]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2520, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5820]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5352, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6094]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6250, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6094]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5703, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3984]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4727, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0742, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6055]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4434, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1621, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4941, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5781]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5352, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4277]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0605, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4316]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3828, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0488, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4922]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4219, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4785]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4844, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4688]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4805, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5938]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6094]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5898, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.7227]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6641, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0586, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5547]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4688, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0859, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4199]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4297, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4844]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4668, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5781]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4023]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4297, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3789]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4609, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0820, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5039]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1133, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4512]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4492, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4102]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3730, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0371, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6445, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1211, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1133, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5078]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5430, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5195]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5273, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4336]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3867, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4336]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4668, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0332, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4883]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5078]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5625, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4629]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6250, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1621, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.7148]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0859, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6133]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4062]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4785, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0723, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5117]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4062, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1055, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5703]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4395, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1309, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4551]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5664, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1113, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4375]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3555, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0820, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4375]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4785, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0410, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6289]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5078, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1211, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6250]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6016, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4277, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1074, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6094]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5781, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3477, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1875, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4336]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6133, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1797, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6289]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5156, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1133, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4492]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3379, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1113, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3613]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3418, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3945]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3848, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4746]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3828, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0918, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4277, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0996, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3672]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4043, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0371, device='cuda:0', dtype=torch.bfloat16)
-Results saved to /DATA/DATA1/wjr/intern/InternVL/internvl_chat/new_train_result/stage2/mos3_test.csv
-Accuracy: 0.0
-SRCC_score: 0.4933904318278946
-PLCC_score: 0.48245100772090144
-KRCC_score: 0.35172049641008896
-SRCC_level: 0.4933904318278946
-PLCC_level: 0.48245100772090144
-KRCC_level: 0.35172049641008896
-score1 tensor([[0.4785],
-        [0.4629],
-        [0.3887],
-        [0.5859]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4336, 0.4785, 0.4668, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0435, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:44:52,856] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 10:44:52,858] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2145.61 | bwd_microstep: 4604.50 | bwd_inner_microstep: 4599.33 | bwd_allreduce_microstep: 5.06 | step_microstep: 47.02
-[2025-01-25 10:44:52,858] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2145.50 | bwd: 4604.53 | bwd_inner: 4599.33 | bwd_allreduce: 5.12 | step: 47.03
- 12%|█▏        | 691/5800 [1:58:22<270:27:01, 190.57s/it]                                                         {'loss': 0.0435, 'grad_norm': 0.9500223398208618, 'learning_rate': 3.917231914569253e-05, 'epoch': 5.96}
- 12%|█▏        | 691/5800 [1:58:22<270:27:01, 190.57s/it]score1 tensor([[0.4316],
-        [0.4492],
-        [0.6250],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.4688, 0.5078, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0923, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:44:59,779] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 10:44:59,781] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2151.34 | bwd_microstep: 4593.15 | bwd_inner_microstep: 4587.88 | bwd_allreduce_microstep: 5.17 | step_microstep: 46.63
-[2025-01-25 10:44:59,781] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2151.30 | bwd: 4593.18 | bwd_inner: 4587.88 | bwd_allreduce: 5.22 | step: 46.64
- 12%|█▏        | 692/5800 [1:58:29<192:12:46, 135.47s/it]                                                         {'loss': 0.0923, 'grad_norm': 1.213226079940796, 'learning_rate': 3.916913656585441e-05, 'epoch': 5.97}
- 12%|█▏        | 692/5800 [1:58:29<192:12:46, 135.47s/it]score1 tensor([[0.5352],
-        [0.4805],
-        [0.4453],
-        [0.4746]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.4688, 0.4219, 0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0415, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:45:06,643] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 10:45:06,645] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2137.06 | bwd_microstep: 4591.62 | bwd_inner_microstep: 4585.93 | bwd_allreduce_microstep: 5.59 | step_microstep: 47.54
-[2025-01-25 10:45:06,645] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2137.01 | bwd: 4591.64 | bwd_inner: 4585.93 | bwd_allreduce: 5.65 | step: 47.55
- 12%|█▏        | 693/5800 [1:58:36<137:26:17, 96.88s/it]                                                         {'loss': 0.0415, 'grad_norm': 0.6374688148498535, 'learning_rate': 3.9165948008746884e-05, 'epoch': 5.97}
- 12%|█▏        | 693/5800 [1:58:36<137:26:17, 96.88s/it]score1 tensor([[0.5039],
-        [0.5703],
-        [0.4102],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6719, 0.5664, 0.4980, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0728, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:45:13,510] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 10:45:13,511] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2140.46 | bwd_microstep: 4609.07 | bwd_inner_microstep: 4603.88 | bwd_allreduce_microstep: 5.10 | step_microstep: 45.89
-[2025-01-25 10:45:13,512] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2140.43 | bwd: 4609.09 | bwd_inner: 4603.88 | bwd_allreduce: 5.15 | step: 45.89
- 12%|█▏        | 694/5800 [1:58:43<99:06:34, 69.88s/it]                                                        {'loss': 0.0728, 'grad_norm': 1.2874464988708496, 'learning_rate': 3.9162753475364216e-05, 'epoch': 5.98}
- 12%|█▏        | 694/5800 [1:58:43<99:06:34, 69.88s/it]score1 tensor([[0.4395],
-        [0.4941],
-        [0.4473],
-        [0.4238]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5000, 0.5156, 0.4648, 0.4062], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:45:20,385] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 10:45:20,386] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2146.18 | bwd_microstep: 4609.71 | bwd_inner_microstep: 4604.56 | bwd_allreduce_microstep: 5.05 | step_microstep: 44.92
-[2025-01-25 10:45:20,387] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2146.13 | bwd: 4609.74 | bwd_inner: 4604.57 | bwd_allreduce: 5.09 | step: 44.92
- 12%|█▏        | 695/5800 [1:58:50<72:17:20, 50.98s/it]                                                       {'loss': 0.0293, 'grad_norm': 4.85292387008667, 'learning_rate': 3.915955296670251e-05, 'epoch': 5.99}
- 12%|█▏        | 695/5800 [1:58:50<72:17:20, 50.98s/it][93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-score1 tensor([[0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4121], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0879, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:45:24,606] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 10:45:24,607] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 571.33 | bwd_microstep: 1219.57 | bwd_inner_microstep: 1214.81 | bwd_allreduce_microstep: 4.66 | step_microstep: 44.66
-[2025-01-25 10:45:24,608] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 571.28 | bwd: 1219.60 | bwd_inner: 1214.81 | bwd_allreduce: 4.71 | step: 44.67
- 12%|█▏        | 696/5800 [1:58:54<52:23:16, 36.95s/it]                                                       {'loss': 0.0879, 'grad_norm': 9.831768989562988, 'learning_rate': 3.915634648375974e-05, 'epoch': 6.0}
- 12%|█▏        | 696/5800 [1:58:54<52:23:16, 36.95s/it][2025-01-25 10:45:29,501] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 10:45:39,776] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 10:45:50,216] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 10:46:00,635] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-score1 tensor([[0.4746],
-        [0.4883],
-        [0.4434],
-        [0.4512]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.4551, 0.5195, 0.4531], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0498, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:46:17,256] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.94 | optimizer_step: 4.36
-[2025-01-25 10:46:17,258] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2138.83 | bwd_microstep: 4608.62 | bwd_inner_microstep: 4599.36 | bwd_allreduce_microstep: 9.00 | step_microstep: 58.07
-[2025-01-25 10:46:17,259] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2138.79 | bwd: 4608.69 | bwd_inner: 4599.36 | bwd_allreduce: 9.14 | step: 58.13
- 12%|█▏        | 697/5800 [1:59:47<59:03:35, 41.66s/it]                                                       {'loss': 0.0498, 'grad_norm': 4.627029895782471, 'learning_rate': 3.915313402753575e-05, 'epoch': 6.01}
- 12%|█▏        | 697/5800 [1:59:47<59:03:35, 41.66s/it]score1 tensor([[0.6562],
-        [0.4863],
-        [0.5078],
-        [0.4062]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.7031, 0.5664, 0.4941, 0.3477], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0498, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:46:24,164] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.37
-[2025-01-25 10:46:24,166] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.09 | bwd_microstep: 4583.14 | bwd_inner_microstep: 4578.30 | bwd_allreduce_microstep: 4.75 | step_microstep: 56.57
-[2025-01-25 10:46:24,167] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2151.94 | bwd: 4583.16 | bwd_inner: 4578.30 | bwd_allreduce: 4.79 | step: 56.61
- 12%|█▏        | 698/5800 [1:59:54<44:16:25, 31.24s/it]                                                       {'loss': 0.0498, 'grad_norm': 1.003285527229309, 'learning_rate': 3.9149915599032234e-05, 'epoch': 6.02}
- 12%|█▏        | 698/5800 [1:59:54<44:16:25, 31.24s/it]score1 tensor([[0.4590],
-        [0.6836],
-        [0.5312],
-        [0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4121, 0.5938, 0.5664, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0596, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:46:31,034] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 10:46:31,036] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2135.45 | bwd_microstep: 4589.17 | bwd_inner_microstep: 4583.94 | bwd_allreduce_microstep: 5.11 | step_microstep: 43.19
-[2025-01-25 10:46:31,036] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2135.40 | bwd: 4589.20 | bwd_inner: 4583.94 | bwd_allreduce: 5.18 | step: 43.19
- 12%|█▏        | 699/5800 [2:00:01<33:53:55, 23.92s/it]                                                       {'loss': 0.0596, 'grad_norm': 0.6931033730506897, 'learning_rate': 3.9146691199252755e-05, 'epoch': 6.03}
- 12%|█▏        | 699/5800 [2:00:01<33:53:55, 23.92s/it]score1 tensor([[0.5234],
-        [0.6211],
-        [0.5234],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.4590, 0.6445, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0913, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:46:37,906] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.41 | optimizer_step: 4.36
-[2025-01-25 10:46:37,908] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2137.66 | bwd_microstep: 4606.51 | bwd_inner_microstep: 4601.13 | bwd_allreduce_microstep: 5.26 | step_microstep: 45.68
-[2025-01-25 10:46:37,908] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2137.61 | bwd: 4606.54 | bwd_inner: 4601.13 | bwd_allreduce: 5.33 | step: 45.70
- 12%|█▏        | 700/5800 [2:00:07<26:38:38, 18.81s/it]                                                       {'loss': 0.0913, 'grad_norm': 0.7537891268730164, 'learning_rate': 3.914346082920274e-05, 'epoch': 6.03}
- 12%|█▏        | 700/5800 [2:00:07<26:38:38, 18.81s/it]score1 tensor([[0.4746],
-        [0.4922],
-        [0.5039],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.4727, 0.4355, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:46:44,782] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 10:46:44,784] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2148.75 | bwd_microstep: 4605.66 | bwd_inner_microstep: 4600.86 | bwd_allreduce_microstep: 4.72 | step_microstep: 42.75
-[2025-01-25 10:46:44,784] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2148.70 | bwd: 4605.68 | bwd_inner: 4600.86 | bwd_allreduce: 4.77 | step: 42.76
- 12%|█▏        | 701/5800 [2:00:14<21:34:10, 15.23s/it]                                                       {'loss': 0.0293, 'grad_norm': 0.41080012917518616, 'learning_rate': 3.914022448988946e-05, 'epoch': 6.04}
- 12%|█▏        | 701/5800 [2:00:14<21:34:10, 15.23s/it]score1 tensor([[0.5430],
-        [0.3594],
-        [0.5781],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4766, 0.1787, 0.6328, 0.6367], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0957, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:46:51,676] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.37
-[2025-01-25 10:46:51,678] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2146.72 | bwd_microstep: 4615.65 | bwd_inner_microstep: 4610.49 | bwd_allreduce_microstep: 5.05 | step_microstep: 46.67
-[2025-01-25 10:46:51,678] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2146.68 | bwd: 4615.68 | bwd_inner: 4610.49 | bwd_allreduce: 5.11 | step: 46.67
- 12%|█▏        | 702/5800 [2:00:21<18:01:21, 12.73s/it]                                                       {'loss': 0.0957, 'grad_norm': 1.0673611164093018, 'learning_rate': 3.913698218232208e-05, 'epoch': 6.05}
- 12%|█▏        | 702/5800 [2:00:21<18:01:21, 12.73s/it]score1 tensor([[0.5547],
-        [0.5195],
-        [0.4766],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.5898, 0.4551, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0415, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:46:58,559] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 10:46:58,560] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2149.54 | bwd_microstep: 4611.53 | bwd_inner_microstep: 4605.84 | bwd_allreduce_microstep: 5.60 | step_microstep: 47.85
-[2025-01-25 10:46:58,561] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2149.51 | bwd: 4611.56 | bwd_inner: 4605.84 | bwd_allreduce: 5.64 | step: 47.86
- 12%|█▏        | 703/5800 [2:00:28<15:32:20, 10.98s/it]                                                       {'loss': 0.0415, 'grad_norm': 5.115108013153076, 'learning_rate': 3.913373390751159e-05, 'epoch': 6.06}
- 12%|█▏        | 703/5800 [2:00:28<15:32:20, 10.98s/it]score1 tensor([[0.5000],
-        [0.4785],
-        [0.5234],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.4004, 0.3750, 0.4727], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0732, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:47:05,463] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 10:47:05,464] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.69 | bwd_microstep: 4610.00 | bwd_inner_microstep: 4604.34 | bwd_allreduce_microstep: 5.55 | step_microstep: 45.38
-[2025-01-25 10:47:05,465] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.65 | bwd: 4610.02 | bwd_inner: 4604.33 | bwd_allreduce: 5.61 | step: 45.39
- 12%|█▏        | 704/5800 [2:00:35<13:48:18,  9.75s/it]                                                       {'loss': 0.0732, 'grad_norm': 4.825770854949951, 'learning_rate': 3.9130479666470876e-05, 'epoch': 6.07}
- 12%|█▏        | 704/5800 [2:00:35<13:48:18,  9.75s/it]score1 tensor([[0.5781],
-        [0.4785],
-        [0.6289],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.3652, 0.6875, 0.4590], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0679, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:47:12,351] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 10:47:12,353] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2146.87 | bwd_microstep: 4611.81 | bwd_inner_microstep: 4604.28 | bwd_allreduce_microstep: 7.30 | step_microstep: 58.99
-[2025-01-25 10:47:12,354] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2146.84 | bwd: 4611.87 | bwd_inner: 4604.28 | bwd_allreduce: 7.43 | step: 58.98
- 12%|█▏        | 705/5800 [2:00:42<12:35:33,  8.90s/it]                                                       {'loss': 0.0679, 'grad_norm': 4.6679558753967285, 'learning_rate': 3.912721946021466e-05, 'epoch': 6.08}
- 12%|█▏        | 705/5800 [2:00:42<12:35:33,  8.90s/it]score1 tensor([[0.5430],
-        [0.5273],
-        [0.5469],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4863, 0.4668, 0.4375, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0654, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:47:19,274] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.37
-[2025-01-25 10:47:19,275] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.08 | bwd_microstep: 4613.22 | bwd_inner_microstep: 4605.60 | bwd_allreduce_microstep: 7.44 | step_microstep: 66.44
-[2025-01-25 10:47:19,276] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.04 | bwd: 4613.27 | bwd_inner: 4605.60 | bwd_allreduce: 7.54 | step: 66.43
- 12%|█▏        | 706/5800 [2:00:49<11:44:48,  8.30s/it]                                                       {'loss': 0.0654, 'grad_norm': 9.869869232177734, 'learning_rate': 3.9123953289759534e-05, 'epoch': 6.09}
- 12%|█▏        | 706/5800 [2:00:49<11:44:48,  8.30s/it]score1 tensor([[0.4961],
-        [0.5508],
-        [0.4531],
-        [0.4922]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.4941, 0.4160, 0.4277], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0474, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:47:26,205] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.89 | optimizer_step: 4.37
-[2025-01-25 10:47:26,207] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.39 | bwd_microstep: 4617.94 | bwd_inner_microstep: 4612.85 | bwd_allreduce_microstep: 4.98 | step_microstep: 72.38
-[2025-01-25 10:47:26,208] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.34 | bwd: 4617.98 | bwd_inner: 4612.85 | bwd_allreduce: 5.04 | step: 72.43
- 12%|█▏        | 707/5800 [2:00:56<11:10:12,  7.90s/it]                                                       {'loss': 0.0474, 'grad_norm': 9.609827995300293, 'learning_rate': 3.9120681156123935e-05, 'epoch': 6.09}
- 12%|█▏        | 707/5800 [2:00:56<11:10:12,  7.90s/it]score1 tensor([[0.4492],
-        [0.3066],
-        [0.4844],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.3613, 0.4941, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0435, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:47:33,130] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.36
-[2025-01-25 10:47:33,132] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.86 | bwd_microstep: 4618.44 | bwd_inner_microstep: 4613.72 | bwd_allreduce_microstep: 4.64 | step_microstep: 43.87
-[2025-01-25 10:47:33,133] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.82 | bwd: 4618.46 | bwd_inner: 4613.72 | bwd_allreduce: 4.68 | step: 43.92
- 12%|█▏        | 708/5800 [2:01:03<10:45:05,  7.60s/it]                                                       {'loss': 0.0435, 'grad_norm': 9.316235542297363, 'learning_rate': 3.911740306032818e-05, 'epoch': 6.1}
- 12%|█▏        | 708/5800 [2:01:03<10:45:05,  7.60s/it]score1 tensor([[0.4570],
-        [0.3887],
-        [0.4375],
-        [0.3809]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.3457, 0.5312, 0.3867], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0630, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:47:40,020] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.50 | optimizer_step: 4.37
-[2025-01-25 10:47:40,021] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2151.85 | bwd_microstep: 4610.06 | bwd_inner_microstep: 4605.09 | bwd_allreduce_microstep: 4.88 | step_microstep: 44.69
-[2025-01-25 10:47:40,022] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2151.81 | bwd: 4610.09 | bwd_inner: 4605.09 | bwd_allreduce: 4.93 | step: 44.70
- 12%|█▏        | 709/5800 [2:01:09<10:26:36,  7.38s/it]                                                       {'loss': 0.063, 'grad_norm': 4.732348442077637, 'learning_rate': 3.911411900339444e-05, 'epoch': 6.11}
- 12%|█▏        | 709/5800 [2:01:09<10:26:36,  7.38s/it]score1 tensor([[0.3828],
-        [0.4180],
-        [0.3809],
-        [0.4570]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.4648, 0.4316, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0649, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:47:46,913] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.36
-[2025-01-25 10:47:46,915] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.82 | bwd_microstep: 4620.85 | bwd_inner_microstep: 4615.62 | bwd_allreduce_microstep: 5.14 | step_microstep: 46.08
-[2025-01-25 10:47:46,915] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.78 | bwd: 4620.88 | bwd_inner: 4615.62 | bwd_allreduce: 5.19 | step: 46.09
- 12%|█▏        | 710/5800 [2:01:16<10:13:57,  7.24s/it]                                                       {'loss': 0.0649, 'grad_norm': 9.093503952026367, 'learning_rate': 3.9110828986346735e-05, 'epoch': 6.12}
- 12%|█▏        | 710/5800 [2:01:16<10:13:57,  7.24s/it]score1 tensor([[0.4570],
-        [0.4531],
-        [0.4941],
-        [0.4199]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.5781, 0.5625, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0688, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:47:53,798] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.55 | optimizer_step: 4.36
-[2025-01-25 10:47:53,799] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.59 | bwd_microstep: 4612.90 | bwd_inner_microstep: 4607.87 | bwd_allreduce_microstep: 4.92 | step_microstep: 43.13
-[2025-01-25 10:47:53,800] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.55 | bwd: 4612.92 | bwd_inner: 4607.87 | bwd_allreduce: 4.98 | step: 43.13
- 12%|█▏        | 711/5800 [2:01:23<10:04:48,  7.13s/it]                                                       {'loss': 0.0688, 'grad_norm': 4.728277206420898, 'learning_rate': 3.910753301021094e-05, 'epoch': 6.13}
- 12%|█▏        | 711/5800 [2:01:23<10:04:48,  7.13s/it]score1 tensor([[0.6133],
-        [0.5312],
-        [0.5508],
-        [0.6289]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6289, 0.5312, 0.5938, 0.6367], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0166, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:48:00,629] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 10:48:00,631] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.04 | bwd_microstep: 4564.09 | bwd_inner_microstep: 4559.05 | bwd_allreduce_microstep: 4.94 | step_microstep: 41.90
-[2025-01-25 10:48:00,631] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.00 | bwd: 4564.12 | bwd_inner: 4559.05 | bwd_allreduce: 4.99 | step: 41.92
- 12%|█▏        | 712/5800 [2:01:30<9:57:13,  7.04s/it]                                                       {'loss': 0.0166, 'grad_norm': 8.106843948364258, 'learning_rate': 3.910423107601481e-05, 'epoch': 6.14}
- 12%|█▏        | 712/5800 [2:01:30<9:57:13,  7.04s/it]score1 tensor([[0.4961],
-        [0.7656],
-        [0.5508],
-        [0.4805]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.6484, 0.4844, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0825, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:48:07,531] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.37
-[2025-01-25 10:48:07,533] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.35 | bwd_microstep: 4622.17 | bwd_inner_microstep: 4616.25 | bwd_allreduce_microstep: 5.78 | step_microstep: 46.80
-[2025-01-25 10:48:07,533] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.32 | bwd: 4622.19 | bwd_inner: 4616.25 | bwd_allreduce: 5.84 | step: 46.81
- 12%|█▏        | 713/5800 [2:01:37<9:53:24,  7.00s/it]                                                      {'loss': 0.0825, 'grad_norm': 10.360647201538086, 'learning_rate': 3.9100923184787955e-05, 'epoch': 6.15}
- 12%|█▏        | 713/5800 [2:01:37<9:53:24,  7.00s/it]score1 tensor([[0.5195],
-        [0.5234],
-        [0.6641],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.4180, 0.6016, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0591, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:48:14,422] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 10:48:14,424] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.93 | bwd_microstep: 4614.99 | bwd_inner_microstep: 4608.50 | bwd_allreduce_microstep: 6.40 | step_microstep: 42.55
-[2025-01-25 10:48:14,424] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.89 | bwd: 4615.02 | bwd_inner: 4608.50 | bwd_allreduce: 6.44 | step: 42.56
- 12%|█▏        | 714/5800 [2:01:44<9:50:50,  6.97s/it]                                                      {'loss': 0.0591, 'grad_norm': 5.161149024963379, 'learning_rate': 3.9097609337561814e-05, 'epoch': 6.16}
- 12%|█▏        | 714/5800 [2:01:44<9:50:50,  6.97s/it]score1 tensor([[0.4824],
-        [0.5273],
-        [0.5547],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.4414, 0.5547, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0493, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:48:21,308] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.04 | optimizer_step: 4.36
-[2025-01-25 10:48:21,310] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.02 | bwd_microstep: 4585.36 | bwd_inner_microstep: 4580.38 | bwd_allreduce_microstep: 4.90 | step_microstep: 59.33
-[2025-01-25 10:48:21,311] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.98 | bwd: 4585.39 | bwd_inner: 4580.38 | bwd_allreduce: 4.94 | step: 59.35
- 12%|█▏        | 715/5800 [2:01:51<9:48:26,  6.94s/it]                                                      {'loss': 0.0493, 'grad_norm': 7.326261520385742, 'learning_rate': 3.9094289535369715e-05, 'epoch': 6.16}
- 12%|█▏        | 715/5800 [2:01:51<9:48:26,  6.94s/it]score1 tensor([[0.7266],
-        [0.5625],
-        [0.5117],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6484, 0.4785, 0.5000, 0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0464, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:48:28,255] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.12 | optimizer_step: 4.36
-[2025-01-25 10:48:28,258] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2171.53 | bwd_microstep: 4627.00 | bwd_inner_microstep: 4620.37 | bwd_allreduce_microstep: 6.52 | step_microstep: 70.19
-[2025-01-25 10:48:28,259] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2171.49 | bwd: 4627.02 | bwd_inner: 4620.37 | bwd_allreduce: 6.58 | step: 70.25
- 12%|█▏        | 716/5800 [2:01:58<9:48:40,  6.95s/it]                                                      {'loss': 0.0464, 'grad_norm': 10.410003662109375, 'learning_rate': 3.909096377924682e-05, 'epoch': 6.17}
- 12%|█▏        | 716/5800 [2:01:58<9:48:40,  6.95s/it]score1 tensor([[0.6016],
-        [0.4961],
-        [0.4570],
-        [0.4922]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.4766, 0.4609, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0537, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:48:35,191] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 10:48:35,192] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.03 | bwd_microstep: 4619.22 | bwd_inner_microstep: 4614.59 | bwd_allreduce_microstep: 4.54 | step_microstep: 44.20
-[2025-01-25 10:48:35,192] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.99 | bwd: 4619.24 | bwd_inner: 4614.59 | bwd_allreduce: 4.59 | step: 44.21
- 12%|█▏        | 717/5800 [2:02:05<9:47:50,  6.94s/it]                                                      {'loss': 0.0537, 'grad_norm': 0.794426679611206, 'learning_rate': 3.908763207023016e-05, 'epoch': 6.18}
- 12%|█▏        | 717/5800 [2:02:05<9:47:50,  6.94s/it]score1 tensor([[0.4434],
-        [0.4375],
-        [0.5156],
-        [0.4453]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.5039, 0.5391, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0483, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:48:42,039] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.37
-[2025-01-25 10:48:42,040] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.29 | bwd_microstep: 4569.86 | bwd_inner_microstep: 4564.91 | bwd_allreduce_microstep: 4.85 | step_microstep: 45.01
-[2025-01-25 10:48:42,040] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.26 | bwd: 4569.88 | bwd_inner: 4564.91 | bwd_allreduce: 4.90 | step: 45.02
- 12%|█▏        | 718/5800 [2:02:12<9:45:25,  6.91s/it]                                                      {'loss': 0.0483, 'grad_norm': 7.084537029266357, 'learning_rate': 3.908429440935862e-05, 'epoch': 6.19}
- 12%|█▏        | 718/5800 [2:02:12<9:45:25,  6.91s/it]score1 tensor([[0.5039],
-        [0.4648],
-        [0.5586],
-        [0.6211]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.5508, 0.5664, 0.6641], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0420, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:48:48,935] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 10:48:48,936] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.90 | bwd_microstep: 4619.27 | bwd_inner_microstep: 4614.07 | bwd_allreduce_microstep: 5.10 | step_microstep: 45.47
-[2025-01-25 10:48:48,937] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.86 | bwd: 4619.29 | bwd_inner: 4614.07 | bwd_allreduce: 5.15 | step: 45.48
- 12%|█▏        | 719/5800 [2:02:18<9:45:00,  6.91s/it]                                                      {'loss': 0.042, 'grad_norm': 10.077855110168457, 'learning_rate': 3.9080950797672947e-05, 'epoch': 6.2}
- 12%|█▏        | 719/5800 [2:02:18<9:45:00,  6.91s/it]score1 tensor([[0.4746],
-        [0.5273],
-        [0.6328],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4219, 0.5664, 0.6641, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0405, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:48:55,853] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 10:48:55,854] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.05 | bwd_microstep: 4630.30 | bwd_inner_microstep: 4625.16 | bwd_allreduce_microstep: 5.06 | step_microstep: 48.52
-[2025-01-25 10:48:55,855] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.99 | bwd: 4630.33 | bwd_inner: 4625.16 | bwd_allreduce: 5.10 | step: 48.53
- 12%|█▏        | 720/5800 [2:02:25<9:44:59,  6.91s/it]                                                      {'loss': 0.0405, 'grad_norm': 0.8681028485298157, 'learning_rate': 3.9077601236215726e-05, 'epoch': 6.21}
- 12%|█▏        | 720/5800 [2:02:25<9:44:59,  6.91s/it]score1 tensor([[0.3945],
-        [0.5391],
-        [0.5273],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3418, 0.5312, 0.5000, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0220, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:49:02,702] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.37
-[2025-01-25 10:49:02,703] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.48 | bwd_microstep: 4579.94 | bwd_inner_microstep: 4575.04 | bwd_allreduce_microstep: 4.83 | step_microstep: 44.14
-[2025-01-25 10:49:02,704] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.44 | bwd: 4579.96 | bwd_inner: 4575.04 | bwd_allreduce: 4.86 | step: 44.14
- 12%|█▏        | 721/5800 [2:02:32<9:43:26,  6.89s/it]                                                      {'loss': 0.022, 'grad_norm': 7.166590213775635, 'learning_rate': 3.907424572603142e-05, 'epoch': 6.22}
- 12%|█▏        | 721/5800 [2:02:32<9:43:26,  6.89s/it]score1 tensor([[0.3691],
-        [0.3594],
-        [0.4727],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4238, 0.3809, 0.4844, 0.5742], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0337, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:49:09,607] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 10:49:09,609] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.45 | bwd_microstep: 4622.84 | bwd_inner_microstep: 4617.67 | bwd_allreduce_microstep: 5.10 | step_microstep: 45.76
-[2025-01-25 10:49:09,609] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.41 | bwd: 4622.86 | bwd_inner: 4617.67 | bwd_allreduce: 5.13 | step: 45.77
- 12%|█▏        | 722/5800 [2:02:39<9:43:39,  6.90s/it]                                                      {'loss': 0.0337, 'grad_norm': 9.194222450256348, 'learning_rate': 3.907088426816632e-05, 'epoch': 6.22}
- 12%|█▏        | 722/5800 [2:02:39<9:43:39,  6.90s/it]score1 tensor([[0.5078],
-        [0.4629],
-        [0.4863],
-        [0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.4629, 0.5625, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0288, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:49:16,479] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.54 | optimizer_step: 4.37
-[2025-01-25 10:49:16,481] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.97 | bwd_microstep: 4579.74 | bwd_inner_microstep: 4574.46 | bwd_allreduce_microstep: 5.19 | step_microstep: 48.12
-[2025-01-25 10:49:16,481] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.93 | bwd: 4579.76 | bwd_inner: 4574.46 | bwd_allreduce: 5.24 | step: 48.12
- 12%|█▏        | 723/5800 [2:02:46<9:42:53,  6.89s/it]                                                      {'loss': 0.0288, 'grad_norm': 2.3516438007354736, 'learning_rate': 3.906751686366861e-05, 'epoch': 6.23}
- 12%|█▏        | 723/5800 [2:02:46<9:42:53,  6.89s/it]score1 tensor([[0.6484],
-        [0.5469],
-        [0.4609],
-        [0.5898]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.6055, 0.4043, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0454, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:49:23,410] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 10:49:23,412] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.55 | bwd_microstep: 4652.65 | bwd_inner_microstep: 4647.66 | bwd_allreduce_microstep: 4.90 | step_microstep: 42.42
-[2025-01-25 10:49:23,412] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.52 | bwd: 4652.67 | bwd_inner: 4647.66 | bwd_allreduce: 4.94 | step: 42.43
- 12%|█▏        | 724/5800 [2:02:53<9:44:09,  6.90s/it]                                                      {'loss': 0.0454, 'grad_norm': 5.1808695793151855, 'learning_rate': 3.9064143513588285e-05, 'epoch': 6.24}
- 12%|█▏        | 724/5800 [2:02:53<9:44:09,  6.90s/it]score1 tensor([[0.4668],
-        [0.5508],
-        [0.5508],
-        [0.4707]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4961, 0.5273, 0.4980, 0.4434], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0332, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:49:30,402] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.55 | optimizer_step: 4.36
-[2025-01-25 10:49:30,405] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.95 | bwd_microstep: 4651.68 | bwd_inner_microstep: 4642.77 | bwd_allreduce_microstep: 8.71 | step_microstep: 82.08
-[2025-01-25 10:49:30,406] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.91 | bwd: 4651.73 | bwd_inner: 4642.77 | bwd_allreduce: 8.82 | step: 82.09
- 12%|█▎        | 725/5800 [2:03:00<9:46:54,  6.94s/it]                                                      {'loss': 0.0332, 'grad_norm': 5.019212245941162, 'learning_rate': 3.906076421897722e-05, 'epoch': 6.25}
- 12%|█▎        | 725/5800 [2:03:00<9:46:54,  6.94s/it]score1 tensor([[0.5625],
-        [0.3750],
-        [0.4707],
-        [0.5859]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.4922, 0.4453, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0732, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:49:37,401] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 10:49:37,402] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.35 | bwd_microstep: 4639.42 | bwd_inner_microstep: 4633.08 | bwd_allreduce_microstep: 6.18 | step_microstep: 59.89
-[2025-01-25 10:49:37,403] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.30 | bwd: 4639.44 | bwd_inner: 4633.08 | bwd_allreduce: 6.28 | step: 59.91
- 13%|█▎        | 726/5800 [2:03:07<9:47:37,  6.95s/it]                                                      {'loss': 0.0732, 'grad_norm': 0.8839139938354492, 'learning_rate': 3.905737898088914e-05, 'epoch': 6.26}
- 13%|█▎        | 726/5800 [2:03:07<9:47:37,  6.95s/it]score1 tensor([[0.4316],
-        [0.4766],
-        [0.5156],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.5000, 0.5820, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:49:44,382] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.91 | optimizer_step: 4.36
-[2025-01-25 10:49:44,385] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.26 | bwd_microstep: 4643.08 | bwd_inner_microstep: 4634.30 | bwd_allreduce_microstep: 8.47 | step_microstep: 90.60
-[2025-01-25 10:49:44,386] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.22 | bwd: 4643.14 | bwd_inner: 4634.30 | bwd_allreduce: 8.63 | step: 90.65
- 13%|█▎        | 727/5800 [2:03:14<9:49:04,  6.97s/it]                                                      {'loss': 0.043, 'grad_norm': 9.463682174682617, 'learning_rate': 3.905398780037962e-05, 'epoch': 6.27}
- 13%|█▎        | 727/5800 [2:03:14<9:49:04,  6.97s/it]score1 tensor([[0.5000],
-        [0.4629],
-        [0.4375],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4082, 0.5820, 0.4336, 0.4688], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0674, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:49:51,345] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 10:49:51,346] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.47 | bwd_microstep: 4641.04 | bwd_inner_microstep: 4635.88 | bwd_allreduce_microstep: 5.05 | step_microstep: 44.51
-[2025-01-25 10:49:51,347] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.43 | bwd: 4641.06 | bwd_inner: 4635.88 | bwd_allreduce: 5.10 | step: 44.53
- 13%|█▎        | 728/5800 [2:03:21<9:48:00,  6.96s/it]                                                      {'loss': 0.0674, 'grad_norm': 4.832339286804199, 'learning_rate': 3.905059067850609e-05, 'epoch': 6.28}
- 13%|█▎        | 728/5800 [2:03:21<9:48:00,  6.96s/it]score1 tensor([[0.4590],
-        [0.4180],
-        [0.4883],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4258, 0.4570, 0.5273, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0464, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:49:58,284] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 10:49:58,286] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.77 | bwd_microstep: 4648.58 | bwd_inner_microstep: 4643.23 | bwd_allreduce_microstep: 5.23 | step_microstep: 43.56
-[2025-01-25 10:49:58,286] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.71 | bwd: 4648.61 | bwd_inner: 4643.23 | bwd_allreduce: 5.29 | step: 43.57
- 13%|█▎        | 729/5800 [2:03:28<9:47:18,  6.95s/it]                                                      {'loss': 0.0464, 'grad_norm': 0.5934406518936157, 'learning_rate': 3.904718761632782e-05, 'epoch': 6.28}
- 13%|█▎        | 729/5800 [2:03:28<9:47:18,  6.95s/it]score1 tensor([[0.4805],
-        [0.5195],
-        [0.4043],
-        [0.6484]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6289, 0.4941, 0.3398, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:50:05,212] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 10:50:05,214] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.01 | bwd_microstep: 4637.12 | bwd_inner_microstep: 4631.91 | bwd_allreduce_microstep: 5.12 | step_microstep: 50.90
-[2025-01-25 10:50:05,214] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.98 | bwd: 4637.14 | bwd_inner: 4631.91 | bwd_allreduce: 5.17 | step: 50.92
- 13%|█▎        | 730/5800 [2:03:35<9:46:40,  6.94s/it]                                                      {'loss': 0.0664, 'grad_norm': 5.138295650482178, 'learning_rate': 3.904377861490597e-05, 'epoch': 6.29}
- 13%|█▎        | 730/5800 [2:03:35<9:46:40,  6.94s/it]score1 tensor([[0.4648],
-        [0.4102],
-        [0.3906],
-        [0.4629]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.4648, 0.5195, 0.4258], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0688, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:50:12,145] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 10:50:12,147] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.31 | bwd_microstep: 4645.44 | bwd_inner_microstep: 4640.59 | bwd_allreduce_microstep: 4.77 | step_microstep: 46.27
-[2025-01-25 10:50:12,147] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.26 | bwd: 4645.46 | bwd_inner: 4640.59 | bwd_allreduce: 4.81 | step: 46.28
- 13%|█▎        | 731/5800 [2:03:42<9:46:17,  6.94s/it]                                                      {'loss': 0.0688, 'grad_norm': 4.440950870513916, 'learning_rate': 3.9040363675303494e-05, 'epoch': 6.3}
- 13%|█���        | 731/5800 [2:03:42<9:46:17,  6.94s/it]score1 tensor([[0.5508],
-        [0.4766],
-        [0.5312],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.5195, 0.5508, 0.6250], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0361, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:50:19,064] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.37
-[2025-01-25 10:50:19,066] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.48 | bwd_microstep: 4643.89 | bwd_inner_microstep: 4638.46 | bwd_allreduce_microstep: 5.31 | step_microstep: 44.63
-[2025-01-25 10:50:19,066] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.45 | bwd: 4643.91 | bwd_inner: 4638.46 | bwd_allreduce: 5.36 | step: 44.65
- 13%|█▎        | 732/5800 [2:03:49<9:45:48,  6.94s/it]                                                      {'loss': 0.0361, 'grad_norm': 10.004069328308105, 'learning_rate': 3.903694279858525e-05, 'epoch': 6.31}
- 13%|█▎        | 732/5800 [2:03:49<9:45:48,  6.94s/it]score1 tensor([[0.5703],
-        [0.5859],
-        [0.4863],
-        [0.4219]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.5508, 0.6406, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0752, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:50:26,004] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.37
-[2025-01-25 10:50:26,006] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.49 | bwd_microstep: 4646.52 | bwd_inner_microstep: 4641.19 | bwd_allreduce_microstep: 5.19 | step_microstep: 47.93
-[2025-01-25 10:50:26,006] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.46 | bwd: 4646.55 | bwd_inner: 4641.19 | bwd_allreduce: 5.25 | step: 47.93
- 13%|█▎        | 733/5800 [2:03:55<9:46:11,  6.94s/it]                                                      {'loss': 0.0752, 'grad_norm': 1.1420373916625977, 'learning_rate': 3.9033515985817926e-05, 'epoch': 6.32}
- 13%|█▎        | 733/5800 [2:03:56<9:46:11,  6.94s/it]score1 tensor([[0.4980],
-        [0.3926],
-        [0.4863],
-        [0.4688]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4961, 0.4512, 0.4609, 0.3926], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0405, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:50:32,981] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.36
-[2025-01-25 10:50:32,983] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.55 | bwd_microstep: 4645.71 | bwd_inner_microstep: 4638.97 | bwd_allreduce_microstep: 6.59 | step_microstep: 70.59
-[2025-01-25 10:50:32,983] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.51 | bwd: 4645.76 | bwd_inner: 4638.97 | bwd_allreduce: 6.67 | step: 70.56
- 13%|█▎        | 734/5800 [2:04:02<9:46:51,  6.95s/it]                                                      {'loss': 0.0405, 'grad_norm': 4.987322807312012, 'learning_rate': 3.903008323807006e-05, 'epoch': 6.33}
- 13%|█▎        | 734/5800 [2:04:02<9:46:51,  6.95s/it]score1 tensor([[0.5625],
-        [0.4629],
-        [0.5508],
-        [0.6172]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.4180, 0.5352, 0.6953], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0493, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:50:39,955] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.93 | optimizer_step: 4.36
-[2025-01-25 10:50:39,956] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2175.60 | bwd_microstep: 4649.85 | bwd_inner_microstep: 4644.75 | bwd_allreduce_microstep: 4.98 | step_microstep: 45.19
-[2025-01-25 10:50:39,957] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2175.56 | bwd: 4649.88 | bwd_inner: 4644.75 | bwd_allreduce: 5.04 | step: 45.20
- 13%|█▎        | 735/5800 [2:04:09<9:47:07,  6.96s/it]                                                      {'loss': 0.0493, 'grad_norm': 0.9454227685928345, 'learning_rate': 3.902664455641203e-05, 'epoch': 6.34}
- 13%|█▎        | 735/5800 [2:04:09<9:47:07,  6.96s/it]score1 tensor([[0.4570],
-        [0.4434],
-        [0.4805],
-        [0.4707]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.5156, 0.4863, 0.3906], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0640, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:50:46,902] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 10:50:46,903] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.01 | bwd_microstep: 4651.56 | bwd_inner_microstep: 4646.68 | bwd_allreduce_microstep: 4.78 | step_microstep: 44.81
-[2025-01-25 10:50:46,904] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.98 | bwd: 4651.58 | bwd_inner: 4646.68 | bwd_allreduce: 4.83 | step: 44.82
- 13%|█▎        | 736/5800 [2:04:16<9:46:35,  6.95s/it]                                                      {'loss': 0.064, 'grad_norm': 4.557432651519775, 'learning_rate': 3.9023199941916094e-05, 'epoch': 6.34}
- 13%|█▎        | 736/5800 [2:04:16<9:46:35,  6.95s/it]score1 tensor([[0.5586],
-        [0.4277],
-        [0.4395],
-        [0.4453]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.4668, 0.4297, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:50:53,781] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 10:50:53,783] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.09 | bwd_microstep: 4593.84 | bwd_inner_microstep: 4588.61 | bwd_allreduce_microstep: 5.14 | step_microstep: 44.06
-[2025-01-25 10:50:53,783] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.04 | bwd: 4593.87 | bwd_inner: 4588.61 | bwd_allreduce: 5.19 | step: 44.07
- 13%|█▎        | 737/5800 [2:04:23<9:44:46,  6.93s/it]                                                      {'loss': 0.0132, 'grad_norm': 2.587097644805908, 'learning_rate': 3.901974939565633e-05, 'epoch': 6.35}
- 13%|█▎        | 737/5800 [2:04:23<9:44:46,  6.93s/it]score1 tensor([[0.5547],
-        [0.4961],
-        [0.5859],
-        [0.4766]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.4707, 0.5000, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0630, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:51:00,715] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.37
-[2025-01-25 10:51:00,717] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.54 | bwd_microstep: 4640.07 | bwd_inner_microstep: 4634.76 | bwd_allreduce_microstep: 5.19 | step_microstep: 50.99
-[2025-01-25 10:51:00,717] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.48 | bwd: 4640.10 | bwd_inner: 4634.76 | bwd_allreduce: 5.26 | step: 51.00
- 13%|█▎        | 738/5800 [2:04:30<9:44:39,  6.93s/it]                                                      {'loss': 0.063, 'grad_norm': 0.6787165999412537, 'learning_rate': 3.9016292918708685e-05, 'epoch': 6.36}
- 13%|█▎        | 738/5800 [2:04:30<9:44:39,  6.93s/it]score1 tensor([[0.5195],
-        [0.6484],
-        [0.5703],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4277, 0.6797, 0.6523, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:51:07,645] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.36
-[2025-01-25 10:51:07,647] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.97 | bwd_microstep: 4642.17 | bwd_inner_microstep: 4637.09 | bwd_allreduce_microstep: 5.00 | step_microstep: 45.01
-[2025-01-25 10:51:07,647] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.92 | bwd: 4642.19 | bwd_inner: 4637.09 | bwd_allreduce: 5.03 | step: 45.02
- 13%|█▎        | 739/5800 [2:04:37<9:44:35,  6.93s/it]                                                      {'loss': 0.0547, 'grad_norm': 0.8986573815345764, 'learning_rate': 3.9012830512150945e-05, 'epoch': 6.37}
- 13%|█▎        | 739/5800 [2:04:37<9:44:35,  6.93s/it]score1 tensor([[0.5234],
-        [0.5352],
-        [0.5195],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.6602, 0.3887, 0.4727], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0928, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:51:14,575] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.37
-[2025-01-25 10:51:14,576] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.09 | bwd_microstep: 4645.09 | bwd_inner_microstep: 4640.27 | bwd_allreduce_microstep: 4.74 | step_microstep: 46.17
-[2025-01-25 10:51:14,576] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.05 | bwd: 4645.12 | bwd_inner: 4640.27 | bwd_allreduce: 4.79 | step: 46.18
- 13%|█▎        | 740/5800 [2:04:44<9:44:32,  6.93s/it]                                                      {'loss': 0.0928, 'grad_norm': 4.759648323059082, 'learning_rate': 3.900936217706275e-05, 'epoch': 6.38}
- 13%|█▎        | 740/5800 [2:04:44<9:44:32,  6.93s/it]score1 tensor([[0.6172],
-        [0.5664],
-        [0.5703],
-        [0.5977]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4375, 0.4980, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0903, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:51:21,515] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.09 | optimizer_step: 4.37
-[2025-01-25 10:51:21,516] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.05 | bwd_microstep: 4645.65 | bwd_inner_microstep: 4640.35 | bwd_allreduce_microstep: 5.17 | step_microstep: 50.76
-[2025-01-25 10:51:21,517] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.99 | bwd: 4645.71 | bwd_inner: 4640.35 | bwd_allreduce: 5.23 | step: 50.74
- 13%|█▎        | 741/5800 [2:04:51<9:44:45,  6.94s/it]                                                      {'loss': 0.0903, 'grad_norm': 10.082600593566895, 'learning_rate': 3.9005887914525586e-05, 'epoch': 6.39}
- 13%|█▎        | 741/5800 [2:04:51<9:44:45,  6.94s/it]score1 tensor([[0.6367],
-        [0.4961],
-        [0.5508],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.5469, 0.4180, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:51:28,475] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.24 | optimizer_step: 4.37
-[2025-01-25 10:51:28,477] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2170.16 | bwd_microstep: 4650.98 | bwd_inner_microstep: 4642.60 | bwd_allreduce_microstep: 8.19 | step_microstep: 58.35
-[2025-01-25 10:51:28,477] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2170.13 | bwd: 4651.03 | bwd_inner: 4642.60 | bwd_allreduce: 8.29 | step: 58.34
- 13%|█▎        | 742/5800 [2:04:58<9:45:47,  6.95s/it]                                                      {'loss': 0.0703, 'grad_norm': 5.136430740356445, 'learning_rate': 3.900240772562279e-05, 'epoch': 6.4}
- 13%|█▎        | 742/5800 [2:04:58<9:45:47,  6.95s/it]score1 tensor([[0.4805],
-        [0.5117],
-        [0.5234],
-        [0.5781]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4297, 0.4844, 0.4883, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0322, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:51:35,470] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.08 | optimizer_step: 4.36
-[2025-01-25 10:51:35,471] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.69 | bwd_microstep: 4653.94 | bwd_inner_microstep: 4645.99 | bwd_allreduce_microstep: 7.76 | step_microstep: 64.17
-[2025-01-25 10:51:35,472] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.64 | bwd: 4653.99 | bwd_inner: 4645.99 | bwd_allreduce: 7.85 | step: 64.15
- 13%|█▎        | 743/5800 [2:05:05<9:46:25,  6.96s/it]                                                      {'loss': 0.0322, 'grad_norm': 9.55322551727295, 'learning_rate': 3.899892161143955e-05, 'epoch': 6.41}
- 13%|█▎        | 743/5800 [2:05:05<9:46:25,  6.96s/it]score1 tensor([[0.5859],
-        [0.4824],
-        [0.4336],
-        [0.4375]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.4551, 0.4668, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0522, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:51:42,422] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 10:51:42,423] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.18 | bwd_microstep: 4642.06 | bwd_inner_microstep: 4637.05 | bwd_allreduce_microstep: 4.92 | step_microstep: 43.47
-[2025-01-25 10:51:42,423] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.15 | bwd: 4642.08 | bwd_inner: 4637.05 | bwd_allreduce: 4.96 | step: 43.49
- 13%|█▎        | 744/5800 [2:05:12<9:45:52,  6.95s/it]                                                      {'loss': 0.0522, 'grad_norm': 0.9207270741462708, 'learning_rate': 3.8995429573062894e-05, 'epoch': 6.41}
- 13%|█▎        | 744/5800 [2:05:12<9:45:52,  6.95s/it]score1 tensor([[0.4160],
-        [0.4160],
-        [0.4316],
-        [0.4570]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4238, 0.3945, 0.4121, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0269, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:51:49,364] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 10:51:49,365] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.70 | bwd_microstep: 4650.90 | bwd_inner_microstep: 4645.43 | bwd_allreduce_microstep: 5.37 | step_microstep: 44.86
-[2025-01-25 10:51:49,366] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.66 | bwd: 4650.93 | bwd_inner: 4645.43 | bwd_allreduce: 5.43 | step: 44.87
- 13%|█▎        | 745/5800 [2:05:19<9:45:30,  6.95s/it]                                                      {'loss': 0.0269, 'grad_norm': 0.48690542578697205, 'learning_rate': 3.899193161158169e-05, 'epoch': 6.42}
- 13%|█▎        | 745/5800 [2:05:19<9:45:30,  6.95s/it]score1 tensor([[0.4824],
-        [0.3906],
-        [0.3828],
-        [0.2969]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.4883, 0.4375, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0981, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:51:56,306] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.95 | optimizer_step: 4.37
-[2025-01-25 10:51:56,308] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.90 | bwd_microstep: 4649.85 | bwd_inner_microstep: 4644.59 | bwd_allreduce_microstep: 5.17 | step_microstep: 44.03
-[2025-01-25 10:51:56,308] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.86 | bwd: 4649.88 | bwd_inner: 4644.60 | bwd_allreduce: 5.22 | step: 44.04
- 13%|█▎        | 746/5800 [2:05:26<9:45:13,  6.95s/it]                                                      {'loss': 0.0981, 'grad_norm': 8.634847640991211, 'learning_rate': 3.8988427728086673e-05, 'epoch': 6.43}
- 13%|█▎        | 746/5800 [2:05:26<9:45:13,  6.95s/it]score1 tensor([[0.4082],
-        [0.3438],
-        [0.2520],
-        [0.2930]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.3730, 0.3652, 0.3086], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0723, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:52:03,235] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 10:52:03,236] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.70 | bwd_microstep: 4638.43 | bwd_inner_microstep: 4633.34 | bwd_allreduce_microstep: 4.97 | step_microstep: 44.61
-[2025-01-25 10:52:03,237] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.65 | bwd: 4638.45 | bwd_inner: 4633.35 | bwd_allreduce: 5.04 | step: 44.63
- 13%|█▎        | 747/5800 [2:05:33<9:44:36,  6.94s/it]                                                      {'loss': 0.0723, 'grad_norm': 7.640950679779053, 'learning_rate': 3.898491792367041e-05, 'epoch': 6.44}
- 13%|█▎        | 747/5800 [2:05:33<9:44:36,  6.94s/it]score1 tensor([[0.4141],
-        [0.4238],
-        [0.3652],
-        [0.3672]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.5117, 0.5586, 0.4805], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1465, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:52:10,168] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.01 | optimizer_step: 4.36
-[2025-01-25 10:52:10,170] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.49 | bwd_microstep: 4643.11 | bwd_inner_microstep: 4637.25 | bwd_allreduce_microstep: 5.75 | step_microstep: 48.68
-[2025-01-25 10:52:10,171] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.46 | bwd: 4643.13 | bwd_inner: 4637.25 | bwd_allreduce: 5.81 | step: 48.70
- 13%|█▎        | 748/5800 [2:05:40<9:44:17,  6.94s/it]                                                      {'loss': 0.1465, 'grad_norm': 8.557391166687012, 'learning_rate': 3.8981402199427326e-05, 'epoch': 6.45}
- 13%|█▎        | 748/5800 [2:05:40<9:44:17,  6.94s/it]score1 tensor([[0.4102],
-        [0.3984],
-        [0.4629],
-        [0.4219]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4922, 0.4629, 0.6562, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1152, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:52:17,106] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 10:52:17,108] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.95 | bwd_microstep: 4647.05 | bwd_inner_microstep: 4641.63 | bwd_allreduce_microstep: 5.34 | step_microstep: 45.13
-[2025-01-25 10:52:17,108] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.91 | bwd: 4647.07 | bwd_inner: 4641.62 | bwd_allreduce: 5.38 | step: 45.14
- 13%|█▎        | 749/5800 [2:05:47<9:44:09,  6.94s/it]                                                      {'loss': 0.1152, 'grad_norm': 8.77188777923584, 'learning_rate': 3.897788055645368e-05, 'epoch': 6.46}
- 13%|█▎        | 749/5800 [2:05:47<9:44:09,  6.94s/it]score1 tensor([[0.3926],
-        [0.4512],
-        [0.5117],
-        [0.4961]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4609, 0.4766, 0.6094, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0615, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:52:24,028] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.89 | optimizer_step: 4.36
-[2025-01-25 10:52:24,030] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.58 | bwd_microstep: 4638.84 | bwd_inner_microstep: 4634.08 | bwd_allreduce_microstep: 4.65 | step_microstep: 43.41
-[2025-01-25 10:52:24,030] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.53 | bwd: 4638.87 | bwd_inner: 4634.08 | bwd_allreduce: 4.69 | step: 43.42
- 13%|█▎        | 750/5800 [2:05:54<9:43:37,  6.93s/it]                                                      {'loss': 0.0615, 'grad_norm': 9.062947273254395, 'learning_rate': 3.8974352995847576e-05, 'epoch': 6.47}
- 13%|█▎        | 750/5800 [2:05:54<9:43:37,  6.93s/it]score1 tensor([[0.4688],
-        [0.5195],
-        [0.5117],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.6094, 0.5391, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0454, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:52:30,966] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 10:52:30,968] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.53 | bwd_microstep: 4640.56 | bwd_inner_microstep: 4633.38 | bwd_allreduce_microstep: 7.09 | step_microstep: 58.19
-[2025-01-25 10:52:30,968] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.48 | bwd: 4640.58 | bwd_inner: 4633.38 | bwd_allreduce: 7.13 | step: 58.16
- 13%|█▎        | 751/5800 [2:06:00<9:43:41,  6.94s/it]                                                      {'loss': 0.0454, 'grad_norm': 9.476889610290527, 'learning_rate': 3.897081951870898e-05, 'epoch': 6.47}
- 13%|█▎        | 751/5800 [2:06:00<9:43:41,  6.94s/it]score1 tensor([[0.5781],
-        [0.5195],
-        [0.6484],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.3438, 0.6445, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0576, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:52:37,931] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.32 | optimizer_step: 4.45
-[2025-01-25 10:52:37,934] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.44 | bwd_microstep: 4646.09 | bwd_inner_microstep: 4636.87 | bwd_allreduce_microstep: 8.97 | step_microstep: 72.05
-[2025-01-25 10:52:37,935] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.39 | bwd: 4646.12 | bwd_inner: 4636.87 | bwd_allreduce: 9.12 | step: 72.10
- 13%|█▎        | 752/5800 [2:06:07<9:44:34,  6.95s/it]                                                      {'loss': 0.0576, 'grad_norm': 4.881217956542969, 'learning_rate': 3.8967280126139686e-05, 'epoch': 6.48}
- 13%|█▎        | 752/5800 [2:06:07<9:44:34,  6.95s/it]score1 tensor([[0.5703],
-        [0.5898],
-        [0.5312],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4902, 0.4824, 0.4492, 0.6016], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0742, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:52:44,891] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 10:52:44,892] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.02 | bwd_microstep: 4647.74 | bwd_inner_microstep: 4642.42 | bwd_allreduce_microstep: 5.25 | step_microstep: 44.98
-[2025-01-25 10:52:44,893] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.98 | bwd: 4647.77 | bwd_inner: 4642.42 | bwd_allreduce: 5.29 | step: 45.00
- 13%|█▎        | 753/5800 [2:06:14<9:44:22,  6.95s/it]                                                      {'loss': 0.0742, 'grad_norm': 4.807510852813721, 'learning_rate': 3.896373481924333e-05, 'epoch': 6.49}
- 13%|█▎        | 753/5800 [2:06:14<9:44:22,  6.95s/it]score1 tensor([[0.5938],
-        [0.5898],
-        [0.5273],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5742, 0.5781, 0.3457, 0.4688], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0796, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:52:51,816] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 10:52:51,817] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.34 | bwd_microstep: 4638.70 | bwd_inner_microstep: 4631.79 | bwd_allreduce_microstep: 6.80 | step_microstep: 43.88
-[2025-01-25 10:52:51,817] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.29 | bwd: 4638.73 | bwd_inner: 4631.79 | bwd_allreduce: 6.86 | step: 43.88
- 13%|█▎        | 754/5800 [2:06:21<9:43:46,  6.94s/it]                                                      {'loss': 0.0796, 'grad_norm': 9.639098167419434, 'learning_rate': 3.896018359912541e-05, 'epoch': 6.5}
- 13%|█▎        | 754/5800 [2:06:21<9:43:46,  6.94s/it]score1 tensor([[0.5195],
-        [0.6445],
-        [0.6133],
-        [0.5781]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4199, 0.5391, 0.5430, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0688, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:52:58,696] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.36
-[2025-01-25 10:52:58,697] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.91 | bwd_microstep: 4595.42 | bwd_inner_microstep: 4589.36 | bwd_allreduce_microstep: 5.96 | step_microstep: 44.28
-[2025-01-25 10:52:58,697] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.88 | bwd: 4595.44 | bwd_inner: 4589.36 | bwd_allreduce: 6.01 | step: 44.29
- 13%|█▎        | 755/5800 [2:06:28<9:42:03,  6.92s/it]                                                      {'loss': 0.0688, 'grad_norm': 7.40805196762085, 'learning_rate': 3.8956626466893265e-05, 'epoch': 6.51}
- 13%|█▎        | 755/5800 [2:06:28<9:42:03,  6.92s/it]score1 tensor([[0.5547],
-        [0.5742],
-        [0.4785],
-        [0.5977]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4258, 0.5430, 0.3105, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1118, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:53:05,622] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.97 | optimizer_step: 4.36
-[2025-01-25 10:53:05,624] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.67 | bwd_microstep: 4639.22 | bwd_inner_microstep: 4634.38 | bwd_allreduce_microstep: 4.75 | step_microstep: 43.63
-[2025-01-25 10:53:05,624] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.61 | bwd: 4639.24 | bwd_inner: 4634.38 | bwd_allreduce: 4.78 | step: 43.64
- 13%|█▎        | 756/5800 [2:06:35<9:42:08,  6.92s/it]                                                      {'loss': 0.1118, 'grad_norm': 9.49903392791748, 'learning_rate': 3.8953063423656055e-05, 'epoch': 6.52}
- 13%|█▎        | 756/5800 [2:06:35<9:42:08,  6.92s/it]score1 tensor([[0.5391],
-        [0.5859],
-        [0.5234],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4883, 0.5273, 0.3789, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0674, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:53:12,559] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 10:53:12,560] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.97 | bwd_microstep: 4649.08 | bwd_inner_microstep: 4643.99 | bwd_allreduce_microstep: 5.00 | step_microstep: 41.69
-[2025-01-25 10:53:12,560] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.92 | bwd: 4649.11 | bwd_inner: 4643.99 | bwd_allreduce: 5.05 | step: 41.69
- 13%|█▎        | 757/5800 [2:06:42<9:42:08,  6.93s/it]                                                      {'loss': 0.0674, 'grad_norm': 9.482535362243652, 'learning_rate': 3.8949494470524804e-05, 'epoch': 6.53}
- 13%|█▎        | 757/5800 [2:06:42<9:42:08,  6.93s/it]score1 tensor([[0.5156],
-        [0.4648],
-        [0.5156],
-        [0.4922]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4805, 0.4043, 0.4355, 0.4531], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0537, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:53:19,477] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.37
-[2025-01-25 10:53:19,479] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.33 | bwd_microstep: 4636.18 | bwd_inner_microstep: 4630.99 | bwd_allreduce_microstep: 5.05 | step_microstep: 42.43
-[2025-01-25 10:53:19,479] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.29 | bwd: 4636.20 | bwd_inner: 4630.99 | bwd_allreduce: 5.12 | step: 42.45
- 13%|█▎        | 758/5800 [2:06:49<9:41:55,  6.93s/it]                                                      {'loss': 0.0537, 'grad_norm': 9.056159019470215, 'learning_rate': 3.894591960861237e-05, 'epoch': 6.53}
- 13%|█▎        | 758/5800 [2:06:49<9:41:55,  6.93s/it]score1 tensor([[0.4863],
-        [0.4707],
-        [0.5312],
-        [0.4688]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.6094, 0.6094, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0850, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:53:26,414] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 10:53:26,416] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.84 | bwd_microstep: 4645.68 | bwd_inner_microstep: 4640.74 | bwd_allreduce_microstep: 4.84 | step_microstep: 46.19
-[2025-01-25 10:53:26,416] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.80 | bwd: 4645.70 | bwd_inner: 4640.75 | bwd_allreduce: 4.89 | step: 46.20
- 13%|█▎        | 759/5800 [2:06:56<9:42:06,  6.93s/it]                                                      {'loss': 0.085, 'grad_norm': 9.158037185668945, 'learning_rate': 3.894233883903347e-05, 'epoch': 6.54}
- 13%|█▎        | 759/5800 [2:06:56<9:42:06,  6.93s/it]score1 tensor([[0.5234],
-        [0.4570],
-        [0.4395],
-        [0.4668]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.5430, 0.4707, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0913, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:53:33,347] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.35 | optimizer_step: 4.37
-[2025-01-25 10:53:33,349] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.18 | bwd_microstep: 4638.71 | bwd_inner_microstep: 4633.12 | bwd_allreduce_microstep: 5.50 | step_microstep: 50.58
-[2025-01-25 10:53:33,350] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.15 | bwd: 4638.74 | bwd_inner: 4633.12 | bwd_allreduce: 5.55 | step: 50.63
- 13%|█▎        | 760/5800 [2:07:03<9:42:14,  6.93s/it]                                                      {'loss': 0.0913, 'grad_norm': 9.027399063110352, 'learning_rate': 3.8938752162904645e-05, 'epoch': 6.55}
- 13%|█▎        | 760/5800 [2:07:03<9:42:14,  6.93s/it]score1 tensor([[0.4102],
-        [0.3887],
-        [0.3965],
-        [0.4121]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.4043, 0.5117, 0.3789], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0752, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:53:40,292] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.95 | optimizer_step: 4.36
-[2025-01-25 10:53:40,293] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.48 | bwd_microstep: 4645.47 | bwd_inner_microstep: 4640.37 | bwd_allreduce_microstep: 5.02 | step_microstep: 46.29
-[2025-01-25 10:53:40,294] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.42 | bwd: 4645.50 | bwd_inner: 4640.37 | bwd_allreduce: 5.06 | step: 46.30
- 13%|█▎        | 761/5800 [2:07:10<9:42:18,  6.93s/it]                                                      {'loss': 0.0752, 'grad_norm': 4.121554374694824, 'learning_rate': 3.893515958134427e-05, 'epoch': 6.56}
- 13%|█▎        | 761/5800 [2:07:10<9:42:18,  6.93s/it]score1 tensor([[0.4043],
-        [0.3926],
-        [0.4062],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.4336, 0.4062, 0.6367], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0503, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:53:47,213] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.90 | optimizer_step: 4.36
-[2025-01-25 10:53:47,215] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2172.80 | bwd_microstep: 4594.21 | bwd_inner_microstep: 4586.69 | bwd_allreduce_microstep: 7.38 | step_microstep: 56.18
-[2025-01-25 10:53:47,215] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2172.70 | bwd: 4594.26 | bwd_inner: 4586.69 | bwd_allreduce: 7.45 | step: 56.17
- 13%|█▎        | 762/5800 [2:07:17<9:41:55,  6.93s/it]                                                      {'loss': 0.0503, 'grad_norm': 6.543796539306641, 'learning_rate': 3.893156109547259e-05, 'epoch': 6.57}
- 13%|█▎        | 762/5800 [2:07:17<9:41:55,  6.93s/it]score1 tensor([[0.4590],
-        [0.4219],
-        [0.4414],
-        [0.4434]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5078, 0.5195, 0.4922, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0645, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:53:54,159] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.37
-[2025-01-25 10:53:54,161] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.69 | bwd_microstep: 4645.69 | bwd_inner_microstep: 4640.12 | bwd_allreduce_microstep: 5.46 | step_microstep: 50.49
-[2025-01-25 10:53:54,162] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.65 | bwd: 4645.72 | bwd_inner: 4640.12 | bwd_allreduce: 5.52 | step: 50.50
- 13%|█▎        | 763/5800 [2:07:24<9:42:13,  6.94s/it]                                                      {'loss': 0.0645, 'grad_norm': 8.607505798339844, 'learning_rate': 3.892795670641167e-05, 'epoch': 6.58}
- 13%|█▎        | 763/5800 [2:07:24<9:42:13,  6.94s/it]score1 tensor([[0.5547],
-        [0.4941],
-        [0.5195],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.5703, 0.6094, 0.4746], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:54:01,093] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 10:54:01,095] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.05 | bwd_microstep: 4646.31 | bwd_inner_microstep: 4641.08 | bwd_allreduce_microstep: 5.13 | step_microstep: 44.21
-[2025-01-25 10:54:01,095] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.01 | bwd: 4646.34 | bwd_inner: 4641.08 | bwd_allreduce: 5.19 | step: 44.22
- 13%|█▎        | 764/5800 [2:07:31<9:41:56,  6.93s/it]                                                      {'loss': 0.0469, 'grad_norm': 4.789449691772461, 'learning_rate': 3.8924346415285416e-05, 'epoch': 6.59}
- 13%|█▎        | 764/5800 [2:07:31<9:41:56,  6.93s/it]score1 tensor([[0.5391],
-        [0.4727],
-        [0.5703],
-        [0.5898]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.3340, 0.6406, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0562, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:54:07,962] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 10:54:07,964] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.74 | bwd_microstep: 4587.73 | bwd_inner_microstep: 4582.70 | bwd_allreduce_microstep: 4.95 | step_microstep: 43.64
-[2025-01-25 10:54:07,965] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.71 | bwd: 4587.76 | bwd_inner: 4582.70 | bwd_allreduce: 4.99 | step: 43.64
- 13%|█▎        | 765/5800 [2:07:37<9:40:12,  6.91s/it]                                                      {'loss': 0.0562, 'grad_norm': 2.7778282165527344, 'learning_rate': 3.8920730223219586e-05, 'epoch': 6.59}
- 13%|█▎        | 765/5800 [2:07:37<9:40:12,  6.91s/it]score1 tensor([[0.5625],
-        [0.5547],
-        [0.5977],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.5430, 0.5586, 0.4141], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0649, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:54:14,889] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.37
-[2025-01-25 10:54:14,890] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.41 | bwd_microstep: 4645.93 | bwd_inner_microstep: 4640.58 | bwd_allreduce_microstep: 5.28 | step_microstep: 44.92
-[2025-01-25 10:54:14,891] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.36 | bwd: 4645.95 | bwd_inner: 4640.58 | bwd_allreduce: 5.31 | step: 44.92
- 13%|█▎        | 766/5800 [2:07:44<9:40:26,  6.92s/it]                                                      {'loss': 0.0649, 'grad_norm': 9.507046699523926, 'learning_rate': 3.891710813134177e-05, 'epoch': 6.6}
- 13%|█▎        | 766/5800 [2:07:44<9:40:26,  6.92s/it]score1 tensor([[0.6211],
-        [0.6055],
-        [0.6367],
-        [0.5898]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.4980, 0.5078, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1309, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:54:21,816] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 10:54:21,817] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.59 | bwd_microstep: 4641.53 | bwd_inner_microstep: 4636.73 | bwd_allreduce_microstep: 4.73 | step_microstep: 42.03
-[2025-01-25 10:54:21,817] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.55 | bwd: 4641.55 | bwd_inner: 4636.73 | bwd_allreduce: 4.76 | step: 42.04
- 13%|█▎        | 767/5800 [2:07:51<9:40:31,  6.92s/it]                                                      {'loss': 0.1309, 'grad_norm': 9.895994186401367, 'learning_rate': 3.8913480140781394e-05, 'epoch': 6.61}
- 13%|█▎        | 767/5800 [2:07:51<9:40:31,  6.92s/it]score1 tensor([[0.6016],
-        [0.6094],
-        [0.6641],
-        [0.6328]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5234, 0.4004, 0.5781, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1270, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:54:28,741] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 10:54:28,742] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.66 | bwd_microstep: 4640.50 | bwd_inner_microstep: 4635.73 | bwd_allreduce_microstep: 4.68 | step_microstep: 41.91
-[2025-01-25 10:54:28,743] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.62 | bwd: 4640.52 | bwd_inner: 4635.73 | bwd_allreduce: 4.73 | step: 41.92
- 13%|█▎        | 768/5800 [2:07:58<9:40:31,  6.92s/it]                                                      {'loss': 0.127, 'grad_norm': 9.970535278320312, 'learning_rate': 3.890984625266974e-05, 'epoch': 6.62}
- 13%|█▎        | 768/5800 [2:07:58<9:40:31,  6.92s/it]score1 tensor([[0.6055],
-        [0.6250],
-        [0.6289],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.6016, 0.4316, 0.4688], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1152, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:54:35,675] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 10:54:35,676] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.54 | bwd_microstep: 4647.66 | bwd_inner_microstep: 4642.94 | bwd_allreduce_microstep: 4.63 | step_microstep: 42.63
-[2025-01-25 10:54:35,676] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.47 | bwd: 4647.68 | bwd_inner: 4642.94 | bwd_allreduce: 4.67 | step: 42.63
- 13%|█▎        | 769/5800 [2:08:05<9:40:43,  6.93s/it]                                                      {'loss': 0.1152, 'grad_norm': 9.762275695800781, 'learning_rate': 3.8906206468139904e-05, 'epoch': 6.63}
- 13%|█▎        | 769/5800 [2:08:05<9:40:43,  6.93s/it]score1 tensor([[0.5859],
-        [0.6836],
-        [0.5859],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4922, 0.6133, 0.4395, 0.3672], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1196, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:54:42,620] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.74 | optimizer_step: 4.36
-[2025-01-25 10:54:42,622] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.68 | bwd_microstep: 4644.91 | bwd_inner_microstep: 4639.19 | bwd_allreduce_microstep: 5.61 | step_microstep: 62.91
-[2025-01-25 10:54:42,622] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.64 | bwd: 4644.93 | bwd_inner: 4639.19 | bwd_allreduce: 5.67 | step: 62.92
- 13%|█▎        | 770/5800 [2:08:12<9:41:41,  6.94s/it]                                                      {'loss': 0.1196, 'grad_norm': 9.672521591186523, 'learning_rate': 3.890256078832685e-05, 'epoch': 6.64}
- 13%|█▎        | 770/5800 [2:08:12<9:41:41,  6.94s/it]score1 tensor([[0.5859],
-        [0.5586],
-        [0.6055],
-        [0.4590]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.5508, 0.4863, 0.3516], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0840, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:54:49,599] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.72 | optimizer_step: 4.85
-[2025-01-25 10:54:49,601] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.69 | bwd_microstep: 4646.33 | bwd_inner_microstep: 4636.96 | bwd_allreduce_microstep: 9.11 | step_microstep: 58.27
-[2025-01-25 10:54:49,602] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.62 | bwd: 4646.39 | bwd_inner: 4636.96 | bwd_allreduce: 9.23 | step: 58.26
- 13%|█▎        | 771/5800 [2:08:19<9:42:47,  6.95s/it]                                                      {'loss': 0.084, 'grad_norm': 9.268802642822266, 'learning_rate': 3.889890921436734e-05, 'epoch': 6.65}
- 13%|█▎        | 771/5800 [2:08:19<9:42:47,  6.95s/it]score1 tensor([[0.4961],
-        [0.5078],
-        [0.5469],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.4629, 0.5625, 0.4805], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0361, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:54:56,604] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.30 | optimizer_step: 4.36
-[2025-01-25 10:54:56,605] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2169.29 | bwd_microstep: 4646.88 | bwd_inner_microstep: 4641.77 | bwd_allreduce_microstep: 5.03 | step_microstep: 68.32
-[2025-01-25 10:54:56,606] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2169.25 | bwd: 4646.90 | bwd_inner: 4641.77 | bwd_allreduce: 5.07 | step: 68.33
- 13%|█▎        | 772/5800 [2:08:26<9:43:06,  6.96s/it]                                                      {'loss': 0.0361, 'grad_norm': 0.4288509786128998, 'learning_rate': 3.8895251747400025e-05, 'epoch': 6.66}
- 13%|█▎        | 772/5800 [2:08:26<9:43:06,  6.96s/it]score1 tensor([[0.4844],
-        [0.4258],
-        [0.5000],
-        [0.4727]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.4492, 0.5898, 0.4434], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0630, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:55:03,528] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.37
-[2025-01-25 10:55:03,529] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.88 | bwd_microstep: 4647.93 | bwd_inner_microstep: 4642.22 | bwd_allreduce_microstep: 5.57 | step_microstep: 44.09
-[2025-01-25 10:55:03,530] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.84 | bwd: 4647.96 | bwd_inner: 4642.22 | bwd_allreduce: 5.64 | step: 44.10
- 13%|█▎        | 773/5800 [2:08:33<9:42:13,  6.95s/it]                                                      {'loss': 0.063, 'grad_norm': 0.40847069025039673, 'learning_rate': 3.8891588388565346e-05, 'epoch': 6.66}
- 13%|█▎        | 773/5800 [2:08:33<9:42:13,  6.95s/it]score1 tensor([[0.5430],
-        [0.5273],
-        [0.4512],
-        [0.4805]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5742, 0.6016, 0.4512, 0.3262], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0649, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:55:10,415] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.37
-[2025-01-25 10:55:10,417] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.46 | bwd_microstep: 4600.38 | bwd_inner_microstep: 4595.30 | bwd_allreduce_microstep: 4.97 | step_microstep: 43.08
-[2025-01-25 10:55:10,417] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.42 | bwd: 4600.41 | bwd_inner: 4595.30 | bwd_allreduce: 5.02 | step: 43.08
- 13%|█▎        | 774/5800 [2:08:40<9:40:32,  6.93s/it]                                                      {'loss': 0.0649, 'grad_norm': 2.4742181301116943, 'learning_rate': 3.888791913900561e-05, 'epoch': 6.67}
- 13%|█▎        | 774/5800 [2:08:40<9:40:32,  6.93s/it]score1 tensor([[0.4180],
-        [0.4238],
-        [0.4062],
-        [0.4082]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3984, 0.4336, 0.4062, 0.3672], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:55:17,285] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 10:55:17,287] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.76 | bwd_microstep: 4586.24 | bwd_inner_microstep: 4581.12 | bwd_allreduce_microstep: 4.99 | step_microstep: 44.43
-[2025-01-25 10:55:17,287] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.72 | bwd: 4586.27 | bwd_inner: 4581.12 | bwd_allreduce: 5.07 | step: 44.44
- 13%|█▎        | 775/5800 [2:08:47<9:38:53,  6.91s/it]                                                      {'loss': 0.0176, 'grad_norm': 2.056838274002075, 'learning_rate': 3.888424399986495e-05, 'epoch': 6.68}
- 13%|█▎        | 775/5800 [2:08:47<9:38:53,  6.91s/it]score1 tensor([[0.4141],
-        [0.4004],
-        [0.4434],
-        [0.3770]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.4512, 0.5078, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:55:24,225] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 10:55:24,226] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.45 | bwd_microstep: 4649.92 | bwd_inner_microstep: 4645.05 | bwd_allreduce_microstep: 4.80 | step_microstep: 47.49
-[2025-01-25 10:55:24,227] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.40 | bwd: 4649.94 | bwd_inner: 4645.05 | bwd_allreduce: 4.83 | step: 47.51
- 13%|█▎        | 776/5800 [2:08:54<9:39:27,  6.92s/it]                                                      {'loss': 0.0664, 'grad_norm': 8.137017250061035, 'learning_rate': 3.888056297228935e-05, 'epoch': 6.69}
- 13%|█▎        | 776/5800 [2:08:54<9:39:27,  6.92s/it]score1 tensor([[0.3887],
-        [0.4395],
-        [0.3906],
-        [0.4180]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4023, 0.5703, 0.3730, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0649, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:55:31,158] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.92 | optimizer_step: 4.37
-[2025-01-25 10:55:31,160] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.49 | bwd_microstep: 4651.90 | bwd_inner_microstep: 4646.29 | bwd_allreduce_microstep: 5.40 | step_microstep: 45.67
-[2025-01-25 10:55:31,160] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.45 | bwd: 4651.92 | bwd_inner: 4646.29 | bwd_allreduce: 5.53 | step: 45.68
- 13%|█▎        | 777/5800 [2:09:01<9:39:47,  6.93s/it]                                                      {'loss': 0.0649, 'grad_norm': 4.200384616851807, 'learning_rate': 3.8876876057426606e-05, 'epoch': 6.7}
- 13%|█▎        | 777/5800 [2:09:01<9:39:47,  6.93s/it]score1 tensor([[0.5391],
-        [0.4531],
-        [0.3594],
-        [0.4434]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6875, 0.3809, 0.3555, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0840, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:55:38,090] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.36
-[2025-01-25 10:55:38,091] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.44 | bwd_microstep: 4643.06 | bwd_inner_microstep: 4638.21 | bwd_allreduce_microstep: 4.76 | step_microstep: 43.48
-[2025-01-25 10:55:38,092] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.39 | bwd: 4643.08 | bwd_inner: 4638.21 | bwd_allreduce: 4.80 | step: 43.49
- 13%|█▎        | 778/5800 [2:09:08<9:39:40,  6.93s/it]                                                      {'loss': 0.084, 'grad_norm': 0.7712474465370178, 'learning_rate': 3.8873183256426356e-05, 'epoch': 6.71}
- 13%|█▎        | 778/5800 [2:09:08<9:39:40,  6.93s/it]score1 tensor([[0.3926],
-        [0.4688],
-        [0.4688],
-        [0.4121]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4629, 0.5664, 0.5586, 0.4160], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0654, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:55:45,024] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.37
-[2025-01-25 10:55:45,026] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.12 | bwd_microstep: 4646.67 | bwd_inner_microstep: 4641.02 | bwd_allreduce_microstep: 5.49 | step_microstep: 50.48
-[2025-01-25 10:55:45,027] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.08 | bwd: 4646.71 | bwd_inner: 4641.02 | bwd_allreduce: 5.58 | step: 50.49
- 13%|█▎        | 779/5800 [2:09:15<9:40:03,  6.93s/it]                                                      {'loss': 0.0654, 'grad_norm': 8.389200210571289, 'learning_rate': 3.88694845704401e-05, 'epoch': 6.72}
- 13%|█▎        | 779/5800 [2:09:15<9:40:03,  6.93s/it]score1 tensor([[0.4531],
-        [0.4473],
-        [0.3945],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4277, 0.4414, 0.4141, 0.6289], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0332, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:55:51,977] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 10:55:51,978] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.27 | bwd_microstep: 4643.22 | bwd_inner_microstep: 4637.79 | bwd_allreduce_microstep: 5.33 | step_microstep: 47.66
-[2025-01-25 10:55:51,979] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.23 | bwd: 4643.25 | bwd_inner: 4637.79 | bwd_allreduce: 5.38 | step: 47.66
- 13%|█▎        | 780/5800 [2:09:21<9:40:19,  6.94s/it]                                                      {'loss': 0.0332, 'grad_norm': 0.5097090601921082, 'learning_rate': 3.8865780000621134e-05, 'epoch': 6.72}
- 13%|█▎        | 780/5800 [2:09:21<9:40:19,  6.94s/it]score1 tensor([[0.4414],
-        [0.5977],
-        [0.4688],
-        [0.4668]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.6133, 0.4980, 0.4297], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0347, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:55:58,945] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.27 | optimizer_step: 4.36
-[2025-01-25 10:55:58,946] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.48 | bwd_microstep: 4653.82 | bwd_inner_microstep: 4648.71 | bwd_allreduce_microstep: 5.03 | step_microstep: 66.64
-[2025-01-25 10:55:58,947] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.36 | bwd: 4653.84 | bwd_inner: 4648.71 | bwd_allreduce: 5.07 | step: 66.65
- 13%|█▎        | 781/5800 [2:09:28<9:41:14,  6.95s/it]                                                      {'loss': 0.0347, 'grad_norm': 4.629953861236572, 'learning_rate': 3.886206954812462e-05, 'epoch': 6.73}
- 13%|█▎        | 781/5800 [2:09:28<9:41:14,  6.95s/it]score1 tensor([[0.4297],
-        [0.4902],
-        [0.5039],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.5820, 0.5000, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:56:05,902] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.37
-[2025-01-25 10:56:05,903] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.46 | bwd_microstep: 4648.49 | bwd_inner_microstep: 4643.13 | bwd_allreduce_microstep: 5.19 | step_microstep: 41.96
-[2025-01-25 10:56:05,903] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.42 | bwd: 4648.54 | bwd_inner: 4643.13 | bwd_allreduce: 5.27 | step: 41.95
- 13%|█▎        | 782/5800 [2:09:35<9:41:01,  6.95s/it]                                                      {'loss': 0.0391, 'grad_norm': 0.5955618023872375, 'learning_rate': 3.8858353214107525e-05, 'epoch': 6.74}
- 13%|█▎        | 782/5800 [2:09:35<9:41:01,  6.95s/it]score1 tensor([[0.5820],
-        [0.5664],
-        [0.7031],
-        [0.4824]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.5977, 0.6562, 0.4727], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0513, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:56:12,832] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 10:56:12,834] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.56 | bwd_microstep: 4647.34 | bwd_inner_microstep: 4642.71 | bwd_allreduce_microstep: 4.56 | step_microstep: 41.60
-[2025-01-25 10:56:12,834] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.52 | bwd: 4647.36 | bwd_inner: 4642.71 | bwd_allreduce: 4.59 | step: 41.61
- 14%|█▎        | 783/5800 [2:09:42<9:40:32,  6.94s/it]                                                      {'loss': 0.0513, 'grad_norm': 4.950308322906494, 'learning_rate': 3.885463099972869e-05, 'epoch': 6.75}
- 14%|█▎        | 783/5800 [2:09:42<9:40:32,  6.94s/it]score1 tensor([[0.5234],
-        [0.5547],
-        [0.5391],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.4570, 0.5039, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:56:19,764] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.37
-[2025-01-25 10:56:19,765] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.05 | bwd_microstep: 4647.43 | bwd_inner_microstep: 4642.46 | bwd_allreduce_microstep: 4.89 | step_microstep: 44.14
-[2025-01-25 10:56:19,766] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.02 | bwd: 4647.45 | bwd_inner: 4642.46 | bwd_allreduce: 4.93 | step: 44.15
- 14%|█▎        | 784/5800 [2:09:49<9:40:04,  6.94s/it]                                                      {'loss': 0.043, 'grad_norm': 9.188799858093262, 'learning_rate': 3.885090290614875e-05, 'epoch': 6.76}
- 14%|█▎        | 784/5800 [2:09:49<9:40:04,  6.94s/it]score1 tensor([[0.6836],
-        [0.4258],
-        [0.5117],
-        [0.4824]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.3945, 0.5117, 0.4707], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0205, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:56:26,640] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 10:56:26,641] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.35 | bwd_microstep: 4583.60 | bwd_inner_microstep: 4578.75 | bwd_allreduce_microstep: 4.75 | step_microstep: 47.97
-[2025-01-25 10:56:26,641] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.32 | bwd: 4583.63 | bwd_inner: 4578.75 | bwd_allreduce: 4.80 | step: 47.98
- 14%|█▎        | 785/5800 [2:09:56<9:38:28,  6.92s/it]                                                      {'loss': 0.0205, 'grad_norm': 6.815266132354736, 'learning_rate': 3.884716893453018e-05, 'epoch': 6.77}
- 14%|█▎        | 785/5800 [2:09:56<9:38:28,  6.92s/it]score1 tensor([[0.5430],
-        [0.4980],
-        [0.6055],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.4902, 0.6719, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:56:33,578] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.37
-[2025-01-25 10:56:33,580] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.53 | bwd_microstep: 4646.44 | bwd_inner_microstep: 4641.02 | bwd_allreduce_microstep: 5.29 | step_microstep: 46.81
-[2025-01-25 10:56:33,580] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.47 | bwd: 4646.47 | bwd_inner: 4641.02 | bwd_allreduce: 5.36 | step: 46.82
- 14%|█▎        | 786/5800 [2:10:03<9:38:56,  6.93s/it]                                                      {'loss': 0.0293, 'grad_norm': 0.5328869223594666, 'learning_rate': 3.8843429086037325e-05, 'epoch': 6.78}
- 14%|█▎        | 786/5800 [2:10:03<9:38:56,  6.93s/it]score1 tensor([[0.4492],
-        [0.5469],
-        [0.5391],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3691, 0.5039, 0.4844, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0483, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:56:40,535] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 10:56:40,536] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.79 | bwd_microstep: 4639.23 | bwd_inner_microstep: 4634.30 | bwd_allreduce_microstep: 4.84 | step_microstep: 42.78
-[2025-01-25 10:56:40,537] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.75 | bwd: 4639.25 | bwd_inner: 4634.30 | bwd_allreduce: 4.88 | step: 42.79
- 14%|█▎        | 787/5800 [2:10:10<9:39:16,  6.93s/it]                                                      {'loss': 0.0483, 'grad_norm': 4.474018096923828, 'learning_rate': 3.883968336183631e-05, 'epoch': 6.78}
- 14%|█▎        | 787/5800 [2:10:10<9:39:16,  6.93s/it]score1 tensor([[0.5625],
-        [0.7109],
-        [0.4355],
-        [0.4160]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6250, 0.6445, 0.4473, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0356, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:56:47,466] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 10:56:47,467] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.41 | bwd_microstep: 4653.10 | bwd_inner_microstep: 4648.00 | bwd_allreduce_microstep: 5.00 | step_microstep: 42.33
-[2025-01-25 10:56:47,468] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.38 | bwd: 4653.13 | bwd_inner: 4648.00 | bwd_allreduce: 5.05 | step: 42.34
- 14%|█▎        | 788/5800 [2:10:17<9:39:11,  6.93s/it]                                                      {'loss': 0.0356, 'grad_norm': 3.7272377014160156, 'learning_rate': 3.883593176309512e-05, 'epoch': 6.79}
- 14%|█▎        | 788/5800 [2:10:17<9:39:11,  6.93s/it]score1 tensor([[0.5117],
-        [0.5391],
-        [0.5117],
-        [0.5781]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4961, 0.6484, 0.5430, 0.6133], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0479, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:56:54,386] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.37
-[2025-01-25 10:56:54,388] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.20 | bwd_microstep: 4637.70 | bwd_inner_microstep: 4632.21 | bwd_allreduce_microstep: 5.40 | step_microstep: 43.84
-[2025-01-25 10:56:54,388] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.17 | bwd: 4637.72 | bwd_inner: 4632.21 | bwd_allreduce: 5.44 | step: 43.87
- 14%|█▎        | 789/5800 [2:10:24<9:39:02,  6.93s/it]                                                      {'loss': 0.0479, 'grad_norm': 4.798167705535889, 'learning_rate': 3.883217429098359e-05, 'epoch': 6.8}
- 14%|█▎        | 789/5800 [2:10:24<9:39:02,  6.93s/it]score1 tensor([[0.3691],
-        [0.4316],
-        [0.4785],
-        [0.5977]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.5352, 0.6055, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0684, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:57:01,321] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.17 | optimizer_step: 4.37
-[2025-01-25 10:57:01,322] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.20 | bwd_microstep: 4642.65 | bwd_inner_microstep: 4638.69 | bwd_allreduce_microstep: 3.89 | step_microstep: 34.15
-[2025-01-25 10:57:01,323] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.17 | bwd: 4642.66 | bwd_inner: 4638.69 | bwd_allreduce: 3.92 | step: 34.16
- 14%|█▎        | 790/5800 [2:10:31<9:38:58,  6.93s/it]                                                      {'loss': 0.0684, 'grad_norm': 8.521708488464355, 'learning_rate': 3.882841094667334e-05, 'epoch': 6.81}
- 14%|█▎        | 790/5800 [2:10:31<9:38:58,  6.93s/it]score1 tensor([[0.7383],
-        [0.4844],
-        [0.6055],
-        [0.5898]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6836, 0.5312, 0.6875, 0.5742], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0498, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:57:08,279] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.92 | optimizer_step: 4.36
-[2025-01-25 10:57:08,280] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.61 | bwd_microstep: 4647.25 | bwd_inner_microstep: 4641.49 | bwd_allreduce_microstep: 5.67 | step_microstep: 58.73
-[2025-01-25 10:57:08,281] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.58 | bwd: 4647.27 | bwd_inner: 4641.49 | bwd_allreduce: 5.71 | step: 58.73
- 14%|█▎        | 791/5800 [2:10:38<9:39:30,  6.94s/it]                                                      {'loss': 0.0498, 'grad_norm': 1.1413573026657104, 'learning_rate': 3.8824641731337855e-05, 'epoch': 6.82}
- 14%|█▎        | 791/5800 [2:10:38<9:39:30,  6.94s/it]score1 tensor([[0.4746],
-        [0.4824],
-        [0.6367],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4141, 0.5078, 0.6133, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:57:15,218] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.37
-[2025-01-25 10:57:15,219] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.38 | bwd_microstep: 4638.35 | bwd_inner_microstep: 4633.53 | bwd_allreduce_microstep: 4.74 | step_microstep: 44.99
-[2025-01-25 10:57:15,219] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.34 | bwd: 4638.37 | bwd_inner: 4633.53 | bwd_allreduce: 4.78 | step: 44.97
- 14%|█▎        | 792/5800 [2:10:45<9:38:47,  6.93s/it]                                                      {'loss': 0.0293, 'grad_norm': 4.764614582061768, 'learning_rate': 3.882086664615245e-05, 'epoch': 6.83}
- 14%|█▎        | 792/5800 [2:10:45<9:38:47,  6.93s/it]score1 tensor([[0.5117],
-        [0.4355],
-        [0.4980],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.4043, 0.4512, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0264, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:57:22,130] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.55 | optimizer_step: 4.36
-[2025-01-25 10:57:22,131] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.02 | bwd_microstep: 4643.52 | bwd_inner_microstep: 4639.43 | bwd_allreduce_microstep: 4.03 | step_microstep: 39.85
-[2025-01-25 10:57:22,131] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.99 | bwd: 4643.53 | bwd_inner: 4639.43 | bwd_allreduce: 4.06 | step: 39.86
- 14%|█▎        | 793/5800 [2:10:52<9:38:11,  6.93s/it]                                                      {'loss': 0.0264, 'grad_norm': 4.204931259155273, 'learning_rate': 3.881708569229425e-05, 'epoch': 6.84}
- 14%|█▎        | 793/5800 [2:10:52<9:38:11,  6.93s/it]score1 tensor([[0.5039],
-        [0.5273],
-        [0.5156],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.6484, 0.4863, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0688, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:57:29,063] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.29 | optimizer_step: 4.37
-[2025-01-25 10:57:29,064] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.26 | bwd_microstep: 4642.50 | bwd_inner_microstep: 4637.78 | bwd_allreduce_microstep: 4.60 | step_microstep: 43.19
-[2025-01-25 10:57:29,064] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2168.22 | bwd: 4642.53 | bwd_inner: 4637.78 | bwd_allreduce: 4.66 | step: 43.21
- 14%|█▎        | 794/5800 [2:10:59<9:38:09,  6.93s/it]                                                      {'loss': 0.0688, 'grad_norm': 0.5071055293083191, 'learning_rate': 3.8813298870942225e-05, 'epoch': 6.84}
- 14%|█▎        | 794/5800 [2:10:59<9:38:09,  6.93s/it]score1 tensor([[0.4688],
-        [0.4707],
-        [0.5352],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4160, 0.4180, 0.4023, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0635, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:57:35,983] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.20 | optimizer_step: 4.37
-[2025-01-25 10:57:35,984] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.92 | bwd_microstep: 4648.79 | bwd_inner_microstep: 4644.81 | bwd_allreduce_microstep: 3.92 | step_microstep: 37.62
-[2025-01-25 10:57:35,985] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.89 | bwd: 4648.80 | bwd_inner: 4644.81 | bwd_allreduce: 3.94 | step: 37.64
- 14%|█▎        | 795/5800 [2:11:05<9:37:42,  6.93s/it]                                                      {'loss': 0.0635, 'grad_norm': 8.944884300231934, 'learning_rate': 3.8809506183277165e-05, 'epoch': 6.85}
- 14%|█▎        | 795/5800 [2:11:05<9:37:42,  6.93s/it]score1 tensor([[0.5430],
-        [0.4160],
-        [0.4316],
-        [0.4707]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5703, 0.4219, 0.4004, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:57:42,898] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.44 | optimizer_step: 4.36
-[2025-01-25 10:57:42,899] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.12 | bwd_microstep: 4639.69 | bwd_inner_microstep: 4634.81 | bwd_allreduce_microstep: 4.72 | step_microstep: 50.03
-[2025-01-25 10:57:42,900] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.12 | bwd: 4639.73 | bwd_inner: 4634.81 | bwd_allreduce: 4.79 | step: 50.02
- 14%|█▎        | 796/5800 [2:11:12<9:38:03,  6.93s/it]                                                      {'loss': 0.0254, 'grad_norm': 4.387156009674072, 'learning_rate': 3.8805707630481716e-05, 'epoch': 6.86}
- 14%|█▎        | 796/5800 [2:11:12<9:38:03,  6.93s/it]score1 tensor([[0.4258],
-        [0.4023],
-        [0.4258],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4590, 0.4746, 0.3789, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:57:49,832] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.39 | optimizer_step: 4.36
-[2025-01-25 10:57:49,833] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.61 | bwd_microstep: 4634.23 | bwd_inner_microstep: 4630.36 | bwd_allreduce_microstep: 3.81 | step_microstep: 34.25
-[2025-01-25 10:57:49,834] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.58 | bwd: 4634.25 | bwd_inner: 4630.36 | bwd_allreduce: 3.84 | step: 34.26
- 14%|█▎        | 797/5800 [2:11:19<9:37:27,  6.93s/it]                                                      {'loss': 0.043, 'grad_norm': 4.2342681884765625, 'learning_rate': 3.8801903213740314e-05, 'epoch': 6.87}
- 14%|█▎        | 797/5800 [2:11:19<9:37:27,  6.93s/it]score1 tensor([[0.4727],
-        [0.5469],
-        [0.5508],
-        [0.5977]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5234, 0.5273, 0.5898, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0303, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:57:56,754] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.19 | optimizer_step: 4.36
-[2025-01-25 10:57:56,756] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.31 | bwd_microstep: 4644.09 | bwd_inner_microstep: 4640.21 | bwd_allreduce_microstep: 3.82 | step_microstep: 34.84
-[2025-01-25 10:57:56,756] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.28 | bwd: 4644.11 | bwd_inner: 4640.21 | bwd_allreduce: 3.85 | step: 34.85
- 14%|█▍        | 798/5800 [2:11:26<9:36:51,  6.92s/it]                                                      {'loss': 0.0303, 'grad_norm': 4.647951602935791, 'learning_rate': 3.879809293423925e-05, 'epoch': 6.88}
- 14%|█▍        | 798/5800 [2:11:26<9:36:51,  6.92s/it]score1 tensor([[0.5195],
-        [0.4766],
-        [0.4570],
-        [0.5781]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.4277, 0.4180, 0.5977], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0308, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:58:03,650] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.54 | optimizer_step: 4.37
-[2025-01-25 10:58:03,651] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.16 | bwd_microstep: 4641.34 | bwd_inner_microstep: 4636.54 | bwd_allreduce_microstep: 4.70 | step_microstep: 43.12
-[2025-01-25 10:58:03,651] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.16 | bwd: 4641.36 | bwd_inner: 4636.54 | bwd_allreduce: 4.74 | step: 43.12
- 14%|█▍        | 799/5800 [2:11:33<9:36:34,  6.92s/it]                                                      {'loss': 0.0308, 'grad_norm': 0.7642914652824402, 'learning_rate': 3.8794276793166636e-05, 'epoch': 6.89}
- 14%|█▍        | 799/5800 [2:11:33<9:36:34,  6.92s/it]score1 tensor([[0.5859],
-        [0.5898],
-        [0.5664],
-        [0.6680]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.5391, 0.5273, 0.7070], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0381, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:58:10,574] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.23 | optimizer_step: 4.36
-[2025-01-25 10:58:10,575] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.91 | bwd_microstep: 4643.28 | bwd_inner_microstep: 4639.34 | bwd_allreduce_microstep: 3.89 | step_microstep: 40.72
-[2025-01-25 10:58:10,576] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.89 | bwd: 4643.30 | bwd_inner: 4639.34 | bwd_allreduce: 3.92 | step: 40.74
- 14%|█▍        | 800/5800 [2:11:40<9:36:38,  6.92s/it]                                                      {'loss': 0.0381, 'grad_norm': 4.619453430175781, 'learning_rate': 3.8790454791712414e-05, 'epoch': 6.9}
- 14%|█▍        | 800/5800 [2:11:40<9:36:38,  6.92s/it]score1 tensor([[0.4941],
-        [0.4922],
-        [0.5156],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4824, 0.4023, 0.4570, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0586, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:58:17,500] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.82 | optimizer_step: 4.64
-[2025-01-25 10:58:17,501] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.15 | bwd_microstep: 4635.25 | bwd_inner_microstep: 4629.99 | bwd_allreduce_microstep: 5.16 | step_microstep: 51.91
-[2025-01-25 10:58:17,502] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.12 | bwd: 4635.28 | bwd_inner: 4629.99 | bwd_allreduce: 5.21 | step: 51.92
- 14%|█▍        | 801/5800 [2:11:47<9:36:37,  6.92s/it]                                                      {'loss': 0.0586, 'grad_norm': 8.818180084228516, 'learning_rate': 3.8786626931068336e-05, 'epoch': 6.91}
- 14%|█▍        | 801/5800 [2:11:47<9:36:37,  6.92s/it]score1 tensor([[0.5156],
-        [0.5078],
-        [0.5000],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.4473, 0.4199, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0605, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:58:24,413] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.23 | optimizer_step: 4.37
-[2025-01-25 10:58:24,414] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.12 | bwd_microstep: 4638.38 | bwd_inner_microstep: 4634.25 | bwd_allreduce_microstep: 4.06 | step_microstep: 32.60
-[2025-01-25 10:58:24,414] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.08 | bwd: 4638.39 | bwd_inner: 4634.25 | bwd_allreduce: 4.09 | step: 32.60
- 14%|█▍        | 802/5800 [2:11:54<9:36:10,  6.92s/it]                                                      {'loss': 0.0605, 'grad_norm': 4.642171859741211, 'learning_rate': 3.878279321242801e-05, 'epoch': 6.91}
- 14%|█▍        | 802/5800 [2:11:54<9:36:10,  6.92s/it]score1 tensor([[0.5469],
-        [0.3711],
-        [0.6562],
-        [0.4590]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.3223, 0.6445, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0322, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:58:31,309] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.31 | optimizer_step: 4.37
-[2025-01-25 10:58:31,310] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.16 | bwd_microstep: 4637.98 | bwd_inner_microstep: 4634.12 | bwd_allreduce_microstep: 3.80 | step_microstep: 33.56
-[2025-01-25 10:58:31,310] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.14 | bwd: 4638.00 | bwd_inner: 4634.12 | bwd_allreduce: 3.83 | step: 33.57
- 14%|█▍        | 803/5800 [2:12:01<9:35:19,  6.91s/it]                                                      {'loss': 0.0322, 'grad_norm': 4.31032133102417, 'learning_rate': 3.877895363698684e-05, 'epoch': 6.92}
- 14%|█▍        | 803/5800 [2:12:01<9:35:19,  6.91s/it]score1 tensor([[0.4824],
-        [0.3867],
-        [0.4453],
-        [0.4863]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4863, 0.4004, 0.4473, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0190, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:58:38,199] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.25 | optimizer_step: 4.37
-[2025-01-25 10:58:38,200] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.74 | bwd_microstep: 4634.62 | bwd_inner_microstep: 4630.30 | bwd_allreduce_microstep: 4.26 | step_microstep: 37.26
-[2025-01-25 10:58:38,200] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.71 | bwd: 4634.64 | bwd_inner: 4630.30 | bwd_allreduce: 4.29 | step: 37.27
- 14%|█▍        | 804/5800 [2:12:08<9:34:42,  6.90s/it]                                                      {'loss': 0.019, 'grad_norm': 8.489763259887695, 'learning_rate': 3.87751082059421e-05, 'epoch': 6.93}
- 14%|█▍        | 804/5800 [2:12:08<9:34:42,  6.90s/it]score1 tensor([[0.4336],
-        [0.3770],
-        [0.3594],
-        [0.4590]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4082, 0.4570, 0.2812, 0.3652], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0693, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:58:45,110] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.41 | optimizer_step: 4.37
-[2025-01-25 10:58:45,111] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2170.10 | bwd_microstep: 4640.12 | bwd_inner_microstep: 4635.55 | bwd_allreduce_microstep: 4.48 | step_microstep: 40.98
-[2025-01-25 10:58:45,111] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2170.07 | bwd: 4640.15 | bwd_inner: 4635.55 | bwd_allreduce: 4.52 | step: 40.99
- 14%|█▍        | 805/5800 [2:12:15<9:35:04,  6.91s/it]                                                      {'loss': 0.0693, 'grad_norm': 4.234799385070801, 'learning_rate': 3.877125692049283e-05, 'epoch': 6.94}
- 14%|█▍        | 805/5800 [2:12:15<9:35:04,  6.91s/it]score1 tensor([[0.3340],
-        [0.3750],
-        [0.3984],
-        [0.4551]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3711, 0.4492, 0.5664, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0713, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:58:52,034] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.37
-[2025-01-25 10:58:52,035] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2170.87 | bwd_microstep: 4642.57 | bwd_inner_microstep: 4637.93 | bwd_allreduce_microstep: 4.57 | step_microstep: 43.05
-[2025-01-25 10:58:52,035] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2170.84 | bwd: 4642.59 | bwd_inner: 4637.93 | bwd_allreduce: 4.60 | step: 43.05
- 14%|█▍        | 806/5800 [2:12:22<9:35:22,  6.91s/it]                                                      {'loss': 0.0713, 'grad_norm': 3.774369478225708, 'learning_rate': 3.876739978183995e-05, 'epoch': 6.95}
- 14%|█▍        | 806/5800 [2:12:22<9:35:22,  6.91s/it]score1 tensor([[0.3926],
-        [0.3535],
-        [0.3340],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.4473, 0.4609, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1240, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:58:58,950] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.06 | optimizer_step: 4.36
-[2025-01-25 10:58:58,951] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.60 | bwd_microstep: 4644.56 | bwd_inner_microstep: 4639.59 | bwd_allreduce_microstep: 4.88 | step_microstep: 41.69
-[2025-01-25 10:58:58,952] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.57 | bwd: 4644.59 | bwd_inner: 4639.59 | bwd_allreduce: 4.93 | step: 41.71
- 14%|█▍        | 807/5800 [2:12:28<9:35:29,  6.92s/it]                                                      {'loss': 0.124, 'grad_norm': 8.094371795654297, 'learning_rate': 3.876353679118617e-05, 'epoch': 6.96}
- 14%|█▍        | 807/5800 [2:12:28<9:35:29,  6.92s/it]score1 tensor([[0.5703],
-        [0.3320],
-        [0.4609],
-        [0.4121]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6562, 0.5156, 0.4121, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1309, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:59:05,881] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.37
-[2025-01-25 10:59:05,883] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.57 | bwd_microstep: 4637.49 | bwd_inner_microstep: 4632.01 | bwd_allreduce_microstep: 5.34 | step_microstep: 53.15
-[2025-01-25 10:59:05,883] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.53 | bwd: 4637.52 | bwd_inner: 4632.01 | bwd_allreduce: 5.41 | step: 53.17
- 14%|█▍        | 808/5800 [2:12:35<9:35:54,  6.92s/it]                                                      {'loss': 0.1309, 'grad_norm': 4.300802707672119, 'learning_rate': 3.875966794973605e-05, 'epoch': 6.97}
- 14%|█▍        | 808/5800 [2:12:35<9:35:54,  6.92s/it]score1 tensor([[0.3438],
-        [0.3477],
-        [0.4609],
-        [0.3945]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3984, 0.4395, 0.4941, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0693, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:59:12,834] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.37
-[2025-01-25 10:59:12,835] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2169.12 | bwd_microstep: 4654.15 | bwd_inner_microstep: 4647.87 | bwd_allreduce_microstep: 6.10 | step_microstep: 48.37
-[2025-01-25 10:59:12,836] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2169.07 | bwd: 4654.18 | bwd_inner: 4647.87 | bwd_allreduce: 6.21 | step: 48.38
- 14%|█▍        | 809/5800 [2:12:42<9:36:28,  6.93s/it]                                                      {'loss': 0.0693, 'grad_norm': 7.970531463623047, 'learning_rate': 3.875579325869595e-05, 'epoch': 6.97}
- 14%|█▍        | 809/5800 [2:12:42<9:36:28,  6.93s/it]score1 tensor([[0.6094],
-        [0.4492],
-        [0.6016],
-        [0.3984]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133, 0.4395, 0.6797, 0.4805], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0435, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:59:19,765] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 10:59:19,767] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.83 | bwd_microstep: 4643.82 | bwd_inner_microstep: 4638.55 | bwd_allreduce_microstep: 5.18 | step_microstep: 44.59
-[2025-01-25 10:59:19,767] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.80 | bwd: 4643.84 | bwd_inner: 4638.55 | bwd_allreduce: 5.22 | step: 44.61
- 14%|█▍        | 810/5800 [2:12:49<9:36:25,  6.93s/it]                                                      {'loss': 0.0435, 'grad_norm': 4.942034721374512, 'learning_rate': 3.875191271927407e-05, 'epoch': 6.98}
- 14%|█▍        | 810/5800 [2:12:49<9:36:25,  6.93s/it]score1 tensor([[0.6055],
-        [0.5195],
-        [0.4922],
-        [0.4277]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.5273, 0.5352, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0571, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:59:26,696] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 10:59:26,698] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.51 | bwd_microstep: 4644.68 | bwd_inner_microstep: 4635.56 | bwd_allreduce_microstep: 9.00 | step_microstep: 44.30
-[2025-01-25 10:59:26,698] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.46 | bwd: 4644.70 | bwd_inner: 4635.56 | bwd_allreduce: 9.07 | step: 44.32
- 14%|█▍        | 811/5800 [2:12:56<9:36:07,  6.93s/it]                                                      {'loss': 0.0571, 'grad_norm': 4.04829216003418, 'learning_rate': 3.874802633268043e-05, 'epoch': 6.99}
- 14%|█▍        | 811/5800 [2:12:56<9:36:07,  6.93s/it][93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5586], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 10:59:31,075] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.36
-[2025-01-25 10:59:31,076] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 575.35 | bwd_microstep: 1225.92 | bwd_inner_microstep: 1220.71 | bwd_allreduce_microstep: 5.09 | step_microstep: 43.14
-[2025-01-25 10:59:31,077] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 575.29 | bwd: 1225.94 | bwd_inner: 1220.71 | bwd_allreduce: 5.16 | step: 43.16
- 14%|█▍        | 812/5800 [2:13:01<8:32:30,  6.16s/it]                                                      {'loss': 0.0195, 'grad_norm': 9.251683235168457, 'learning_rate': 3.874413410012688e-05, 'epoch': 7.0}
- 14%|█▍        | 812/5800 [2:13:01<8:32:30,  6.16s/it][2025-01-25 10:59:35,583] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 10:59:46,153] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 10:59:56,549] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 11:00:06,826] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-score1 tensor([[0.6406],
-        [0.5938],
-        [0.6172],
-        [0.6016]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.4473, 0.4766, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1377, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:00:22,278] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 11:00:22,280] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.86 | bwd_microstep: 4586.29 | bwd_inner_microstep: 4581.45 | bwd_allreduce_microstep: 4.73 | step_microstep: 57.82
-[2025-01-25 11:00:22,280] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.80 | bwd: 4586.32 | bwd_inner: 4581.45 | bwd_allreduce: 4.79 | step: 57.83
- 14%|█▍        | 813/5800 [2:13:52<27:15:53, 19.68s/it]                                                       {'loss': 0.1377, 'grad_norm': 9.698155403137207, 'learning_rate': 3.874023602282707e-05, 'epoch': 7.01}
- 14%|█▍        | 813/5800 [2:13:52<27:15:53, 19.68s/it]score1 tensor([[0.5938],
-        [0.5430],
-        [0.5273],
-        [0.7500]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4629, 0.4062, 0.4004, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1514, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:00:29,171] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.37
-[2025-01-25 11:00:29,173] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2136.53 | bwd_microstep: 4585.69 | bwd_inner_microstep: 4577.26 | bwd_allreduce_microstep: 8.20 | step_microstep: 60.35
-[2025-01-25 11:00:29,173] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2136.45 | bwd: 4585.77 | bwd_inner: 4577.26 | bwd_allreduce: 8.32 | step: 60.33
- 14%|█▍        | 814/5800 [2:13:59<21:56:23, 15.84s/it]                                                       {'loss': 0.1514, 'grad_norm': 9.514471054077148, 'learning_rate': 3.87363321019965e-05, 'epoch': 7.02}
- 14%|█▍        | 814/5800 [2:13:59<21:56:23, 15.84s/it]score1 tensor([[0.7891],
-        [0.6289],
-        [0.6094],
-        [0.6211]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.4941, 0.4648, 0.4395], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1777, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:00:36,049] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.15 | optimizer_step: 4.36
-[2025-01-25 11:00:36,050] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2141.47 | bwd_microstep: 4599.59 | bwd_inner_microstep: 4591.76 | bwd_allreduce_microstep: 7.63 | step_microstep: 47.73
-[2025-01-25 11:00:36,051] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2141.44 | bwd: 4599.65 | bwd_inner: 4591.76 | bwd_allreduce: 7.73 | step: 47.71
- 14%|█▍        | 815/5800 [2:14:06<18:12:37, 13.15s/it]                                                       {'loss': 0.1777, 'grad_norm': 9.968300819396973, 'learning_rate': 3.873242233885248e-05, 'epoch': 7.03}
- 14%|█▍        | 815/5800 [2:14:06<18:12:37, 13.15s/it]score1 tensor([[0.7148],
-        [0.7383],
-        [0.7070],
-        [0.8594]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6016, 0.5039, 0.4609, 0.6797], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1934, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:00:42,918] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.37
-[2025-01-25 11:00:42,920] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2143.08 | bwd_microstep: 4603.67 | bwd_inner_microstep: 4598.84 | bwd_allreduce_microstep: 4.74 | step_microstep: 43.23
-[2025-01-25 11:00:42,920] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2143.03 | bwd: 4603.70 | bwd_inner: 4598.84 | bwd_allreduce: 4.79 | step: 43.24
- 14%|█▍        | 816/5800 [2:14:12<15:35:50, 11.27s/it]                                                       {'loss': 0.1934, 'grad_norm': 10.900458335876465, 'learning_rate': 3.872850673461413e-05, 'epoch': 7.03}
- 14%|█▍        | 816/5800 [2:14:12<15:35:50, 11.27s/it]score1 tensor([[0.6367],
-        [0.6133],
-        [0.8828],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.4609, 0.5977, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1924, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:00:49,799] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.37
-[2025-01-25 11:00:49,800] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.02 | bwd_microstep: 4609.19 | bwd_inner_microstep: 4604.28 | bwd_allreduce_microstep: 4.81 | step_microstep: 43.01
-[2025-01-25 11:00:49,800] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2149.98 | bwd: 4609.21 | bwd_inner: 4604.28 | bwd_allreduce: 4.86 | step: 43.03
- 14%|█▍        | 817/5800 [2:14:19<13:46:20,  9.95s/it]                                                       {'loss': 0.1924, 'grad_norm': 10.126873016357422, 'learning_rate': 3.872458529050242e-05, 'epoch': 7.04}
- 14%|█▍        | 817/5800 [2:14:19<13:46:20,  9.95s/it]score1 tensor([[0.5586],
-        [0.6250],
-        [0.6133],
-        [0.6602]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4766, 0.4609, 0.4336, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1426, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:00:56,691] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 11:00:56,692] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2147.15 | bwd_microstep: 4624.08 | bwd_inner_microstep: 4618.81 | bwd_allreduce_microstep: 5.17 | step_microstep: 44.28
-[2025-01-25 11:00:56,693] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2147.11 | bwd: 4624.10 | bwd_inner: 4618.81 | bwd_allreduce: 5.22 | step: 44.28
- 14%|█▍        | 818/5800 [2:14:26<12:29:55,  9.03s/it]                                                       {'loss': 0.1426, 'grad_norm': 9.49283504486084, 'learning_rate': 3.872065800774011e-05, 'epoch': 7.05}
- 14%|█▍        | 818/5800 [2:14:26<12:29:55,  9.03s/it]score1 tensor([[0.5703],
-        [0.6367],
-        [0.6719],
-        [0.4961]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.5469, 0.5820, 0.2812], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1045, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:01:03,558] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 11:01:03,559] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2149.08 | bwd_microstep: 4603.53 | bwd_inner_microstep: 4598.84 | bwd_allreduce_microstep: 4.61 | step_microstep: 42.19
-[2025-01-25 11:01:03,559] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2149.03 | bwd: 4603.55 | bwd_inner: 4598.84 | bwd_allreduce: 4.65 | step: 42.20
- 14%|█▍        | 819/5800 [2:14:33<11:35:58,  8.38s/it]                                                       {'loss': 0.1045, 'grad_norm': 9.329605102539062, 'learning_rate': 3.8716724887551806e-05, 'epoch': 7.06}
- 14%|█▍        | 819/5800 [2:14:33<11:35:58,  8.38s/it]score1 tensor([[0.7578],
-        [0.5859],
-        [0.6016],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.5703, 0.5430, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0747, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:01:10,438] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 11:01:10,440] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2151.20 | bwd_microstep: 4610.33 | bwd_inner_microstep: 4605.69 | bwd_allreduce_microstep: 4.56 | step_microstep: 42.15
-[2025-01-25 11:01:10,440] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2151.15 | bwd: 4610.35 | bwd_inner: 4605.69 | bwd_allreduce: 4.60 | step: 42.16
- 14%|█▍        | 820/5800 [2:14:40<10:58:20,  7.93s/it]                                                       {'loss': 0.0747, 'grad_norm': 9.648393630981445, 'learning_rate': 3.8712785931163924e-05, 'epoch': 7.07}
- 14%|█▍        | 820/5800 [2:14:40<10:58:20,  7.93s/it]score1 tensor([[0.5391],
-        [0.6562],
-        [0.7383],
-        [0.4277]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5898, 0.5508, 0.6484, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0757, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:01:17,312] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 11:01:17,313] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2151.62 | bwd_microstep: 4608.16 | bwd_inner_microstep: 4603.25 | bwd_allreduce_microstep: 4.80 | step_microstep: 42.38
-[2025-01-25 11:01:17,313] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2151.59 | bwd: 4608.19 | bwd_inner: 4603.25 | bwd_allreduce: 4.85 | step: 42.38
- 14%|█▍        | 821/5800 [2:14:47<10:31:55,  7.62s/it]                                                       {'loss': 0.0757, 'grad_norm': 1.9393346309661865, 'learning_rate': 3.870884113980469e-05, 'epoch': 7.08}
- 14%|█▍        | 821/5800 [2:14:47<10:31:55,  7.62s/it]score1 tensor([[0.4727],
-        [0.4648],
-        [0.6992],
-        [0.4668]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.4668, 0.6289, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:01:24,222] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.47 | optimizer_step: 4.37
-[2025-01-25 11:01:24,224] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2151.71 | bwd_microstep: 4614.63 | bwd_inner_microstep: 4609.40 | bwd_allreduce_microstep: 5.11 | step_microstep: 67.59
-[2025-01-25 11:01:24,224] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2151.67 | bwd: 4614.66 | bwd_inner: 4609.40 | bwd_allreduce: 5.16 | step: 67.61
- 14%|█▍        | 822/5800 [2:14:54<10:14:22,  7.41s/it]                                                       {'loss': 0.0312, 'grad_norm': 3.6993556022644043, 'learning_rate': 3.870489051470416e-05, 'epoch': 7.09}
- 14%|█▍        | 822/5800 [2:14:54<10:14:22,  7.41s/it]score1 tensor([[0.4453],
-        [0.4062],
-        [0.5820],
-        [0.4316]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4766, 0.4707, 0.6211, 0.4297], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0342, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:01:31,137] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.91 | optimizer_step: 4.53
-[2025-01-25 11:01:31,140] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.25 | bwd_microstep: 4614.00 | bwd_inner_microstep: 4609.09 | bwd_allreduce_microstep: 4.80 | step_microstep: 59.42
-[2025-01-25 11:01:31,140] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.17 | bwd: 4614.03 | bwd_inner: 4609.09 | bwd_allreduce: 4.85 | step: 59.48
- 14%|█▍        | 823/5800 [2:15:01<10:02:24,  7.26s/it]                                                       {'loss': 0.0342, 'grad_norm': 4.226658821105957, 'learning_rate': 3.870093405709421e-05, 'epoch': 7.09}
- 14%|█▍        | 823/5800 [2:15:01<10:02:24,  7.26s/it]score1 tensor([[0.5078],
-        [0.4395],
-        [0.4609],
-        [0.4160]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.4863, 0.5312, 0.4277], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0596, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:01:38,069] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.90 | optimizer_step: 4.36
-[2025-01-25 11:01:38,070] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.97 | bwd_microstep: 4614.81 | bwd_inner_microstep: 4609.94 | bwd_allreduce_microstep: 4.78 | step_microstep: 64.54
-[2025-01-25 11:01:38,071] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.93 | bwd: 4614.83 | bwd_inner: 4609.94 | bwd_allreduce: 4.82 | step: 64.55
- 14%|█▍        | 824/5800 [2:15:08<9:53:38,  7.16s/it]                                                       {'loss': 0.0596, 'grad_norm': 8.055760383605957, 'learning_rate': 3.869697176820853e-05, 'epoch': 7.1}
- 14%|█▍        | 824/5800 [2:15:08<9:53:38,  7.16s/it]score1 tensor([[0.4082],
-        [0.5156],
-        [0.5039],
-        [0.4727]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3887, 0.4941, 0.6289, 0.5234], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0542, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:01:44,996] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.37
-[2025-01-25 11:01:44,997] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.94 | bwd_microstep: 4622.94 | bwd_inner_microstep: 4615.49 | bwd_allreduce_microstep: 7.36 | step_microstep: 58.47
-[2025-01-25 11:01:44,997] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.91 | bwd: 4622.97 | bwd_inner: 4615.49 | bwd_allreduce: 7.41 | step: 58.48
- 14%|█▍        | 825/5800 [2:15:14<9:47:48,  7.09s/it]                                                      {'loss': 0.0542, 'grad_norm': 0.4667023718357086, 'learning_rate': 3.869300364928263e-05, 'epoch': 7.11}
- 14%|█▍        | 825/5800 [2:15:14<9:47:48,  7.09s/it]score1 tensor([[0.5117],
-        [0.4336],
-        [0.6094],
-        [0.4395]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6602, 0.4160, 0.6875, 0.4746], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0698, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:01:51,904] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 11:01:51,905] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.92 | bwd_microstep: 4614.94 | bwd_inner_microstep: 4609.37 | bwd_allreduce_microstep: 5.47 | step_microstep: 46.94
-[2025-01-25 11:01:51,905] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.89 | bwd: 4614.96 | bwd_inner: 4609.37 | bwd_allreduce: 5.52 | step: 46.94
- 14%|█▍        | 826/5800 [2:15:21<9:43:04,  7.03s/it]                                                      {'loss': 0.0698, 'grad_norm': 4.5600457191467285, 'learning_rate': 3.868902970155384e-05, 'epoch': 7.12}
- 14%|█▍        | 826/5800 [2:15:21<9:43:04,  7.03s/it]score1 tensor([[0.4590],
-        [0.4062],
-        [0.4883],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.4199, 0.4492, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0283, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:01:58,816] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.36
-[2025-01-25 11:01:58,817] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.94 | bwd_microstep: 4620.25 | bwd_inner_microstep: 4615.04 | bwd_allreduce_microstep: 5.10 | step_microstep: 44.56
-[2025-01-25 11:01:58,817] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.82 | bwd: 4620.28 | bwd_inner: 4615.04 | bwd_allreduce: 5.15 | step: 44.58
- 14%|█▍        | 827/5800 [2:15:28<9:40:01,  7.00s/it]                                                      {'loss': 0.0283, 'grad_norm': 4.382079124450684, 'learning_rate': 3.868504992626131e-05, 'epoch': 7.13}
- 14%|█▍        | 827/5800 [2:15:28<9:40:01,  7.00s/it]score1 tensor([[0.4492],
-        [0.4219],
-        [0.4688],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.3672, 0.4004, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0386, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:02:05,732] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.49 | optimizer_step: 4.37
-[2025-01-25 11:02:05,733] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.86 | bwd_microstep: 4622.01 | bwd_inner_microstep: 4616.67 | bwd_allreduce_microstep: 5.25 | step_microstep: 44.05
-[2025-01-25 11:02:05,734] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.83 | bwd: 4622.04 | bwd_inner: 4616.67 | bwd_allreduce: 5.29 | step: 44.06
- 14%|█▍        | 828/5800 [2:15:35<9:37:48,  6.97s/it]                                                      {'loss': 0.0386, 'grad_norm': 0.5687318444252014, 'learning_rate': 3.8681064324646004e-05, 'epoch': 7.14}
- 14%|█▍        | 828/5800 [2:15:35<9:37:48,  6.97s/it]score1 tensor([[0.5117],
-        [0.5078],
-        [0.5312],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.5078, 0.5508, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0166, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:02:12,574] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 11:02:12,576] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.88 | bwd_microstep: 4566.38 | bwd_inner_microstep: 4561.65 | bwd_allreduce_microstep: 4.62 | step_microstep: 42.65
-[2025-01-25 11:02:12,576] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.84 | bwd: 4566.40 | bwd_inner: 4561.65 | bwd_allreduce: 4.68 | step: 42.66
- 14%|█▍        | 829/5800 [2:15:42<9:34:31,  6.93s/it]                                                      {'loss': 0.0166, 'grad_norm': 6.468243598937988, 'learning_rate': 3.867707289795069e-05, 'epoch': 7.15}
- 14%|█▍        | 829/5800 [2:15:42<9:34:31,  6.93s/it]score1 tensor([[0.4922],
-        [0.5117],
-        [0.5117],
-        [0.5938]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4922, 0.4121, 0.5195, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0308, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:02:19,426] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 11:02:19,428] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.01 | bwd_microstep: 4577.89 | bwd_inner_microstep: 4572.89 | bwd_allreduce_microstep: 4.91 | step_microstep: 43.71
-[2025-01-25 11:02:19,428] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.97 | bwd: 4577.91 | bwd_inner: 4572.89 | bwd_allreduce: 4.95 | step: 43.72
- 14%|█▍        | 830/5800 [2:15:49<9:32:16,  6.91s/it]                                                      {'loss': 0.0308, 'grad_norm': 2.44063663482666, 'learning_rate': 3.8673075647419976e-05, 'epoch': 7.16}
- 14%|█▍        | 830/5800 [2:15:49<9:32:16,  6.91s/it]score1 tensor([[0.5039],
-        [0.5000],
-        [0.5625],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4570, 0.4785, 0.5664, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0283, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:02:26,324] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 11:02:26,325] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.38 | bwd_microstep: 4620.14 | bwd_inner_microstep: 4614.74 | bwd_allreduce_microstep: 5.30 | step_microstep: 43.70
-[2025-01-25 11:02:26,326] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.32 | bwd: 4620.16 | bwd_inner: 4614.74 | bwd_allreduce: 5.35 | step: 43.71
- 14%|█▍        | 831/5800 [2:15:56<9:31:51,  6.91s/it]                                                      {'loss': 0.0283, 'grad_norm': 4.111080169677734, 'learning_rate': 3.866907257430027e-05, 'epoch': 7.16}
- 14%|█▍        | 831/5800 [2:15:56<9:31:51,  6.91s/it]score1 tensor([[0.5039],
-        [0.4863],
-        [0.5508],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4551, 0.4648, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:02:33,225] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.37
-[2025-01-25 11:02:33,227] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.51 | bwd_microstep: 4618.88 | bwd_inner_microstep: 4613.33 | bwd_allreduce_microstep: 5.38 | step_microstep: 46.96
-[2025-01-25 11:02:33,228] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.47 | bwd: 4618.92 | bwd_inner: 4613.34 | bwd_allreduce: 5.47 | step: 46.94
- 14%|█▍        | 832/5800 [2:16:03<9:32:06,  6.91s/it]                                                      {'loss': 0.0508, 'grad_norm': 4.421143054962158, 'learning_rate': 3.866506367983979e-05, 'epoch': 7.17}
- 14%|█▍        | 832/5800 [2:16:03<9:32:06,  6.91s/it]score1 tensor([[0.4746],
-        [0.4863],
-        [0.4414],
-        [0.4805]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4395, 0.4355, 0.3613, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0498, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:02:40,161] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.37
-[2025-01-25 11:02:40,162] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.64 | bwd_microstep: 4628.64 | bwd_inner_microstep: 4623.76 | bwd_allreduce_microstep: 4.78 | step_microstep: 43.13
-[2025-01-25 11:02:40,162] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.59 | bwd: 4628.67 | bwd_inner: 4623.76 | bwd_allreduce: 4.83 | step: 43.14
- 14%|█▍        | 833/5800 [2:16:10<9:32:21,  6.91s/it]                                                      {'loss': 0.0498, 'grad_norm': 8.092841148376465, 'learning_rate': 3.866104896528859e-05, 'epoch': 7.18}
- 14%|█▍        | 833/5800 [2:16:10<9:32:21,  6.91s/it]score1 tensor([[0.5391],
-        [0.5078],
-        [0.6289],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.4902, 0.6562, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0288, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:02:47,087] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 11:02:47,088] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.12 | bwd_microstep: 4623.21 | bwd_inner_microstep: 4618.00 | bwd_allreduce_microstep: 5.10 | step_microstep: 55.74
-[2025-01-25 11:02:47,088] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.06 | bwd: 4623.23 | bwd_inner: 4618.00 | bwd_allreduce: 5.15 | step: 55.75
- 14%|█▍        | 834/5800 [2:16:17<9:32:52,  6.92s/it]                                                      {'loss': 0.0288, 'grad_norm': 4.772640705108643, 'learning_rate': 3.865702843189853e-05, 'epoch': 7.19}
- 14%|█▍        | 834/5800 [2:16:17<9:32:52,  6.92s/it]score1 tensor([[0.5781],
-        [0.5664],
-        [0.5000],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.5625, 0.3457, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0522, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:02:54,030] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 11:02:54,032] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.85 | bwd_microstep: 4627.55 | bwd_inner_microstep: 4622.55 | bwd_allreduce_microstep: 4.92 | step_microstep: 44.97
-[2025-01-25 11:02:54,033] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.75 | bwd: 4627.57 | bwd_inner: 4622.55 | bwd_allreduce: 4.96 | step: 44.98
- 14%|█▍        | 835/5800 [2:16:24<9:32:55,  6.92s/it]                                                      {'loss': 0.0522, 'grad_norm': 8.971495628356934, 'learning_rate': 3.8653002080923274e-05, 'epoch': 7.2}
- 14%|█▍        | 835/5800 [2:16:24<9:32:55,  6.92s/it]score1 tensor([[0.4512],
-        [0.6055],
-        [0.4629],
-        [0.4961]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.6172, 0.6016, 0.5000], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0386, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:03:00,906] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 11:03:00,908] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.41 | bwd_microstep: 4582.62 | bwd_inner_microstep: 4577.15 | bwd_allreduce_microstep: 5.36 | step_microstep: 43.29
-[2025-01-25 11:03:00,908] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.36 | bwd: 4582.64 | bwd_inner: 4577.15 | bwd_allreduce: 5.42 | step: 43.30
- 14%|█▍        | 836/5800 [2:16:30<9:31:30,  6.91s/it]                                                      {'loss': 0.0386, 'grad_norm': 6.527840614318848, 'learning_rate': 3.864896991361831e-05, 'epoch': 7.21}
- 14%|█▍        | 836/5800 [2:16:30<9:31:30,  6.91s/it]score1 tensor([[0.6484],
-        [0.4453],
-        [0.6016],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6719, 0.4453, 0.6328, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0420, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:03:07,779] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.37
-[2025-01-25 11:03:07,780] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.61 | bwd_microstep: 4582.69 | bwd_inner_microstep: 4577.79 | bwd_allreduce_microstep: 4.79 | step_microstep: 52.79
-[2025-01-25 11:03:07,780] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.57 | bwd: 4582.71 | bwd_inner: 4577.79 | bwd_allreduce: 4.84 | step: 52.79
- 14%|█▍        | 837/5800 [2:16:37<9:30:30,  6.90s/it]                                                      {'loss': 0.042, 'grad_norm': 7.030946731567383, 'learning_rate': 3.864493193124094e-05, 'epoch': 7.22}
- 14%|█▍        | 837/5800 [2:16:37<9:30:30,  6.90s/it]score1 tensor([[0.5898],
-        [0.5039],
-        [0.1963],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6484, 0.5352, 0.1787, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0317, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:03:14,708] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.36
-[2025-01-25 11:03:14,709] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.37 | bwd_microstep: 4641.63 | bwd_inner_microstep: 4636.03 | bwd_allreduce_microstep: 5.48 | step_microstep: 45.86
-[2025-01-25 11:03:14,710] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.33 | bwd: 4641.66 | bwd_inner: 4636.03 | bwd_allreduce: 5.55 | step: 45.87
- 14%|█▍        | 838/5800 [2:16:44<9:31:10,  6.91s/it]                                                      {'loss': 0.0317, 'grad_norm': 6.504162788391113, 'learning_rate': 3.864088813505028e-05, 'epoch': 7.22}
- 14%|█▍        | 838/5800 [2:16:44<9:31:10,  6.91s/it]score1 tensor([[0.4883],
-        [0.5742],
-        [0.4609],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4062, 0.5586, 0.3945, 0.5898], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0498, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:03:21,644] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.37
-[2025-01-25 11:03:21,645] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.76 | bwd_microstep: 4636.77 | bwd_inner_microstep: 4631.69 | bwd_allreduce_microstep: 4.99 | step_microstep: 46.76
-[2025-01-25 11:03:21,646] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.73 | bwd: 4636.80 | bwd_inner: 4631.69 | bwd_allreduce: 5.04 | step: 46.76
- 14%|█▍        | 839/5800 [2:16:51<9:31:46,  6.92s/it]                                                      {'loss': 0.0498, 'grad_norm': 4.230065822601318, 'learning_rate': 3.863683852630725e-05, 'epoch': 7.23}
- 14%|█▍        | 839/5800 [2:16:51<9:31:46,  6.92s/it]score1 tensor([[0.4883],
-        [0.4824],
-        [0.4941],
-        [0.4707]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4043, 0.4844, 0.5430, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0420, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:03:28,566] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 11:03:28,567] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.96 | bwd_microstep: 4637.05 | bwd_inner_microstep: 4632.33 | bwd_allreduce_microstep: 4.64 | step_microstep: 45.44
-[2025-01-25 11:03:28,567] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.93 | bwd: 4637.08 | bwd_inner: 4632.33 | bwd_allreduce: 4.68 | step: 45.45
- 14%|█▍        | 840/5800 [2:16:58<9:31:50,  6.92s/it]                                                      {'loss': 0.042, 'grad_norm': 4.183599948883057, 'learning_rate': 3.863278310627459e-05, 'epoch': 7.24}
- 14%|█▍        | 840/5800 [2:16:58<9:31:50,  6.92s/it]score1 tensor([[0.4727],
-        [0.5703],
-        [0.4707],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4141, 0.6406, 0.4453, 0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0464, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:03:35,501] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 11:03:35,503] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.20 | bwd_microstep: 4647.79 | bwd_inner_microstep: 4642.48 | bwd_allreduce_microstep: 5.24 | step_microstep: 48.62
-[2025-01-25 11:03:35,503] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.15 | bwd: 4647.82 | bwd_inner: 4642.48 | bwd_allreduce: 5.28 | step: 48.62
- 14%|█▍        | 841/5800 [2:17:05<9:32:11,  6.92s/it]                                                      {'loss': 0.0464, 'grad_norm': 4.10254430770874, 'learning_rate': 3.862872187621685e-05, 'epoch': 7.25}
- 14%|█▍        | 841/5800 [2:17:05<9:32:11,  6.92s/it]score1 tensor([[0.5430],
-        [0.6055],
-        [0.6680],
-        [0.4922]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.5781, 0.6641, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0200, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:03:42,434] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 11:03:42,436] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.97 | bwd_microstep: 4640.53 | bwd_inner_microstep: 4634.84 | bwd_allreduce_microstep: 5.58 | step_microstep: 46.83
-[2025-01-25 11:03:42,436] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.93 | bwd: 4640.55 | bwd_inner: 4634.84 | bwd_allreduce: 5.62 | step: 46.83
- 15%|█▍        | 842/5800 [2:17:12<9:32:30,  6.93s/it]                                                      {'loss': 0.02, 'grad_norm': 4.764791011810303, 'learning_rate': 3.862465483740039e-05, 'epoch': 7.26}
- 15%|█▍        | 842/5800 [2:17:12<9:32:30,  6.93s/it]score1 tensor([[0.4707],
-        [0.4844],
-        [0.4766],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3984, 0.4941, 0.4805, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0332, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:03:49,377] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 11:03:49,378] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.60 | bwd_microstep: 4640.09 | bwd_inner_microstep: 4634.75 | bwd_allreduce_microstep: 5.23 | step_microstep: 46.46
-[2025-01-25 11:03:49,379] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.56 | bwd: 4640.12 | bwd_inner: 4634.75 | bwd_allreduce: 5.29 | step: 46.46
- 15%|█▍        | 843/5800 [2:17:19<9:32:37,  6.93s/it]                                                      {'loss': 0.0332, 'grad_norm': 0.4894876778125763, 'learning_rate': 3.862058199109339e-05, 'epoch': 7.27}
- 15%|█▍        | 843/5800 [2:17:19<9:32:37,  6.93s/it]score1 tensor([[0.3906],
-        [0.5273],
-        [0.6289],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3555, 0.5156, 0.6406, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:03:56,310] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 11:03:56,312] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.94 | bwd_microstep: 4646.17 | bwd_inner_microstep: 4640.34 | bwd_allreduce_microstep: 5.75 | step_microstep: 44.18
-[2025-01-25 11:03:56,312] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.90 | bwd: 4646.20 | bwd_inner: 4640.34 | bwd_allreduce: 5.80 | step: 44.19
- 15%|█▍        | 844/5800 [2:17:26<9:32:36,  6.93s/it]                                                      {'loss': 0.0186, 'grad_norm': 3.8274717330932617, 'learning_rate': 3.8616503338565825e-05, 'epoch': 7.28}
- 15%|█▍        | 844/5800 [2:17:26<9:32:36,  6.93s/it]score1 tensor([[0.4492],
-        [0.6523],
-        [0.4375],
-        [0.6914]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.6094, 0.4277, 0.6875], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0278, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:04:03,249] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.98 | optimizer_step: 4.36
-[2025-01-25 11:04:03,251] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.95 | bwd_microstep: 4646.48 | bwd_inner_microstep: 4640.91 | bwd_allreduce_microstep: 5.47 | step_microstep: 52.56
-[2025-01-25 11:04:03,251] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.91 | bwd: 4646.52 | bwd_inner: 4640.91 | bwd_allreduce: 5.52 | step: 52.57
- 15%|█▍        | 845/5800 [2:17:33<9:32:35,  6.93s/it]                                                      {'loss': 0.0278, 'grad_norm': 5.090629577636719, 'learning_rate': 3.86124188810895e-05, 'epoch': 7.28}
- 15%|█▍        | 845/5800 [2:17:33<9:32:35,  6.93s/it]score1 tensor([[0.4570],
-        [0.5547],
-        [0.3574],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.5508, 0.3223, 0.5938], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:04:10,180] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 11:04:10,182] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.89 | bwd_microstep: 4645.29 | bwd_inner_microstep: 4640.11 | bwd_allreduce_microstep: 5.08 | step_microstep: 42.23
-[2025-01-25 11:04:10,182] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.86 | bwd: 4645.31 | bwd_inner: 4640.11 | bwd_allreduce: 5.13 | step: 42.23
- 15%|█▍        | 846/5800 [2:17:40<9:32:17,  6.93s/it]                                                      {'loss': 0.0293, 'grad_norm': 0.6346124410629272, 'learning_rate': 3.860832861993801e-05, 'epoch': 7.29}
- 15%|█▍        | 846/5800 [2:17:40<9:32:17,  6.93s/it]score1 tensor([[0.3613],
-        [0.5469],
-        [0.3750],
-        [0.4395]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4629, 0.6562, 0.4551, 0.4141], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0791, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:04:17,107] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 11:04:17,109] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.67 | bwd_microstep: 4644.82 | bwd_inner_microstep: 4639.57 | bwd_allreduce_microstep: 5.15 | step_microstep: 50.64
-[2025-01-25 11:04:17,109] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.63 | bwd: 4644.84 | bwd_inner: 4639.57 | bwd_allreduce: 5.21 | step: 50.65
- 15%|█▍        | 847/5800 [2:17:47<9:32:09,  6.93s/it]                                                      {'loss': 0.0791, 'grad_norm': 4.079785346984863, 'learning_rate': 3.860423255638678e-05, 'epoch': 7.3}
- 15%|█▍        | 847/5800 [2:17:47<9:32:09,  6.93s/it]score1 tensor([[0.4668],
-        [0.4980],
-        [0.3848],
-        [0.3438]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.5391, 0.4004, 0.4434], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:04:24,026] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.37
-[2025-01-25 11:04:24,027] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.25 | bwd_microstep: 4632.96 | bwd_inner_microstep: 4627.54 | bwd_allreduce_microstep: 5.30 | step_microstep: 49.97
-[2025-01-25 11:04:24,027] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.22 | bwd: 4632.98 | bwd_inner: 4627.54 | bwd_allreduce: 5.36 | step: 49.99
- 15%|█▍        | 848/5800 [2:17:54<9:31:51,  6.93s/it]                                                      {'loss': 0.0469, 'grad_norm': 7.872174263000488, 'learning_rate': 3.860013069171302e-05, 'epoch': 7.31}
- 15%|█▍        | 848/5800 [2:17:54<9:31:51,  6.93s/it]score1 tensor([[0.4844],
-        [0.4121],
-        [0.3711],
-        [0.4102]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.4551, 0.4844, 0.4824], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0679, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:04:30,967] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.99 | optimizer_step: 4.37
-[2025-01-25 11:04:30,969] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.91 | bwd_microstep: 4652.67 | bwd_inner_microstep: 4647.54 | bwd_allreduce_microstep: 5.03 | step_microstep: 40.44
-[2025-01-25 11:04:30,969] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.88 | bwd: 4652.69 | bwd_inner: 4647.54 | bwd_allreduce: 5.08 | step: 40.45
- 15%|█▍        | 849/5800 [2:18:00<9:31:58,  6.93s/it]                                                      {'loss': 0.0679, 'grad_norm': 7.900103569030762, 'learning_rate': 3.859602302719577e-05, 'epoch': 7.32}
- 15%|█▍        | 849/5800 [2:18:00<9:31:58,  6.93s/it]score1 tensor([[0.5273],
-        [0.3867],
-        [0.4121],
-        [0.3770]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.4512, 0.5156, 0.4258], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0610, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:04:37,918] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.26 | optimizer_step: 4.36
-[2025-01-25 11:04:37,920] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.72 | bwd_microstep: 4651.46 | bwd_inner_microstep: 4641.62 | bwd_allreduce_microstep: 9.73 | step_microstep: 61.37
-[2025-01-25 11:04:37,921] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.68 | bwd: 4651.48 | bwd_inner: 4641.62 | bwd_allreduce: 9.79 | step: 61.39
- 15%|█▍        | 850/5800 [2:18:07<9:32:26,  6.94s/it]                                                      {'loss': 0.061, 'grad_norm': 7.883676052093506, 'learning_rate': 3.859190956411588e-05, 'epoch': 7.33}
- 15%|█▍        | 850/5800 [2:18:07<9:32:26,  6.94s/it]score1 tensor([[0.5703],
-        [0.4160],
-        [0.4043],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5703, 0.4180, 0.4551, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0303, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:04:44,824] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 11:04:44,825] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2171.33 | bwd_microstep: 4589.91 | bwd_inner_microstep: 4582.34 | bwd_allreduce_microstep: 7.40 | step_microstep: 55.98
-[2025-01-25 11:04:44,825] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2171.30 | bwd: 4589.96 | bwd_inner: 4582.35 | bwd_allreduce: 7.48 | step: 55.96
- 15%|█▍        | 851/5800 [2:18:14<9:31:58,  6.93s/it]                                                      {'loss': 0.0303, 'grad_norm': 2.0481789112091064, 'learning_rate': 3.8587790303755986e-05, 'epoch': 7.34}
- 15%|█▍        | 851/5800 [2:18:14<9:31:58,  6.93s/it]score1 tensor([[0.5000],
-        [0.6484],
-        [0.5078],
-        [0.4941]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4395, 0.6094, 0.4316, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0737, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:04:51,808] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.59 | optimizer_step: 4.44
-[2025-01-25 11:04:51,810] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.30 | bwd_microstep: 4643.52 | bwd_inner_microstep: 4635.28 | bwd_allreduce_microstep: 8.08 | step_microstep: 60.25
-[2025-01-25 11:04:51,810] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.27 | bwd: 4643.54 | bwd_inner: 4635.28 | bwd_allreduce: 8.15 | step: 60.27
- 15%|█▍        | 852/5800 [2:18:21<9:32:42,  6.94s/it]                                                      {'loss': 0.0737, 'grad_norm': 8.678008079528809, 'learning_rate': 3.858366524740055e-05, 'epoch': 7.34}
- 15%|█▍        | 852/5800 [2:18:21<9:32:42,  6.94s/it]score1 tensor([[0.5742],
-        [0.5898],
-        [0.6719],
-        [0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.5977, 0.6562, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:04:58,751] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.37
-[2025-01-25 11:04:58,752] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.01 | bwd_microstep: 4639.55 | bwd_inner_microstep: 4634.73 | bwd_allreduce_microstep: 4.72 | step_microstep: 44.49
-[2025-01-25 11:04:58,753] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.95 | bwd: 4639.57 | bwd_inner: 4634.73 | bwd_allreduce: 4.78 | step: 44.50
- 15%|█▍        | 853/5800 [2:18:28<9:32:24,  6.94s/it]                                                      {'loss': 0.0234, 'grad_norm': 4.525014877319336, 'learning_rate': 3.8579534396335835e-05, 'epoch': 7.35}
- 15%|█▍        | 853/5800 [2:18:28<9:32:24,  6.94s/it]score1 tensor([[0.5117],
-        [0.5586],
-        [0.5039],
-        [0.6641]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5000, 0.4941, 0.3438, 0.7070], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0698, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:05:05,696] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.53 | optimizer_step: 4.37
-[2025-01-25 11:05:05,699] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.53 | bwd_microstep: 4647.21 | bwd_inner_microstep: 4642.28 | bwd_allreduce_microstep: 4.84 | step_microstep: 47.39
-[2025-01-25 11:05:05,700] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.50 | bwd: 4647.23 | bwd_inner: 4642.28 | bwd_allreduce: 4.88 | step: 47.41
- 15%|█▍        | 854/5800 [2:18:35<9:32:13,  6.94s/it]                                                      {'loss': 0.0698, 'grad_norm': 3.9568471908569336, 'learning_rate': 3.8575397751849905e-05, 'epoch': 7.36}
- 15%|█▍        | 854/5800 [2:18:35<9:32:13,  6.94s/it]score1 tensor([[0.5117],
-        [0.4961],
-        [0.5820],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4082, 0.4160, 0.4922, 0.4805], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0781, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:05:12,633] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.36
-[2025-01-25 11:05:12,634] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.03 | bwd_microstep: 4647.40 | bwd_inner_microstep: 4642.04 | bwd_allreduce_microstep: 5.22 | step_microstep: 52.50
-[2025-01-25 11:05:12,635] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.98 | bwd: 4647.42 | bwd_inner: 4642.04 | bwd_allreduce: 5.29 | step: 52.49
- 15%|█▍        | 855/5800 [2:18:42<9:32:00,  6.94s/it]                                                      {'loss': 0.0781, 'grad_norm': 8.468052864074707, 'learning_rate': 3.8571255315232654e-05, 'epoch': 7.37}
- 15%|█▍        | 855/5800 [2:18:42<9:32:00,  6.94s/it]score1 tensor([[0.6211],
-        [0.4668],
-        [0.5039],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.4727, 0.4473, 0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0381, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:05:19,558] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 11:05:19,560] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.77 | bwd_microstep: 4639.61 | bwd_inner_microstep: 4634.74 | bwd_allreduce_microstep: 4.77 | step_microstep: 43.62
-[2025-01-25 11:05:19,560] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.73 | bwd: 4639.63 | bwd_inner: 4634.74 | bwd_allreduce: 4.82 | step: 43.62
- 15%|█▍        | 856/5800 [2:18:49<9:31:30,  6.94s/it]                                                      {'loss': 0.0381, 'grad_norm': 4.737184524536133, 'learning_rate': 3.856710708777575e-05, 'epoch': 7.38}
- 15%|█▍        | 856/5800 [2:18:49<9:31:30,  6.94s/it]score1 tensor([[0.5547],
-        [0.4980],
-        [0.5508],
-        [0.4922]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.4648, 0.6055, 0.3262], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0654, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:05:26,488] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 11:05:26,489] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.61 | bwd_microstep: 4642.58 | bwd_inner_microstep: 4637.35 | bwd_allreduce_microstep: 5.10 | step_microstep: 49.97
-[2025-01-25 11:05:26,489] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.57 | bwd: 4642.61 | bwd_inner: 4637.35 | bwd_allreduce: 5.16 | step: 49.98
- 15%|█▍        | 857/5800 [2:18:56<9:31:11,  6.93s/it]                                                      {'loss': 0.0654, 'grad_norm': 0.6144192814826965, 'learning_rate': 3.856295307077269e-05, 'epoch': 7.39}
- 15%|█▍        | 857/5800 [2:18:56<9:31:11,  6.93s/it]score1 tensor([[0.5156],
-        [0.4219],
-        [0.5039],
-        [0.4512]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.4277, 0.5430, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0303, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:05:33,413] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 11:05:33,415] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.24 | bwd_microstep: 4642.25 | bwd_inner_microstep: 4636.76 | bwd_allreduce_microstep: 5.39 | step_microstep: 47.82
-[2025-01-25 11:05:33,416] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2168.20 | bwd: 4642.28 | bwd_inner: 4636.76 | bwd_allreduce: 5.44 | step: 47.83
- 15%|█▍        | 858/5800 [2:19:03<9:30:53,  6.93s/it]                                                      {'loss': 0.0303, 'grad_norm': 8.143030166625977, 'learning_rate': 3.855879326551877e-05, 'epoch': 7.4}
- 15%|█▍        | 858/5800 [2:19:03<9:30:53,  6.93s/it]score1 tensor([[0.4590],
-        [0.4629],
-        [0.4980],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4863, 0.5625, 0.4863, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0347, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:05:40,308] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.13 | optimizer_step: 4.37
-[2025-01-25 11:05:40,311] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2170.78 | bwd_microstep: 4597.99 | bwd_inner_microstep: 4591.84 | bwd_allreduce_microstep: 6.05 | step_microstep: 52.40
-[2025-01-25 11:05:40,311] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2170.72 | bwd: 4598.01 | bwd_inner: 4591.84 | bwd_allreduce: 6.10 | step: 52.41
- 15%|█▍        | 859/5800 [2:19:10<9:30:13,  6.92s/it]                                                      {'loss': 0.0347, 'grad_norm': 1.984279751777649, 'learning_rate': 3.855462767331108e-05, 'epoch': 7.41}
- 15%|█▍        | 859/5800 [2:19:10<9:30:13,  6.92s/it]score1 tensor([[0.4512],
-        [0.4707],
-        [0.5898],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.4277, 0.5625, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0220, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:05:47,266] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 11:05:47,267] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2170.41 | bwd_microstep: 4643.00 | bwd_inner_microstep: 4637.23 | bwd_allreduce_microstep: 5.68 | step_microstep: 47.46
-[2025-01-25 11:05:47,268] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2170.37 | bwd: 4643.03 | bwd_inner: 4637.23 | bwd_allreduce: 5.73 | step: 47.47
- 15%|█▍        | 860/5800 [2:19:17<9:30:42,  6.93s/it]                                                      {'loss': 0.022, 'grad_norm': 4.170193195343018, 'learning_rate': 3.8550456295448544e-05, 'epoch': 7.41}
- 15%|█▍        | 860/5800 [2:19:17<9:30:42,  6.93s/it]score1 tensor([[0.4863],
-        [0.3887],
-        [0.4746],
-        [0.3359]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.4844, 0.5039, 0.3398], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0366, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:05:54,230] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 11:05:54,233] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.25 | bwd_microstep: 4651.91 | bwd_inner_microstep: 4646.90 | bwd_allreduce_microstep: 4.92 | step_microstep: 66.17
-[2025-01-25 11:05:54,233] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.21 | bwd: 4651.94 | bwd_inner: 4646.90 | bwd_allreduce: 4.95 | step: 66.21
- 15%|█▍        | 861/5800 [2:19:24<9:32:24,  6.95s/it]                                                      {'loss': 0.0366, 'grad_norm': 7.811854839324951, 'learning_rate': 3.854627913323185e-05, 'epoch': 7.42}
- 15%|█▍        | 861/5800 [2:19:24<9:32:24,  6.95s/it]score1 tensor([[0.6133],
-        [0.4883],
-        [0.5000],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6797, 0.5508, 0.6875, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0859, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:06:01,232] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.00 | optimizer_step: 4.36
-[2025-01-25 11:06:01,234] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.67 | bwd_microstep: 4649.08 | bwd_inner_microstep: 4644.06 | bwd_allreduce_microstep: 4.91 | step_microstep: 44.85
-[2025-01-25 11:06:01,234] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.53 | bwd: 4649.10 | bwd_inner: 4644.05 | bwd_allreduce: 4.96 | step: 44.85
- 15%|█▍        | 862/5800 [2:19:31<9:32:35,  6.96s/it]                                                      {'loss': 0.0859, 'grad_norm': 8.67005729675293, 'learning_rate': 3.8542096187963517e-05, 'epoch': 7.43}
- 15%|█▍        | 862/5800 [2:19:31<9:32:35,  6.96s/it]score1 tensor([[0.4648],
-        [0.4863],
-        [0.5000],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4883, 0.3750, 0.5234, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0454, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:06:08,200] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.40 | optimizer_step: 4.36
-[2025-01-25 11:06:08,202] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.64 | bwd_microstep: 4652.28 | bwd_inner_microstep: 4646.37 | bwd_allreduce_microstep: 5.80 | step_microstep: 56.49
-[2025-01-25 11:06:08,203] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.60 | bwd: 4652.30 | bwd_inner: 4646.37 | bwd_allreduce: 5.86 | step: 56.50
- 15%|█▍        | 863/5800 [2:19:38<9:32:39,  6.96s/it]                                                      {'loss': 0.0454, 'grad_norm': 0.7227128148078918, 'learning_rate': 3.853790746094786e-05, 'epoch': 7.44}
- 15%|█▍        | 863/5800 [2:19:38<9:32:39,  6.96s/it]score1 tensor([[0.5781],
-        [0.4941],
-        [0.6367],
-        [0.4609]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6484, 0.4629, 0.6367, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0347, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:06:15,094] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 11:06:15,095] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2170.65 | bwd_microstep: 4595.54 | bwd_inner_microstep: 4590.11 | bwd_allreduce_microstep: 5.32 | step_microstep: 45.82
-[2025-01-25 11:06:15,095] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2170.62 | bwd: 4595.56 | bwd_inner: 4590.11 | bwd_allreduce: 5.38 | step: 45.82
- 15%|█▍        | 864/5800 [2:19:45<9:30:53,  6.94s/it]                                                      {'loss': 0.0347, 'grad_norm': 2.2336127758026123, 'learning_rate': 3.853371295349098e-05, 'epoch': 7.45}
- 15%|█▍        | 864/5800 [2:19:45<9:30:53,  6.94s/it]score1 tensor([[0.5742],
-        [0.4805],
-        [0.6094],
-        [0.6406]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.3926, 0.6445, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0396, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:06:22,037] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 11:06:22,039] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.94 | bwd_microstep: 4650.33 | bwd_inner_microstep: 4645.03 | bwd_allreduce_microstep: 5.19 | step_microstep: 44.66
-[2025-01-25 11:06:22,039] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2168.91 | bwd: 4650.36 | bwd_inner: 4645.03 | bwd_allreduce: 5.25 | step: 44.66
- 15%|█▍        | 865/5800 [2:19:52<9:30:50,  6.94s/it]                                                      {'loss': 0.0396, 'grad_norm': 4.997718811035156, 'learning_rate': 3.852951266690083e-05, 'epoch': 7.46}
- 15%|█▍        | 865/5800 [2:19:52<9:30:50,  6.94s/it]score1 tensor([[0.6094],
-        [0.5977],
-        [0.4883],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.6211, 0.4336, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0630, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:06:28,989] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 11:06:28,990] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.22 | bwd_microstep: 4665.61 | bwd_inner_microstep: 4660.31 | bwd_allreduce_microstep: 5.19 | step_microstep: 45.81
-[2025-01-25 11:06:28,991] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.18 | bwd: 4665.63 | bwd_inner: 4660.31 | bwd_allreduce: 5.25 | step: 45.82
- 15%|█▍        | 866/5800 [2:19:58<9:31:03,  6.94s/it]                                                      {'loss': 0.063, 'grad_norm': 4.32914924621582, 'learning_rate': 3.8525306602487114e-05, 'epoch': 7.47}
- 15%|█▍        | 866/5800 [2:19:58<9:31:03,  6.94s/it]score1 tensor([[0.6523],
-        [0.5625],
-        [0.5469],
-        [0.5898]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5742, 0.5430, 0.4375, 0.4121], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0962, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:06:35,923] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.36
-[2025-01-25 11:06:35,925] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.62 | bwd_microstep: 4643.83 | bwd_inner_microstep: 4638.35 | bwd_allreduce_microstep: 5.36 | step_microstep: 45.81
-[2025-01-25 11:06:35,925] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.55 | bwd: 4643.86 | bwd_inner: 4638.35 | bwd_allreduce: 5.42 | step: 45.82
- 15%|█▍        | 867/5800 [2:20:05<9:30:28,  6.94s/it]                                                      {'loss': 0.0962, 'grad_norm': 9.048396110534668, 'learning_rate': 3.8521094761561356e-05, 'epoch': 7.47}
- 15%|█▍        | 867/5800 [2:20:05<9:30:28,  6.94s/it]score1 tensor([[0.6016],
-        [0.6250],
-        [0.6133],
-        [0.5938]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.5586, 0.5391, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0806, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:06:42,863] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.37
-[2025-01-25 11:06:42,864] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.29 | bwd_microstep: 4648.41 | bwd_inner_microstep: 4643.31 | bwd_allreduce_microstep: 4.98 | step_microstep: 45.97
-[2025-01-25 11:06:42,864] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.24 | bwd: 4648.44 | bwd_inner: 4643.31 | bwd_allreduce: 5.05 | step: 45.98
- 15%|█▍        | 868/5800 [2:20:12<9:30:30,  6.94s/it]                                                      {'loss': 0.0806, 'grad_norm': 9.251076698303223, 'learning_rate': 3.851687714543688e-05, 'epoch': 7.48}
- 15%|█▍        | 868/5800 [2:20:12<9:30:30,  6.94s/it]score1 tensor([[0.4609],
-        [0.5469],
-        [0.7148],
-        [0.7188]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4414, 0.4980, 0.6133, 0.6133], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0688, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:06:49,824] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.65 | optimizer_step: 4.37
-[2025-01-25 11:06:49,826] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.25 | bwd_microstep: 4656.98 | bwd_inner_microstep: 4649.94 | bwd_allreduce_microstep: 6.81 | step_microstep: 58.95
-[2025-01-25 11:06:49,830] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.20 | bwd: 4657.03 | bwd_inner: 4649.94 | bwd_allreduce: 6.94 | step: 58.94
- 15%|█▍        | 869/5800 [2:20:19<9:31:06,  6.95s/it]                                                      {'loss': 0.0688, 'grad_norm': 9.319587707519531, 'learning_rate': 3.8512653755428814e-05, 'epoch': 7.49}
- 15%|█▍        | 869/5800 [2:20:19<9:31:06,  6.95s/it]score1 tensor([[0.5273],
-        [0.5430],
-        [0.4766],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4922, 0.4473, 0.4258, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0483, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:06:56,779] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.37
-[2025-01-25 11:06:56,780] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2169.80 | bwd_microstep: 4648.79 | bwd_inner_microstep: 4643.23 | bwd_allreduce_microstep: 5.43 | step_microstep: 48.39
-[2025-01-25 11:06:56,781] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2169.76 | bwd: 4648.82 | bwd_inner: 4643.23 | bwd_allreduce: 5.49 | step: 48.41
- 15%|█▌        | 870/5800 [2:20:26<9:30:57,  6.95s/it]                                                      {'loss': 0.0483, 'grad_norm': 4.206108093261719, 'learning_rate': 3.8508424592854085e-05, 'epoch': 7.5}
- 15%|█▌        | 870/5800 [2:20:26<9:30:57,  6.95s/it]score1 tensor([[0.4668],
-        [0.4863],
-        [0.6953],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.4453, 0.6641, 0.6016], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0342, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:07:03,724] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.37
-[2025-01-25 11:07:03,726] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.75 | bwd_microstep: 4647.88 | bwd_inner_microstep: 4640.06 | bwd_allreduce_microstep: 7.57 | step_microstep: 47.11
-[2025-01-25 11:07:03,726] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.69 | bwd: 4647.93 | bwd_inner: 4640.06 | bwd_allreduce: 7.68 | step: 47.10
- 15%|█▌        | 871/5800 [2:20:33<9:31:01,  6.95s/it]                                                      {'loss': 0.0342, 'grad_norm': 0.7450087666511536, 'learning_rate': 3.850418965903142e-05, 'epoch': 7.51}
- 15%|█▌        | 871/5800 [2:20:33<9:31:01,  6.95s/it]score1 tensor([[0.4531],
-        [0.4375],
-        [0.3887],
-        [0.3496]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4707, 0.4258, 0.4043, 0.3945], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0225, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:07:10,701] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 11:07:10,702] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2175.34 | bwd_microstep: 4655.44 | bwd_inner_microstep: 4650.39 | bwd_allreduce_microstep: 4.96 | step_microstep: 47.22
-[2025-01-25 11:07:10,702] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2175.31 | bwd: 4655.47 | bwd_inner: 4650.39 | bwd_allreduce: 5.01 | step: 47.24
- 15%|█▌        | 872/5800 [2:20:40<9:31:34,  6.96s/it]                                                      {'loss': 0.0225, 'grad_norm': 3.762221336364746, 'learning_rate': 3.8499948955281344e-05, 'epoch': 7.52}
- 15%|█▌        | 872/5800 [2:20:40<9:31:34,  6.96s/it]score1 tensor([[0.5859],
-        [0.4453],
-        [0.5352],
-        [0.4727]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.4219, 0.5664, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0420, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:07:17,656] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 11:07:17,658] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.97 | bwd_microstep: 4648.01 | bwd_inner_microstep: 4642.93 | bwd_allreduce_microstep: 4.99 | step_microstep: 49.00
-[2025-01-25 11:07:17,658] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.92 | bwd: 4648.03 | bwd_inner: 4642.93 | bwd_allreduce: 5.04 | step: 49.01
- 15%|█▌        | 873/5800 [2:20:47<9:31:03,  6.95s/it]                                                      {'loss': 0.042, 'grad_norm': 4.58549165725708, 'learning_rate': 3.849570248292618e-05, 'epoch': 7.53}
- 15%|█▌        | 873/5800 [2:20:47<9:31:03,  6.95s/it]score1 tensor([[0.4961],
-        [0.4453],
-        [0.5352],
-        [0.4531]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.4844, 0.5625, 0.4297], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0322, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:07:24,589] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 11:07:24,591] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.70 | bwd_microstep: 4640.86 | bwd_inner_microstep: 4635.40 | bwd_allreduce_microstep: 5.34 | step_microstep: 45.49
-[2025-01-25 11:07:24,591] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.65 | bwd: 4640.88 | bwd_inner: 4635.40 | bwd_allreduce: 5.40 | step: 45.50
- 15%|█▌        | 874/5800 [2:20:54<9:30:19,  6.95s/it]                                                      {'loss': 0.0322, 'grad_norm': 4.290001392364502, 'learning_rate': 3.849145024329006e-05, 'epoch': 7.53}
- 15%|█▌        | 874/5800 [2:20:54<9:30:19,  6.95s/it]score1 tensor([[0.5586],
-        [0.4609],
-        [0.4609],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.4023, 0.4492, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0322, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:07:31,506] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.36
-[2025-01-25 11:07:31,508] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.16 | bwd_microstep: 4639.08 | bwd_inner_microstep: 4633.51 | bwd_allreduce_microstep: 5.44 | step_microstep: 43.96
-[2025-01-25 11:07:31,508] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.13 | bwd: 4639.12 | bwd_inner: 4633.51 | bwd_allreduce: 5.51 | step: 43.98
- 15%|█▌        | 875/5800 [2:21:01<9:29:33,  6.94s/it]                                                      {'loss': 0.0322, 'grad_norm': 3.950202226638794, 'learning_rate': 3.848719223769889e-05, 'epoch': 7.54}
- 15%|█▌        | 875/5800 [2:21:01<9:29:33,  6.94s/it]score1 tensor([[0.4727],
-        [0.5430],
-        [0.5078],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4570, 0.4863, 0.5625, 0.4590], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0420, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:07:38,445] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.37
-[2025-01-25 11:07:38,446] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.70 | bwd_microstep: 4647.87 | bwd_inner_microstep: 4642.75 | bwd_allreduce_microstep: 5.03 | step_microstep: 46.97
-[2025-01-25 11:07:38,446] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.67 | bwd: 4647.90 | bwd_inner: 4642.75 | bwd_allreduce: 5.07 | step: 46.98
- 15%|█▌        | 876/5800 [2:21:08<9:29:21,  6.94s/it]                                                      {'loss': 0.042, 'grad_norm': 4.186079025268555, 'learning_rate': 3.8482928467480405e-05, 'epoch': 7.55}
- 15%|█▌        | 876/5800 [2:21:08<9:29:21,  6.94s/it]score1 tensor([[0.6172],
-        [0.5430],
-        [0.4668],
-        [0.6016]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.5391, 0.4414, 0.6250], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0200, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:07:45,378] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 11:07:45,380] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.47 | bwd_microstep: 4645.88 | bwd_inner_microstep: 4640.70 | bwd_allreduce_microstep: 5.07 | step_microstep: 45.52
-[2025-01-25 11:07:45,380] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2168.43 | bwd: 4645.91 | bwd_inner: 4640.69 | bwd_allreduce: 5.14 | step: 45.54
- 15%|█▌        | 877/5800 [2:21:15<9:29:07,  6.94s/it]                                                      {'loss': 0.02, 'grad_norm': 0.9006555080413818, 'learning_rate': 3.847865893396412e-05, 'epoch': 7.56}
- 15%|█▌        | 877/5800 [2:21:15<9:29:07,  6.94s/it]score1 tensor([[0.4434],
-        [0.5469],
-        [0.5312],
-        [0.6172]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3789, 0.6172, 0.4824, 0.6289], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0488, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:07:52,304] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 11:07:52,305] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.95 | bwd_microstep: 4642.13 | bwd_inner_microstep: 4632.79 | bwd_allreduce_microstep: 9.22 | step_microstep: 44.06
-[2025-01-25 11:07:52,306] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.91 | bwd: 4642.16 | bwd_inner: 4632.79 | bwd_allreduce: 9.30 | step: 44.06
- 15%|█▌        | 878/5800 [2:21:22<9:28:49,  6.93s/it]                                                      {'loss': 0.0488, 'grad_norm': 0.8217353820800781, 'learning_rate': 3.847438363848134e-05, 'epoch': 7.57}
- 15%|█▌        | 878/5800 [2:21:22<9:28:49,  6.93s/it]score1 tensor([[0.4219],
-        [0.4434],
-        [0.4062],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4863, 0.5117, 0.3906, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0684, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:07:59,254] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 11:07:59,256] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2170.70 | bwd_microstep: 4651.66 | bwd_inner_microstep: 4646.21 | bwd_allreduce_microstep: 5.34 | step_microstep: 51.39
-[2025-01-25 11:07:59,258] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2170.67 | bwd: 4651.72 | bwd_inner: 4646.21 | bwd_allreduce: 5.40 | step: 51.41
- 15%|█▌        | 879/5800 [2:21:29<9:29:44,  6.95s/it]                                                      {'loss': 0.0684, 'grad_norm': 4.187727451324463, 'learning_rate': 3.8470102582365185e-05, 'epoch': 7.58}
- 15%|█▌        | 879/5800 [2:21:29<9:29:44,  6.95s/it]score1 tensor([[0.4219],
-        [0.4863],
-        [0.4688],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.4375, 0.4453, 0.6016], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0415, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:08:06,227] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.37
-[2025-01-25 11:08:06,229] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.79 | bwd_microstep: 4647.47 | bwd_inner_microstep: 4641.46 | bwd_allreduce_microstep: 5.90 | step_microstep: 45.77
-[2025-01-25 11:08:06,229] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.74 | bwd: 4647.50 | bwd_inner: 4641.47 | bwd_allreduce: 5.95 | step: 45.77
- 15%|█▌        | 880/5800 [2:21:36<9:29:42,  6.95s/it]                                                      {'loss': 0.0415, 'grad_norm': 0.5928323864936829, 'learning_rate': 3.8465815766950564e-05, 'epoch': 7.59}
- 15%|█▌        | 880/5800 [2:21:36<9:29:42,  6.95s/it]score1 tensor([[0.4414],
-        [0.6484],
-        [0.5703],
-        [0.4512]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.6367, 0.5625, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0259, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:08:13,193] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.42 | optimizer_step: 4.37
-[2025-01-25 11:08:13,195] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2179.51 | bwd_microstep: 4644.45 | bwd_inner_microstep: 4635.64 | bwd_allreduce_microstep: 8.56 | step_microstep: 60.50
-[2025-01-25 11:08:13,195] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2179.47 | bwd: 4644.57 | bwd_inner: 4635.64 | bwd_allreduce: 8.67 | step: 60.48
- 15%|█▌        | 881/5800 [2:21:43<9:30:20,  6.96s/it]                                                      {'loss': 0.0259, 'grad_norm': 1.2437961101531982, 'learning_rate': 3.8461523193574175e-05, 'epoch': 7.59}
- 15%|█▌        | 881/5800 [2:21:43<9:30:20,  6.96s/it]score1 tensor([[0.5000],
-        [0.5117],
-        [0.6367],
-        [0.4160]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.4785, 0.5781, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0376, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:08:20,176] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 7.45 | optimizer_step: 4.95
-[2025-01-25 11:08:20,178] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2177.63 | bwd_microstep: 4642.08 | bwd_inner_microstep: 4636.10 | bwd_allreduce_microstep: 5.84 | step_microstep: 63.92
-[2025-01-25 11:08:20,179] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2177.58 | bwd: 4642.11 | bwd_inner: 4636.10 | bwd_allreduce: 5.92 | step: 63.93
- 15%|█▌        | 882/5800 [2:21:50<9:30:33,  6.96s/it]                                                      {'loss': 0.0376, 'grad_norm': 4.407370567321777, 'learning_rate': 3.845722486357452e-05, 'epoch': 7.6}
- 15%|█▌        | 882/5800 [2:21:50<9:30:33,  6.96s/it]score1 tensor([[0.4043],
-        [0.4453],
-        [0.4668],
-        [0.4629]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3789, 0.4688, 0.4570, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0405, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:08:27,126] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 11:08:27,128] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.46 | bwd_microstep: 4644.18 | bwd_inner_microstep: 4638.52 | bwd_allreduce_microstep: 5.56 | step_microstep: 58.28
-[2025-01-25 11:08:27,128] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.43 | bwd: 4644.21 | bwd_inner: 4638.52 | bwd_allreduce: 5.62 | step: 58.28
- 15%|█▌        | 883/5800 [2:21:57<9:30:09,  6.96s/it]                                                      {'loss': 0.0405, 'grad_norm': 0.5570327639579773, 'learning_rate': 3.845292077829189e-05, 'epoch': 7.61}
- 15%|█▌        | 883/5800 [2:21:57<9:30:09,  6.96s/it]score1 tensor([[0.5195],
-        [0.4141],
-        [0.5078],
-        [0.4551]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.4980, 0.6133, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0820, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:08:34,059] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 11:08:34,061] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.96 | bwd_microstep: 4643.88 | bwd_inner_microstep: 4638.37 | bwd_allreduce_microstep: 5.40 | step_microstep: 45.81
-[2025-01-25 11:08:34,061] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.91 | bwd: 4643.91 | bwd_inner: 4638.37 | bwd_allreduce: 5.46 | step: 45.84
- 15%|█▌        | 884/5800 [2:22:04<9:29:26,  6.95s/it]                                                      {'loss': 0.082, 'grad_norm': 8.326176643371582, 'learning_rate': 3.844861093906837e-05, 'epoch': 7.62}
- 15%|█▌        | 884/5800 [2:22:04<9:29:26,  6.95s/it]score1 tensor([[0.3770],
-        [0.3848],
-        [0.5703],
-        [0.4238]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.3984, 0.5898, 0.4727], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0308, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:08:40,988] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.36
-[2025-01-25 11:08:40,989] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.85 | bwd_microstep: 4644.39 | bwd_inner_microstep: 4639.07 | bwd_allreduce_microstep: 5.25 | step_microstep: 41.61
-[2025-01-25 11:08:40,990] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.81 | bwd: 4644.42 | bwd_inner: 4639.07 | bwd_allreduce: 5.29 | step: 41.62
- 15%|█▌        | 885/5800 [2:22:10<9:28:52,  6.94s/it]                                                      {'loss': 0.0308, 'grad_norm': 7.950469017028809, 'learning_rate': 3.844429534724786e-05, 'epoch': 7.63}
- 15%|█▌        | 885/5800 [2:22:10<9:28:52,  6.94s/it]score1 tensor([[0.4219],
-        [0.5117],
-        [0.4414],
-        [0.4980]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.4375, 0.4512, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0488, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:08:47,920] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.37
-[2025-01-25 11:08:47,922] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.74 | bwd_microstep: 4641.93 | bwd_inner_microstep: 4636.06 | bwd_allreduce_microstep: 5.67 | step_microstep: 47.26
-[2025-01-25 11:08:47,922] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.71 | bwd: 4641.96 | bwd_inner: 4636.06 | bwd_allreduce: 5.80 | step: 47.27
- 15%|█▌        | 886/5800 [2:22:17<9:28:15,  6.94s/it]                                                      {'loss': 0.0488, 'grad_norm': 3.9059946537017822, 'learning_rate': 3.8439974004176015e-05, 'epoch': 7.64}
- 15%|█▌        | 886/5800 [2:22:17<9:28:15,  6.94s/it]score1 tensor([[0.4375],
-        [0.5078],
-        [0.4395],
-        [0.4453]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5000, 0.5352, 0.4590, 0.3730], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0454, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:08:54,852] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.36
-[2025-01-25 11:08:54,853] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.89 | bwd_microstep: 4646.86 | bwd_inner_microstep: 4641.94 | bwd_allreduce_microstep: 4.85 | step_microstep: 45.79
-[2025-01-25 11:08:54,854] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.84 | bwd: 4646.89 | bwd_inner: 4641.94 | bwd_allreduce: 4.89 | step: 45.80
- 15%|█▌        | 887/5800 [2:22:24<9:28:02,  6.94s/it]                                                      {'loss': 0.0454, 'grad_norm': 4.084710121154785, 'learning_rate': 3.843564691120032e-05, 'epoch': 7.65}
- 15%|█▌        | 887/5800 [2:22:24<9:28:02,  6.94s/it]score1 tensor([[0.5117],
-        [0.4512],
-        [0.5508],
-        [0.6172]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.3340, 0.5078, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0845, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:09:01,793] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 9.55 | optimizer_step: 4.36
-[2025-01-25 11:09:01,794] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.39 | bwd_microstep: 4642.43 | bwd_inner_microstep: 4637.42 | bwd_allreduce_microstep: 4.90 | step_microstep: 50.95
-[2025-01-25 11:09:01,794] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.35 | bwd: 4642.45 | bwd_inner: 4637.42 | bwd_allreduce: 4.95 | step: 50.97
- 15%|█▌        | 888/5800 [2:22:31<9:28:01,  6.94s/it]                                                      {'loss': 0.0845, 'grad_norm': 8.692591667175293, 'learning_rate': 3.843131406967003e-05, 'epoch': 7.66}
- 15%|█▌        | 888/5800 [2:22:31<9:28:01,  6.94s/it]score1 tensor([[0.5703],
-        [0.5312],
-        [0.4004],
-        [0.6211]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.5156, 0.3652, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0410, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:09:08,730] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.80 | optimizer_step: 4.36
-[2025-01-25 11:09:08,732] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.44 | bwd_microstep: 4640.65 | bwd_inner_microstep: 4635.65 | bwd_allreduce_microstep: 4.90 | step_microstep: 63.09
-[2025-01-25 11:09:08,733] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.39 | bwd: 4640.67 | bwd_inner: 4635.65 | bwd_allreduce: 4.95 | step: 63.12
- 15%|█▌        | 889/5800 [2:22:38<9:28:07,  6.94s/it]                                                      {'loss': 0.041, 'grad_norm': 4.1602091789245605, 'learning_rate': 3.84269754809362e-05, 'epoch': 7.66}
- 15%|█▌        | 889/5800 [2:22:38<9:28:07,  6.94s/it]score1 tensor([[0.5039],
-        [0.5273],
-        [0.6719],
-        [0.6250]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3711, 0.4668, 0.5625, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0767, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:09:15,718] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 11:09:15,720] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2170.70 | bwd_microstep: 4643.56 | bwd_inner_microstep: 4635.18 | bwd_allreduce_microstep: 8.12 | step_microstep: 78.66
-[2025-01-25 11:09:15,720] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2170.65 | bwd: 4643.63 | bwd_inner: 4635.18 | bwd_allreduce: 8.23 | step: 78.62
- 15%|█▌        | 890/5800 [2:22:45<9:29:19,  6.96s/it]                                                      {'loss': 0.0767, 'grad_norm': 9.014336585998535, 'learning_rate': 3.8422631146351686e-05, 'epoch': 7.67}
- 15%|█▌        | 890/5800 [2:22:45<9:29:19,  6.96s/it]score1 tensor([[0.5547],
-        [0.5352],
-        [0.5156],
-        [0.8086]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.4023, 0.4316, 0.6953], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0933, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:09:22,704] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 11:09:22,706] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2170.13 | bwd_microstep: 4653.52 | bwd_inner_microstep: 4646.70 | bwd_allreduce_microstep: 6.72 | step_microstep: 70.12
-[2025-01-25 11:09:22,706] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2170.08 | bwd: 4653.54 | bwd_inner: 4646.70 | bwd_allreduce: 6.77 | step: 70.14
- 15%|█▌        | 891/5800 [2:22:52<9:29:30,  6.96s/it]                                                      {'loss': 0.0933, 'grad_norm': 9.174332618713379, 'learning_rate': 3.8418281067271105e-05, 'epoch': 7.68}
- 15%|█▌        | 891/5800 [2:22:52<9:29:30,  6.96s/it]score1 tensor([[0.5195],
-        [0.6172],
-        [0.5352],
-        [0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.6094, 0.4668, 0.3809], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0513, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:09:29,640] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.53 | optimizer_step: 4.37
-[2025-01-25 11:09:29,642] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.88 | bwd_microstep: 4649.48 | bwd_inner_microstep: 4645.02 | bwd_allreduce_microstep: 4.38 | step_microstep: 43.24
-[2025-01-25 11:09:29,642] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.85 | bwd: 4649.50 | bwd_inner: 4645.02 | bwd_allreduce: 4.41 | step: 43.25
- 15%|█▌        | 892/5800 [2:22:59<9:28:53,  6.95s/it]                                                      {'loss': 0.0513, 'grad_norm': 8.699997901916504, 'learning_rate': 3.841392524505092e-05, 'epoch': 7.69}
- 15%|█▌        | 892/5800 [2:22:59<9:28:53,  6.95s/it]score1 tensor([[0.5430],
-        [0.5195],
-        [0.4922],
-        [0.4551]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4746, 0.5352, 0.5508, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0459, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:09:36,575] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.48 | optimizer_step: 4.37
-[2025-01-25 11:09:36,576] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.98 | bwd_microstep: 4642.34 | bwd_inner_microstep: 4637.63 | bwd_allreduce_microstep: 4.62 | step_microstep: 44.12
-[2025-01-25 11:09:36,576] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.93 | bwd: 4642.37 | bwd_inner: 4637.63 | bwd_allreduce: 4.67 | step: 44.13
- 15%|█▌        | 893/5800 [2:23:06<9:28:05,  6.95s/it]                                                      {'loss': 0.0459, 'grad_norm': 3.9753520488739014, 'learning_rate': 3.840956368104932e-05, 'epoch': 7.7}
- 15%|█▌        | 893/5800 [2:23:06<9:28:05,  6.95s/it]score1 tensor([[0.4648],
-        [0.3672],
-        [0.5000],
-        [0.6406]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.3516, 0.5000, 0.6836], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0166, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:09:43,448] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 11:09:43,449] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.49 | bwd_microstep: 4587.90 | bwd_inner_microstep: 4581.19 | bwd_allreduce_microstep: 6.62 | step_microstep: 46.50
-[2025-01-25 11:09:43,449] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.45 | bwd: 4587.93 | bwd_inner: 4581.19 | bwd_allreduce: 6.67 | step: 46.50
- 15%|█▌        | 894/5800 [2:23:13<9:26:15,  6.93s/it]                                                      {'loss': 0.0166, 'grad_norm': 2.6835763454437256, 'learning_rate': 3.8405196376626334e-05, 'epoch': 7.71}
- 15%|█▌        | 894/5800 [2:23:13<9:26:15,  6.93s/it]score1 tensor([[0.4785],
-        [0.4766],
-        [0.4023],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.5391, 0.3105, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0557, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:09:50,366] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.49 | optimizer_step: 4.37
-[2025-01-25 11:09:50,367] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.47 | bwd_microstep: 4641.23 | bwd_inner_microstep: 4636.58 | bwd_allreduce_microstep: 4.55 | step_microstep: 42.17
-[2025-01-25 11:09:50,368] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.43 | bwd: 4641.26 | bwd_inner: 4636.58 | bwd_allreduce: 4.60 | step: 42.18
- 15%|█▌        | 895/5800 [2:23:20<9:25:56,  6.92s/it]                                                      {'loss': 0.0557, 'grad_norm': 4.382538795471191, 'learning_rate': 3.840082333314377e-05, 'epoch': 7.72}
- 15%|█▌        | 895/5800 [2:23:20<9:25:56,  6.92s/it]score1 tensor([[0.5078],
-        [0.5156],
-        [0.5273],
-        [0.4922]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4160, 0.4883, 0.6094, 0.4336], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0649, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:09:57,286] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.50 | optimizer_step: 4.37
-[2025-01-25 11:09:57,288] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.37 | bwd_microstep: 4639.88 | bwd_inner_microstep: 4635.23 | bwd_allreduce_microstep: 4.57 | step_microstep: 43.50
-[2025-01-25 11:09:57,288] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.33 | bwd: 4639.90 | bwd_inner: 4635.23 | bwd_allreduce: 4.61 | step: 43.50
- 15%|█▌        | 896/5800 [2:23:27<9:25:43,  6.92s/it]                                                      {'loss': 0.0649, 'grad_norm': 4.096011638641357, 'learning_rate': 3.8396444551965196e-05, 'epoch': 7.72}
- 15%|█▌        | 896/5800 [2:23:27<9:25:43,  6.92s/it]score1 tensor([[0.5039],
-        [0.5586],
-        [0.5938],
-        [0.6562]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4590, 0.6055, 0.6133, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0503, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:10:04,198] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.37
-[2025-01-25 11:10:04,199] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.65 | bwd_microstep: 4636.19 | bwd_inner_microstep: 4632.33 | bwd_allreduce_microstep: 3.81 | step_microstep: 40.24
-[2025-01-25 11:10:04,199] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.62 | bwd: 4636.21 | bwd_inner: 4632.33 | bwd_allreduce: 3.83 | step: 40.25
- 15%|█▌        | 897/5800 [2:23:34<9:25:22,  6.92s/it]                                                      {'loss': 0.0503, 'grad_norm': 0.5203759670257568, 'learning_rate': 3.839206003445601e-05, 'epoch': 7.73}
- 15%|█▌        | 897/5800 [2:23:34<9:25:22,  6.92s/it]score1 tensor([[0.5117],
-        [0.5000],
-        [0.4629],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.5820, 0.4648, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0396, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:10:11,128] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.37
-[2025-01-25 11:10:11,129] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.28 | bwd_microstep: 4646.80 | bwd_inner_microstep: 4642.22 | bwd_allreduce_microstep: 4.50 | step_microstep: 45.08
-[2025-01-25 11:10:11,130] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.24 | bwd: 4646.82 | bwd_inner: 4642.22 | bwd_allreduce: 4.54 | step: 45.10
- 15%|█▌        | 898/5800 [2:23:41<9:25:34,  6.92s/it]                                                      {'loss': 0.0396, 'grad_norm': 0.7480533123016357, 'learning_rate': 3.838766978198337e-05, 'epoch': 7.74}
- 15%|█▌        | 898/5800 [2:23:41<9:25:34,  6.92s/it]score1 tensor([[0.4707],
-        [0.5352],
-        [0.4863],
-        [0.4961]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.4238, 0.5156, 0.4199], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:10:18,056] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.99 | optimizer_step: 4.36
-[2025-01-25 11:10:18,058] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.51 | bwd_microstep: 4644.95 | bwd_inner_microstep: 4640.05 | bwd_allreduce_microstep: 4.79 | step_microstep: 44.73
-[2025-01-25 11:10:18,058] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.47 | bwd: 4644.98 | bwd_inner: 4640.05 | bwd_allreduce: 4.85 | step: 44.74
- 16%|█▌        | 899/5800 [2:23:48<9:25:32,  6.92s/it]                                                      {'loss': 0.0664, 'grad_norm': 0.6046892404556274, 'learning_rate': 3.8383273795916245e-05, 'epoch': 7.75}
- 16%|█▌        | 899/5800 [2:23:48<9:25:32,  6.92s/it]score1 tensor([[0.4277],
-        [0.5430],
-        [0.5234],
-        [0.4805]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4043, 0.5586, 0.4844, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:10:25,001] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.97 | optimizer_step: 4.36
-[2025-01-25 11:10:25,003] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.68 | bwd_microstep: 4646.15 | bwd_inner_microstep: 4641.00 | bwd_allreduce_microstep: 5.05 | step_microstep: 69.67
-[2025-01-25 11:10:25,003] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.64 | bwd: 4646.17 | bwd_inner: 4641.00 | bwd_allreduce: 5.09 | step: 69.68
- 16%|█▌        | 900/5800 [2:23:54<9:26:29,  6.94s/it]                                                      {'loss': 0.0234, 'grad_norm': 0.5806503891944885, 'learning_rate': 3.8378872077625375e-05, 'epoch': 7.76}
- 16%|█▌        | 900/5800 [2:23:54<9:26:29,  6.94s/it]score1 tensor([[0.4902],
-        [0.5391],
-        [0.5898],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.5039, 0.5547, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0337, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:10:31,957] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 11:10:31,958] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.57 | bwd_microstep: 4645.52 | bwd_inner_microstep: 4640.46 | bwd_allreduce_microstep: 4.99 | step_microstep: 47.59
-[2025-01-25 11:10:31,959] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.54 | bwd: 4645.55 | bwd_inner: 4640.46 | bwd_allreduce: 5.02 | step: 47.61
- 16%|█▌        | 901/5800 [2:24:01<9:26:31,  6.94s/it]                                                      {'loss': 0.0337, 'grad_norm': 0.6404984593391418, 'learning_rate': 3.8374464628483286e-05, 'epoch': 7.77}
- 16%|█▌        | 901/5800 [2:24:01<9:26:31,  6.94s/it]score1 tensor([[0.5312],
-        [0.4844],
-        [0.4785],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.4492, 0.5273, 0.3809], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0669, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:10:38,891] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.37
-[2025-01-25 11:10:38,892] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.35 | bwd_microstep: 4643.98 | bwd_inner_microstep: 4639.11 | bwd_allreduce_microstep: 4.77 | step_microstep: 43.62
-[2025-01-25 11:10:38,893] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.30 | bwd: 4644.00 | bwd_inner: 4639.11 | bwd_allreduce: 4.81 | step: 43.63
- 16%|█▌        | 902/5800 [2:24:08<9:26:06,  6.93s/it]                                                      {'loss': 0.0669, 'grad_norm': 4.307362079620361, 'learning_rate': 3.83700514498643e-05, 'epoch': 7.78}
- 16%|█▌        | 902/5800 [2:24:08<9:26:06,  6.93s/it]score1 tensor([[0.4531],
-        [0.4668],
-        [0.6016],
-        [0.6523]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4082, 0.4043, 0.6445, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0454, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:10:45,817] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.52 | optimizer_step: 4.37
-[2025-01-25 11:10:45,818] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.12 | bwd_microstep: 4646.52 | bwd_inner_microstep: 4641.71 | bwd_allreduce_microstep: 4.73 | step_microstep: 44.05
-[2025-01-25 11:10:45,818] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.09 | bwd: 4646.54 | bwd_inner: 4641.71 | bwd_allreduce: 4.77 | step: 44.06
- 16%|█▌        | 903/5800 [2:24:15<9:25:52,  6.93s/it]                                                      {'loss': 0.0454, 'grad_norm': 4.144433975219727, 'learning_rate': 3.836563254314453e-05, 'epoch': 7.78}
- 16%|█▌        | 903/5800 [2:24:15<9:25:52,  6.93s/it]score1 tensor([[0.5469],
-        [0.5000],
-        [0.4160],
-        [0.4355]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133, 0.5391, 0.3730, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0415, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:10:52,736] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.50 | optimizer_step: 4.37
-[2025-01-25 11:10:52,738] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.54 | bwd_microstep: 4639.44 | bwd_inner_microstep: 4634.68 | bwd_allreduce_microstep: 4.68 | step_microstep: 45.14
-[2025-01-25 11:10:52,738] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.50 | bwd: 4639.47 | bwd_inner: 4634.68 | bwd_allreduce: 4.72 | step: 45.16
- 16%|█▌        | 904/5800 [2:24:22<9:25:16,  6.93s/it]                                                      {'loss': 0.0415, 'grad_norm': 0.8721733689308167, 'learning_rate': 3.836120790970185e-05, 'epoch': 7.79}
- 16%|█▌        | 904/5800 [2:24:22<9:25:16,  6.93s/it]score1 tensor([[0.4199],
-        [0.4863],
-        [0.4004],
-        [0.4473]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4238, 0.5781, 0.4785, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:10:59,666] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 11:10:59,667] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.13 | bwd_microstep: 4646.07 | bwd_inner_microstep: 4640.83 | bwd_allreduce_microstep: 5.13 | step_microstep: 49.60
-[2025-01-25 11:10:59,668] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.09 | bwd: 4646.10 | bwd_inner: 4640.83 | bwd_allreduce: 5.19 | step: 49.62
- 16%|█▌        | 905/5800 [2:24:29<9:25:14,  6.93s/it]                                                      {'loss': 0.0703, 'grad_norm': 7.7754693031311035, 'learning_rate': 3.835677755091595e-05, 'epoch': 7.8}
- 16%|█▌        | 905/5800 [2:24:29<9:25:14,  6.93s/it]score1 tensor([[0.4590],
-        [0.4590],
-        [0.4414],
-        [0.4492]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4023, 0.5703, 0.5508, 0.4805], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0771, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:11:06,600] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 11:11:06,601] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.38 | bwd_microstep: 4648.15 | bwd_inner_microstep: 4641.90 | bwd_allreduce_microstep: 6.17 | step_microstep: 48.36
-[2025-01-25 11:11:06,602] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.27 | bwd: 4648.18 | bwd_inner: 4641.90 | bwd_allreduce: 6.21 | step: 48.36
- 16%|█▌        | 906/5800 [2:24:36<9:25:14,  6.93s/it]                                                      {'loss': 0.0771, 'grad_norm': 3.999953269958496, 'learning_rate': 3.83523414681683e-05, 'epoch': 7.81}
- 16%|█▌        | 906/5800 [2:24:36<9:25:14,  6.93s/it]score1 tensor([[0.3926],
-        [0.4121],
-        [0.4355],
-        [0.4805]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4707, 0.4180, 0.4473, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0425, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:11:13,524] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.52 | optimizer_step: 4.36
-[2025-01-25 11:11:13,526] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.79 | bwd_microstep: 4646.03 | bwd_inner_microstep: 4641.36 | bwd_allreduce_microstep: 4.58 | step_microstep: 47.81
-[2025-01-25 11:11:13,526] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.76 | bwd: 4646.05 | bwd_inner: 4641.35 | bwd_allreduce: 4.63 | step: 47.82
- 16%|█▌        | 907/5800 [2:24:43<9:24:59,  6.93s/it]                                                      {'loss': 0.0425, 'grad_norm': 7.688133716583252, 'learning_rate': 3.834789966284214e-05, 'epoch': 7.82}
- 16%|█▌        | 907/5800 [2:24:43<9:24:59,  6.93s/it]score1 tensor([[0.4453],
-        [0.5234],
-        [0.5586],
-        [0.5781]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3789, 0.5742, 0.5508, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0342, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:11:20,457] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.37
-[2025-01-25 11:11:20,458] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.85 | bwd_microstep: 4647.58 | bwd_inner_microstep: 4642.29 | bwd_allreduce_microstep: 5.15 | step_microstep: 51.28
-[2025-01-25 11:11:20,459] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.81 | bwd: 4647.62 | bwd_inner: 4642.29 | bwd_allreduce: 5.22 | step: 51.29
- 16%|█▌        | 908/5800 [2:24:50<9:25:20,  6.93s/it]                                                      {'loss': 0.0342, 'grad_norm': 4.34680700302124, 'learning_rate': 3.83434521363225e-05, 'epoch': 7.83}
- 16%|█▌        | 908/5800 [2:24:50<9:25:20,  6.93s/it]score1 tensor([[0.5000],
-        [0.4453],
-        [0.4707],
-        [0.3711]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.4688, 0.5117, 0.3672], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0366, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:11:27,427] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.97 | optimizer_step: 4.36
-[2025-01-25 11:11:27,429] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.00 | bwd_microstep: 4640.95 | bwd_inner_microstep: 4634.23 | bwd_allreduce_microstep: 6.58 | step_microstep: 67.89
-[2025-01-25 11:11:27,429] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.97 | bwd: 4640.97 | bwd_inner: 4634.23 | bwd_allreduce: 6.67 | step: 67.90
- 16%|█▌        | 909/5800 [2:24:57<9:25:58,  6.94s/it]                                                      {'loss': 0.0366, 'grad_norm': 4.264121055603027, 'learning_rate': 3.8338998889996194e-05, 'epoch': 7.84}
- 16%|█▌        | 909/5800 [2:24:57<9:25:58,  6.94s/it]score1 tensor([[0.5547],
-        [0.4629],
-        [0.5078],
-        [0.6133]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5078, 0.4805, 0.4980, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0264, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:11:34,380] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.36
-[2025-01-25 11:11:34,381] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2173.33 | bwd_microstep: 4653.59 | bwd_inner_microstep: 4648.29 | bwd_allreduce_microstep: 5.18 | step_microstep: 45.35
-[2025-01-25 11:11:34,382] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2173.28 | bwd: 4653.61 | bwd_inner: 4648.29 | bwd_allreduce: 5.23 | step: 45.36
- 16%|█▌        | 910/5800 [2:25:04<9:26:21,  6.95s/it]                                                      {'loss': 0.0264, 'grad_norm': 0.4623858630657196, 'learning_rate': 3.833453992525182e-05, 'epoch': 7.84}
- 16%|█▌        | 910/5800 [2:25:04<9:26:21,  6.95s/it]score1 tensor([[0.5234],
-        [0.4375],
-        [0.5742],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.3867, 0.5508, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:11:41,321] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.89 | optimizer_step: 4.37
-[2025-01-25 11:11:41,323] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.27 | bwd_microstep: 4639.66 | bwd_inner_microstep: 4634.88 | bwd_allreduce_microstep: 4.69 | step_microstep: 41.39
-[2025-01-25 11:11:41,323] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.23 | bwd: 4639.68 | bwd_inner: 4634.88 | bwd_allreduce: 4.73 | step: 41.40
- 16%|█▌        | 911/5800 [2:25:11<9:25:41,  6.94s/it]                                                      {'loss': 0.0215, 'grad_norm': 4.21828556060791, 'learning_rate': 3.8330075243479776e-05, 'epoch': 7.85}
- 16%|█▌        | 911/5800 [2:25:11<9:25:41,  6.94s/it]score1 tensor([[0.4941],
-        [0.4688],
-        [0.6875],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.4219, 0.7031, 0.5742], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:11:48,198] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.37
-[2025-01-25 11:11:48,199] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.45 | bwd_microstep: 4582.02 | bwd_inner_microstep: 4577.05 | bwd_allreduce_microstep: 4.86 | step_microstep: 45.73
-[2025-01-25 11:11:48,200] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.41 | bwd: 4582.04 | bwd_inner: 4577.05 | bwd_allreduce: 4.92 | step: 45.74
- 16%|█▌        | 912/5800 [2:25:18<9:23:56,  6.92s/it]                                                      {'loss': 0.0176, 'grad_norm': 2.855156898498535, 'learning_rate': 3.832560484607221e-05, 'epoch': 7.86}
- 16%|█▌        | 912/5800 [2:25:18<9:23:56,  6.92s/it]score1 tensor([[0.4746],
-        [0.4453],
-        [0.4668],
-        [0.4961]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4629, 0.4609, 0.4941, 0.4902], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0151, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:11:55,117] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.50 | optimizer_step: 4.36
-[2025-01-25 11:11:55,119] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.53 | bwd_microstep: 4637.14 | bwd_inner_microstep: 4631.76 | bwd_allreduce_microstep: 5.27 | step_microstep: 45.70
-[2025-01-25 11:11:55,119] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.49 | bwd: 4637.17 | bwd_inner: 4631.76 | bwd_allreduce: 5.33 | step: 45.71
- 16%|█▌        | 913/5800 [2:25:25<9:23:49,  6.92s/it]                                                      {'loss': 0.0151, 'grad_norm': 0.5693850517272949, 'learning_rate': 3.832112873442308e-05, 'epoch': 7.87}
- 16%|█▌        | 913/5800 [2:25:25<9:23:49,  6.92s/it]score1 tensor([[0.5625],
-        [0.5195],
-        [0.4648],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6484, 0.4688, 0.3652, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0630, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:12:02,048] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 11:12:02,050] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.24 | bwd_microstep: 4635.35 | bwd_inner_microstep: 4630.49 | bwd_allreduce_microstep: 4.78 | step_microstep: 51.14
-[2025-01-25 11:12:02,050] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.21 | bwd: 4635.38 | bwd_inner: 4630.49 | bwd_allreduce: 4.82 | step: 51.15
- 16%|█▌        | 914/5800 [2:25:32<9:23:50,  6.92s/it]                                                      {'loss': 0.063, 'grad_norm': 4.216343402862549, 'learning_rate': 3.831664690992811e-05, 'epoch': 7.88}
- 16%|█▌        | 914/5800 [2:25:32<9:23:50,  6.92s/it]score1 tensor([[0.5703],
-        [0.5156],
-        [0.4258],
-        [0.3418]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.4961, 0.3750, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0376, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:12:08,965] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 11:12:08,966] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.65 | bwd_microstep: 4638.23 | bwd_inner_microstep: 4630.39 | bwd_allreduce_microstep: 7.73 | step_microstep: 43.40
-[2025-01-25 11:12:08,967] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.61 | bwd: 4638.25 | bwd_inner: 4630.39 | bwd_allreduce: 7.79 | step: 43.41
- 16%|█▌        | 915/5800 [2:25:38<9:23:28,  6.92s/it]                                                      {'loss': 0.0376, 'grad_norm': 0.6204925179481506, 'learning_rate': 3.831215937398481e-05, 'epoch': 7.89}
- 16%|█▌        | 915/5800 [2:25:38<9:23:28,  6.92s/it]score1 tensor([[0.5664],
-        [0.4805],
-        [0.5312],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.4941, 0.4355, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0400, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:12:15,831] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.37
-[2025-01-25 11:12:15,832] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.67 | bwd_microstep: 4588.73 | bwd_inner_microstep: 4583.76 | bwd_allreduce_microstep: 4.86 | step_microstep: 44.16
-[2025-01-25 11:12:15,833] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.63 | bwd: 4588.75 | bwd_inner: 4583.76 | bwd_allreduce: 4.92 | step: 44.17
- 16%|█▌        | 916/5800 [2:25:45<9:22:00,  6.90s/it]                                                      {'loss': 0.04, 'grad_norm': 2.2470803260803223, 'learning_rate': 3.830766612799248e-05, 'epoch': 7.9}
- 16%|█▌        | 916/5800 [2:25:45<9:22:00,  6.90s/it]score1 tensor([[0.5000],
-        [0.4746],
-        [0.4648],
-        [0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4434, 0.3750, 0.4551, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0503, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:12:22,751] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 11:12:22,752] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2170.16 | bwd_microstep: 4636.57 | bwd_inner_microstep: 4631.58 | bwd_allreduce_microstep: 4.89 | step_microstep: 47.66
-[2025-01-25 11:12:22,753] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2170.13 | bwd: 4636.60 | bwd_inner: 4631.59 | bwd_allreduce: 4.94 | step: 47.67
- 16%|█▌        | 917/5800 [2:25:52<9:22:23,  6.91s/it]                                                      {'loss': 0.0503, 'grad_norm': 4.129864692687988, 'learning_rate': 3.8303167173352177e-05, 'epoch': 7.91}
- 16%|█▌        | 917/5800 [2:25:52<9:22:23,  6.91s/it]score1 tensor([[0.5430],
-        [0.5156],
-        [0.6445],
-        [0.6211]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.5039, 0.5625, 0.6250], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0303, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:12:29,672] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 11:12:29,673] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.65 | bwd_microstep: 4638.71 | bwd_inner_microstep: 4633.52 | bwd_allreduce_microstep: 5.08 | step_microstep: 44.53
-[2025-01-25 11:12:29,674] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.62 | bwd: 4638.73 | bwd_inner: 4633.52 | bwd_allreduce: 5.14 | step: 44.51
- 16%|█▌        | 918/5800 [2:25:59<9:22:30,  6.91s/it]                                                      {'loss': 0.0303, 'grad_norm': 0.6389335989952087, 'learning_rate': 3.829866251146677e-05, 'epoch': 7.91}
- 16%|█▌        | 918/5800 [2:25:59<9:22:30,  6.91s/it]score1 tensor([[0.4570],
-        [0.3848],
-        [0.4336],
-        [0.4766]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4121, 0.4512, 0.4785, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0645, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:12:36,599] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 11:12:36,601] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.88 | bwd_microstep: 4653.30 | bwd_inner_microstep: 4640.91 | bwd_allreduce_microstep: 12.32 | step_microstep: 39.92
-[2025-01-25 11:12:36,601] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.85 | bwd: 4653.32 | bwd_inner: 4640.91 | bwd_allreduce: 12.35 | step: 39.92
- 16%|█▌        | 919/5800 [2:26:06<9:22:39,  6.92s/it]                                                      {'loss': 0.0645, 'grad_norm': 3.9653306007385254, 'learning_rate': 3.829415214374087e-05, 'epoch': 7.92}
- 16%|█▌        | 919/5800 [2:26:06<9:22:39,  6.92s/it]score1 tensor([[0.6992],
-        [0.5820],
-        [0.5273],
-        [0.6914]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6523, 0.5938, 0.5469, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0400, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:12:43,551] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.04 | optimizer_step: 4.37
-[2025-01-25 11:12:43,552] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2170.03 | bwd_microstep: 4645.53 | bwd_inner_microstep: 4637.89 | bwd_allreduce_microstep: 7.38 | step_microstep: 66.10
-[2025-01-25 11:12:43,552] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2170.00 | bwd: 4645.58 | bwd_inner: 4637.89 | bwd_allreduce: 7.50 | step: 66.09
- 16%|█▌        | 920/5800 [2:26:13<9:24:25,  6.94s/it]                                                      {'loss': 0.04, 'grad_norm': 1.1599749326705933, 'learning_rate': 3.828963607158091e-05, 'epoch': 7.93}
- 16%|█▌        | 920/5800 [2:26:13<9:24:25,  6.94s/it]evaluate!
-score1 tensor([[0.4746]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4805, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5820]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5625, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4883]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4766, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4746]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5703, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0957, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6484]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5742, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0742, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4922]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5547, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0625, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6016]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5977, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6016]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4219, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1797, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5547]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4316]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5000, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0684, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4570]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3945, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0625, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5703]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5000, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4629]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3730, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0898, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5078]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4512]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4414, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5625]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6797, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1172, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5508]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5391, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5508]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4648, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0859, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4316]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4258, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4883]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4707, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4863]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3965, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0898, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3750, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1602, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4492, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0859, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5312]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5586, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6523, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1367, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6250]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5898, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5781]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4355, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1426, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5078]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5156, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5195]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3262, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1934, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4590]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4863, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4707]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1777, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5820]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6055, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4570, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0586, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5430]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1055, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6133]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.7031, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0898, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5078]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4336, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0742, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5898]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5391, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5078]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4473, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0605, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5703]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5430, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5039]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4590, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0449, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6719]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6641, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5625]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4453, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1172, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6406]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.7109, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5430]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5273, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4531]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3691, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0840, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4766]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4180, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0586, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4863]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3223, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1641, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5820]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6016, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4941]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4648, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3418, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1738, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4512]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4590, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4434]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5117, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0684, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4551, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0840, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4805]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5234, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5820]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5312, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5547]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5820, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4238]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4199, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4648]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3926, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0723, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1016, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5898]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1016, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5742]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5352, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6680]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6250, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6289]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5703, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0586, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4961]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4727, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5820]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4434, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1387, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5312]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4941, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0371, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5781]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5352, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4883]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0., device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4609]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3828, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0781, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.1172]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4219, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.3047, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4531]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4844, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5625]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4805, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0820, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5742]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6719]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5898, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0820, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6875]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6641, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5195]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4688, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4395]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4297, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4238]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4668, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5430]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0859, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4824]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4297, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0527, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4609]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4609, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0., device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4863]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1309, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4922]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4492, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4316]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3730, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0586, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6445, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1094, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1211, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4863]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5430, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0566, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5664]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5273, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4941]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3867, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1074, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4453]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4668, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5859]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5625, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6172]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6250, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6562]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5508]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0781, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5117]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4785, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0332, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5078]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4062, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1016, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5312]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4395, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0918, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4785]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5664, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0879, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4863]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3555, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1309, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4688]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4785, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5859]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5078, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0781, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5586]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6016, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4727]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4277, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0449, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6289]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5781, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5078]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3477, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1602, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5312]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6133, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0820, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5625]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5156, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4844]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3379, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1465, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4121]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3418, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6016]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3848, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2168, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4590]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3828, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0762, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5195]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4277, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0918, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4180]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4043, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16)
-Results saved to /DATA/DATA1/wjr/intern/InternVL/internvl_chat/new_train_result/stage2/mos3_test.csv
-Accuracy: 0.0
-SRCC_score: 0.5623263478060381
-PLCC_score: 0.5304219656776898
-KRCC_score: 0.3908488521662502
-SRCC_level: 0.5623263478060381
-PLCC_level: 0.5304219656776898
-KRCC_level: 0.3908488521662502
-score1 tensor([[0.4766],
-        [0.4785],
-        [0.4043],
-        [0.6484]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4141, 0.5039, 0.3418, 0.5742], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0562, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:23:20,654] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 11:23:20,656] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2140.85 | bwd_microstep: 4606.45 | bwd_inner_microstep: 4601.33 | bwd_allreduce_microstep: 5.03 | step_microstep: 46.23
-[2025-01-25 11:23:20,656] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2140.81 | bwd: 4606.47 | bwd_inner: 4601.33 | bwd_allreduce: 5.07 | step: 46.24
- 16%|█▌        | 921/5800 [2:36:50<265:36:08, 195.98s/it]                                                         {'loss': 0.0562, 'grad_norm': 4.323210716247559, 'learning_rate': 3.828511429639507e-05, 'epoch': 7.94}
- 16%|█▌        | 921/5800 [2:36:50<265:36:08, 195.98s/it]score1 tensor([[0.5195],
-        [0.5664],
-        [0.4160],
-        [0.5938]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.5195, 0.3652, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0518, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:23:27,507] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.87 | optimizer_step: 4.36
-[2025-01-25 11:23:27,508] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2138.37 | bwd_microstep: 4580.71 | bwd_inner_microstep: 4572.19 | bwd_allreduce_microstep: 8.24 | step_microstep: 62.27
-[2025-01-25 11:23:27,508] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2138.33 | bwd: 4580.76 | bwd_inner: 4572.19 | bwd_allreduce: 8.40 | step: 62.25
- 16%|█▌        | 922/5800 [2:36:57<188:40:05, 139.24s/it]                                                         {'loss': 0.0518, 'grad_norm': 4.4140095710754395, 'learning_rate': 3.828058681959332e-05, 'epoch': 7.95}
- 16%|█▌        | 922/5800 [2:36:57<188:40:05, 139.24s/it]score1 tensor([[0.4492],
-        [0.5312],
-        [0.5430],
-        [0.4336]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4531, 0.4980, 0.5586, 0.3457], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:23:34,354] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.21 | optimizer_step: 4.37
-[2025-01-25 11:23:34,355] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2133.10 | bwd_microstep: 4587.01 | bwd_inner_microstep: 4581.15 | bwd_allreduce_microstep: 5.78 | step_microstep: 58.66
-[2025-01-25 11:23:34,356] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2133.06 | bwd: 4587.04 | bwd_inner: 4581.15 | bwd_allreduce: 5.82 | step: 58.67
- 16%|█▌        | 923/5800 [2:37:04<134:49:36, 99.52s/it]                                                         {'loss': 0.0352, 'grad_norm': 0.5598067045211792, 'learning_rate': 3.82760536425874e-05, 'epoch': 7.96}
- 16%|█▌        | 923/5800 [2:37:04<134:49:36, 99.52s/it]score1 tensor([[0.4434],
-        [0.4902],
-        [0.5664],
-        [0.3926]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5000, 0.5781, 0.6094, 0.3691], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0527, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:23:41,253] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.29 | optimizer_step: 4.36
-[2025-01-25 11:23:41,254] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2143.89 | bwd_microstep: 4610.34 | bwd_inner_microstep: 4602.81 | bwd_allreduce_microstep: 7.32 | step_microstep: 63.83
-[2025-01-25 11:23:41,254] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2143.85 | bwd: 4610.40 | bwd_inner: 4602.82 | bwd_allreduce: 7.42 | step: 63.82
- 16%|█▌        | 924/5800 [2:37:11<97:10:02, 71.74s/it]                                                        {'loss': 0.0527, 'grad_norm': 5.17290735244751, 'learning_rate': 3.827151476679084e-05, 'epoch': 7.97}
- 16%|█▌        | 924/5800 [2:37:11<97:10:02, 71.74s/it]score1 tensor([[0.4844],
-        [0.3730],
-        [0.5273],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4570, 0.3086, 0.5430, 0.5820], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0415, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:23:48,147] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.51 | optimizer_step: 4.36
-[2025-01-25 11:23:48,148] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2146.70 | bwd_microstep: 4602.40 | bwd_inner_microstep: 4597.82 | bwd_allreduce_microstep: 4.50 | step_microstep: 57.69
-[2025-01-25 11:23:48,148] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2146.57 | bwd: 4602.42 | bwd_inner: 4597.82 | bwd_allreduce: 4.54 | step: 57.70
- 16%|█▌        | 925/5800 [2:37:18<70:48:08, 52.28s/it]                                                       {'loss': 0.0415, 'grad_norm': 0.7701335549354553, 'learning_rate': 3.8266970193618934e-05, 'epoch': 7.97}
- 16%|█▌        | 925/5800 [2:37:18<70:48:08, 52.28s/it]score1 tensor([[0.4219],
-        [0.5273],
-        [0.4082],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4453, 0.4961, 0.3477, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0308, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:23:55,032] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.37
-[2025-01-25 11:23:55,033] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2148.09 | bwd_microstep: 4606.45 | bwd_inner_microstep: 4601.59 | bwd_allreduce_microstep: 4.77 | step_microstep: 46.70
-[2025-01-25 11:23:55,034] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2148.03 | bwd: 4606.48 | bwd_inner: 4601.59 | bwd_allreduce: 4.81 | step: 46.72
- 16%|█▌        | 926/5800 [2:37:25<52:20:37, 38.66s/it]                                                       {'loss': 0.0308, 'grad_norm': 4.3169779777526855, 'learning_rate': 3.826241992448876e-05, 'epoch': 7.98}
- 16%|█▌        | 926/5800 [2:37:25<52:20:37, 38.66s/it]score1 tensor([[0.4551],
-        [0.4277],
-        [0.4453],
-        [0.4160]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.4297, 0.4883, 0.4531], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0278, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:24:01,911] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 11:24:01,913] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2147.28 | bwd_microstep: 4610.30 | bwd_inner_microstep: 4605.28 | bwd_allreduce_microstep: 4.89 | step_microstep: 49.96
-[2025-01-25 11:24:01,913] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2147.24 | bwd: 4610.32 | bwd_inner: 4605.28 | bwd_allreduce: 4.97 | step: 49.97
- 16%|█▌        | 927/5800 [2:37:31<39:25:31, 29.13s/it]                                                       {'loss': 0.0278, 'grad_norm': 7.982148170471191, 'learning_rate': 3.8257863960819176e-05, 'epoch': 7.99}
- 16%|█▌        | 927/5800 [2:37:31<39:25:31, 29.13s/it][93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-score1 tensor([[0.5859]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6367], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:24:05,944] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.49 | optimizer_step: 4.37
-[2025-01-25 11:24:05,946] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 572.01 | bwd_microstep: 1217.46 | bwd_inner_microstep: 1212.74 | bwd_allreduce_microstep: 4.62 | step_microstep: 43.23
-[2025-01-25 11:24:05,946] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 571.98 | bwd: 1217.48 | bwd_inner: 1212.74 | bwd_allreduce: 4.67 | step: 43.24
- 16%|█▌        | 928/5800 [2:37:35<29:13:44, 21.60s/it]                                                       {'loss': 0.0508, 'grad_norm': 9.647887229919434, 'learning_rate': 3.825330230403081e-05, 'epoch': 8.0}
- 16%|█▌        | 928/5800 [2:37:35<29:13:44, 21.60s/it][2025-01-25 11:24:10,808] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 11:24:21,949] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 11:24:31,813] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 11:24:41,606] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-score1 tensor([[0.4922],
-        [0.4844],
-        [0.3984],
-        [0.3867]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.4980, 0.4707, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0420, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:24:56,795] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.45
-[2025-01-25 11:24:56,797] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2141.72 | bwd_microstep: 4614.87 | bwd_inner_microstep: 4609.42 | bwd_allreduce_microstep: 5.36 | step_microstep: 49.01
-[2025-01-25 11:24:56,797] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2141.67 | bwd: 4614.90 | bwd_inner: 4609.42 | bwd_allreduce: 5.42 | step: 49.01
- 16%|█▌        | 929/5800 [2:38:26<41:06:01, 30.38s/it]                                                       {'loss': 0.042, 'grad_norm': 8.022466659545898, 'learning_rate': 3.824873495554606e-05, 'epoch': 8.01}
- 16%|█▌        | 929/5800 [2:38:26<41:06:01, 30.38s/it]score1 tensor([[0.5195],
-        [0.4414],
-        [0.5312],
-        [0.6016]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.4062, 0.5508, 0.5742], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0269, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:25:03,658] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 11:25:03,659] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2134.14 | bwd_microstep: 4585.05 | bwd_inner_microstep: 4580.46 | bwd_allreduce_microstep: 4.51 | step_microstep: 43.26
-[2025-01-25 11:25:03,659] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2134.11 | bwd: 4585.08 | bwd_inner: 4580.46 | bwd_allreduce: 4.56 | step: 43.27
- 16%|█▌        | 930/5800 [2:38:33<31:32:48, 23.32s/it]                                                       {'loss': 0.0269, 'grad_norm': 4.247073173522949, 'learning_rate': 3.824416191678911e-05, 'epoch': 8.02}
- 16%|█▌        | 930/5800 [2:38:33<31:32:48, 23.32s/it]score1 tensor([[0.6094],
-        [0.5156],
-        [0.6523],
-        [0.4277]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4941, 0.6484, 0.3438], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0420, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:25:10,502] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.55 | optimizer_step: 4.36
-[2025-01-25 11:25:10,503] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2135.15 | bwd_microstep: 4588.65 | bwd_inner_microstep: 4584.14 | bwd_allreduce_microstep: 4.43 | step_microstep: 40.87
-[2025-01-25 11:25:10,504] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2135.10 | bwd: 4588.67 | bwd_inner: 4584.14 | bwd_allreduce: 4.47 | step: 40.88
- 16%|█▌        | 931/5800 [2:38:40<24:51:24, 18.38s/it]                                                       {'loss': 0.042, 'grad_norm': 8.879462242126465, 'learning_rate': 3.8239583189185906e-05, 'epoch': 8.03}
- 16%|█▌        | 931/5800 [2:38:40<24:51:24, 18.38s/it]score1 tensor([[0.4902],
-        [0.0000],
-        [0.4629],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4531, 0.1787, 0.4121, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0830, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:25:17,311] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 11:25:17,313] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2137.12 | bwd_microstep: 4552.88 | bwd_inner_microstep: 4547.95 | bwd_allreduce_microstep: 4.82 | step_microstep: 44.31
-[2025-01-25 11:25:17,313] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2137.01 | bwd: 4552.90 | bwd_inner: 4547.95 | bwd_allreduce: 4.88 | step: 44.31
- 16%|█▌        | 932/5800 [2:38:47<20:09:29, 14.91s/it]                                                       {'loss': 0.083, 'grad_norm': 6.291326522827148, 'learning_rate': 3.8234998774164184e-05, 'epoch': 8.03}
- 16%|█▌        | 932/5800 [2:38:47<20:09:29, 14.91s/it]score1 tensor([[0.5898],
-        [0.5195],
-        [0.4414],
-        [0.4785]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.5078, 0.4922, 0.3789], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0503, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:25:24,186] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.55 | optimizer_step: 4.36
-[2025-01-25 11:25:24,187] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2145.40 | bwd_microstep: 4607.98 | bwd_inner_microstep: 4603.44 | bwd_allreduce_microstep: 4.45 | step_microstep: 42.06
-[2025-01-25 11:25:24,188] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2145.37 | bwd: 4608.00 | bwd_inner: 4603.44 | bwd_allreduce: 4.49 | step: 42.06
- 16%|█▌        | 933/5800 [2:38:54<16:53:42, 12.50s/it]                                                       {'loss': 0.0503, 'grad_norm': 4.5762739181518555, 'learning_rate': 3.823040867315343e-05, 'epoch': 8.04}
- 16%|█▌        | 933/5800 [2:38:54<16:53:42, 12.50s/it]score1 tensor([[0.4668],
-        [0.6172],
-        [0.3398],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.6641, 0.3984, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0337, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:25:31,077] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.37
-[2025-01-25 11:25:31,078] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2151.21 | bwd_microstep: 4624.23 | bwd_inner_microstep: 4619.71 | bwd_allreduce_microstep: 4.41 | step_microstep: 45.18
-[2025-01-25 11:25:31,078] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2151.17 | bwd: 4624.25 | bwd_inner: 4619.71 | bwd_allreduce: 4.47 | step: 45.19
- 16%|█▌        | 934/5800 [2:39:01<14:37:04, 10.81s/it]                                                       {'loss': 0.0337, 'grad_norm': 8.469834327697754, 'learning_rate': 3.8225812887584936e-05, 'epoch': 8.05}
- 16%|█▌        | 934/5800 [2:39:01<14:37:04, 10.81s/it]score1 tensor([[0.4668],
-        [0.5352],
-        [0.5547],
-        [0.6289]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4297, 0.4844, 0.5664, 0.6367], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0269, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:25:37,959] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 11:25:37,961] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.10 | bwd_microstep: 4621.78 | bwd_inner_microstep: 4617.08 | bwd_allreduce_microstep: 4.59 | step_microstep: 43.01
-[2025-01-25 11:25:37,961] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.06 | bwd: 4621.80 | bwd_inner: 4617.08 | bwd_allreduce: 4.64 | step: 43.03
- 16%|█▌        | 935/5800 [2:39:07<13:01:22,  9.64s/it]                                                       {'loss': 0.0269, 'grad_norm': 0.8006069660186768, 'learning_rate': 3.822121141889174e-05, 'epoch': 8.06}
- 16%|█▌        | 935/5800 [2:39:07<13:01:22,  9.64s/it]score1 tensor([[0.5312],
-        [0.4902],
-        [0.5234],
-        [0.4121]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5586, 0.5156, 0.5586, 0.4277], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0259, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:25:44,843] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.28 | optimizer_step: 4.36
-[2025-01-25 11:25:44,844] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.60 | bwd_microstep: 4611.00 | bwd_inner_microstep: 4604.34 | bwd_allreduce_microstep: 6.51 | step_microstep: 45.04
-[2025-01-25 11:25:44,844] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.57 | bwd: 4611.05 | bwd_inner: 4604.34 | bwd_allreduce: 6.59 | step: 45.01
- 16%|█▌        | 936/5800 [2:39:14<11:53:55,  8.81s/it]                                                       {'loss': 0.0259, 'grad_norm': 8.403072357177734, 'learning_rate': 3.821660426850866e-05, 'epoch': 8.07}
- 16%|█▌        | 936/5800 [2:39:14<11:53:55,  8.81s/it]score1 tensor([[0.4141],
-        [0.4434],
-        [0.4648],
-        [0.4805]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.4082, 0.5000, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0415, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:25:51,721] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.48 | optimizer_step: 4.36
-[2025-01-25 11:25:51,722] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.33 | bwd_microstep: 4618.23 | bwd_inner_microstep: 4613.75 | bwd_allreduce_microstep: 4.39 | step_microstep: 40.25
-[2025-01-25 11:25:51,722] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.29 | bwd: 4618.25 | bwd_inner: 4613.75 | bwd_allreduce: 4.44 | step: 40.26
- 16%|█▌        | 937/5800 [2:39:21<11:07:39,  8.24s/it]                                                       {'loss': 0.0415, 'grad_norm': 4.099556922912598, 'learning_rate': 3.8211991437872274e-05, 'epoch': 8.08}
- 16%|█▌        | 937/5800 [2:39:21<11:07:39,  8.24s/it]score1 tensor([[0.4805],
-        [0.5586],
-        [0.6484],
-        [0.4512]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5977, 0.5391, 0.6719, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0532, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:25:58,640] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.47 | optimizer_step: 4.36
-[2025-01-25 11:25:58,642] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.70 | bwd_microstep: 4617.32 | bwd_inner_microstep: 4612.78 | bwd_allreduce_microstep: 4.45 | step_microstep: 41.60
-[2025-01-25 11:25:58,642] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.66 | bwd: 4617.34 | bwd_inner: 4612.78 | bwd_allreduce: 4.50 | step: 41.61
- 16%|█▌        | 938/5800 [2:39:28<10:34:53,  7.83s/it]                                                       {'loss': 0.0532, 'grad_norm': 4.3761677742004395, 'learning_rate': 3.8207372928420955e-05, 'epoch': 8.09}
- 16%|█▌        | 938/5800 [2:39:28<10:34:53,  7.83s/it]score1 tensor([[0.6680],
-        [0.4355],
-        [0.3262],
-        [0.3477]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6250, 0.4473, 0.3223, 0.2812], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:26:05,526] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.37
-[2025-01-25 11:26:05,528] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.66 | bwd_microstep: 4618.85 | bwd_inner_microstep: 4613.60 | bwd_allreduce_microstep: 5.11 | step_microstep: 42.43
-[2025-01-25 11:26:05,528] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.62 | bwd: 4618.87 | bwd_inner: 4613.60 | bwd_allreduce: 5.19 | step: 42.43
- 16%|█▌        | 939/5800 [2:39:35<10:11:55,  7.55s/it]                                                       {'loss': 0.0312, 'grad_norm': 4.124626636505127, 'learning_rate': 3.820274874159483e-05, 'epoch': 8.09}
- 16%|█▌        | 939/5800 [2:39:35<10:11:55,  7.55s/it]score1 tensor([[0.6367],
-        [0.4668],
-        [0.5312],
-        [0.4199]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.4141, 0.5156, 0.4668], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0464, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:26:12,423] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.53 | optimizer_step: 4.37
-[2025-01-25 11:26:12,424] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.17 | bwd_microstep: 4618.76 | bwd_inner_microstep: 4614.09 | bwd_allreduce_microstep: 4.56 | step_microstep: 41.41
-[2025-01-25 11:26:12,425] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.14 | bwd: 4618.78 | bwd_inner: 4614.09 | bwd_allreduce: 4.62 | step: 41.42
- 16%|█▌        | 940/5800 [2:39:42<9:55:37,  7.35s/it]                                                       {'loss': 0.0464, 'grad_norm': 4.678940296173096, 'learning_rate': 3.819811887883581e-05, 'epoch': 8.1}
- 16%|█▌        | 940/5800 [2:39:42<9:55:37,  7.35s/it]score1 tensor([[0.4785],
-        [0.7266],
-        [0.5273],
-        [0.4355]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.6133, 0.4883, 0.4746], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0557, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:26:19,336] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.32 | optimizer_step: 4.37
-[2025-01-25 11:26:19,337] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.34 | bwd_microstep: 4657.35 | bwd_inner_microstep: 4653.35 | bwd_allreduce_microstep: 3.94 | step_microstep: 33.41
-[2025-01-25 11:26:19,337] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.30 | bwd: 4657.37 | bwd_inner: 4653.35 | bwd_allreduce: 3.97 | step: 33.42
- 16%|█▌        | 941/5800 [2:39:49<9:44:34,  7.22s/it]                                                      {'loss': 0.0557, 'grad_norm': 4.916601657867432, 'learning_rate': 3.819348334158756e-05, 'epoch': 8.11}
- 16%|█▌        | 941/5800 [2:39:49<9:44:34,  7.22s/it]score1 tensor([[0.5234],
-        [0.4199],
-        [0.4668],
-        [0.4277]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4238, 0.4629, 0.4785, 0.3867], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0488, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:26:26,218] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.37
-[2025-01-25 11:26:26,219] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.05 | bwd_microstep: 4628.89 | bwd_inner_microstep: 4624.06 | bwd_allreduce_microstep: 4.76 | step_microstep: 41.56
-[2025-01-25 11:26:26,220] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.02 | bwd: 4628.91 | bwd_inner: 4624.06 | bwd_allreduce: 4.79 | step: 41.57
- 16%|█▌        | 942/5800 [2:39:56<9:36:32,  7.12s/it]                                                      {'loss': 0.0488, 'grad_norm': 0.7690532803535461, 'learning_rate': 3.8188842131295505e-05, 'epoch': 8.12}
- 16%|█▌        | 942/5800 [2:39:56<9:36:32,  7.12s/it]score1 tensor([[0.4160],
-        [0.5078],
-        [0.5742],
-        [0.6992]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4336, 0.5625, 0.5508, 0.6797], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0288, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:26:33,115] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 11:26:33,116] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.69 | bwd_microstep: 4625.24 | bwd_inner_microstep: 4620.52 | bwd_allreduce_microstep: 4.62 | step_microstep: 41.36
-[2025-01-25 11:26:33,117] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.66 | bwd: 4625.26 | bwd_inner: 4620.52 | bwd_allreduce: 4.67 | step: 41.37
- 16%|█▋        | 943/5800 [2:40:03<9:31:11,  7.06s/it]                                                      {'loss': 0.0288, 'grad_norm': 1.3194949626922607, 'learning_rate': 3.818419524940688e-05, 'epoch': 8.13}
- 16%|█▋        | 943/5800 [2:40:03<9:31:11,  7.06s/it]score1 tensor([[0.4453],
-        [0.4395],
-        [0.4824],
-        [0.4121]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4824, 0.4180, 0.4004, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0366, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:26:40,015] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.55 | optimizer_step: 4.36
-[2025-01-25 11:26:40,016] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.97 | bwd_microstep: 4623.43 | bwd_inner_microstep: 4618.78 | bwd_allreduce_microstep: 4.57 | step_microstep: 40.74
-[2025-01-25 11:26:40,016] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.93 | bwd: 4623.45 | bwd_inner: 4618.78 | bwd_allreduce: 4.61 | step: 40.75
- 16%|█▋        | 944/5800 [2:40:09<9:27:02,  7.01s/it]                                                      {'loss': 0.0366, 'grad_norm': 0.6764149069786072, 'learning_rate': 3.817954269737065e-05, 'epoch': 8.14}
- 16%|█▋        | 944/5800 [2:40:09<9:27:02,  7.01s/it]score1 tensor([[0.4590],
-        [0.5312],
-        [0.5703],
-        [0.4258]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.5039, 0.6133, 0.4414], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0244, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:26:46,905] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.21 | optimizer_step: 4.36
-[2025-01-25 11:26:46,906] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.02 | bwd_microstep: 4623.32 | bwd_inner_microstep: 4619.00 | bwd_allreduce_microstep: 4.25 | step_microstep: 39.12
-[2025-01-25 11:26:46,906] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.98 | bwd: 4623.36 | bwd_inner: 4619.00 | bwd_allreduce: 4.29 | step: 39.13
- 16%|█▋        | 945/5800 [2:40:16<9:24:02,  6.97s/it]                                                      {'loss': 0.0244, 'grad_norm': 0.48523789644241333, 'learning_rate': 3.8174884476637564e-05, 'epoch': 8.15}
- 16%|█▋        | 945/5800 [2:40:16<9:24:02,  6.97s/it]score1 tensor([[0.5820],
-        [0.5977],
-        [0.6133],
-        [0.4609]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.6172, 0.6016, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:26:53,834] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.05 | optimizer_step: 4.37
-[2025-01-25 11:26:53,836] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.24 | bwd_microstep: 4645.31 | bwd_inner_microstep: 4637.25 | bwd_allreduce_microstep: 7.85 | step_microstep: 62.61
-[2025-01-25 11:26:53,836] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.21 | bwd: 4645.37 | bwd_inner: 4637.26 | bwd_allreduce: 7.91 | step: 62.59
- 16%|█▋        | 946/5800 [2:40:23<9:23:04,  6.96s/it]                                                      {'loss': 0.0176, 'grad_norm': 0.663663923740387, 'learning_rate': 3.817022058866014e-05, 'epoch': 8.16}
- 16%|█▋        | 946/5800 [2:40:23<9:23:04,  6.96s/it]score1 tensor([[0.5117],
-        [0.5234],
-        [0.4707],
-        [0.4785]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.4121, 0.4629, 0.4355], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0425, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:27:00,773] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.50 | optimizer_step: 4.37
-[2025-01-25 11:27:00,774] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.62 | bwd_microstep: 4652.60 | bwd_inner_microstep: 4647.68 | bwd_allreduce_microstep: 4.79 | step_microstep: 42.46
-[2025-01-25 11:27:00,775] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.59 | bwd: 4652.63 | bwd_inner: 4647.68 | bwd_allreduce: 4.86 | step: 42.46
- 16%|█▋        | 947/5800 [2:40:30<9:22:44,  6.96s/it]                                                      {'loss': 0.0425, 'grad_norm': 4.099588871002197, 'learning_rate': 3.8165551034892653e-05, 'epoch': 8.16}
- 16%|█▋        | 947/5800 [2:40:30<9:22:44,  6.96s/it]score1 tensor([[0.5039],
-        [0.4922],
-        [0.5156],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6602, 0.5156, 0.5078, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0693, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:27:07,748] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.71 | optimizer_step: 4.36
-[2025-01-25 11:27:07,751] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2171.74 | bwd_microstep: 4656.83 | bwd_inner_microstep: 4650.81 | bwd_allreduce_microstep: 5.94 | step_microstep: 51.07
-[2025-01-25 11:27:07,752] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2171.70 | bwd: 4656.85 | bwd_inner: 4650.81 | bwd_allreduce: 5.97 | step: 51.12
- 16%|█▋        | 948/5800 [2:40:37<9:23:20,  6.97s/it]                                                      {'loss': 0.0693, 'grad_norm': 4.4007110595703125, 'learning_rate': 3.8160875816791155e-05, 'epoch': 8.17}
- 16%|█▋        | 948/5800 [2:40:37<9:23:20,  6.97s/it]score1 tensor([[0.4727],
-        [0.4707],
-        [0.4395],
-        [0.3633]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4707, 0.5156, 0.5234, 0.4688], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0591, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:27:14,720] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 11:27:14,721] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.38 | bwd_microstep: 4654.80 | bwd_inner_microstep: 4649.98 | bwd_allreduce_microstep: 4.69 | step_microstep: 42.82
-[2025-01-25 11:27:14,721] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2168.35 | bwd: 4654.82 | bwd_inner: 4649.98 | bwd_allreduce: 4.76 | step: 42.84
- 16%|█▋        | 949/5800 [2:40:44<9:22:47,  6.96s/it]                                                      {'loss': 0.0591, 'grad_norm': 3.8190319538116455, 'learning_rate': 3.8156194935813455e-05, 'epoch': 8.18}
- 16%|█▋        | 949/5800 [2:40:44<9:22:47,  6.96s/it]score1 tensor([[0.4766],
-        [0.3691],
-        [0.4980],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6016, 0.3672, 0.5469, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0557, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:27:21,682] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.37
-[2025-01-25 11:27:21,683] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.10 | bwd_microstep: 4652.60 | bwd_inner_microstep: 4648.02 | bwd_allreduce_microstep: 4.49 | step_microstep: 49.81
-[2025-01-25 11:27:21,683] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.06 | bwd: 4652.62 | bwd_inner: 4648.02 | bwd_allreduce: 4.54 | step: 49.82
- 16%|█▋        | 950/5800 [2:40:51<9:22:41,  6.96s/it]                                                      {'loss': 0.0557, 'grad_norm': 4.503204822540283, 'learning_rate': 3.815150839341915e-05, 'epoch': 8.19}
- 16%|█▋        | 950/5800 [2:40:51<9:22:41,  6.96s/it]score1 tensor([[0.5273],
-        [0.5156],
-        [0.4902],
-        [0.4375]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5898, 0.4844, 0.5508, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0415, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:27:28,609] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 11:27:28,610] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.69 | bwd_microstep: 4646.80 | bwd_inner_microstep: 4642.40 | bwd_allreduce_microstep: 4.32 | step_microstep: 41.12
-[2025-01-25 11:27:28,611] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.66 | bwd: 4646.83 | bwd_inner: 4642.40 | bwd_allreduce: 4.37 | step: 41.13
- 16%|█▋        | 951/5800 [2:40:58<9:21:43,  6.95s/it]                                                      {'loss': 0.0415, 'grad_norm': 4.078990459442139, 'learning_rate': 3.8146816191069554e-05, 'epoch': 8.2}
- 16%|█▋        | 951/5800 [2:40:58<9:21:43,  6.95s/it]score1 tensor([[0.5703],
-        [0.5547],
-        [0.4844],
-        [0.6250]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.6445, 0.5078, 0.6289], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0557, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:27:35,536] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.54 | optimizer_step: 4.36
-[2025-01-25 11:27:35,537] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.37 | bwd_microstep: 4644.80 | bwd_inner_microstep: 4640.29 | bwd_allreduce_microstep: 4.44 | step_microstep: 44.35
-[2025-01-25 11:27:35,538] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2168.33 | bwd: 4644.82 | bwd_inner: 4640.28 | bwd_allreduce: 4.47 | step: 44.35
- 16%|█▋        | 952/5800 [2:41:05<9:20:56,  6.94s/it]                                                      {'loss': 0.0557, 'grad_norm': 4.431155204772949, 'learning_rate': 3.81421183302278e-05, 'epoch': 8.21}
- 16%|█▋        | 952/5800 [2:41:05<9:20:56,  6.94s/it]score1 tensor([[0.5391],
-        [0.6289],
-        [0.5664],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.5586, 0.5117, 0.4863], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0493, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:27:42,468] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.37
-[2025-01-25 11:27:42,469] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.91 | bwd_microstep: 4659.14 | bwd_inner_microstep: 4654.41 | bwd_allreduce_microstep: 4.65 | step_microstep: 41.90
-[2025-01-25 11:27:42,469] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.87 | bwd: 4659.16 | bwd_inner: 4654.41 | bwd_allreduce: 4.69 | step: 41.90
- 16%|█▋        | 953/5800 [2:41:12<9:20:35,  6.94s/it]                                                      {'loss': 0.0493, 'grad_norm': 8.876229286193848, 'learning_rate': 3.813741481235877e-05, 'epoch': 8.22}
- 16%|█▋        | 953/5800 [2:41:12<9:20:35,  6.94s/it]score1 tensor([[0.6953],
-        [0.6016],
-        [0.5273],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6484, 0.6094, 0.5039, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:27:49,393] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.37
-[2025-01-25 11:27:49,394] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.05 | bwd_microstep: 4648.08 | bwd_inner_microstep: 4643.62 | bwd_allreduce_microstep: 4.37 | step_microstep: 41.89
-[2025-01-25 11:27:49,394] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.00 | bwd: 4648.10 | bwd_inner: 4643.62 | bwd_allreduce: 4.41 | step: 41.89
- 16%|█▋        | 954/5800 [2:41:19<9:20:09,  6.94s/it]                                                      {'loss': 0.0352, 'grad_norm': 4.498268127441406, 'learning_rate': 3.813270563892908e-05, 'epoch': 8.22}
- 16%|█▋        | 954/5800 [2:41:19<9:20:09,  6.94s/it]score1 tensor([[0.5938],
-        [0.5352],
-        [0.6289],
-        [0.5938]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.4980, 0.6172, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0415, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:27:56,318] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.49 | optimizer_step: 4.36
-[2025-01-25 11:27:56,319] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.75 | bwd_microstep: 4650.42 | bwd_inner_microstep: 4642.14 | bwd_allreduce_microstep: 8.19 | step_microstep: 40.56
-[2025-01-25 11:27:56,319] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.72 | bwd: 4650.44 | bwd_inner: 4642.14 | bwd_allreduce: 8.23 | step: 40.56
- 16%|█▋        | 955/5800 [2:41:26<9:19:46,  6.93s/it]                                                      {'loss': 0.0415, 'grad_norm': 8.990232467651367, 'learning_rate': 3.812799081140714e-05, 'epoch': 8.23}
- 16%|█▋        | 955/5800 [2:41:26<9:19:46,  6.93s/it]score1 tensor([[0.6055],
-        [0.5664],
-        [0.6055],
-        [0.6602]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.5469, 0.5547, 0.5938], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0620, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:28:03,254] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 11:28:03,255] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2171.42 | bwd_microstep: 4651.05 | bwd_inner_microstep: 4646.07 | bwd_allreduce_microstep: 4.86 | step_microstep: 43.36
-[2025-01-25 11:28:03,256] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2171.38 | bwd: 4651.07 | bwd_inner: 4646.07 | bwd_allreduce: 4.92 | step: 43.37
- 16%|█▋        | 956/5800 [2:41:33<9:20:09,  6.94s/it]                                                      {'loss': 0.062, 'grad_norm': 9.177687644958496, 'learning_rate': 3.812327033126311e-05, 'epoch': 8.24}
- 16%|█▋        | 956/5800 [2:41:33<9:20:09,  6.94s/it]score1 tensor([[0.5625],
-        [0.5781],
-        [0.7031],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.5391, 0.6875, 0.4766], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0649, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:28:10,202] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 11:28:10,203] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.07 | bwd_microstep: 4649.70 | bwd_inner_microstep: 4645.02 | bwd_allreduce_microstep: 4.58 | step_microstep: 42.91
-[2025-01-25 11:28:10,204] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.04 | bwd: 4649.73 | bwd_inner: 4645.02 | bwd_allreduce: 4.63 | step: 42.92
- 16%|█▋        | 957/5800 [2:41:40<9:19:53,  6.94s/it]                                                      {'loss': 0.0649, 'grad_norm': 9.0274019241333, 'learning_rate': 3.811854419996894e-05, 'epoch': 8.25}
- 16%|█▋        | 957/5800 [2:41:40<9:19:53,  6.94s/it]score1 tensor([[0.6172],
-        [0.5039],
-        [0.5352],
-        [0.6562]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.4609, 0.4375, 0.7070], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0605, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:28:17,149] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 11:28:17,150] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2170.12 | bwd_microstep: 4651.29 | bwd_inner_microstep: 4646.01 | bwd_allreduce_microstep: 5.17 | step_microstep: 50.38
-[2025-01-25 11:28:17,150] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2170.02 | bwd: 4651.31 | bwd_inner: 4646.01 | bwd_allreduce: 5.23 | step: 50.40
- 17%|█▋        | 958/5800 [2:41:47<9:19:57,  6.94s/it]                                                      {'loss': 0.0605, 'grad_norm': 4.108710765838623, 'learning_rate': 3.81138124189983e-05, 'epoch': 8.26}
- 17%|█▋        | 958/5800 [2:41:47<9:19:57,  6.94s/it]score1 tensor([[0.5977],
-        [0.5273],
-        [0.4375],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6797, 0.5430, 0.3906, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0371, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:28:24,105] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 11:28:24,106] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2176.33 | bwd_microstep: 4652.01 | bwd_inner_microstep: 4645.59 | bwd_allreduce_microstep: 6.11 | step_microstep: 51.45
-[2025-01-25 11:28:24,106] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2176.26 | bwd: 4652.08 | bwd_inner: 4645.59 | bwd_allreduce: 6.29 | step: 51.42
- 17%|█▋        | 959/5800 [2:41:54<9:20:19,  6.94s/it]                                                      {'loss': 0.0371, 'grad_norm': 0.7421790957450867, 'learning_rate': 3.810907498982664e-05, 'epoch': 8.27}
- 17%|█▋        | 959/5800 [2:41:54<9:20:19,  6.94s/it]score1 tensor([[0.4160],
-        [0.4219],
-        [0.5742],
-        [0.4512]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.4551, 0.6562, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:28:31,040] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.54 | optimizer_step: 4.36
-[2025-01-25 11:28:31,042] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.73 | bwd_microstep: 4655.12 | bwd_inner_microstep: 4650.73 | bwd_allreduce_microstep: 4.32 | step_microstep: 42.27
-[2025-01-25 11:28:31,042] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.68 | bwd: 4655.15 | bwd_inner: 4650.73 | bwd_allreduce: 4.36 | step: 42.28
- 17%|█▋        | 960/5800 [2:42:01<9:20:00,  6.94s/it]                                                      {'loss': 0.0547, 'grad_norm': 8.007049560546875, 'learning_rate': 3.810433191393118e-05, 'epoch': 8.28}
- 17%|█▋        | 960/5800 [2:42:01<9:20:00,  6.94s/it]score1 tensor([[0.5234],
-        [0.3926],
-        [0.4648],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6523, 0.4941, 0.5781, 0.5820], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1055, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:28:37,978] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 11:28:37,980] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.52 | bwd_microstep: 4656.82 | bwd_inner_microstep: 4652.01 | bwd_allreduce_microstep: 4.70 | step_microstep: 46.58
-[2025-01-25 11:28:37,980] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.48 | bwd: 4656.85 | bwd_inner: 4652.01 | bwd_allreduce: 4.76 | step: 46.58
- 17%|█▋        | 961/5800 [2:42:07<9:19:49,  6.94s/it]                                                      {'loss': 0.1055, 'grad_norm': 8.061861991882324, 'learning_rate': 3.8099583192790896e-05, 'epoch': 8.28}
- 17%|█▋        | 961/5800 [2:42:07<9:19:49,  6.94s/it]score1 tensor([[0.3926],
-        [0.3652],
-        [0.5469],
-        [0.3770]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4199, 0.4277, 0.6211, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:28:44,929] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.52 | optimizer_step: 4.36
-[2025-01-25 11:28:44,930] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.13 | bwd_microstep: 4655.21 | bwd_inner_microstep: 4650.33 | bwd_allreduce_microstep: 4.77 | step_microstep: 45.73
-[2025-01-25 11:28:44,930] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.10 | bwd: 4655.23 | bwd_inner: 4650.33 | bwd_allreduce: 4.83 | step: 45.74
- 17%|█▋        | 962/5800 [2:42:14<9:19:47,  6.94s/it]                                                      {'loss': 0.0703, 'grad_norm': 7.600094318389893, 'learning_rate': 3.8094828827886516e-05, 'epoch': 8.29}
- 17%|█▋        | 962/5800 [2:42:14<9:19:47,  6.94s/it]score1 tensor([[0.4531],
-        [0.4375],
-        [0.3086],
-        [0.3945]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3262, 0.5195, 0.4004, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0903, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:28:51,900] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.72 | optimizer_step: 4.37
-[2025-01-25 11:28:51,902] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.63 | bwd_microstep: 4649.64 | bwd_inner_microstep: 4642.21 | bwd_allreduce_microstep: 7.19 | step_microstep: 86.94
-[2025-01-25 11:28:51,903] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.59 | bwd: 4649.70 | bwd_inner: 4642.21 | bwd_allreduce: 7.32 | step: 86.96
- 17%|█▋        | 963/5800 [2:42:21<9:21:22,  6.96s/it]                                                      {'loss': 0.0903, 'grad_norm': 3.506446361541748, 'learning_rate': 3.809006882070053e-05, 'epoch': 8.3}
- 17%|█▋        | 963/5800 [2:42:21<9:21:22,  6.96s/it]score1 tensor([[0.4277],
-        [0.4316],
-        [0.5625],
-        [0.4336]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.4512, 0.5273, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:28:58,865] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 11:28:58,867] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.12 | bwd_microstep: 4641.43 | bwd_inner_microstep: 4637.21 | bwd_allreduce_microstep: 4.11 | step_microstep: 41.23
-[2025-01-25 11:28:58,867] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.01 | bwd: 4641.46 | bwd_inner: 4637.21 | bwd_allreduce: 4.17 | step: 41.24
- 17%|█▋        | 964/5800 [2:42:28<9:20:21,  6.95s/it]                                                      {'loss': 0.0352, 'grad_norm': 0.6045628786087036, 'learning_rate': 3.80853031727172e-05, 'epoch': 8.31}
- 17%|█▋        | 964/5800 [2:42:28<9:20:21,  6.95s/it]score1 tensor([[0.5156],
-        [0.5898],
-        [0.4961],
-        [0.4219]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.5820, 0.5664, 0.4629], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0337, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:29:05,789] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 11:29:05,791] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.57 | bwd_microstep: 4645.66 | bwd_inner_microstep: 4641.05 | bwd_allreduce_microstep: 4.52 | step_microstep: 41.96
-[2025-01-25 11:29:05,791] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.53 | bwd: 4645.68 | bwd_inner: 4641.05 | bwd_allreduce: 4.57 | step: 41.97
- 17%|█▋        | 965/5800 [2:42:35<9:19:35,  6.94s/it]                                                      {'loss': 0.0337, 'grad_norm': 3.8792800903320312, 'learning_rate': 3.808053188542254e-05, 'epoch': 8.32}
- 17%|█▋        | 965/5800 [2:42:35<9:19:35,  6.94s/it]score1 tensor([[0.3867],
-        [0.5625],
-        [0.5664],
-        [0.4785]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3457, 0.5781, 0.5625, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:29:12,727] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.36
-[2025-01-25 11:29:12,728] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.15 | bwd_microstep: 4654.50 | bwd_inner_microstep: 4649.73 | bwd_allreduce_microstep: 4.67 | step_microstep: 43.76
-[2025-01-25 11:29:12,729] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.12 | bwd: 4654.54 | bwd_inner: 4649.73 | bwd_allreduce: 4.72 | step: 43.77
- 17%|█▋        | 966/5800 [2:42:42<9:19:26,  6.94s/it]                                                      {'loss': 0.0234, 'grad_norm': 3.8642935752868652, 'learning_rate': 3.8075754960304305e-05, 'epoch': 8.33}
- 17%|█▋        | 966/5800 [2:42:42<9:19:26,  6.94s/it]score1 tensor([[0.5703],
-        [0.6797],
-        [0.6484],
-        [0.4785]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5000, 0.6016, 0.6094, 0.4160], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0625, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:29:19,660] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 11:29:19,661] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.76 | bwd_microstep: 4644.59 | bwd_inner_microstep: 4639.75 | bwd_allreduce_microstep: 4.74 | step_microstep: 41.38
-[2025-01-25 11:29:19,662] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.72 | bwd: 4644.61 | bwd_inner: 4639.75 | bwd_allreduce: 4.79 | step: 41.39
- 17%|█▋        | 967/5800 [2:42:49<9:18:56,  6.94s/it]                                                      {'loss': 0.0625, 'grad_norm': 8.9351167678833, 'learning_rate': 3.807097239885203e-05, 'epoch': 8.34}
- 17%|█▋        | 967/5800 [2:42:49<9:18:56,  6.94s/it]score1 tensor([[0.5273],
-        [0.5117],
-        [0.5469],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5078, 0.4043, 0.4062, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0923, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:29:26,592] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.93 | optimizer_step: 4.36
-[2025-01-25 11:29:26,594] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.42 | bwd_microstep: 4653.51 | bwd_inner_microstep: 4648.67 | bwd_allreduce_microstep: 4.74 | step_microstep: 45.50
-[2025-01-25 11:29:26,594] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.38 | bwd: 4653.53 | bwd_inner: 4648.67 | bwd_allreduce: 4.79 | step: 45.51
- 17%|█▋        | 968/5800 [2:42:56<9:18:41,  6.94s/it]                                                      {'loss': 0.0923, 'grad_norm': 8.368754386901855, 'learning_rate': 3.8066184202557014e-05, 'epoch': 8.34}
- 17%|█▋        | 968/5800 [2:42:56<9:18:41,  6.94s/it]score1 tensor([[0.5195],
-        [0.5625],
-        [0.5703],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.5547, 0.5117, 0.3809], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0659, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:29:33,530] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 11:29:33,531] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.68 | bwd_microstep: 4656.07 | bwd_inner_microstep: 4651.30 | bwd_allreduce_microstep: 4.66 | step_microstep: 41.86
-[2025-01-25 11:29:33,531] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.64 | bwd: 4656.10 | bwd_inner: 4651.30 | bwd_allreduce: 4.72 | step: 41.86
- 17%|█▋        | 969/5800 [2:43:03<9:18:34,  6.94s/it]                                                      {'loss': 0.0659, 'grad_norm': 4.4827752113342285, 'learning_rate': 3.806139037291228e-05, 'epoch': 8.35}
- 17%|█▋        | 969/5800 [2:43:03<9:18:34,  6.94s/it]score1 tensor([[0.5469],
-        [0.5234],
-        [0.5156],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4863, 0.5195, 0.4570, 0.4316], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0518, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:29:40,466] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 11:29:40,467] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.56 | bwd_microstep: 4653.24 | bwd_inner_microstep: 4648.51 | bwd_allreduce_microstep: 4.64 | step_microstep: 41.75
-[2025-01-25 11:29:40,468] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.53 | bwd: 4653.26 | bwd_inner: 4648.52 | bwd_allreduce: 4.68 | step: 41.76
- 17%|█▋        | 970/5800 [2:43:10<9:18:21,  6.94s/it]                                                      {'loss': 0.0518, 'grad_norm': 8.296395301818848, 'learning_rate': 3.805659091141263e-05, 'epoch': 8.36}
- 17%|█▋        | 970/5800 [2:43:10<9:18:21,  6.94s/it]score1 tensor([[0.5039],
-        [0.5039],
-        [0.4727],
-        [0.4961]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4824, 0.4922, 0.5000, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0278, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:29:47,391] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 11:29:47,392] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.36 | bwd_microstep: 4648.20 | bwd_inner_microstep: 4643.52 | bwd_allreduce_microstep: 4.59 | step_microstep: 42.64
-[2025-01-25 11:29:47,392] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.34 | bwd: 4648.23 | bwd_inner: 4643.52 | bwd_allreduce: 4.64 | step: 42.65
- 17%|█▋        | 971/5800 [2:43:17<9:18:01,  6.93s/it]                                                      {'loss': 0.0278, 'grad_norm': 4.172493934631348, 'learning_rate': 3.8051785819554635e-05, 'epoch': 8.37}
- 17%|█▋        | 971/5800 [2:43:17<9:18:01,  6.93s/it]score1 tensor([[0.4512],
-        [0.5078],
-        [0.4570],
-        [0.4473]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4395, 0.6484, 0.4551, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0615, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:29:54,337] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 16.08 | optimizer_step: 4.36
-[2025-01-25 11:29:54,339] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.91 | bwd_microstep: 4650.09 | bwd_inner_microstep: 4644.98 | bwd_allreduce_microstep: 4.98 | step_microstep: 56.42
-[2025-01-25 11:29:54,339] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.85 | bwd: 4650.11 | bwd_inner: 4644.98 | bwd_allreduce: 5.05 | step: 56.43
- 17%|█▋        | 972/5800 [2:43:24<9:18:35,  6.94s/it]                                                      {'loss': 0.0615, 'grad_norm': 0.42170068621635437, 'learning_rate': 3.804697509883659e-05, 'epoch': 8.38}
- 17%|█▋        | 972/5800 [2:43:24<9:18:35,  6.94s/it]score1 tensor([[0.4316],
-        [0.5391],
-        [0.3379],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.6445, 0.3613, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0679, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:30:01,297] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.41 | optimizer_step: 4.36
-[2025-01-25 11:30:01,299] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.06 | bwd_microstep: 4649.48 | bwd_inner_microstep: 4643.17 | bwd_allreduce_microstep: 6.16 | step_microstep: 55.63
-[2025-01-25 11:30:01,299] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2168.02 | bwd: 4649.53 | bwd_inner: 4643.17 | bwd_allreduce: 6.24 | step: 55.62
- 17%|█▋        | 973/5800 [2:43:31<9:19:01,  6.95s/it]                                                      {'loss': 0.0679, 'grad_norm': 7.836183071136475, 'learning_rate': 3.8042158750758564e-05, 'epoch': 8.39}
- 17%|█▋        | 973/5800 [2:43:31<9:19:01,  6.95s/it]score1 tensor([[0.4180],
-        [0.4863],
-        [0.4297],
-        [0.3848]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.6562, 0.4219, 0.5820], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1138, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:30:08,277] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 11:30:08,279] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2176.03 | bwd_microstep: 4649.53 | bwd_inner_microstep: 4644.73 | bwd_allreduce_microstep: 4.72 | step_microstep: 42.70
-[2025-01-25 11:30:08,279] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2175.87 | bwd: 4649.56 | bwd_inner: 4644.73 | bwd_allreduce: 4.76 | step: 42.71
- 17%|█▋        | 974/5800 [2:43:38<9:19:14,  6.95s/it]                                                      {'loss': 0.1138, 'grad_norm': 3.8228750228881836, 'learning_rate': 3.803733677682239e-05, 'epoch': 8.4}
- 17%|█▋        | 974/5800 [2:43:38<9:19:14,  6.95s/it]score1 tensor([[0.4355],
-        [0.4199],
-        [0.3906],
-        [0.3457]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.4844, 0.3887, 0.3672], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0645, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:30:15,212] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 11:30:15,214] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.49 | bwd_microstep: 4653.00 | bwd_inner_microstep: 4647.89 | bwd_allreduce_microstep: 5.03 | step_microstep: 42.91
-[2025-01-25 11:30:15,214] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.45 | bwd: 4653.02 | bwd_inner: 4647.89 | bwd_allreduce: 5.07 | step: 42.92
- 17%|█▋        | 975/5800 [2:43:45<9:18:40,  6.95s/it]                                                      {'loss': 0.0645, 'grad_norm': 3.7048380374908447, 'learning_rate': 3.8032509178531625e-05, 'epoch': 8.41}
- 17%|█▋        | 975/5800 [2:43:45<9:18:40,  6.95s/it]score1 tensor([[0.3984],
-        [0.3477],
-        [0.4707],
-        [0.3320]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4258, 0.4434, 0.6094, 0.3945], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0811, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:30:22,147] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 11:30:22,148] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2171.05 | bwd_microstep: 4646.27 | bwd_inner_microstep: 4640.98 | bwd_allreduce_microstep: 5.18 | step_microstep: 41.13
-[2025-01-25 11:30:22,148] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2171.02 | bwd: 4646.30 | bwd_inner: 4640.98 | bwd_allreduce: 5.24 | step: 41.13
- 17%|█▋        | 976/5800 [2:43:52<9:18:19,  6.94s/it]                                                      {'loss': 0.0811, 'grad_norm': 7.226808547973633, 'learning_rate': 3.802767595739161e-05, 'epoch': 8.41}
- 17%|█▋        | 976/5800 [2:43:52<9:18:19,  6.94s/it]score1 tensor([[0.4219],
-        [0.4746],
-        [0.4648],
-        [0.4805]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4453, 0.5547, 0.4961, 0.5703], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0562, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:30:29,087] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.98 | optimizer_step: 4.36
-[2025-01-25 11:30:29,088] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2171.06 | bwd_microstep: 4651.40 | bwd_inner_microstep: 4646.38 | bwd_allreduce_microstep: 4.92 | step_microstep: 44.48
-[2025-01-25 11:30:29,089] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2171.02 | bwd: 4651.42 | bwd_inner: 4646.38 | bwd_allreduce: 4.97 | step: 44.48
- 17%|█▋        | 977/5800 [2:43:59<9:18:10,  6.94s/it]                                                      {'loss': 0.0562, 'grad_norm': 7.782418251037598, 'learning_rate': 3.8022837114909424e-05, 'epoch': 8.42}
- 17%|█▋        | 977/5800 [2:43:59<9:18:10,  6.94s/it]score1 tensor([[0.4473],
-        [0.3984],
-        [0.4824],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.4199, 0.5156, 0.6406], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0498, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:30:36,029] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 11:30:36,030] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.79 | bwd_microstep: 4644.57 | bwd_inner_microstep: 4639.54 | bwd_allreduce_microstep: 4.94 | step_microstep: 46.45
-[2025-01-25 11:30:36,030] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.73 | bwd: 4644.60 | bwd_inner: 4639.54 | bwd_allreduce: 4.99 | step: 46.46
- 17%|█▋        | 978/5800 [2:44:06<9:17:50,  6.94s/it]                                                      {'loss': 0.0498, 'grad_norm': 7.758822441101074, 'learning_rate': 3.8017992652593904e-05, 'epoch': 8.43}
- 17%|█▋        | 978/5800 [2:44:06<9:17:50,  6.94s/it]score1 tensor([[0.4648],
-        [0.5156],
-        [0.3887],
-        [0.6094]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.4961, 0.4004, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:30:42,965] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 11:30:42,966] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.85 | bwd_microstep: 4654.37 | bwd_inner_microstep: 4649.43 | bwd_allreduce_microstep: 4.84 | step_microstep: 42.17
-[2025-01-25 11:30:42,967] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.82 | bwd: 4654.40 | bwd_inner: 4649.43 | bwd_allreduce: 4.89 | step: 42.18
- 17%|█▋        | 979/5800 [2:44:12<9:17:37,  6.94s/it]                                                      {'loss': 0.0215, 'grad_norm': 3.9445431232452393, 'learning_rate': 3.8013142571955635e-05, 'epoch': 8.44}
- 17%|█▋        | 979/5800 [2:44:12<9:17:37,  6.94s/it]score1 tensor([[0.5625],
-        [0.5156],
-        [0.5234],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.4414, 0.4336, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0498, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:30:49,910] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 11:30:49,911] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2169.09 | bwd_microstep: 4650.65 | bwd_inner_microstep: 4645.30 | bwd_allreduce_microstep: 5.20 | step_microstep: 47.20
-[2025-01-25 11:30:49,912] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2169.06 | bwd: 4650.69 | bwd_inner: 4645.30 | bwd_allreduce: 5.29 | step: 47.20
- 17%|█▋        | 980/5800 [2:44:19<9:17:39,  6.94s/it]                                                      {'loss': 0.0498, 'grad_norm': 4.122282981872559, 'learning_rate': 3.800828687450696e-05, 'epoch': 8.45}
- 17%|█▋        | 980/5800 [2:44:19<9:17:39,  6.94s/it]score1 tensor([[0.5508],
-        [0.4102],
-        [0.5938],
-        [0.5898]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.3086, 0.6406, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0645, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:30:56,868] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 11:30:56,869] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.76 | bwd_microstep: 4656.44 | bwd_inner_microstep: 4651.33 | bwd_allreduce_microstep: 5.02 | step_microstep: 58.38
-[2025-01-25 11:30:56,870] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2168.72 | bwd: 4656.47 | bwd_inner: 4651.33 | bwd_allreduce: 5.07 | step: 58.39
- 17%|█▋        | 981/5800 [2:44:26<9:17:59,  6.95s/it]                                                      {'loss': 0.0645, 'grad_norm': 3.882725715637207, 'learning_rate': 3.800342556176196e-05, 'epoch': 8.46}
- 17%|█▋        | 981/5800 [2:44:26<9:17:59,  6.95s/it]score1 tensor([[0.5820],
-        [0.6211],
-        [0.6133],
-        [0.5781]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4805, 0.4863, 0.6016, 0.4141], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1030, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:31:03,836] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 11:31:03,837] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.02 | bwd_microstep: 4648.47 | bwd_inner_microstep: 4643.40 | bwd_allreduce_microstep: 4.83 | step_microstep: 75.02
-[2025-01-25 11:31:03,838] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.95 | bwd: 4648.53 | bwd_inner: 4643.40 | bwd_allreduce: 4.96 | step: 74.99
- 17%|█▋        | 982/5800 [2:44:33<9:19:02,  6.96s/it]                                                      {'loss': 0.103, 'grad_norm': 8.735762596130371, 'learning_rate': 3.799855863523648e-05, 'epoch': 8.47}
- 17%|█▋        | 982/5800 [2:44:33<9:19:02,  6.96s/it]score1 tensor([[0.6523],
-        [0.6211],
-        [0.5625],
-        [0.6055]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.5312, 0.4082, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1387, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:31:10,821] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 11:31:10,822] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.57 | bwd_microstep: 4652.89 | bwd_inner_microstep: 4647.70 | bwd_allreduce_microstep: 5.07 | step_microstep: 43.57
-[2025-01-25 11:31:10,823] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.54 | bwd: 4652.91 | bwd_inner: 4647.70 | bwd_allreduce: 5.13 | step: 43.58
- 17%|█▋        | 983/5800 [2:44:40<9:18:43,  6.96s/it]                                                      {'loss': 0.1387, 'grad_norm': 8.842575073242188, 'learning_rate': 3.799368609644813e-05, 'epoch': 8.47}
- 17%|█▋        | 983/5800 [2:44:40<9:18:43,  6.96s/it]score1 tensor([[0.6211],
-        [0.5781],
-        [0.5273],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5234, 0.4785, 0.4375, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0947, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:31:17,762] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.37
-[2025-01-25 11:31:17,763] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.33 | bwd_microstep: 4645.39 | bwd_inner_microstep: 4640.09 | bwd_allreduce_microstep: 5.19 | step_microstep: 48.34
-[2025-01-25 11:31:17,764] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.30 | bwd: 4645.41 | bwd_inner: 4640.09 | bwd_allreduce: 5.24 | step: 48.35
- 17%|█▋        | 984/5800 [2:44:47<9:18:05,  6.95s/it]                                                      {'loss': 0.0947, 'grad_norm': 8.481590270996094, 'learning_rate': 3.798880794691623e-05, 'epoch': 8.48}
- 17%|█▋        | 984/5800 [2:44:47<9:18:05,  6.95s/it]score1 tensor([[0.5391],
-        [0.6016],
-        [0.5625],
-        [0.6172]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.5625, 0.4531, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0669, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:31:24,693] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.36
-[2025-01-25 11:31:24,695] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.37 | bwd_microstep: 4645.04 | bwd_inner_microstep: 4639.99 | bwd_allreduce_microstep: 4.98 | step_microstep: 49.42
-[2025-01-25 11:31:24,695] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.32 | bwd: 4645.06 | bwd_inner: 4639.99 | bwd_allreduce: 5.01 | step: 49.43
- 17%|█▋        | 985/5800 [2:44:54<9:17:25,  6.95s/it]                                                      {'loss': 0.0669, 'grad_norm': 8.593489646911621, 'learning_rate': 3.798392418816187e-05, 'epoch': 8.49}
- 17%|█▋        | 985/5800 [2:44:54<9:17:25,  6.95s/it]score1 tensor([[0.5117],
-        [0.5625],
-        [0.5352],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3926, 0.5625, 0.5430, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0532, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:31:31,593] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.37
-[2025-01-25 11:31:31,594] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.31 | bwd_microstep: 4598.48 | bwd_inner_microstep: 4590.59 | bwd_allreduce_microstep: 7.70 | step_microstep: 65.97
-[2025-01-25 11:31:31,595] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.29 | bwd: 4598.53 | bwd_inner: 4590.59 | bwd_allreduce: 7.80 | step: 65.99
- 17%|█▋        | 986/5800 [2:45:01<9:16:53,  6.94s/it]                                                      {'loss': 0.0532, 'grad_norm': 1.9616312980651855, 'learning_rate': 3.797903482170791e-05, 'epoch': 8.5}
- 17%|█▋        | 986/5800 [2:45:01<9:16:53,  6.94s/it]score1 tensor([[0.4980],
-        [0.4570],
-        [0.4961],
-        [0.4668]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.4434, 0.5508, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0410, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:31:38,578] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.69
-[2025-01-25 11:31:38,579] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2170.11 | bwd_microstep: 4648.37 | bwd_inner_microstep: 4643.39 | bwd_allreduce_microstep: 4.90 | step_microstep: 47.61
-[2025-01-25 11:31:38,580] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2170.01 | bwd: 4648.39 | bwd_inner: 4643.39 | bwd_allreduce: 4.94 | step: 47.59
- 17%|█▋        | 987/5800 [2:45:08<9:17:09,  6.95s/it]                                                      {'loss': 0.041, 'grad_norm': 0.3028724491596222, 'learning_rate': 3.7974139849078924e-05, 'epoch': 8.51}
- 17%|█▋        | 987/5800 [2:45:08<9:17:09,  6.95s/it]score1 tensor([[0.5195],
-        [0.4453],
-        [0.4844],
-        [0.4668]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5742, 0.4277, 0.5273, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0566, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:31:45,641] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.01 | optimizer_gradients: 3.80 | optimizer_step: 4.41
-[2025-01-25 11:31:45,642] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2265.95 | bwd_microstep: 4677.17 | bwd_inner_microstep: 4670.91 | bwd_allreduce_microstep: 6.08 | step_microstep: 47.06
-[2025-01-25 11:31:45,643] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2265.92 | bwd: 4677.22 | bwd_inner: 4670.91 | bwd_allreduce: 6.18 | step: 47.07
- 17%|█▋        | 988/5800 [2:45:15<9:20:07,  6.98s/it]                                                      {'loss': 0.0566, 'grad_norm': 4.1013617515563965, 'learning_rate': 3.796923927180126e-05, 'epoch': 8.52}
- 17%|█▋        | 988/5800 [2:45:15<9:20:07,  6.98s/it]score1 tensor([[0.4980],
-        [0.4160],
-        [0.4688],
-        [0.4121]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.3789, 0.4453, 0.3789], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0405, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:31:52,604] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.29 | optimizer_step: 4.45
-[2025-01-25 11:31:52,607] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.15 | bwd_microstep: 4662.02 | bwd_inner_microstep: 4657.18 | bwd_allreduce_microstep: 4.72 | step_microstep: 49.11
-[2025-01-25 11:31:52,608] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.11 | bwd: 4662.04 | bwd_inner: 4657.18 | bwd_allreduce: 4.77 | step: 49.16
- 17%|█▋        | 989/5800 [2:45:22<9:19:49,  6.98s/it]                                                      {'loss': 0.0405, 'grad_norm': 3.539870262145996, 'learning_rate': 3.7964333091402995e-05, 'epoch': 8.53}
- 17%|█▋        | 989/5800 [2:45:22<9:19:49,  6.98s/it]score1 tensor([[0.4473],
-        [0.4375],
-        [0.5195],
-        [0.4902]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4668, 0.5508, 0.3652, 0.6875], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1211, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:31:59,592] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.37
-[2025-01-25 11:31:59,594] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2182.89 | bwd_microstep: 4649.38 | bwd_inner_microstep: 4644.49 | bwd_allreduce_microstep: 4.80 | step_microstep: 44.32
-[2025-01-25 11:31:59,594] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2182.86 | bwd: 4649.40 | bwd_inner: 4644.49 | bwd_allreduce: 4.85 | step: 44.33
- 17%|█▋        | 990/5800 [2:45:29<9:19:22,  6.98s/it]                                                      {'loss': 0.1211, 'grad_norm': 3.692438840866089, 'learning_rate': 3.7959421309413965e-05, 'epoch': 8.53}
- 17%|█▋        | 990/5800 [2:45:29<9:19:22,  6.98s/it]score1 tensor([[0.3164],
-        [0.4805],
-        [0.4590],
-        [0.3926]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3652, 0.5469, 0.6289, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0942, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:32:06,526] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 11:32:06,527] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.50 | bwd_microstep: 4649.20 | bwd_inner_microstep: 4644.87 | bwd_allreduce_microstep: 4.27 | step_microstep: 37.47
-[2025-01-25 11:32:06,527] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.43 | bwd: 4649.22 | bwd_inner: 4644.87 | bwd_allreduce: 4.30 | step: 37.48
- 17%|█▋        | 991/5800 [2:45:36<9:18:08,  6.96s/it]                                                      {'loss': 0.0942, 'grad_norm': 7.211051940917969, 'learning_rate': 3.795450392736575e-05, 'epoch': 8.54}
- 17%|█▋        | 991/5800 [2:45:36<9:18:08,  6.96s/it]score1 tensor([[0.4141],
-        [0.4316],
-        [0.4570],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.4473, 0.4648, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0605, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:32:13,492] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.37
-[2025-01-25 11:32:13,495] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.93 | bwd_microstep: 4651.82 | bwd_inner_microstep: 4643.32 | bwd_allreduce_microstep: 8.27 | step_microstep: 76.90
-[2025-01-25 11:32:13,496] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.89 | bwd: 4651.88 | bwd_inner: 4643.32 | bwd_allreduce: 8.40 | step: 76.91
- 17%|█▋        | 992/5800 [2:45:43<9:18:45,  6.97s/it]                                                      {'loss': 0.0605, 'grad_norm': 7.514078140258789, 'learning_rate': 3.794958094679166e-05, 'epoch': 8.55}
- 17%|█▋        | 992/5800 [2:45:43<9:18:45,  6.97s/it]score1 tensor([[0.4277],
-        [0.4590],
-        [0.4102],
-        [0.4766]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.5547, 0.4512, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0566, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:32:20,467] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 11:32:20,468] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.15 | bwd_microstep: 4647.80 | bwd_inner_microstep: 4642.77 | bwd_allreduce_microstep: 4.82 | step_microstep: 45.75
-[2025-01-25 11:32:20,469] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.12 | bwd: 4647.78 | bwd_inner: 4642.72 | bwd_allreduce: 4.85 | step: 45.76
- 17%|█▋        | 993/5800 [2:45:50<9:18:21,  6.97s/it]                                                      {'loss': 0.0566, 'grad_norm': 3.76291823387146, 'learning_rate': 3.7944652369226796e-05, 'epoch': 8.56}
- 17%|█▋        | 993/5800 [2:45:50<9:18:21,  6.97s/it]score1 tensor([[0.4453],
-        [0.4355],
-        [0.4844],
-        [0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.4238, 0.5664, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0420, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:32:27,424] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.37
-[2025-01-25 11:32:27,425] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.17 | bwd_microstep: 4646.06 | bwd_inner_microstep: 4641.28 | bwd_allreduce_microstep: 4.68 | step_microstep: 42.28
-[2025-01-25 11:32:27,425] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.13 | bwd: 4646.09 | bwd_inner: 4641.28 | bwd_allreduce: 4.73 | step: 42.29
- 17%|█▋        | 994/5800 [2:45:57<9:18:09,  6.97s/it]                                                      {'loss': 0.042, 'grad_norm': 3.849041223526001, 'learning_rate': 3.7939718196207946e-05, 'epoch': 8.57}
- 17%|█▋        | 994/5800 [2:45:57<9:18:09,  6.97s/it]score1 tensor([[0.4082],
-        [0.5117],
-        [0.4844],
-        [0.4902]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4258, 0.5312, 0.4844, 0.4375], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0225, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:32:34,348] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 11:32:34,349] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2171.17 | bwd_microstep: 4598.87 | bwd_inner_microstep: 4590.98 | bwd_allreduce_microstep: 7.65 | step_microstep: 59.22
-[2025-01-25 11:32:34,349] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2171.14 | bwd: 4598.94 | bwd_inner: 4590.98 | bwd_allreduce: 7.76 | step: 59.20
- 17%|█▋        | 995/5800 [2:46:04<9:16:48,  6.95s/it]                                                      {'loss': 0.0225, 'grad_norm': 1.8408865928649902, 'learning_rate': 3.793477842927368e-05, 'epoch': 8.58}
- 17%|█▋        | 995/5800 [2:46:04<9:16:48,  6.95s/it]score1 tensor([[0.5586],
-        [0.4668],
-        [0.5117],
-        [0.4668]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4766, 0.4473, 0.5469, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0571, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:32:41,306] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 11:32:41,308] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.64 | bwd_microstep: 4653.46 | bwd_inner_microstep: 4648.21 | bwd_allreduce_microstep: 5.13 | step_microstep: 42.38
-[2025-01-25 11:32:41,308] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.61 | bwd: 4653.49 | bwd_inner: 4648.21 | bwd_allreduce: 5.20 | step: 42.39
- 17%|█▋        | 996/5800 [2:46:11<9:16:27,  6.95s/it]                                                      {'loss': 0.0571, 'grad_norm': 3.893247604370117, 'learning_rate': 3.792983306996431e-05, 'epoch': 8.59}
- 17%|█▋        | 996/5800 [2:46:11<9:16:27,  6.95s/it]score1 tensor([[0.5820],
-        [0.6250],
-        [0.5469],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.6055, 0.6055, 0.4121], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0552, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:32:48,230] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 11:32:48,231] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.57 | bwd_microstep: 4643.16 | bwd_inner_microstep: 4638.64 | bwd_allreduce_microstep: 4.44 | step_microstep: 41.79
-[2025-01-25 11:32:48,232] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.50 | bwd: 4643.19 | bwd_inner: 4638.64 | bwd_allreduce: 4.47 | step: 41.80
- 17%|█▋        | 997/5800 [2:46:18<9:15:49,  6.94s/it]                                                      {'loss': 0.0552, 'grad_norm': 4.3495402336120605, 'learning_rate': 3.792488211982188e-05, 'epoch': 8.59}
- 17%|█▋        | 997/5800 [2:46:18<9:15:49,  6.94s/it]score1 tensor([[0.5508],
-        [0.6367],
-        [0.5703],
-        [0.5859]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.5352, 0.5195, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0737, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:32:55,165] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 11:32:55,166] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.65 | bwd_microstep: 4651.32 | bwd_inner_microstep: 4646.67 | bwd_allreduce_microstep: 4.56 | step_microstep: 43.41
-[2025-01-25 11:32:55,167] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.62 | bwd: 4651.35 | bwd_inner: 4646.67 | bwd_allreduce: 4.61 | step: 43.42
- 17%|█▋        | 998/5800 [2:46:25<9:15:22,  6.94s/it]                                                      {'loss': 0.0737, 'grad_norm': 8.574216842651367, 'learning_rate': 3.791992558039018e-05, 'epoch': 8.6}
- 17%|█▋        | 998/5800 [2:46:25<9:15:22,  6.94s/it]score1 tensor([[0.5703],
-        [0.4297],
-        [0.5859],
-        [0.5859]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4746, 0.4043, 0.5430, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0518, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:33:02,109] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.37
-[2025-01-25 11:33:02,112] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.62 | bwd_microstep: 4649.41 | bwd_inner_microstep: 4644.12 | bwd_allreduce_microstep: 5.19 | step_microstep: 46.48
-[2025-01-25 11:33:02,113] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.58 | bwd: 4649.44 | bwd_inner: 4644.12 | bwd_allreduce: 5.24 | step: 46.49
- 17%|█▋        | 999/5800 [2:46:32<9:15:31,  6.94s/it]                                                      {'loss': 0.0518, 'grad_norm': 8.1514892578125, 'learning_rate': 3.791496345321474e-05, 'epoch': 8.61}
- 17%|█▋        | 999/5800 [2:46:32<9:15:31,  6.94s/it]score1 tensor([[0.5273],
-        [0.6094],
-        [0.5664],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.5391, 0.4844, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0732, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:33:09,045] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.37
-[2025-01-25 11:33:09,046] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2169.29 | bwd_microstep: 4645.87 | bwd_inner_microstep: 4640.83 | bwd_allreduce_microstep: 4.93 | step_microstep: 43.24
-[2025-01-25 11:33:09,047] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2169.25 | bwd: 4645.89 | bwd_inner: 4640.83 | bwd_allreduce: 4.98 | step: 43.24
- 17%|█▋        | 1000/5800 [2:46:39<9:15:06,  6.94s/it]                                                       {'loss': 0.0732, 'grad_norm': 8.341894149780273, 'learning_rate': 3.790999573984285e-05, 'epoch': 8.62}
- 17%|█▋        | 1000/5800 [2:46:39<9:15:06,  6.94s/it]score1 tensor([[0.5039],
-        [0.5195],
-        [0.5430],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.4355, 0.5273, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0522, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:33:15,964] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.16 | optimizer_step: 4.36
-[2025-01-25 11:33:15,966] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.56 | bwd_microstep: 4644.64 | bwd_inner_microstep: 4639.49 | bwd_allreduce_microstep: 5.02 | step_microstep: 42.60
-[2025-01-25 11:33:15,966] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.53 | bwd: 4644.66 | bwd_inner: 4639.49 | bwd_allreduce: 5.09 | step: 42.61
- 17%|█▋        | 1001/5800 [2:46:45<9:14:41,  6.94s/it]                                                       {'loss': 0.0522, 'grad_norm': 4.07791805267334, 'learning_rate': 3.790502244182352e-05, 'epoch': 8.63}
- 17%|█▋        | 1001/5800 [2:46:45<9:14:41,  6.94s/it]score1 tensor([[0.4629],
-        [0.5898],
-        [0.5508],
-        [0.4727]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.6133, 0.5586, 0.4902], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0278, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:33:22,902] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 11:33:22,904] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.86 | bwd_microstep: 4652.46 | bwd_inner_microstep: 4647.19 | bwd_allreduce_microstep: 5.17 | step_microstep: 44.62
-[2025-01-25 11:33:22,904] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.83 | bwd: 4652.48 | bwd_inner: 4647.19 | bwd_allreduce: 5.22 | step: 44.63
- 17%|█▋        | 1002/5800 [2:46:52<9:14:36,  6.94s/it]                                                       {'loss': 0.0278, 'grad_norm': 4.334601402282715, 'learning_rate': 3.790004356070752e-05, 'epoch': 8.64}
- 17%|█▋        | 1002/5800 [2:46:52<9:14:36,  6.94s/it]score1 tensor([[0.6016],
-        [0.3711],
-        [0.4258],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.7031, 0.3418, 0.4219, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0361, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:33:29,837] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.11 | optimizer_step: 4.36
-[2025-01-25 11:33:29,838] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.38 | bwd_microstep: 4643.33 | bwd_inner_microstep: 4638.38 | bwd_allreduce_microstep: 4.81 | step_microstep: 46.64
-[2025-01-25 11:33:29,839] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.34 | bwd: 4643.39 | bwd_inner: 4638.38 | bwd_allreduce: 4.88 | step: 46.64
- 17%|█▋        | 1003/5800 [2:46:59<9:14:34,  6.94s/it]                                                       {'loss': 0.0361, 'grad_norm': 1.3500674962997437, 'learning_rate': 3.7895059098047333e-05, 'epoch': 8.65}
- 17%|█▋        | 1003/5800 [2:46:59<9:14:34,  6.94s/it]score1 tensor([[0.4824],
-        [0.4277],
-        [0.4414],
-        [0.4531]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.3750, 0.4727, 0.3984], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0356, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:33:36,812] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 8.65 | optimizer_step: 4.73
-[2025-01-25 11:33:36,817] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.64 | bwd_microstep: 4644.22 | bwd_inner_microstep: 4636.17 | bwd_allreduce_microstep: 7.85 | step_microstep: 73.11
-[2025-01-25 11:33:36,818] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.54 | bwd: 4644.27 | bwd_inner: 4636.17 | bwd_allreduce: 7.95 | step: 73.13
- 17%|█▋        | 1004/5800 [2:47:06<9:16:12,  6.96s/it]                                                       {'loss': 0.0356, 'grad_norm': 3.7673985958099365, 'learning_rate': 3.789006905539722e-05, 'epoch': 8.66}
- 17%|█▋        | 1004/5800 [2:47:06<9:16:12,  6.96s/it]score1 tensor([[0.5430],
-        [0.5391],
-        [0.4785],
-        [0.4961]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.6133, 0.4023, 0.5898], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0767, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:33:43,804] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 11:33:43,806] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.86 | bwd_microstep: 4641.30 | bwd_inner_microstep: 4636.47 | bwd_allreduce_microstep: 4.73 | step_microstep: 44.11
-[2025-01-25 11:33:43,806] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.82 | bwd: 4641.32 | bwd_inner: 4636.47 | bwd_allreduce: 4.78 | step: 44.12
- 17%|█▋        | 1005/5800 [2:47:13<9:15:50,  6.96s/it]                                                       {'loss': 0.0767, 'grad_norm': 4.258723735809326, 'learning_rate': 3.7885073434313156e-05, 'epoch': 8.66}
- 17%|█▋        | 1005/5800 [2:47:13<9:15:50,  6.96s/it]score1 tensor([[0.4492],
-        [0.5625],
-        [0.5156],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.6445, 0.5742, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0532, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:33:50,728] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 11:33:50,729] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.14 | bwd_microstep: 4641.45 | bwd_inner_microstep: 4636.43 | bwd_allreduce_microstep: 4.94 | step_microstep: 44.02
-[2025-01-25 11:33:50,730] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.10 | bwd: 4641.48 | bwd_inner: 4636.43 | bwd_allreduce: 4.98 | step: 44.03
- 17%|█▋        | 1006/5800 [2:47:20<9:15:16,  6.95s/it]                                                       {'loss': 0.0532, 'grad_norm': 7.951792240142822, 'learning_rate': 3.788007223635286e-05, 'epoch': 8.67}
- 17%|█▋        | 1006/5800 [2:47:20<9:15:16,  6.95s/it]score1 tensor([[0.4297],
-        [0.5508],
-        [0.4688],
-        [0.4062]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.5977, 0.5664, 0.4023], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0635, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:33:57,676] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 11:33:57,677] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.89 | bwd_microstep: 4653.04 | bwd_inner_microstep: 4648.12 | bwd_allreduce_microstep: 4.84 | step_microstep: 42.56
-[2025-01-25 11:33:57,677] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.86 | bwd: 4653.06 | bwd_inner: 4648.12 | bwd_allreduce: 4.88 | step: 42.57
- 17%|█▋        | 1007/5800 [2:47:27<9:14:52,  6.95s/it]                                                       {'loss': 0.0635, 'grad_norm': 4.17120885848999, 'learning_rate': 3.7875065463075796e-05, 'epoch': 8.68}
- 17%|█▋        | 1007/5800 [2:47:27<9:14:52,  6.95s/it]score1 tensor([[0.4805],
-        [0.4121],
-        [0.4590],
-        [0.3867]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.4609, 0.4590, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0181, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:34:04,561] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 11:34:04,562] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.52 | bwd_microstep: 4591.23 | bwd_inner_microstep: 4585.49 | bwd_allreduce_microstep: 5.64 | step_microstep: 53.91
-[2025-01-25 11:34:04,562] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.47 | bwd: 4591.25 | bwd_inner: 4585.49 | bwd_allreduce: 5.69 | step: 53.92
- 17%|█▋        | 1008/5800 [2:47:34<9:13:41,  6.93s/it]                                                       {'loss': 0.0181, 'grad_norm': 1.8710788488388062, 'learning_rate': 3.787005311604317e-05, 'epoch': 8.69}
- 17%|█▋        | 1008/5800 [2:47:34<9:13:41,  6.93s/it]score1 tensor([[0.3867],
-        [0.4453],
-        [0.5273],
-        [0.4238]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4590, 0.5000, 0.5742, 0.4902], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0601, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:34:11,551] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.03 | optimizer_step: 4.36
-[2025-01-25 11:34:11,553] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.67 | bwd_microstep: 4656.04 | bwd_inner_microstep: 4648.91 | bwd_allreduce_microstep: 7.04 | step_microstep: 65.87
-[2025-01-25 11:34:11,553] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2168.64 | bwd: 4656.06 | bwd_inner: 4648.91 | bwd_allreduce: 7.08 | step: 65.88
- 17%|█▋        | 1009/5800 [2:47:41<9:14:45,  6.95s/it]                                                       {'loss': 0.0601, 'grad_norm': 7.380353927612305, 'learning_rate': 3.786503519681789e-05, 'epoch': 8.7}
- 17%|█▋        | 1009/5800 [2:47:41<9:14:45,  6.95s/it]score1 tensor([[0.5508],
-        [0.4492],
-        [0.4219],
-        [0.4922]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4160, 0.3945, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0151, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:34:18,413] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.36
-[2025-01-25 11:34:18,414] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2170.62 | bwd_microstep: 4550.26 | bwd_inner_microstep: 4545.50 | bwd_allreduce_microstep: 4.66 | step_microstep: 41.61
-[2025-01-25 11:34:18,415] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2170.56 | bwd: 4550.29 | bwd_inner: 4545.50 | bwd_allreduce: 4.71 | step: 41.62
- 17%|█▋        | 1010/5800 [2:47:48<9:12:36,  6.92s/it]                                                       {'loss': 0.0151, 'grad_norm': 3.6952850818634033, 'learning_rate': 3.786001170696467e-05, 'epoch': 8.71}
- 17%|█▋        | 1010/5800 [2:47:48<9:12:36,  6.92s/it]score1 tensor([[0.3691],
-        [0.3867],
-        [0.3457],
-        [0.4902]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4609, 0.4043, 0.3398, 0.4609], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0361, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:34:25,351] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.00 | optimizer_step: 4.36
-[2025-01-25 11:34:25,352] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.18 | bwd_microstep: 4642.47 | bwd_inner_microstep: 4636.54 | bwd_allreduce_microstep: 5.81 | step_microstep: 45.79
-[2025-01-25 11:34:25,353] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.13 | bwd: 4642.50 | bwd_inner: 4636.54 | bwd_allreduce: 5.88 | step: 45.80
- 17%|█▋        | 1011/5800 [2:47:55<9:12:55,  6.93s/it]                                                       {'loss': 0.0361, 'grad_norm': 0.8133490681648254, 'learning_rate': 3.785498264804989e-05, 'epoch': 8.72}
- 17%|█▋        | 1011/5800 [2:47:55<9:12:55,  6.93s/it]score1 tensor([[0.2461],
-        [0.5117],
-        [0.4570],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3555, 0.4941, 0.4590, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:34:32,294] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 11:34:32,295] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.30 | bwd_microstep: 4649.42 | bwd_inner_microstep: 4644.90 | bwd_allreduce_microstep: 4.40 | step_microstep: 41.73
-[2025-01-25 11:34:32,295] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.26 | bwd: 4649.45 | bwd_inner: 4644.90 | bwd_allreduce: 4.48 | step: 41.74
- 17%|█▋        | 1012/5800 [2:48:02<9:12:47,  6.93s/it]                                                       {'loss': 0.0469, 'grad_norm': 3.20847487449646, 'learning_rate': 3.784994802164171e-05, 'epoch': 8.72}
- 17%|█▋        | 1012/5800 [2:48:02<9:12:47,  6.93s/it]score1 tensor([[0.4258],
-        [0.5352],
-        [0.6484],
-        [0.6797]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4277, 0.5703, 0.6445, 0.6953], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0142, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:34:39,230] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.36
-[2025-01-25 11:34:39,232] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2172.14 | bwd_microstep: 4650.08 | bwd_inner_microstep: 4645.56 | bwd_allreduce_microstep: 4.42 | step_microstep: 42.71
-[2025-01-25 11:34:39,232] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2172.11 | bwd: 4650.10 | bwd_inner: 4645.56 | bwd_allreduce: 4.47 | step: 42.74
- 17%|█▋        | 1013/5800 [2:48:09<9:13:11,  6.93s/it]                                                       {'loss': 0.0142, 'grad_norm': 3.9658362865448, 'learning_rate': 3.784490782931002e-05, 'epoch': 8.73}
- 17%|█▋        | 1013/5800 [2:48:09<9:13:11,  6.93s/it]score1 tensor([[0.5547],
-        [0.4473],
-        [0.4668],
-        [0.4492]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.4688, 0.4805, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0400, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:34:46,174] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.54 | optimizer_step: 4.37
-[2025-01-25 11:34:46,175] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.10 | bwd_microstep: 4645.86 | bwd_inner_microstep: 4640.72 | bwd_allreduce_microstep: 5.05 | step_microstep: 49.73
-[2025-01-25 11:34:46,176] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.06 | bwd: 4645.88 | bwd_inner: 4640.72 | bwd_allreduce: 5.09 | step: 49.73
- 17%|█▋        | 1014/5800 [2:48:16<9:13:02,  6.93s/it]                                                       {'loss': 0.04, 'grad_norm': 7.778183937072754, 'learning_rate': 3.783986207262643e-05, 'epoch': 8.74}
- 17%|█▋        | 1014/5800 [2:48:16<9:13:02,  6.93s/it]score1 tensor([[0.4902],
-        [0.4453],
-        [0.4004],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.4922, 0.4336, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0342, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:34:53,095] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 11:34:53,097] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.75 | bwd_microstep: 4643.10 | bwd_inner_microstep: 4638.19 | bwd_allreduce_microstep: 4.81 | step_microstep: 42.04
-[2025-01-25 11:34:53,097] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.71 | bwd: 4643.12 | bwd_inner: 4638.19 | bwd_allreduce: 4.87 | step: 42.05
- 18%|█▊        | 1015/5800 [2:48:23<9:12:43,  6.93s/it]                                                       {'loss': 0.0342, 'grad_norm': 3.6063404083251953, 'learning_rate': 3.783481075316429e-05, 'epoch': 8.75}
- 18%|█▊        | 1015/5800 [2:48:23<9:12:43,  6.93s/it]score1 tensor([[0.4902],
-        [0.6992],
-        [0.6992],
-        [0.6445]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.6367, 0.6211, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0601, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:35:00,028] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 11:35:00,029] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.62 | bwd_microstep: 4648.50 | bwd_inner_microstep: 4643.41 | bwd_allreduce_microstep: 5.00 | step_microstep: 45.66
-[2025-01-25 11:35:00,029] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.57 | bwd: 4648.53 | bwd_inner: 4643.41 | bwd_allreduce: 5.05 | step: 45.67
- 18%|█▊        | 1016/5800 [2:48:30<9:12:38,  6.93s/it]                                                       {'loss': 0.0601, 'grad_norm': 9.184124946594238, 'learning_rate': 3.78297538724987e-05, 'epoch': 8.76}
- 18%|█▊        | 1016/5800 [2:48:30<9:12:38,  6.93s/it]score1 tensor([[0.5625],
-        [0.7266],
-        [0.5664],
-        [0.6992]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.6094, 0.4863, 0.5938], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0757, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:35:06,891] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 11:35:06,893] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.95 | bwd_microstep: 4583.00 | bwd_inner_microstep: 4578.17 | bwd_allreduce_microstep: 4.74 | step_microstep: 44.04
-[2025-01-25 11:35:06,893] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.92 | bwd: 4583.03 | bwd_inner: 4578.17 | bwd_allreduce: 4.79 | step: 44.05
- 18%|█▊        | 1017/5800 [2:48:36<9:10:54,  6.91s/it]                                                       {'loss': 0.0757, 'grad_norm': 7.0145134925842285, 'learning_rate': 3.782469143220649e-05, 'epoch': 8.77}
- 18%|█▊        | 1017/5800 [2:48:36<9:10:54,  6.91s/it]score1 tensor([[0.5586],
-        [0.4883],
-        [0.6719],
-        [0.4453]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.3730, 0.5781, 0.4160], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0732, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:35:13,815] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 11:35:13,817] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.71 | bwd_microstep: 4644.29 | bwd_inner_microstep: 4639.54 | bwd_allreduce_microstep: 4.66 | step_microstep: 42.33
-[2025-01-25 11:35:13,817] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.67 | bwd: 4644.31 | bwd_inner: 4639.54 | bwd_allreduce: 4.71 | step: 42.33
- 18%|█▊        | 1018/5800 [2:48:43<9:11:06,  6.91s/it]                                                       {'loss': 0.0732, 'grad_norm': 8.372385025024414, 'learning_rate': 3.781962343386619e-05, 'epoch': 8.78}
- 18%|█▊        | 1018/5800 [2:48:43<9:11:06,  6.91s/it]score1 tensor([[0.5742],
-        [0.5273],
-        [0.5117],
-        [0.6680]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4395, 0.3809, 0.4258, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1074, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:35:20,788] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.61 | optimizer_step: 5.22
-[2025-01-25 11:35:20,791] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.29 | bwd_microstep: 4660.49 | bwd_inner_microstep: 4655.42 | bwd_allreduce_microstep: 4.98 | step_microstep: 72.61
-[2025-01-25 11:35:20,792] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.25 | bwd: 4660.51 | bwd_inner: 4655.42 | bwd_allreduce: 5.02 | step: 72.68
- 18%|█▊        | 1019/5800 [2:48:50<9:13:40,  6.95s/it]                                                       {'loss': 0.1074, 'grad_norm': 8.553028106689453, 'learning_rate': 3.781454987905812e-05, 'epoch': 8.78}
- 18%|█▊        | 1019/5800 [2:48:50<9:13:40,  6.95s/it]score1 tensor([[0.5000],
-        [0.5430],
-        [0.7148],
-        [0.4082]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.4961, 0.6328, 0.3691], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0425, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:35:27,782] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 11:35:27,783] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.21 | bwd_microstep: 4648.96 | bwd_inner_microstep: 4644.19 | bwd_allreduce_microstep: 4.69 | step_microstep: 42.27
-[2025-01-25 11:35:27,783] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.18 | bwd: 4648.98 | bwd_inner: 4644.19 | bwd_allreduce: 4.73 | step: 42.28
- 18%|█▊        | 1020/5800 [2:48:57<9:13:19,  6.95s/it]                                                       {'loss': 0.0425, 'grad_norm': 8.35219955444336, 'learning_rate': 3.780947076936428e-05, 'epoch': 8.79}
- 18%|█▊        | 1020/5800 [2:48:57<9:13:19,  6.95s/it]score1 tensor([[0.5391],
-        [0.7227],
-        [0.5430],
-        [0.7305]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.6641, 0.4727, 0.6875], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0498, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:35:34,715] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.45 | optimizer_step: 4.37
-[2025-01-25 11:35:34,717] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.88 | bwd_microstep: 4647.54 | bwd_inner_microstep: 4642.77 | bwd_allreduce_microstep: 4.69 | step_microstep: 48.64
-[2025-01-25 11:35:34,717] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.86 | bwd: 4647.56 | bwd_inner: 4642.77 | bwd_allreduce: 4.73 | step: 48.65
- 18%|█▊        | 1021/5800 [2:49:04<9:12:50,  6.94s/it]                                                       {'loss': 0.0498, 'grad_norm': 9.06015396118164, 'learning_rate': 3.7804386106368434e-05, 'epoch': 8.8}
- 18%|█▊        | 1021/5800 [2:49:04<9:12:50,  6.94s/it]score1 tensor([[0.4590],
-        [0.4766],
-        [0.6484],
-        [0.4863]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4043, 0.5039, 0.6172, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0337, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:35:41,648] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.53 | optimizer_step: 4.36
-[2025-01-25 11:35:41,649] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2175.21 | bwd_microstep: 4644.42 | bwd_inner_microstep: 4639.23 | bwd_allreduce_microstep: 5.09 | step_microstep: 46.63
-[2025-01-25 11:35:41,650] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2175.17 | bwd: 4644.45 | bwd_inner: 4639.23 | bwd_allreduce: 5.13 | step: 46.64
- 18%|█▊        | 1022/5800 [2:49:11<9:12:31,  6.94s/it]                                                       {'loss': 0.0337, 'grad_norm': 4.289520263671875, 'learning_rate': 3.779929589165607e-05, 'epoch': 8.81}
- 18%|█▊        | 1022/5800 [2:49:11<9:12:31,  6.94s/it]score1 tensor([[0.4062],
-        [0.4258],
-        [0.4082],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3652, 0.4629, 0.3711, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0308, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:35:48,560] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.45 | optimizer_step: 4.36
-[2025-01-25 11:35:48,561] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2169.38 | bwd_microstep: 4639.27 | bwd_inner_microstep: 4634.60 | bwd_allreduce_microstep: 4.58 | step_microstep: 39.25
-[2025-01-25 11:35:48,562] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2169.35 | bwd: 4639.29 | bwd_inner: 4634.60 | bwd_allreduce: 4.63 | step: 39.26
- 18%|█▊        | 1023/5800 [2:49:18<9:11:40,  6.93s/it]                                                       {'loss': 0.0308, 'grad_norm': 0.6303421258926392, 'learning_rate': 3.7794200126814396e-05, 'epoch': 8.82}
- 18%|█▊        | 1023/5800 [2:49:18<9:11:40,  6.93s/it]score1 tensor([[0.4688],
-        [0.4258],
-        [0.4219],
-        [0.4434]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.5352, 0.4941, 0.4805], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0928, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:35:55,490] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.51 | optimizer_step: 4.37
-[2025-01-25 11:35:55,492] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2172.75 | bwd_microstep: 4651.07 | bwd_inner_microstep: 4645.53 | bwd_allreduce_microstep: 5.43 | step_microstep: 48.45
-[2025-01-25 11:35:55,492] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2172.72 | bwd: 4651.10 | bwd_inner: 4645.53 | bwd_allreduce: 5.49 | step: 48.46
- 18%|█▊        | 1024/5800 [2:49:25<9:11:42,  6.93s/it]                                                       {'loss': 0.0928, 'grad_norm': 7.571022987365723, 'learning_rate': 3.778909881343237e-05, 'epoch': 8.83}
- 18%|█▊        | 1024/5800 [2:49:25<9:11:42,  6.93s/it]score1 tensor([[0.4688],
-        [0.3164],
-        [0.4941],
-        [0.4512]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.3477, 0.6484, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0811, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:36:02,413] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.51 | optimizer_step: 4.36
-[2025-01-25 11:36:02,414] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.00 | bwd_microstep: 4642.21 | bwd_inner_microstep: 4637.11 | bwd_allreduce_microstep: 4.99 | step_microstep: 43.45
-[2025-01-25 11:36:02,414] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.97 | bwd: 4642.23 | bwd_inner: 4637.11 | bwd_allreduce: 5.05 | step: 43.45
- 18%|█▊        | 1025/5800 [2:49:32<9:11:23,  6.93s/it]                                                       {'loss': 0.0811, 'grad_norm': 7.095206260681152, 'learning_rate': 3.778399195310067e-05, 'epoch': 8.84}
- 18%|█▊        | 1025/5800 [2:49:32<9:11:23,  6.93s/it]score1 tensor([[0.3516],
-        [0.4609],
-        [0.3828],
-        [0.3359]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4668, 0.5273, 0.5195, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0894, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:36:09,350] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.54 | optimizer_step: 4.36
-[2025-01-25 11:36:09,352] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.02 | bwd_microstep: 4644.02 | bwd_inner_microstep: 4639.14 | bwd_allreduce_microstep: 4.79 | step_microstep: 49.69
-[2025-01-25 11:36:09,352] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.98 | bwd: 4644.04 | bwd_inner: 4639.14 | bwd_allreduce: 4.84 | step: 49.70
- 18%|█▊        | 1026/5800 [2:49:39<9:11:27,  6.93s/it]                                                       {'loss': 0.0894, 'grad_norm': 6.625424861907959, 'learning_rate': 3.777887954741169e-05, 'epoch': 8.84}
- 18%|█▊        | 1026/5800 [2:49:39<9:11:27,  6.93s/it]score1 tensor([[0.4824],
-        [0.4688],
-        [0.4512],
-        [0.3828]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.6055, 0.5156, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0957, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:36:16,288] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.20 | optimizer_step: 4.36
-[2025-01-25 11:36:16,290] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2169.33 | bwd_microstep: 4648.12 | bwd_inner_microstep: 4642.27 | bwd_allreduce_microstep: 5.73 | step_microstep: 56.01
-[2025-01-25 11:36:16,290] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2169.30 | bwd: 4648.14 | bwd_inner: 4642.27 | bwd_allreduce: 5.81 | step: 56.02
- 18%|█▊        | 1027/5800 [2:49:46<9:11:48,  6.94s/it]                                                       {'loss': 0.0957, 'grad_norm': 7.528873920440674, 'learning_rate': 3.777376159795959e-05, 'epoch': 8.85}
- 18%|█▊        | 1027/5800 [2:49:46<9:11:48,  6.94s/it]score1 tensor([[0.3555],
-        [0.4824],
-        [0.4824],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4961, 0.5195, 0.5391, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0645, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:36:23,262] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 11:36:23,263] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.71 | bwd_microstep: 4649.17 | bwd_inner_microstep: 4643.77 | bwd_allreduce_microstep: 5.31 | step_microstep: 72.94
-[2025-01-25 11:36:23,264] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.67 | bwd: 4649.20 | bwd_inner: 4643.77 | bwd_allreduce: 5.36 | step: 72.95
- 18%|█▊        | 1028/5800 [2:49:53<9:13:12,  6.96s/it]                                                       {'loss': 0.0645, 'grad_norm': 7.353283882141113, 'learning_rate': 3.776863810634021e-05, 'epoch': 8.86}
- 18%|█▊        | 1028/5800 [2:49:53<9:13:12,  6.96s/it]score1 tensor([[0.5430],
-        [0.4434],
-        [0.4258],
-        [0.4766]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.4883, 0.4922, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0518, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:36:30,282] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.75 | optimizer_step: 4.37
-[2025-01-25 11:36:30,286] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.30 | bwd_microstep: 4650.15 | bwd_inner_microstep: 4644.77 | bwd_allreduce_microstep: 5.23 | step_microstep: 63.81
-[2025-01-25 11:36:30,286] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2168.26 | bwd: 4650.19 | bwd_inner: 4644.77 | bwd_allreduce: 5.30 | step: 63.81
- 18%|█▊        | 1029/5800 [2:50:00<9:14:23,  6.97s/it]                                                       {'loss': 0.0518, 'grad_norm': 3.93507981300354, 'learning_rate': 3.7763509074151165e-05, 'epoch': 8.87}
- 18%|█▊        | 1029/5800 [2:50:00<9:14:23,  6.97s/it]score1 tensor([[0.4004],
-        [0.5312],
-        [0.3984],
-        [0.4961]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4883, 0.5117, 0.3730, 0.4297], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0498, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:36:37,286] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.48 | optimizer_step: 4.37
-[2025-01-25 11:36:37,287] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2169.35 | bwd_microstep: 4644.03 | bwd_inner_microstep: 4636.16 | bwd_allreduce_microstep: 7.69 | step_microstep: 68.58
-[2025-01-25 11:36:37,288] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2169.25 | bwd: 4644.10 | bwd_inner: 4636.16 | bwd_allreduce: 7.77 | step: 68.55
- 18%|█▊        | 1030/5800 [2:50:07<9:14:19,  6.97s/it]                                                       {'loss': 0.0498, 'grad_norm': 4.534162521362305, 'learning_rate': 3.775837450299176e-05, 'epoch': 8.88}
- 18%|█▊        | 1030/5800 [2:50:07<9:14:19,  6.97s/it]score1 tensor([[0.4785],
-        [0.6367],
-        [0.4453],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.6094, 0.4395, 0.4570], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:36:44,231] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.51 | optimizer_step: 4.36
-[2025-01-25 11:36:44,233] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.42 | bwd_microstep: 4651.42 | bwd_inner_microstep: 4645.98 | bwd_allreduce_microstep: 5.33 | step_microstep: 46.22
-[2025-01-25 11:36:44,233] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.38 | bwd: 4651.45 | bwd_inner: 4645.98 | bwd_allreduce: 5.38 | step: 46.23
- 18%|█▊        | 1031/5800 [2:50:14<9:13:35,  6.96s/it]                                                       {'loss': 0.0312, 'grad_norm': 8.225068092346191, 'learning_rate': 3.7753234394463057e-05, 'epoch': 8.89}
- 18%|█▊        | 1031/5800 [2:50:14<9:13:35,  6.96s/it]score1 tensor([[0.4746],
-        [0.5078],
-        [0.6328],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4766, 0.4707, 0.5781, 0.4316], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0405, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:36:51,156] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 11:36:51,157] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.08 | bwd_microstep: 4646.93 | bwd_inner_microstep: 4642.10 | bwd_allreduce_microstep: 4.75 | step_microstep: 41.62
-[2025-01-25 11:36:51,158] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.05 | bwd: 4646.95 | bwd_inner: 4642.10 | bwd_allreduce: 4.79 | step: 41.63
- 18%|█▊        | 1032/5800 [2:50:21<9:12:31,  6.95s/it]                                                       {'loss': 0.0405, 'grad_norm': 4.455028533935547, 'learning_rate': 3.7748088750167824e-05, 'epoch': 8.9}
- 18%|█▊        | 1032/5800 [2:50:21<9:12:31,  6.95s/it]score1 tensor([[0.4727],
-        [0.5781],
-        [0.5547],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.5273, 0.4727, 0.4570], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0557, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:36:58,080] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.47 | optimizer_step: 4.36
-[2025-01-25 11:36:58,082] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.40 | bwd_microstep: 4641.33 | bwd_inner_microstep: 4636.42 | bwd_allreduce_microstep: 4.82 | step_microstep: 45.51
-[2025-01-25 11:36:58,082] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.35 | bwd: 4641.35 | bwd_inner: 4636.42 | bwd_allreduce: 4.86 | step: 45.52
- 18%|█▊        | 1033/5800 [2:50:28<9:11:42,  6.94s/it]                                                       {'loss': 0.0557, 'grad_norm': 8.514729499816895, 'learning_rate': 3.7742937571710555e-05, 'epoch': 8.91}
- 18%|█▊        | 1033/5800 [2:50:28<9:11:42,  6.94s/it]score1 tensor([[0.5625],
-        [0.4434],
-        [0.6289],
-        [0.6562]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5078, 0.4688, 0.5625, 0.6250], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0444, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:37:04,999] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 11:37:05,000] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.24 | bwd_microstep: 4640.41 | bwd_inner_microstep: 4635.17 | bwd_allreduce_microstep: 5.14 | step_microstep: 43.53
-[2025-01-25 11:37:05,000] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.20 | bwd: 4640.44 | bwd_inner: 4635.16 | bwd_allreduce: 5.20 | step: 43.53
- 18%|█▊        | 1034/5800 [2:50:34<9:11:10,  6.94s/it]                                                       {'loss': 0.0444, 'grad_norm': 5.529080390930176, 'learning_rate': 3.773778086069749e-05, 'epoch': 8.91}
- 18%|█▊        | 1034/5800 [2:50:34<9:11:10,  6.94s/it]score1 tensor([[0.4707],
-        [0.3984],
-        [0.4922],
-        [0.4609]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.3516, 0.4863, 0.4141], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0288, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:37:11,943] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.36
-[2025-01-25 11:37:11,944] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2172.97 | bwd_microstep: 4645.63 | bwd_inner_microstep: 4640.39 | bwd_allreduce_microstep: 5.13 | step_microstep: 46.00
-[2025-01-25 11:37:11,944] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2172.94 | bwd: 4645.66 | bwd_inner: 4640.39 | bwd_allreduce: 5.19 | step: 46.01
- 18%|█▊        | 1035/5800 [2:50:41<9:11:03,  6.94s/it]                                                       {'loss': 0.0288, 'grad_norm': 7.227757453918457, 'learning_rate': 3.7732618618736576e-05, 'epoch': 8.92}
- 18%|█▊        | 1035/5800 [2:50:41<9:11:03,  6.94s/it]score1 tensor([[0.4609],
-        [0.5664],
-        [0.4961],
-        [0.4473]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4297, 0.6133, 0.6172, 0.4023], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0610, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:37:18,882] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.03 | optimizer_step: 4.37
-[2025-01-25 11:37:18,884] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.93 | bwd_microstep: 4648.36 | bwd_inner_microstep: 4643.37 | bwd_allreduce_microstep: 4.89 | step_microstep: 54.03
-[2025-01-25 11:37:18,885] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.90 | bwd: 4648.39 | bwd_inner: 4643.37 | bwd_allreduce: 4.94 | step: 54.07
- 18%|█▊        | 1036/5800 [2:50:48<9:11:14,  6.94s/it]                                                       {'loss': 0.061, 'grad_norm': 2.1852900981903076, 'learning_rate': 3.772745084743749e-05, 'epoch': 8.93}
- 18%|█▊        | 1036/5800 [2:50:48<9:11:14,  6.94s/it]score1 tensor([[0.4785],
-        [0.4492],
-        [0.4258],
-        [0.4551]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.5273, 0.4453, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:37:25,891] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.37 | optimizer_step: 4.63
-[2025-01-25 11:37:25,894] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.66 | bwd_microstep: 4650.67 | bwd_inner_microstep: 4642.94 | bwd_allreduce_microstep: 7.54 | step_microstep: 95.34
-[2025-01-25 11:37:25,895] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.62 | bwd: 4650.72 | bwd_inner: 4642.94 | bwd_allreduce: 7.65 | step: 95.36
- 18%|█▊        | 1037/5800 [2:50:55<9:13:02,  6.97s/it]                                                       {'loss': 0.0703, 'grad_norm': 7.3181867599487305, 'learning_rate': 3.7722277548411624e-05, 'epoch': 8.94}
- 18%|█▊        | 1037/5800 [2:50:55<9:13:02,  6.97s/it]score1 tensor([[0.3926],
-        [0.4727],
-        [0.4688],
-        [0.5938]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.5391, 0.5391, 0.6562], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0542, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:37:32,869] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 11:37:32,870] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2171.43 | bwd_microstep: 4650.93 | bwd_inner_microstep: 4645.80 | bwd_allreduce_microstep: 5.02 | step_microstep: 47.01
-[2025-01-25 11:37:32,871] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2171.40 | bwd: 4650.95 | bwd_inner: 4645.80 | bwd_allreduce: 5.08 | step: 47.02
- 18%|█▊        | 1038/5800 [2:51:02<9:12:34,  6.96s/it]                                                       {'loss': 0.0542, 'grad_norm': 4.974468231201172, 'learning_rate': 3.7717098723272116e-05, 'epoch': 8.95}
- 18%|█▊        | 1038/5800 [2:51:02<9:12:34,  6.96s/it]score1 tensor([[0.5508],
-        [0.4395],
-        [0.3594],
-        [0.3906]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5898, 0.4941, 0.3340, 0.4805], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0522, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:37:39,827] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.37
-[2025-01-25 11:37:39,829] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.74 | bwd_microstep: 4648.48 | bwd_inner_microstep: 4640.67 | bwd_allreduce_microstep: 7.62 | step_microstep: 72.73
-[2025-01-25 11:37:39,829] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.70 | bwd: 4648.54 | bwd_inner: 4640.67 | bwd_allreduce: 7.73 | step: 72.71
- 18%|█▊        | 1039/5800 [2:51:09<9:12:24,  6.96s/it]                                                       {'loss': 0.0522, 'grad_norm': 4.58716344833374, 'learning_rate': 3.771191437363381e-05, 'epoch': 8.96}
- 18%|█▊        | 1039/5800 [2:51:09<9:12:24,  6.96s/it]score1 tensor([[0.4258],
-        [0.5078],
-        [0.5664],
-        [0.6250]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5000, 0.5469, 0.6836, 0.6367], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0605, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:37:46,760] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.36
-[2025-01-25 11:37:46,761] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.84 | bwd_microstep: 4641.15 | bwd_inner_microstep: 4635.91 | bwd_allreduce_microstep: 5.13 | step_microstep: 44.45
-[2025-01-25 11:37:46,766] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.80 | bwd: 4641.17 | bwd_inner: 4635.91 | bwd_allreduce: 5.18 | step: 44.46
- 18%|█▊        | 1040/5800 [2:51:16<9:11:41,  6.95s/it]                                                       {'loss': 0.0605, 'grad_norm': 8.323959350585938, 'learning_rate': 3.7706724501113265e-05, 'epoch': 8.97}
- 18%|█▊        | 1040/5800 [2:51:16<9:11:41,  6.95s/it]score1 tensor([[0.4961],
-        [0.3594],
-        [0.6211],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.3105, 0.6289, 0.5703], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0562, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:37:53,686] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.09 | optimizer_step: 4.37
-[2025-01-25 11:37:53,688] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.88 | bwd_microstep: 4640.89 | bwd_inner_microstep: 4635.87 | bwd_allreduce_microstep: 4.92 | step_microstep: 45.44
-[2025-01-25 11:37:53,689] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.84 | bwd: 4640.91 | bwd_inner: 4635.87 | bwd_allreduce: 4.97 | step: 45.45
- 18%|█▊        | 1041/5800 [2:51:23<9:10:50,  6.94s/it]                                                       {'loss': 0.0562, 'grad_norm': 5.049170970916748, 'learning_rate': 3.770152910732879e-05, 'epoch': 8.97}
- 18%|█▊        | 1041/5800 [2:51:23<9:10:50,  6.94s/it]score1 tensor([[0.5352],
-        [0.5586],
-        [0.5352],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.6211, 0.5664, 0.4570], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:38:00,610] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 11:38:00,612] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.08 | bwd_microstep: 4642.57 | bwd_inner_microstep: 4637.63 | bwd_allreduce_microstep: 4.85 | step_microstep: 44.10
-[2025-01-25 11:38:00,612] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.04 | bwd: 4642.59 | bwd_inner: 4637.63 | bwd_allreduce: 4.89 | step: 44.10
- 18%|█▊        | 1042/5800 [2:51:30<9:10:10,  6.94s/it]                                                       {'loss': 0.0391, 'grad_norm': 0.5807949900627136, 'learning_rate': 3.769632819390039e-05, 'epoch': 8.98}
- 18%|█▊        | 1042/5800 [2:51:30<9:10:10,  6.94s/it]score1 tensor([[0.5977],
-        [0.5820],
-        [0.3633],
-        [0.4824]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.5781, 0.4512, 0.3457], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0718, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:38:07,531] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 11:38:07,532] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.83 | bwd_microstep: 4642.20 | bwd_inner_microstep: 4637.05 | bwd_allreduce_microstep: 5.06 | step_microstep: 42.38
-[2025-01-25 11:38:07,532] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.80 | bwd: 4642.22 | bwd_inner: 4637.05 | bwd_allreduce: 5.10 | step: 42.39
- 18%|█▊        | 1043/5800 [2:51:37<9:09:37,  6.93s/it]                                                       {'loss': 0.0718, 'grad_norm': 4.648168563842773, 'learning_rate': 3.7691121762449795e-05, 'epoch': 8.99}
- 18%|█▊        | 1043/5800 [2:51:37<9:09:37,  6.93s/it][93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0762, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:38:12,728] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.42 | optimizer_step: 4.36
-[2025-01-25 11:38:12,729] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 574.18 | bwd_microstep: 1223.19 | bwd_inner_microstep: 1218.83 | bwd_allreduce_microstep: 4.26 | step_microstep: 43.08
-[2025-01-25 11:38:12,730] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 574.13 | bwd: 1223.21 | bwd_inner: 1218.83 | bwd_allreduce: 4.31 | step: 43.10
- 18%|█▊        | 1044/5800 [2:51:42<8:28:12,  6.41s/it]                                                       {'loss': 0.0762, 'grad_norm': 7.849636554718018, 'learning_rate': 3.768590981460047e-05, 'epoch': 9.0}
- 18%|█▊        | 1044/5800 [2:51:42<8:28:12,  6.41s/it][2025-01-25 11:38:17,519] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 11:38:28,189] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 11:38:40,530] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 11:38:51,292] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-score1 tensor([[0.5625],
-        [0.5781],
-        [0.5156],
-        [0.6211]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.4180, 0.5195, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0713, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:39:06,499] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.36
-[2025-01-25 11:39:06,501] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2146.81 | bwd_microstep: 4609.68 | bwd_inner_microstep: 4603.64 | bwd_allreduce_microstep: 5.89 | step_microstep: 52.02
-[2025-01-25 11:39:06,501] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2146.73 | bwd: 4609.71 | bwd_inner: 4603.64 | bwd_allreduce: 5.98 | step: 52.03
- 18%|█▊        | 1045/5800 [2:52:36<27:14:29, 20.62s/it]                                                        {'loss': 0.0713, 'grad_norm': 4.3601975440979, 'learning_rate': 3.7680692351977594e-05, 'epoch': 9.01}
- 18%|█▊        | 1045/5800 [2:52:36<27:14:29, 20.62s/it]score1 tensor([[0.5273],
-        [0.7031],
-        [0.5117],
-        [0.6797]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.6094, 0.4082, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0918, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:39:13,375] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.36
-[2025-01-25 11:39:13,377] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2139.04 | bwd_microstep: 4585.38 | bwd_inner_microstep: 4578.24 | bwd_allreduce_microstep: 6.96 | step_microstep: 52.49
-[2025-01-25 11:39:13,377] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2139.01 | bwd: 4585.45 | bwd_inner: 4578.24 | bwd_allreduce: 7.03 | step: 52.47
- 18%|█▊        | 1046/5800 [2:52:43<21:47:23, 16.50s/it]                                                        {'loss': 0.0918, 'grad_norm': 8.483760833740234, 'learning_rate': 3.7675469376208054e-05, 'epoch': 9.02}
- 18%|█▊        | 1046/5800 [2:52:43<21:47:23, 16.50s/it]score1 tensor([[0.5938],
-        [0.5391],
-        [0.6328],
-        [0.4473]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.5117, 0.5508, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0557, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:39:20,271] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.66 | optimizer_step: 4.42
-[2025-01-25 11:39:20,273] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2139.34 | bwd_microstep: 4600.59 | bwd_inner_microstep: 4594.73 | bwd_allreduce_microstep: 5.74 | step_microstep: 60.32
-[2025-01-25 11:39:20,274] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2139.30 | bwd: 4600.62 | bwd_inner: 4594.73 | bwd_allreduce: 5.80 | step: 60.32
- 18%|█▊        | 1047/5800 [2:52:50<17:58:34, 13.62s/it]                                                        {'loss': 0.0557, 'grad_norm': 8.000895500183105, 'learning_rate': 3.767024088892046e-05, 'epoch': 9.03}
- 18%|█▊        | 1047/5800 [2:52:50<17:58:34, 13.62s/it]score1 tensor([[0.4961],
-        [0.4082],
-        [0.4004],
-        [0.4414]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.4141, 0.3789, 0.4609], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:39:27,169] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.37
-[2025-01-25 11:39:27,170] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2149.98 | bwd_microstep: 4615.33 | bwd_inner_microstep: 4605.12 | bwd_allreduce_microstep: 9.99 | step_microstep: 52.56
-[2025-01-25 11:39:27,171] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2149.95 | bwd: 4615.40 | bwd_inner: 4605.12 | bwd_allreduce: 10.11 | step: 52.47
- 18%|█▊        | 1048/5800 [2:52:57<15:18:54, 11.60s/it]                                                        {'loss': 0.0161, 'grad_norm': 0.723540723323822, 'learning_rate': 3.7665006891745156e-05, 'epoch': 9.03}
- 18%|█▊        | 1048/5800 [2:52:57<15:18:54, 11.60s/it]score1 tensor([[0.4414],
-        [0.6367],
-        [0.4824],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.6445, 0.5195, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0552, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:39:34,066] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 11:39:34,067] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2151.30 | bwd_microstep: 4610.48 | bwd_inner_microstep: 4605.35 | bwd_allreduce_microstep: 5.01 | step_microstep: 43.23
-[2025-01-25 11:39:34,068] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2151.20 | bwd: 4610.50 | bwd_inner: 4605.35 | bwd_allreduce: 5.07 | step: 43.24
- 18%|█▊        | 1049/5800 [2:53:04<13:26:38, 10.19s/it]                                                        {'loss': 0.0552, 'grad_norm': 7.664623260498047, 'learning_rate': 3.765976738631419e-05, 'epoch': 9.04}
- 18%|█▊        | 1049/5800 [2:53:04<13:26:38, 10.19s/it]score1 tensor([[0.4453],
-        [0.5156],
-        [0.3984],
-        [0.4043]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4746, 0.5391, 0.4141, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0420, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:39:40,948] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 11:39:40,951] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2149.37 | bwd_microstep: 4616.04 | bwd_inner_microstep: 4611.04 | bwd_allreduce_microstep: 4.88 | step_microstep: 46.91
-[2025-01-25 11:39:40,951] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2149.33 | bwd: 4616.07 | bwd_inner: 4611.04 | bwd_allreduce: 4.94 | step: 46.92
- 18%|█▊        | 1050/5800 [2:53:10<12:08:07,  9.20s/it]                                                        {'loss': 0.042, 'grad_norm': 6.87738561630249, 'learning_rate': 3.765452237426133e-05, 'epoch': 9.05}
- 18%|█▊        | 1050/5800 [2:53:10<12:08:07,  9.20s/it]score1 tensor([[0.4570],
-        [0.6406],
-        [0.5352],
-        [0.6406]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.5352, 0.5391, 0.6133], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0518, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:39:47,837] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.36
-[2025-01-25 11:39:47,838] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.30 | bwd_microstep: 4607.67 | bwd_inner_microstep: 4602.77 | bwd_allreduce_microstep: 4.81 | step_microstep: 46.36
-[2025-01-25 11:39:47,839] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.27 | bwd: 4607.69 | bwd_inner: 4602.77 | bwd_allreduce: 4.85 | step: 46.37
- 18%|█▊        | 1051/5800 [2:53:17<11:13:05,  8.50s/it]                                                        {'loss': 0.0518, 'grad_norm': 3.263223648071289, 'learning_rate': 3.764927185722205e-05, 'epoch': 9.06}
- 18%|█▊        | 1051/5800 [2:53:17<11:13:05,  8.50s/it]score1 tensor([[0.3945],
-        [0.4473],
-        [0.4277],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4297, 0.4551, 0.4395, 0.5586], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0205, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:39:54,733] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 11:39:54,734] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.92 | bwd_microstep: 4615.15 | bwd_inner_microstep: 4609.42 | bwd_allreduce_microstep: 5.60 | step_microstep: 49.15
-[2025-01-25 11:39:54,735] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.84 | bwd: 4615.17 | bwd_inner: 4609.42 | bwd_allreduce: 5.67 | step: 49.16
- 18%|█▊        | 1052/5800 [2:53:24<10:34:39,  8.02s/it]                                                        {'loss': 0.0205, 'grad_norm': 6.923631191253662, 'learning_rate': 3.7644015836833575e-05, 'epoch': 9.07}
- 18%|█▊        | 1052/5800 [2:53:24<10:34:39,  8.02s/it]score1 tensor([[0.5703],
-        [0.4336],
-        [0.3770],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.5039, 0.3984, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0464, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:40:01,608] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.37
-[2025-01-25 11:40:01,610] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2151.01 | bwd_microstep: 4610.01 | bwd_inner_microstep: 4605.15 | bwd_allreduce_microstep: 4.75 | step_microstep: 43.37
-[2025-01-25 11:40:01,610] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.96 | bwd: 4610.03 | bwd_inner: 4605.16 | bwd_allreduce: 4.81 | step: 43.38
- 18%|█▊        | 1053/5800 [2:53:31<10:07:23,  7.68s/it]                                                        {'loss': 0.0464, 'grad_norm': 7.083672046661377, 'learning_rate': 3.763875431473481e-05, 'epoch': 9.08}
- 18%|█▊        | 1053/5800 [2:53:31<10:07:23,  7.68s/it]score1 tensor([[0.4785],
-        [0.4883],
-        [0.4395],
-        [0.4219]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.4473, 0.4785, 0.4707], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0371, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:40:08,491] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 11:40:08,492] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.83 | bwd_microstep: 4611.20 | bwd_inner_microstep: 4605.70 | bwd_allreduce_microstep: 5.39 | step_microstep: 47.89
-[2025-01-25 11:40:08,493] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.80 | bwd: 4611.22 | bwd_inner: 4605.70 | bwd_allreduce: 5.45 | step: 47.89
- 18%|█▊        | 1054/5800 [2:53:38<9:48:24,  7.44s/it]                                                        {'loss': 0.0371, 'grad_norm': 3.373934507369995, 'learning_rate': 3.763348729256639e-05, 'epoch': 9.09}
- 18%|█▊        | 1054/5800 [2:53:38<9:48:24,  7.44s/it]score1 tensor([[0.5156],
-        [0.4844],
-        [0.6719],
-        [0.4668]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4238, 0.4961, 0.6484, 0.4043], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0474, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:40:15,391] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 11:40:15,392] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.03 | bwd_microstep: 4616.16 | bwd_inner_microstep: 4610.46 | bwd_allreduce_microstep: 5.51 | step_microstep: 52.94
-[2025-01-25 11:40:15,393] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.96 | bwd: 4616.21 | bwd_inner: 4610.46 | bwd_allreduce: 5.61 | step: 52.94
- 18%|█▊        | 1055/5800 [2:53:45<9:36:06,  7.28s/it]                                                       {'loss': 0.0474, 'grad_norm': 4.122019290924072, 'learning_rate': 3.762821477197066e-05, 'epoch': 9.09}
- 18%|█▊        | 1055/5800 [2:53:45<9:36:06,  7.28s/it]score1 tensor([[0.6055],
-        [0.5781],
-        [0.6445],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.5781, 0.5664, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0371, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:40:22,303] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.40 | optimizer_step: 4.36
-[2025-01-25 11:40:22,307] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.75 | bwd_microstep: 4572.98 | bwd_inner_microstep: 4564.07 | bwd_allreduce_microstep: 8.70 | step_microstep: 73.66
-[2025-01-25 11:40:22,307] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.72 | bwd: 4573.04 | bwd_inner: 4564.07 | bwd_allreduce: 8.81 | step: 73.71
- 18%|█▊        | 1056/5800 [2:53:52<9:27:19,  7.18s/it]                                                       {'loss': 0.0371, 'grad_norm': 2.370997428894043, 'learning_rate': 3.7622936754591695e-05, 'epoch': 9.1}
- 18%|█▊        | 1056/5800 [2:53:52<9:27:19,  7.18s/it]score1 tensor([[0.5977],
-        [0.5625],
-        [0.4414],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4805, 0.6016, 0.4219, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:40:29,264] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.54 | optimizer_step: 4.37
-[2025-01-25 11:40:29,266] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.13 | bwd_microstep: 4630.60 | bwd_inner_microstep: 4624.17 | bwd_allreduce_microstep: 6.27 | step_microstep: 48.22
-[2025-01-25 11:40:29,266] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.00 | bwd: 4630.63 | bwd_inner: 4624.17 | bwd_allreduce: 6.36 | step: 48.23
- 18%|█▊        | 1057/5800 [2:53:59<9:21:20,  7.10s/it]                                                       {'loss': 0.0508, 'grad_norm': 3.6526267528533936, 'learning_rate': 3.761765324207526e-05, 'epoch': 9.11}
- 18%|█▊        | 1057/5800 [2:53:59<9:21:20,  7.10s/it]score1 tensor([[0.4883],
-        [0.6406],
-        [0.5781],
-        [0.5859]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.5664, 0.5625, 0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0459, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:40:36,173] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.36
-[2025-01-25 11:40:36,176] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.52 | bwd_microstep: 4616.33 | bwd_inner_microstep: 4611.08 | bwd_allreduce_microstep: 5.13 | step_microstep: 55.40
-[2025-01-25 11:40:36,177] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.46 | bwd: 4616.36 | bwd_inner: 4611.08 | bwd_allreduce: 5.19 | step: 55.44
- 18%|█▊        | 1058/5800 [2:54:06<9:17:07,  7.05s/it]                                                       {'loss': 0.0459, 'grad_norm': 7.898348331451416, 'learning_rate': 3.7612364236068856e-05, 'epoch': 9.12}
- 18%|█▊        | 1058/5800 [2:54:06<9:17:07,  7.05s/it]score1 tensor([[0.5820],
-        [0.5859],
-        [0.5664],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6289, 0.6406, 0.5273, 0.4082], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0640, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:40:43,104] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.36
-[2025-01-25 11:40:43,106] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.86 | bwd_microstep: 4621.80 | bwd_inner_microstep: 4616.69 | bwd_allreduce_microstep: 4.96 | step_microstep: 47.75
-[2025-01-25 11:40:43,106] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.83 | bwd: 4621.82 | bwd_inner: 4616.69 | bwd_allreduce: 5.05 | step: 47.76
- 18%|█▊        | 1059/5800 [2:54:13<9:13:42,  7.01s/it]                                                       {'loss': 0.064, 'grad_norm': 0.5690949559211731, 'learning_rate': 3.760706973822168e-05, 'epoch': 9.13}
- 18%|█▊        | 1059/5800 [2:54:13<9:13:42,  7.01s/it]score1 tensor([[0.4785],
-        [0.4629],
-        [0.5391],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.3750, 0.4473, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0737, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:40:49,997] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.36
-[2025-01-25 11:40:49,998] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.18 | bwd_microstep: 4617.77 | bwd_inner_microstep: 4612.88 | bwd_allreduce_microstep: 4.81 | step_microstep: 43.80
-[2025-01-25 11:40:49,999] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.15 | bwd: 4617.79 | bwd_inner: 4612.88 | bwd_allreduce: 4.85 | step: 43.83
- 18%|█▊        | 1060/5800 [2:54:19<9:10:57,  6.97s/it]                                                       {'loss': 0.0737, 'grad_norm': 7.2570109367370605, 'learning_rate': 3.760176975018465e-05, 'epoch': 9.14}
- 18%|█▊        | 1060/5800 [2:54:19<9:10:57,  6.97s/it]score1 tensor([[0.4688],
-        [0.5898],
-        [0.3906],
-        [0.6250]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.5742, 0.3555, 0.6016], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:40:56,846] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.36
-[2025-01-25 11:40:56,847] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.92 | bwd_microstep: 4565.99 | bwd_inner_microstep: 4561.21 | bwd_allreduce_microstep: 4.64 | step_microstep: 46.25
-[2025-01-25 11:40:56,848] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.89 | bwd: 4566.01 | bwd_inner: 4561.21 | bwd_allreduce: 4.71 | step: 46.26
- 18%|█▊        | 1061/5800 [2:54:26<9:07:58,  6.94s/it]                                                       {'loss': 0.0186, 'grad_norm': 5.692249774932861, 'learning_rate': 3.759646427361039e-05, 'epoch': 9.15}
- 18%|█▊        | 1061/5800 [2:54:26<9:07:58,  6.94s/it]score1 tensor([[0.4414],
-        [0.6016],
-        [0.5430],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.6836, 0.5430, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0420, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:41:03,708] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 11:41:03,709] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.98 | bwd_microstep: 4580.27 | bwd_inner_microstep: 4575.41 | bwd_allreduce_microstep: 4.79 | step_microstep: 46.05
-[2025-01-25 11:41:03,710] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.95 | bwd: 4580.30 | bwd_inner: 4575.41 | bwd_allreduce: 4.82 | step: 46.06
- 18%|█▊        | 1062/5800 [2:54:33<9:06:00,  6.91s/it]                                                       {'loss': 0.042, 'grad_norm': 1.8077683448791504, 'learning_rate': 3.7591153310153255e-05, 'epoch': 9.16}
- 18%|█▊        | 1062/5800 [2:54:33<9:06:00,  6.91s/it]score1 tensor([[0.3867],
-        [0.4902],
-        [0.4238],
-        [0.4727]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3711, 0.5195, 0.4844, 0.4883], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0303, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:41:10,618] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 11:41:10,619] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.79 | bwd_microstep: 4625.40 | bwd_inner_microstep: 4620.11 | bwd_allreduce_microstep: 5.18 | step_microstep: 47.15
-[2025-01-25 11:41:10,620] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.76 | bwd: 4625.42 | bwd_inner: 4620.11 | bwd_allreduce: 5.22 | step: 47.15
- 18%|█▊        | 1063/5800 [2:54:40<9:05:55,  6.91s/it]                                                       {'loss': 0.0303, 'grad_norm': 3.640005111694336, 'learning_rate': 3.7585836861469276e-05, 'epoch': 9.16}
- 18%|█▊        | 1063/5800 [2:54:40<9:05:55,  6.91s/it]score1 tensor([[0.4590],
-        [0.3750],
-        [0.3535],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.3477, 0.3105, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0474, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:41:17,549] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.96 | optimizer_step: 4.36
-[2025-01-25 11:41:17,550] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.79 | bwd_microstep: 4623.99 | bwd_inner_microstep: 4618.35 | bwd_allreduce_microstep: 5.55 | step_microstep: 63.22
-[2025-01-25 11:41:17,551] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.75 | bwd: 4624.01 | bwd_inner: 4618.35 | bwd_allreduce: 5.59 | step: 63.22
- 18%|█▊        | 1064/5800 [2:54:47<9:06:13,  6.92s/it]                                                       {'loss': 0.0474, 'grad_norm': 1.065568208694458, 'learning_rate': 3.758051492921622e-05, 'epoch': 9.17}
- 18%|█▊        | 1064/5800 [2:54:47<9:06:13,  6.92s/it]score1 tensor([[0.4707],
-        [0.5039],
-        [0.4219],
-        [0.3438]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5234, 0.5273, 0.4766, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0405, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:41:24,507] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.46 | optimizer_step: 4.43
-[2025-01-25 11:41:24,509] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.31 | bwd_microstep: 4630.67 | bwd_inner_microstep: 4622.91 | bwd_allreduce_microstep: 7.56 | step_microstep: 80.96
-[2025-01-25 11:41:24,509] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.25 | bwd: 4630.72 | bwd_inner: 4622.91 | bwd_allreduce: 7.68 | step: 80.99
- 18%|█▊        | 1065/5800 [2:54:54<9:07:08,  6.93s/it]                                                       {'loss': 0.0405, 'grad_norm': 6.685210704803467, 'learning_rate': 3.757518751505357e-05, 'epoch': 9.18}
- 18%|█▊        | 1065/5800 [2:54:54<9:07:08,  6.93s/it]score1 tensor([[0.5703],
-        [0.5156],
-        [0.3965],
-        [0.3984]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.5430, 0.4473, 0.3652], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0376, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:41:31,444] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.37
-[2025-01-25 11:41:31,446] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.40 | bwd_microstep: 4625.38 | bwd_inner_microstep: 4615.55 | bwd_allreduce_microstep: 9.59 | step_microstep: 62.14
-[2025-01-25 11:41:31,447] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.35 | bwd: 4625.45 | bwd_inner: 4615.55 | bwd_allreduce: 9.72 | step: 62.11
- 18%|█▊        | 1066/5800 [2:55:01<9:07:24,  6.94s/it]                                                       {'loss': 0.0376, 'grad_norm': 3.873584032058716, 'learning_rate': 3.756985462064249e-05, 'epoch': 9.19}
- 18%|█▊        | 1066/5800 [2:55:01<9:07:24,  6.94s/it]score1 tensor([[0.5391],
-        [0.4922],
-        [0.5781],
-        [0.4258]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.4766, 0.5664, 0.4199], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0181, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:41:38,390] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 11:41:38,391] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.79 | bwd_microstep: 4618.50 | bwd_inner_microstep: 4613.26 | bwd_allreduce_microstep: 5.13 | step_microstep: 48.52
-[2025-01-25 11:41:38,392] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.75 | bwd: 4618.53 | bwd_inner: 4613.26 | bwd_allreduce: 5.19 | step: 48.52
- 18%|█▊        | 1067/5800 [2:55:08<9:06:49,  6.93s/it]                                                       {'loss': 0.0181, 'grad_norm': 3.568702220916748, 'learning_rate': 3.7564516247645884e-05, 'epoch': 9.2}
- 18%|█▊        | 1067/5800 [2:55:08<9:06:49,  6.93s/it]score1 tensor([[0.4102],
-        [0.4785],
-        [0.5859],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4043, 0.3809, 0.5625, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0356, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:41:45,295] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.52 | optimizer_step: 4.37
-[2025-01-25 11:41:45,296] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.01 | bwd_microstep: 4619.26 | bwd_inner_microstep: 4614.38 | bwd_allreduce_microstep: 4.78 | step_microstep: 46.76
-[2025-01-25 11:41:45,297] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.98 | bwd: 4619.29 | bwd_inner: 4614.39 | bwd_allreduce: 4.83 | step: 46.76
- 18%|█▊        | 1068/5800 [2:55:15<9:06:08,  6.92s/it]                                                       {'loss': 0.0356, 'grad_norm': 7.446200847625732, 'learning_rate': 3.755917239772833e-05, 'epoch': 9.21}
- 18%|█▊        | 1068/5800 [2:55:15<9:06:08,  6.92s/it]score1 tensor([[0.5195],
-        [0.5195],
-        [0.5000],
-        [0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.4863, 0.4941, 0.4570], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0337, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:41:52,204] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 11:41:52,205] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.71 | bwd_microstep: 4627.98 | bwd_inner_microstep: 4622.85 | bwd_allreduce_microstep: 5.01 | step_microstep: 44.58
-[2025-01-25 11:41:52,206] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.68 | bwd: 4628.03 | bwd_inner: 4622.85 | bwd_allreduce: 5.07 | step: 44.58
- 18%|█▊        | 1069/5800 [2:55:22<9:05:41,  6.92s/it]                                                       {'loss': 0.0337, 'grad_norm': 7.439280033111572, 'learning_rate': 3.7553823072556154e-05, 'epoch': 9.22}
- 18%|█▊        | 1069/5800 [2:55:22<9:05:41,  6.92s/it]score1 tensor([[0.4375],
-        [0.6758],
-        [0.4824],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4668, 0.6445, 0.5039, 0.6406], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:41:59,132] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.37
-[2025-01-25 11:41:59,134] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.01 | bwd_microstep: 4635.14 | bwd_inner_microstep: 4629.61 | bwd_allreduce_microstep: 5.42 | step_microstep: 47.55
-[2025-01-25 11:41:59,134] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.98 | bwd: 4635.17 | bwd_inner: 4629.61 | bwd_allreduce: 5.48 | step: 47.55
- 18%|█▊        | 1070/5800 [2:55:29<9:05:37,  6.92s/it]                                                       {'loss': 0.0391, 'grad_norm': 3.5850396156311035, 'learning_rate': 3.7548468273797356e-05, 'epoch': 9.22}
- 18%|█▊        | 1070/5800 [2:55:29<9:05:37,  6.92s/it]score1 tensor([[0.5664],
-        [0.5156],
-        [0.2500],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.5000, 0.3516, 0.6523], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0518, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:42:06,039] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.37
-[2025-01-25 11:42:06,040] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.43 | bwd_microstep: 4622.90 | bwd_inner_microstep: 4617.52 | bwd_allreduce_microstep: 5.23 | step_microstep: 44.88
-[2025-01-25 11:42:06,040] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.39 | bwd: 4622.92 | bwd_inner: 4617.52 | bwd_allreduce: 5.31 | step: 44.88
- 18%|█▊        | 1071/5800 [2:55:36<9:05:07,  6.92s/it]                                                       {'loss': 0.0518, 'grad_norm': 1.0395268201828003, 'learning_rate': 3.7543108003121653e-05, 'epoch': 9.23}
- 18%|█▊        | 1071/5800 [2:55:36<9:05:07,  6.92s/it]score1 tensor([[0.4609],
-        [0.4688],
-        [0.3477],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4590, 0.3750, 0.6484], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0483, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:42:12,975] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 11:42:12,977] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.01 | bwd_microstep: 4631.94 | bwd_inner_microstep: 4626.94 | bwd_allreduce_microstep: 4.90 | step_microstep: 49.73
-[2025-01-25 11:42:12,977] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.97 | bwd: 4631.96 | bwd_inner: 4626.94 | bwd_allreduce: 4.95 | step: 49.74
- 18%|█▊        | 1072/5800 [2:55:42<9:05:31,  6.92s/it]                                                       {'loss': 0.0483, 'grad_norm': 3.5705771446228027, 'learning_rate': 3.753774226220047e-05, 'epoch': 9.24}
- 18%|█▊        | 1072/5800 [2:55:42<9:05:31,  6.92s/it]score1 tensor([[0.5273],
-        [0.3809],
-        [0.4883],
-        [0.6172]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4590, 0.3438, 0.4941, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0288, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:42:19,888] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.09 | optimizer_step: 4.64
-[2025-01-25 11:42:19,889] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.23 | bwd_microstep: 4625.12 | bwd_inner_microstep: 4619.92 | bwd_allreduce_microstep: 5.11 | step_microstep: 57.17
-[2025-01-25 11:42:19,889] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.21 | bwd: 4625.14 | bwd_inner: 4619.92 | bwd_allreduce: 5.15 | step: 57.17
- 18%|█▊        | 1073/5800 [2:55:49<9:05:29,  6.92s/it]                                                       {'loss': 0.0288, 'grad_norm': 1.1753884553909302, 'learning_rate': 3.753237105270696e-05, 'epoch': 9.25}
- 18%|█▊        | 1073/5800 [2:55:49<9:05:29,  6.92s/it]score1 tensor([[0.4824],
-        [0.5078],
-        [0.5586],
-        [0.3535]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.5547, 0.5469, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0435, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:42:26,814] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.53 | optimizer_step: 4.36
-[2025-01-25 11:42:26,816] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.46 | bwd_microstep: 4630.90 | bwd_inner_microstep: 4625.73 | bwd_allreduce_microstep: 5.07 | step_microstep: 44.87
-[2025-01-25 11:42:26,816] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.42 | bwd: 4630.93 | bwd_inner: 4625.73 | bwd_allreduce: 5.12 | step: 44.88
- 19%|█▊        | 1074/5800 [2:55:56<9:05:07,  6.92s/it]                                                       {'loss': 0.0435, 'grad_norm': 3.387134075164795, 'learning_rate': 3.7526994376315934e-05, 'epoch': 9.26}
- 19%|█▊        | 1074/5800 [2:55:56<9:05:07,  6.92s/it]score1 tensor([[0.4980],
-        [0.5039],
-        [0.3965],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4609, 0.5117, 0.4434, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0308, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:42:33,762] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.48 | optimizer_step: 4.74
-[2025-01-25 11:42:33,765] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.81 | bwd_microstep: 4627.42 | bwd_inner_microstep: 4618.93 | bwd_allreduce_microstep: 8.32 | step_microstep: 82.30
-[2025-01-25 11:42:33,765] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.78 | bwd: 4627.48 | bwd_inner: 4618.93 | bwd_allreduce: 8.41 | step: 82.32
- 19%|█▊        | 1075/5800 [2:56:03<9:06:16,  6.94s/it]                                                       {'loss': 0.0308, 'grad_norm': 0.9595419764518738, 'learning_rate': 3.752161223470394e-05, 'epoch': 9.27}
- 19%|█▊        | 1075/5800 [2:56:03<9:06:16,  6.94s/it]score1 tensor([[0.3262],
-        [0.5938],
-        [0.5586],
-        [0.6289]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3867, 0.5508, 0.5156, 0.5938], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0454, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:42:40,706] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 11:42:40,707] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2169.01 | bwd_microstep: 4621.95 | bwd_inner_microstep: 4617.00 | bwd_allreduce_microstep: 4.87 | step_microstep: 44.77
-[2025-01-25 11:42:40,708] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2168.90 | bwd: 4621.98 | bwd_inner: 4617.00 | bwd_allreduce: 4.91 | step: 44.79
- 19%|█▊        | 1076/5800 [2:56:10<9:05:40,  6.93s/it]                                                       {'loss': 0.0454, 'grad_norm': 4.8025054931640625, 'learning_rate': 3.751622462954923e-05, 'epoch': 9.28}
- 19%|█▊        | 1076/5800 [2:56:10<9:05:40,  6.93s/it]score1 tensor([[0.3770],
-        [0.5430],
-        [0.6367],
-        [0.5781]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.4473, 0.5938, 0.5898], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0435, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:42:47,610] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.37
-[2025-01-25 11:42:47,611] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.64 | bwd_microstep: 4626.86 | bwd_inner_microstep: 4621.53 | bwd_allreduce_microstep: 5.25 | step_microstep: 47.29
-[2025-01-25 11:42:47,612] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.61 | bwd: 4626.88 | bwd_inner: 4621.52 | bwd_allreduce: 5.29 | step: 47.31
- 19%|█▊        | 1077/5800 [2:56:17<9:04:59,  6.92s/it]                                                       {'loss': 0.0435, 'grad_norm': 0.9872585535049438, 'learning_rate': 3.751083156253174e-05, 'epoch': 9.28}
- 19%|█▊        | 1077/5800 [2:56:17<9:04:59,  6.92s/it]score1 tensor([[0.4492],
-        [0.5742],
-        [0.6172],
-        [0.6328]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4023, 0.6172, 0.5625, 0.5586], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:42:54,525] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 11:42:54,527] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.19 | bwd_microstep: 4620.42 | bwd_inner_microstep: 4615.54 | bwd_allreduce_microstep: 4.78 | step_microstep: 45.38
-[2025-01-25 11:42:54,527] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2168.14 | bwd: 4620.44 | bwd_inner: 4615.55 | bwd_allreduce: 4.83 | step: 45.39
- 19%|█▊        | 1078/5800 [2:56:24<9:04:40,  6.92s/it]                                                       {'loss': 0.0547, 'grad_norm': 3.990996837615967, 'learning_rate': 3.750543303533313e-05, 'epoch': 9.29}
- 19%|█▊        | 1078/5800 [2:56:24<9:04:40,  6.92s/it]score1 tensor([[0.5312],
-        [0.3262],
-        [0.4434],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.4043, 0.4355, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0542, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:43:01,450] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 11:43:01,451] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.55 | bwd_microstep: 4628.27 | bwd_inner_microstep: 4623.27 | bwd_allreduce_microstep: 4.91 | step_microstep: 50.23
-[2025-01-25 11:43:01,451] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.51 | bwd: 4628.30 | bwd_inner: 4623.27 | bwd_allreduce: 4.95 | step: 50.24
- 19%|█▊        | 1079/5800 [2:56:31<9:04:36,  6.92s/it]                                                       {'loss': 0.0542, 'grad_norm': 0.635217010974884, 'learning_rate': 3.7500029049636764e-05, 'epoch': 9.3}
- 19%|█▊        | 1079/5800 [2:56:31<9:04:36,  6.92s/it]score1 tensor([[0.4355],
-        [0.4531],
-        [0.4902],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.5039, 0.5156, 0.6133], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:43:08,359] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.37
-[2025-01-25 11:43:08,360] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.01 | bwd_microstep: 4630.22 | bwd_inner_microstep: 4625.62 | bwd_allreduce_microstep: 4.49 | step_microstep: 46.53
-[2025-01-25 11:43:08,360] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.98 | bwd: 4630.24 | bwd_inner: 4625.62 | bwd_allreduce: 4.55 | step: 46.54
- 19%|█▊        | 1080/5800 [2:56:38<9:04:11,  6.92s/it]                                                       {'loss': 0.0352, 'grad_norm': 7.333136558532715, 'learning_rate': 3.749461960712768e-05, 'epoch': 9.31}
- 19%|█▊        | 1080/5800 [2:56:38<9:04:11,  6.92s/it]score1 tensor([[0.4551],
-        [0.4199],
-        [0.6992],
-        [0.5898]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.4297, 0.6367, 0.5703], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0327, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:43:15,259] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.37
-[2025-01-25 11:43:15,261] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.27 | bwd_microstep: 4619.00 | bwd_inner_microstep: 4614.12 | bwd_allreduce_microstep: 4.78 | step_microstep: 48.83
-[2025-01-25 11:43:15,261] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.23 | bwd: 4619.03 | bwd_inner: 4614.12 | bwd_allreduce: 4.83 | step: 48.84
- 19%|█▊        | 1081/5800 [2:56:45<9:03:37,  6.91s/it]                                                       {'loss': 0.0327, 'grad_norm': 1.7396833896636963, 'learning_rate': 3.748920470949265e-05, 'epoch': 9.32}
- 19%|█▊        | 1081/5800 [2:56:45<9:03:37,  6.91s/it]score1 tensor([[0.4355],
-        [0.6172],
-        [0.4277],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4277, 0.6133, 0.3887, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0205, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:43:22,165] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.05 | optimizer_step: 4.36
-[2025-01-25 11:43:22,166] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.31 | bwd_microstep: 4627.02 | bwd_inner_microstep: 4622.18 | bwd_allreduce_microstep: 4.74 | step_microstep: 49.27
-[2025-01-25 11:43:22,166] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.27 | bwd: 4627.04 | bwd_inner: 4622.18 | bwd_allreduce: 4.80 | step: 49.27
- 19%|█▊        | 1082/5800 [2:56:52<9:03:36,  6.91s/it]                                                       {'loss': 0.0205, 'grad_norm': 3.5948193073272705, 'learning_rate': 3.7483784358420126e-05, 'epoch': 9.33}
- 19%|█▊        | 1082/5800 [2:56:52<9:03:36,  6.91s/it]score1 tensor([[0.3613],
-        [0.3848],
-        [0.4824],
-        [0.3906]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3223, 0.2812, 0.5156, 0.3691], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0493, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:43:29,106] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 11:43:29,110] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.35 | bwd_microstep: 4627.21 | bwd_inner_microstep: 4619.40 | bwd_allreduce_microstep: 7.71 | step_microstep: 60.33
-[2025-01-25 11:43:29,111] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.32 | bwd: 4627.23 | bwd_inner: 4619.40 | bwd_allreduce: 7.76 | step: 60.39
- 19%|█▊        | 1083/5800 [2:56:59<9:05:05,  6.93s/it]                                                       {'loss': 0.0493, 'grad_norm': 3.06461238861084, 'learning_rate': 3.747835855560026e-05, 'epoch': 9.34}
- 19%|█▊        | 1083/5800 [2:56:59<9:05:05,  6.93s/it]score1 tensor([[0.4844],
-        [0.5781],
-        [0.4043],
-        [0.4492]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4961, 0.6094, 0.4570, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0415, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:43:36,096] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.54 | optimizer_step: 4.36
-[2025-01-25 11:43:36,097] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.66 | bwd_microstep: 4628.57 | bwd_inner_microstep: 4622.62 | bwd_allreduce_microstep: 5.85 | step_microstep: 71.41
-[2025-01-25 11:43:36,098] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.62 | bwd: 4628.60 | bwd_inner: 4622.62 | bwd_allreduce: 5.90 | step: 71.42
- 19%|█▊        | 1084/5800 [2:57:06<9:05:09,  6.94s/it]                                                       {'loss': 0.0415, 'grad_norm': 7.164894104003906, 'learning_rate': 3.7472927302724926e-05, 'epoch': 9.34}
- 19%|█▊        | 1084/5800 [2:57:06<9:05:09,  6.94s/it]score1 tensor([[0.4531],
-        [0.4824],
-        [0.5195],
-        [0.3848]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4883, 0.6094, 0.5742, 0.4199], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0630, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:43:43,030] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 11:43:43,032] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2171.42 | bwd_microstep: 4628.69 | bwd_inner_microstep: 4623.57 | bwd_allreduce_microstep: 5.01 | step_microstep: 45.74
-[2025-01-25 11:43:43,032] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2171.35 | bwd: 4628.71 | bwd_inner: 4623.57 | bwd_allreduce: 5.07 | step: 45.73
- 19%|█▊        | 1085/5800 [2:57:13<9:04:59,  6.94s/it]                                                       {'loss': 0.063, 'grad_norm': 7.0042877197265625, 'learning_rate': 3.7467490601487684e-05, 'epoch': 9.35}
- 19%|█▊        | 1085/5800 [2:57:13<9:04:59,  6.94s/it]score1 tensor([[0.5430],
-        [0.5742],
-        [0.4453],
-        [0.4141]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.6953, 0.4707, 0.3789], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0708, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:43:49,941] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.36
-[2025-01-25 11:43:49,942] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.66 | bwd_microstep: 4626.51 | bwd_inner_microstep: 4621.43 | bwd_allreduce_microstep: 4.98 | step_microstep: 45.72
-[2025-01-25 11:43:49,942] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.62 | bwd: 4626.54 | bwd_inner: 4621.43 | bwd_allreduce: 5.04 | step: 45.72
- 19%|█▊        | 1086/5800 [2:57:19<9:04:16,  6.93s/it]                                                       {'loss': 0.0708, 'grad_norm': 4.029616355895996, 'learning_rate': 3.746204845358378e-05, 'epoch': 9.36}
- 19%|█▊        | 1086/5800 [2:57:19<9:04:16,  6.93s/it]score1 tensor([[0.4141],
-        [0.5078],
-        [0.4727],
-        [0.4336]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.5625, 0.5781, 0.4258], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0454, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:43:56,845] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.37
-[2025-01-25 11:43:56,846] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.43 | bwd_microstep: 4620.78 | bwd_inner_microstep: 4615.54 | bwd_allreduce_microstep: 5.16 | step_microstep: 50.85
-[2025-01-25 11:43:56,846] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.40 | bwd: 4620.80 | bwd_inner: 4615.54 | bwd_allreduce: 5.20 | step: 50.86
- 19%|█▊        | 1087/5800 [2:57:26<9:03:35,  6.92s/it]                                                       {'loss': 0.0454, 'grad_norm': 0.6049211621284485, 'learning_rate': 3.7456600860710174e-05, 'epoch': 9.37}
- 19%|█▊        | 1087/5800 [2:57:26<9:03:35,  6.92s/it]score1 tensor([[0.5234],
-        [0.4746],
-        [0.4961],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.4727, 0.5039, 0.6875], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0522, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:44:03,736] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.43 | optimizer_step: 4.36
-[2025-01-25 11:44:03,737] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.35 | bwd_microstep: 4617.83 | bwd_inner_microstep: 4613.26 | bwd_allreduce_microstep: 4.49 | step_microstep: 43.24
-[2025-01-25 11:44:03,737] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.30 | bwd: 4617.86 | bwd_inner: 4613.26 | bwd_allreduce: 4.53 | step: 43.25
- 19%|█▉        | 1088/5800 [2:57:33<9:02:43,  6.91s/it]                                                       {'loss': 0.0522, 'grad_norm': 3.910649299621582, 'learning_rate': 3.745114782456553e-05, 'epoch': 9.38}
- 19%|█▉        | 1088/5800 [2:57:33<9:02:43,  6.91s/it]score1 tensor([[0.4902],
-        [0.4980],
-        [0.5430],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.5000, 0.5586, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:44:10,625] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 11:44:10,626] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.22 | bwd_microstep: 4618.22 | bwd_inner_microstep: 4613.04 | bwd_allreduce_microstep: 5.07 | step_microstep: 50.18
-[2025-01-25 11:44:10,627] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.19 | bwd: 4618.25 | bwd_inner: 4613.04 | bwd_allreduce: 5.12 | step: 50.19
- 19%|█▉        | 1089/5800 [2:57:40<9:02:19,  6.91s/it]                                                       {'loss': 0.0254, 'grad_norm': 0.4616090655326843, 'learning_rate': 3.7445689346850176e-05, 'epoch': 9.39}
- 19%|█▉        | 1089/5800 [2:57:40<9:02:19,  6.91s/it]score1 tensor([[0.5391],
-        [0.5312],
-        [0.6250],
-        [0.4727]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.5117, 0.7031, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0400, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:44:17,536] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.42 | optimizer_step: 4.36
-[2025-01-25 11:44:17,537] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.65 | bwd_microstep: 4628.02 | bwd_inner_microstep: 4623.20 | bwd_allreduce_microstep: 4.72 | step_microstep: 43.16
-[2025-01-25 11:44:17,537] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.61 | bwd: 4628.04 | bwd_inner: 4623.20 | bwd_allreduce: 4.76 | step: 43.16
- 19%|█▉        | 1090/5800 [2:57:47<9:02:05,  6.91s/it]                                                       {'loss': 0.04, 'grad_norm': 3.4773690700531006, 'learning_rate': 3.744022542926618e-05, 'epoch': 9.4}
- 19%|█▉        | 1090/5800 [2:57:47<9:02:05,  6.91s/it]score1 tensor([[0.5391],
-        [0.5195],
-        [0.5273],
-        [0.6055]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.4453, 0.3730, 0.5742], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0708, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:44:24,456] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 11:44:24,458] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.30 | bwd_microstep: 4630.36 | bwd_inner_microstep: 4624.03 | bwd_allreduce_microstep: 6.24 | step_microstep: 53.62
-[2025-01-25 11:44:24,458] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.26 | bwd: 4630.39 | bwd_inner: 4624.03 | bwd_allreduce: 6.29 | step: 53.66
- 19%|█▉        | 1091/5800 [2:57:54<9:02:20,  6.91s/it]                                                       {'loss': 0.0708, 'grad_norm': 3.9190993309020996, 'learning_rate': 3.7434756073517285e-05, 'epoch': 9.41}
- 19%|█▉        | 1091/5800 [2:57:54<9:02:20,  6.91s/it]score1 tensor([[0.5000],
-        [0.5039],
-        [0.5742],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.4551, 0.6055, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0356, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:44:31,374] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.54 | optimizer_step: 4.37
-[2025-01-25 11:44:31,375] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.48 | bwd_microstep: 4623.88 | bwd_inner_microstep: 4616.26 | bwd_allreduce_microstep: 7.38 | step_microstep: 53.17
-[2025-01-25 11:44:31,375] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.46 | bwd: 4623.94 | bwd_inner: 4616.26 | bwd_allreduce: 7.51 | step: 53.15
- 19%|█▉        | 1092/5800 [2:58:01<9:02:30,  6.91s/it]                                                       {'loss': 0.0356, 'grad_norm': 3.5435702800750732, 'learning_rate': 3.742928128130892e-05, 'epoch': 9.41}
- 19%|█▉        | 1092/5800 [2:58:01<9:02:30,  6.91s/it]score1 tensor([[0.5664],
-        [0.4707],
-        [0.5352],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.3750, 0.4883, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0522, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:44:38,332] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.76 | optimizer_step: 4.36
-[2025-01-25 11:44:38,333] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.46 | bwd_microstep: 4637.89 | bwd_inner_microstep: 4628.96 | bwd_allreduce_microstep: 8.69 | step_microstep: 85.84
-[2025-01-25 11:44:38,333] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.43 | bwd: 4637.97 | bwd_inner: 4628.95 | bwd_allreduce: 8.82 | step: 85.80
- 19%|█▉        | 1093/5800 [2:58:08<9:03:40,  6.93s/it]                                                       {'loss': 0.0522, 'grad_norm': 3.7464520931243896, 'learning_rate': 3.742380105434824e-05, 'epoch': 9.42}
- 19%|█▉        | 1093/5800 [2:58:08<9:03:40,  6.93s/it]score1 tensor([[0.5547],
-        [0.3672],
-        [0.5547],
-        [0.4785]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.3086, 0.6875, 0.5000], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0659, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:44:45,239] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.36
-[2025-01-25 11:44:45,241] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.50 | bwd_microstep: 4619.92 | bwd_inner_microstep: 4615.29 | bwd_allreduce_microstep: 4.54 | step_microstep: 44.06
-[2025-01-25 11:44:45,241] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.45 | bwd: 4619.94 | bwd_inner: 4615.29 | bwd_allreduce: 4.58 | step: 44.07
- 19%|█▉        | 1094/5800 [2:58:15<9:02:46,  6.92s/it]                                                       {'loss': 0.0659, 'grad_norm': 4.269558906555176, 'learning_rate': 3.7418315394344044e-05, 'epoch': 9.43}
- 19%|█▉        | 1094/5800 [2:58:15<9:02:46,  6.92s/it]score1 tensor([[0.5312],
-        [0.4395],
-        [0.5469],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5820, 0.4590, 0.6211, 0.6562], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0586, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:44:52,139] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.55 | optimizer_step: 4.36
-[2025-01-25 11:44:52,140] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.82 | bwd_microstep: 4622.40 | bwd_inner_microstep: 4616.79 | bwd_allreduce_microstep: 5.49 | step_microstep: 46.19
-[2025-01-25 11:44:52,140] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.80 | bwd: 4622.43 | bwd_inner: 4616.79 | bwd_allreduce: 5.56 | step: 46.20
- 19%|█▉        | 1095/5800 [2:58:22<9:02:22,  6.92s/it]                                                       {'loss': 0.0586, 'grad_norm': 7.604241371154785, 'learning_rate': 3.741282430300689e-05, 'epoch': 9.44}
- 19%|█▉        | 1095/5800 [2:58:22<9:02:22,  6.92s/it]score1 tensor([[0.5547],
-        [0.4590],
-        [0.4609],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5898, 0.5469, 0.4648, 0.6367], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0522, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:44:59,059] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 11:44:59,061] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.78 | bwd_microstep: 4627.99 | bwd_inner_microstep: 4622.79 | bwd_allreduce_microstep: 5.09 | step_microstep: 49.55
-[2025-01-25 11:44:59,062] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.75 | bwd: 4628.01 | bwd_inner: 4622.79 | bwd_allreduce: 5.15 | step: 49.56
- 19%|█▉        | 1096/5800 [2:58:29<9:02:08,  6.92s/it]                                                       {'loss': 0.0522, 'grad_norm': 7.503766059875488, 'learning_rate': 3.740732778204897e-05, 'epoch': 9.45}
- 19%|█▉        | 1096/5800 [2:58:29<9:02:08,  6.92s/it]score1 tensor([[0.5117],
-        [0.5039],
-        [0.4707],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4277, 0.5547, 0.4844, 0.5703], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0488, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:45:05,961] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 11:45:05,963] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.20 | bwd_microstep: 4621.77 | bwd_inner_microstep: 4616.79 | bwd_allreduce_microstep: 4.82 | step_microstep: 46.87
-[2025-01-25 11:45:05,963] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.16 | bwd: 4621.80 | bwd_inner: 4616.79 | bwd_allreduce: 4.94 | step: 46.87
- 19%|█▉        | 1097/5800 [2:58:35<9:01:45,  6.91s/it]                                                       {'loss': 0.0488, 'grad_norm': 3.724777936935425, 'learning_rate': 3.740182583318421e-05, 'epoch': 9.46}
- 19%|█▉        | 1097/5800 [2:58:35<9:01:45,  6.91s/it]score1 tensor([[0.5938],
-        [0.4785],
-        [0.6094],
-        [0.4648]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6562, 0.3945, 0.6250, 0.4668], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0410, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:45:12,884] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 11:45:12,886] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.62 | bwd_microstep: 4624.87 | bwd_inner_microstep: 4619.55 | bwd_allreduce_microstep: 5.21 | step_microstep: 46.66
-[2025-01-25 11:45:12,886] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.58 | bwd: 4624.89 | bwd_inner: 4619.55 | bwd_allreduce: 5.26 | step: 46.67
- 19%|█▉        | 1098/5800 [2:58:42<9:01:54,  6.92s/it]                                                       {'loss': 0.041, 'grad_norm': 4.238851070404053, 'learning_rate': 3.73963184581282e-05, 'epoch': 9.47}
- 19%|█▉        | 1098/5800 [2:58:42<9:01:54,  6.92s/it]score1 tensor([[0.4844],
-        [0.4609],
-        [0.5273],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4062, 0.3457, 0.5547, 0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0610, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:45:19,780] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 11:45:19,781] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.00 | bwd_microstep: 4616.80 | bwd_inner_microstep: 4611.64 | bwd_allreduce_microstep: 5.08 | step_microstep: 44.93
-[2025-01-25 11:45:19,781] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.96 | bwd: 4616.83 | bwd_inner: 4611.64 | bwd_allreduce: 5.12 | step: 44.94
- 19%|█▉        | 1099/5800 [2:58:49<9:01:16,  6.91s/it]                                                       {'loss': 0.061, 'grad_norm': 0.6343216300010681, 'learning_rate': 3.739080565859825e-05, 'epoch': 9.47}
- 19%|█▉        | 1099/5800 [2:58:49<9:01:16,  6.91s/it]score1 tensor([[0.5820],
-        [0.6172],
-        [0.5000],
-        [0.5898]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.6602, 0.4023, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0723, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:45:26,687] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.89 | optimizer_step: 4.37
-[2025-01-25 11:45:26,689] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.58 | bwd_microstep: 4616.90 | bwd_inner_microstep: 4611.18 | bwd_allreduce_microstep: 5.64 | step_microstep: 55.20
-[2025-01-25 11:45:26,689] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.54 | bwd: 4616.93 | bwd_inner: 4611.18 | bwd_allreduce: 5.68 | step: 55.21
- 19%|█▉        | 1100/5800 [2:58:56<9:01:16,  6.91s/it]                                                       {'loss': 0.0723, 'grad_norm': 3.8350958824157715, 'learning_rate': 3.738528743631333e-05, 'epoch': 9.48}
- 19%|█▉        | 1100/5800 [2:58:56<9:01:16,  6.91s/it]score1 tensor([[0.5469],
-        [0.6016],
-        [0.5742],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4863, 0.4688, 0.4141, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1011, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:45:33,593] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.34 | optimizer_step: 4.36
-[2025-01-25 11:45:33,594] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.60 | bwd_microstep: 4629.44 | bwd_inner_microstep: 4625.03 | bwd_allreduce_microstep: 4.33 | step_microstep: 35.42
-[2025-01-25 11:45:33,594] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.54 | bwd: 4629.46 | bwd_inner: 4625.03 | bwd_allreduce: 4.37 | step: 35.42
- 19%|█▉        | 1101/5800 [2:59:03<9:00:45,  6.90s/it]                                                       {'loss': 0.1011, 'grad_norm': 8.011373519897461, 'learning_rate': 3.737976379299414e-05, 'epoch': 9.49}
- 19%|█▉        | 1101/5800 [2:59:03<9:00:45,  6.90s/it]score1 tensor([[0.5195],
-        [0.4707],
-        [0.6641],
-        [0.6094]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4336, 0.4434, 0.5195, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1006, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:45:40,499] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.21 | optimizer_step: 4.37
-[2025-01-25 11:45:40,501] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.31 | bwd_microstep: 4630.75 | bwd_inner_microstep: 4625.48 | bwd_allreduce_microstep: 5.17 | step_microstep: 60.52
-[2025-01-25 11:45:40,501] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.29 | bwd: 4630.77 | bwd_inner: 4625.48 | bwd_allreduce: 5.22 | step: 60.55
- 19%|█▉        | 1102/5800 [2:59:10<9:00:56,  6.91s/it]                                                       {'loss': 0.1006, 'grad_norm': 7.97429895401001, 'learning_rate': 3.737423473036303e-05, 'epoch': 9.5}
- 19%|█▉        | 1102/5800 [2:59:10<9:00:56,  6.91s/it]score1 tensor([[0.5430],
-        [0.5586],
-        [0.6484],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.5508, 0.6055, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0376, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:45:47,421] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.39 | optimizer_step: 4.36
-[2025-01-25 11:45:47,422] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.05 | bwd_microstep: 4623.38 | bwd_inner_microstep: 4615.79 | bwd_allreduce_microstep: 7.41 | step_microstep: 65.42
-[2025-01-25 11:45:47,423] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.04 | bwd: 4623.45 | bwd_inner: 4615.79 | bwd_allreduce: 7.50 | step: 65.38
- 19%|█▉        | 1103/5800 [2:59:17<9:01:26,  6.92s/it]                                                       {'loss': 0.0376, 'grad_norm': 8.02379322052002, 'learning_rate': 3.736870025014405e-05, 'epoch': 9.51}
- 19%|█▉        | 1103/5800 [2:59:17<9:01:26,  6.92s/it]score1 tensor([[0.4648],
-        [0.4805],
-        [0.4824],
-        [0.6367]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4277, 0.4824, 0.4551, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0244, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:45:54,339] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 11:45:54,340] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.84 | bwd_microstep: 4626.43 | bwd_inner_microstep: 4622.38 | bwd_allreduce_microstep: 3.99 | step_microstep: 37.22
-[2025-01-25 11:45:54,341] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.81 | bwd: 4626.44 | bwd_inner: 4622.38 | bwd_allreduce: 4.02 | step: 37.22
- 19%|█▉        | 1104/5800 [2:59:24<9:00:38,  6.91s/it]                                                       {'loss': 0.0244, 'grad_norm': 4.002115249633789, 'learning_rate': 3.7363160354062976e-05, 'epoch': 9.52}
- 19%|█▉        | 1104/5800 [2:59:24<9:00:38,  6.91s/it]score1 tensor([[0.4648],
-        [0.5859],
-        [0.5938],
-        [0.6094]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4375, 0.5586, 0.5273, 0.6875], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0498, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:46:01,201] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.25 | optimizer_step: 4.37
-[2025-01-25 11:46:01,203] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.10 | bwd_microstep: 4617.73 | bwd_inner_microstep: 4613.38 | bwd_allreduce_microstep: 4.27 | step_microstep: 35.19
-[2025-01-25 11:46:01,203] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.09 | bwd: 4617.75 | bwd_inner: 4613.38 | bwd_allreduce: 4.31 | step: 35.19
- 19%|█▉        | 1105/5800 [2:59:31<8:59:30,  6.89s/it]                                                       {'loss': 0.0498, 'grad_norm': 3.832758903503418, 'learning_rate': 3.7357615043847213e-05, 'epoch': 9.53}
- 19%|█▉        | 1105/5800 [2:59:31<8:59:30,  6.89s/it]score1 tensor([[0.5273],
-        [0.4590],
-        [0.6133],
-        [0.5938]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.5664, 0.6133, 0.6797], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0610, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:46:08,030] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 11:46:08,032] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.60 | bwd_microstep: 4572.31 | bwd_inner_microstep: 4567.18 | bwd_allreduce_microstep: 5.04 | step_microstep: 48.73
-[2025-01-25 11:46:08,032] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.59 | bwd: 4572.33 | bwd_inner: 4567.18 | bwd_allreduce: 5.08 | step: 48.73
- 19%|█▉        | 1106/5800 [2:59:38<8:58:15,  6.88s/it]                                                       {'loss': 0.061, 'grad_norm': 5.8975019454956055, 'learning_rate': 3.73520643212259e-05, 'epoch': 9.53}
- 19%|█▉        | 1106/5800 [2:59:38<8:58:15,  6.88s/it]score1 tensor([[0.5703],
-        [0.5312],
-        [0.3867],
-        [0.3262]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.5391, 0.4531, 0.3945], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0444, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:46:14,941] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.19 | optimizer_step: 4.36
-[2025-01-25 11:46:14,942] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.40 | bwd_microstep: 4629.94 | bwd_inner_microstep: 4625.59 | bwd_allreduce_microstep: 4.29 | step_microstep: 40.98
-[2025-01-25 11:46:14,942] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.34 | bwd: 4629.96 | bwd_inner: 4625.59 | bwd_allreduce: 4.32 | step: 40.98
- 19%|█▉        | 1107/5800 [2:59:44<8:58:26,  6.88s/it]                                                       {'loss': 0.0444, 'grad_norm': 3.2046148777008057, 'learning_rate': 3.734650818792985e-05, 'epoch': 9.54}
- 19%|█▉        | 1107/5800 [2:59:44<8:58:26,  6.88s/it]score1 tensor([[0.4160],
-        [0.4512],
-        [0.4238],
-        [0.3887]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.4961, 0.4121, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0571, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:46:21,812] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 11:46:21,813] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.14 | bwd_microstep: 4621.54 | bwd_inner_microstep: 4616.38 | bwd_allreduce_microstep: 5.05 | step_microstep: 41.18
-[2025-01-25 11:46:21,813] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.12 | bwd: 4621.57 | bwd_inner: 4616.38 | bwd_allreduce: 5.11 | step: 41.18
- 19%|█▉        | 1108/5800 [2:59:51<8:58:21,  6.88s/it]                                                       {'loss': 0.0571, 'grad_norm': 3.5203769207000732, 'learning_rate': 3.734094664569156e-05, 'epoch': 9.55}
- 19%|█▉        | 1108/5800 [2:59:51<8:58:21,  6.88s/it]score1 tensor([[0.4727],
-        [0.4062],
-        [0.4980],
-        [0.5859]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4629, 0.5156, 0.5273, 0.6641], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0566, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:46:28,705] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.28 | optimizer_step: 4.37
-[2025-01-25 11:46:28,707] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.13 | bwd_microstep: 4626.06 | bwd_inner_microstep: 4622.15 | bwd_allreduce_microstep: 3.84 | step_microstep: 35.28
-[2025-01-25 11:46:28,707] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.10 | bwd: 4626.07 | bwd_inner: 4622.15 | bwd_allreduce: 3.87 | step: 35.29
- 19%|█▉        | 1109/5800 [2:59:58<8:58:28,  6.89s/it]                                                       {'loss': 0.0566, 'grad_norm': 4.169429779052734, 'learning_rate': 3.733537969624522e-05, 'epoch': 9.56}
- 19%|█▉        | 1109/5800 [2:59:58<8:58:28,  6.89s/it]score1 tensor([[0.6055],
-        [0.3945],
-        [0.4160],
-        [0.4570]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133, 0.4512, 0.5391, 0.5742], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0762, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:46:35,592] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.49 | optimizer_step: 4.37
-[2025-01-25 11:46:35,593] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.88 | bwd_microstep: 4620.57 | bwd_inner_microstep: 4615.34 | bwd_allreduce_microstep: 5.12 | step_microstep: 39.06
-[2025-01-25 11:46:35,594] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.87 | bwd: 4620.59 | bwd_inner: 4615.34 | bwd_allreduce: 5.17 | step: 39.07
- 19%|█▉        | 1110/5800 [3:00:05<8:58:18,  6.89s/it]                                                       {'loss': 0.0762, 'grad_norm': 7.320600986480713, 'learning_rate': 3.732980734132668e-05, 'epoch': 9.57}
- 19%|█▉        | 1110/5800 [3:00:05<8:58:18,  6.89s/it]score1 tensor([[0.4316],
-        [0.5938],
-        [0.4844],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.6641, 0.5117, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0396, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:46:42,511] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 11:46:42,512] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.64 | bwd_microstep: 4621.45 | bwd_inner_microstep: 4614.03 | bwd_allreduce_microstep: 7.21 | step_microstep: 63.68
-[2025-01-25 11:46:42,512] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.61 | bwd: 4621.57 | bwd_inner: 4614.03 | bwd_allreduce: 7.31 | step: 63.66
- 19%|█▉        | 1111/5800 [3:00:12<8:58:54,  6.90s/it]                                                       {'loss': 0.0396, 'grad_norm': 7.666648864746094, 'learning_rate': 3.732422958267353e-05, 'epoch': 9.58}
- 19%|█▉        | 1111/5800 [3:00:12<8:58:54,  6.90s/it]score1 tensor([[0.4375],
-        [0.4707],
-        [0.5078],
-        [0.6094]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4316, 0.4941, 0.4922, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:46:49,399] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.40 | optimizer_step: 4.37
-[2025-01-25 11:46:49,400] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.54 | bwd_microstep: 4617.00 | bwd_inner_microstep: 4611.59 | bwd_allreduce_microstep: 5.31 | step_microstep: 37.22
-[2025-01-25 11:46:49,400] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.51 | bwd: 4617.03 | bwd_inner: 4611.59 | bwd_allreduce: 5.37 | step: 37.23
- 19%|█▉        | 1112/5800 [3:00:19<8:58:27,  6.89s/it]                                                       {'loss': 0.0132, 'grad_norm': 0.7495497465133667, 'learning_rate': 3.731864642202498e-05, 'epoch': 9.59}
- 19%|█▉        | 1112/5800 [3:00:19<8:58:27,  6.89s/it]score1 tensor([[0.7422],
-        [0.5312],
-        [0.6055],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.5000, 0.4375, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0859, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:46:56,304] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.28 | optimizer_step: 4.36
-[2025-01-25 11:46:56,305] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.62 | bwd_microstep: 4628.47 | bwd_inner_microstep: 4624.30 | bwd_allreduce_microstep: 4.11 | step_microstep: 33.00
-[2025-01-25 11:46:56,305] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.54 | bwd: 4628.49 | bwd_inner: 4624.30 | bwd_allreduce: 4.14 | step: 33.01
- 19%|█▉        | 1113/5800 [3:00:26<8:58:33,  6.89s/it]                                                       {'loss': 0.0859, 'grad_norm': 8.279431343078613, 'learning_rate': 3.7313057861121974e-05, 'epoch': 9.59}
- 19%|█▉        | 1113/5800 [3:00:26<8:58:33,  6.89s/it]score1 tensor([[0.7383],
-        [0.4395],
-        [0.6797],
-        [0.4531]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6719, 0.3984, 0.5469, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0688, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:47:03,190] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.55 | optimizer_step: 4.36
-[2025-01-25 11:47:03,191] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.85 | bwd_microstep: 4628.62 | bwd_inner_microstep: 4623.42 | bwd_allreduce_microstep: 5.09 | step_microstep: 45.93
-[2025-01-25 11:47:03,191] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.84 | bwd: 4628.65 | bwd_inner: 4623.42 | bwd_allreduce: 5.15 | step: 45.93
- 19%|█▉        | 1114/5800 [3:00:33<8:58:43,  6.90s/it]                                                       {'loss': 0.0688, 'grad_norm': 8.167510032653809, 'learning_rate': 3.730746390170713e-05, 'epoch': 9.6}
- 19%|█▉        | 1114/5800 [3:00:33<8:58:43,  6.90s/it]score1 tensor([[0.6211],
-        [0.6406],
-        [0.7578],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4043, 0.5469, 0.6211, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1387, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:47:10,097] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.53 | optimizer_step: 4.37
-[2025-01-25 11:47:10,097] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.20 | bwd_microstep: 4620.20 | bwd_inner_microstep: 4615.08 | bwd_allreduce_microstep: 5.03 | step_microstep: 44.50
-[2025-01-25 11:47:10,098] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.17 | bwd: 4620.23 | bwd_inner: 4615.08 | bwd_allreduce: 5.08 | step: 44.50
- 19%|█▉        | 1115/5800 [3:00:40<8:58:26,  6.90s/it]                                                       {'loss': 0.1387, 'grad_norm': 8.673182487487793, 'learning_rate': 3.730186454552472e-05, 'epoch': 9.61}
- 19%|█▉        | 1115/5800 [3:00:40<8:58:26,  6.90s/it]score1 tensor([[0.5977],
-        [0.4668],
-        [0.3984],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.4395, 0.3340, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0396, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:47:16,992] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 11:47:16,993] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.34 | bwd_microstep: 4626.61 | bwd_inner_microstep: 4622.31 | bwd_allreduce_microstep: 4.22 | step_microstep: 44.74
-[2025-01-25 11:47:16,993] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.31 | bwd: 4626.63 | bwd_inner: 4622.31 | bwd_allreduce: 4.26 | step: 44.74
- 19%|█▉        | 1116/5800 [3:00:46<8:58:24,  6.90s/it]                                                       {'loss': 0.0396, 'grad_norm': 3.331700086593628, 'learning_rate': 3.729625979432074e-05, 'epoch': 9.62}
- 19%|█▉        | 1116/5800 [3:00:46<8:58:24,  6.90s/it]score1 tensor([[0.6016],
-        [0.5039],
-        [0.4883],
-        [0.6133]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.4902, 0.4297, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0654, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:47:23,880] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.55 | optimizer_step: 4.37
-[2025-01-25 11:47:23,881] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.15 | bwd_microstep: 4617.56 | bwd_inner_microstep: 4613.03 | bwd_allreduce_microstep: 4.45 | step_microstep: 42.92
-[2025-01-25 11:47:23,882] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.12 | bwd: 4617.58 | bwd_inner: 4613.03 | bwd_allreduce: 4.49 | step: 42.92
- 19%|█▉        | 1117/5800 [3:00:53<8:58:12,  6.90s/it]                                                       {'loss': 0.0654, 'grad_norm': 7.894193172454834, 'learning_rate': 3.729064964984283e-05, 'epoch': 9.63}
- 19%|█▉        | 1117/5800 [3:00:53<8:58:12,  6.90s/it]score1 tensor([[0.6992],
-        [0.6367],
-        [0.5469],
-        [0.6211]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.5078, 0.5547, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0615, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:47:30,783] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 11:47:30,784] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.60 | bwd_microstep: 4619.25 | bwd_inner_microstep: 4614.60 | bwd_allreduce_microstep: 4.55 | step_microstep: 41.75
-[2025-01-25 11:47:30,784] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.57 | bwd: 4619.28 | bwd_inner: 4614.60 | bwd_allreduce: 4.61 | step: 41.76
- 19%|█▉        | 1118/5800 [3:01:00<8:58:20,  6.90s/it]                                                       {'loss': 0.0615, 'grad_norm': 4.716207504272461, 'learning_rate': 3.7285034113840346e-05, 'epoch': 9.64}
- 19%|█▉        | 1118/5800 [3:01:00<8:58:20,  6.90s/it]score1 tensor([[0.4375],
-        [0.4980],
-        [0.6094],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.5156, 0.6094, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:47:37,637] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.42 | optimizer_step: 4.36
-[2025-01-25 11:47:37,639] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.97 | bwd_microstep: 4580.27 | bwd_inner_microstep: 4576.09 | bwd_allreduce_microstep: 4.12 | step_microstep: 35.57
-[2025-01-25 11:47:37,639] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.92 | bwd: 4580.29 | bwd_inner: 4576.09 | bwd_allreduce: 4.15 | step: 35.57
- 19%|█▉        | 1119/5800 [3:01:07<8:56:48,  6.88s/it]                                                       {'loss': 0.0312, 'grad_norm': 2.2087011337280273, 'learning_rate': 3.72794131880643e-05, 'epoch': 9.65}
- 19%|█▉        | 1119/5800 [3:01:07<8:56:48,  6.88s/it]score1 tensor([[0.3281],
-        [0.5312],
-        [0.3457],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.5820, 0.4121, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0874, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:47:44,523] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.36 | optimizer_step: 4.36
-[2025-01-25 11:47:44,524] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.93 | bwd_microstep: 4626.54 | bwd_inner_microstep: 4621.76 | bwd_allreduce_microstep: 4.68 | step_microstep: 45.34
-[2025-01-25 11:47:44,525] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.93 | bwd: 4626.56 | bwd_inner: 4621.76 | bwd_allreduce: 4.73 | step: 45.35
- 19%|█▉        | 1120/5800 [3:01:14<8:57:15,  6.89s/it]                                                       {'loss': 0.0874, 'grad_norm': 7.036182403564453, 'learning_rate': 3.7273786874267405e-05, 'epoch': 9.66}
- 19%|█▉        | 1120/5800 [3:01:14<8:57:15,  6.89s/it]score1 tensor([[0.3398],
-        [0.3770],
-        [0.2354],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4277, 0.4023, 0.3652, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0752, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:47:51,436] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.41 | optimizer_step: 4.36
-[2025-01-25 11:47:51,437] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.85 | bwd_microstep: 4626.99 | bwd_inner_microstep: 4621.90 | bwd_allreduce_microstep: 5.02 | step_microstep: 38.18
-[2025-01-25 11:47:51,438] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.81 | bwd: 4627.01 | bwd_inner: 4621.90 | bwd_allreduce: 5.05 | step: 38.18
- 19%|█▉        | 1121/5800 [3:01:21<8:57:28,  6.89s/it]                                                       {'loss': 0.0752, 'grad_norm': 2.8223209381103516, 'learning_rate': 3.726815517420403e-05, 'epoch': 9.66}
- 19%|█▉        | 1121/5800 [3:01:21<8:57:28,  6.89s/it]score1 tensor([[0.4375],
-        [0.5586],
-        [0.4277],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4609, 0.5977, 0.5664, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0552, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:47:58,448] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 11.85 | optimizer_step: 4.92
-[2025-01-25 11:47:58,449] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.35 | bwd_microstep: 4644.82 | bwd_inner_microstep: 4634.99 | bwd_allreduce_microstep: 9.48 | step_microstep: 124.05
-[2025-01-25 11:47:58,449] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.31 | bwd: 4644.90 | bwd_inner: 4634.98 | bwd_allreduce: 9.66 | step: 124.02
- 19%|█▉        | 1122/5800 [3:01:28<9:00:59,  6.94s/it]                                                       {'loss': 0.0552, 'grad_norm': 7.528820991516113, 'learning_rate': 3.726251808963024e-05, 'epoch': 9.67}
- 19%|█▉        | 1122/5800 [3:01:28<9:00:59,  6.94s/it]score1 tensor([[0.4238],
-        [0.2754],
-        [0.4297],
-        [0.3828]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4863, 0.4512, 0.4727, 0.4375], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0840, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:48:05,371] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.24 | optimizer_step: 4.36
-[2025-01-25 11:48:05,372] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.25 | bwd_microstep: 4615.42 | bwd_inner_microstep: 4609.64 | bwd_allreduce_microstep: 5.69 | step_microstep: 34.64
-[2025-01-25 11:48:05,372] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.23 | bwd: 4615.45 | bwd_inner: 4609.64 | bwd_allreduce: 5.74 | step: 34.63
- 19%|█▉        | 1123/5800 [3:01:35<8:59:26,  6.92s/it]                                                       {'loss': 0.084, 'grad_norm': 6.458930969238281, 'learning_rate': 3.725687562230379e-05, 'epoch': 9.68}
- 19%|█▉        | 1123/5800 [3:01:35<8:59:26,  6.92s/it]score1 tensor([[0.4473],
-        [0.4883],
-        [0.5391],
-        [0.3164]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4570, 0.4941, 0.5508, 0.4414], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0381, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:48:12,252] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.37
-[2025-01-25 11:48:12,253] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.08 | bwd_microstep: 4625.55 | bwd_inner_microstep: 4620.20 | bwd_allreduce_microstep: 5.18 | step_microstep: 47.50
-[2025-01-25 11:48:12,253] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.06 | bwd: 4625.58 | bwd_inner: 4620.20 | bwd_allreduce: 5.29 | step: 47.51
- 19%|█▉        | 1124/5800 [3:01:42<8:58:45,  6.91s/it]                                                       {'loss': 0.0381, 'grad_norm': 7.023523807525635, 'learning_rate': 3.725122777398408e-05, 'epoch': 9.69}
- 19%|█▉        | 1124/5800 [3:01:42<8:58:45,  6.91s/it]score1 tensor([[0.4883],
-        [0.5234],
-        [0.4551],
-        [0.3945]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.4668, 0.4844, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0376, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:48:19,142] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.46 | optimizer_step: 4.36
-[2025-01-25 11:48:19,144] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.66 | bwd_microstep: 4618.44 | bwd_inner_microstep: 4613.80 | bwd_allreduce_microstep: 4.56 | step_microstep: 41.97
-[2025-01-25 11:48:19,144] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.64 | bwd: 4618.46 | bwd_inner: 4613.80 | bwd_allreduce: 4.60 | step: 41.97
- 19%|█▉        | 1125/5800 [3:01:49<8:58:09,  6.91s/it]                                                       {'loss': 0.0376, 'grad_norm': 1.3042181730270386, 'learning_rate': 3.724557454643222e-05, 'epoch': 9.7}
- 19%|█���        | 1125/5800 [3:01:49<8:58:09,  6.91s/it]score1 tensor([[0.4961],
-        [0.5625],
-        [0.3750],
-        [0.4707]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5000, 0.4648, 0.4805, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0552, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:48:26,053] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.89 | optimizer_step: 4.37
-[2025-01-25 11:48:26,055] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.20 | bwd_microstep: 4628.74 | bwd_inner_microstep: 4623.98 | bwd_allreduce_microstep: 4.67 | step_microstep: 45.47
-[2025-01-25 11:48:26,055] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.17 | bwd: 4628.76 | bwd_inner: 4623.98 | bwd_allreduce: 4.71 | step: 45.48
- 19%|█▉        | 1126/5800 [3:01:56<8:58:10,  6.91s/it]                                                       {'loss': 0.0552, 'grad_norm': 3.3302195072174072, 'learning_rate': 3.723991594141098e-05, 'epoch': 9.71}
- 19%|█▉        | 1126/5800 [3:01:56<8:58:10,  6.91s/it]score1 tensor([[0.5469],
-        [0.5898],
-        [0.4805],
-        [0.4258]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5703, 0.5391, 0.5312, 0.3906], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0400, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:48:32,973] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.37
-[2025-01-25 11:48:32,975] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.29 | bwd_microstep: 4628.96 | bwd_inner_microstep: 4624.09 | bwd_allreduce_microstep: 4.74 | step_microstep: 44.47
-[2025-01-25 11:48:32,975] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.25 | bwd: 4628.98 | bwd_inner: 4624.09 | bwd_allreduce: 4.81 | step: 44.48
- 19%|█▉        | 1127/5800 [3:02:02<8:58:08,  6.91s/it]                                                       {'loss': 0.04, 'grad_norm': 0.7571731209754944, 'learning_rate': 3.723425196068481e-05, 'epoch': 9.72}
- 19%|█▉        | 1127/5800 [3:02:02<8:58:08,  6.91s/it]score1 tensor([[0.6602],
-        [0.4375],
-        [0.5000],
-        [0.5977]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.4160, 0.4961, 0.4863], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0645, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:48:39,871] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.37
-[2025-01-25 11:48:39,872] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.41 | bwd_microstep: 4618.52 | bwd_inner_microstep: 4613.44 | bwd_allreduce_microstep: 4.97 | step_microstep: 46.76
-[2025-01-25 11:48:39,872] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.38 | bwd: 4618.55 | bwd_inner: 4613.44 | bwd_allreduce: 5.03 | step: 46.77
- 19%|█▉        | 1128/5800 [3:02:09<8:57:44,  6.91s/it]                                                       {'loss': 0.0645, 'grad_norm': 7.684942722320557, 'learning_rate': 3.7228582606019834e-05, 'epoch': 9.72}
- 19%|█▉        | 1128/5800 [3:02:09<8:57:44,  6.91s/it]score1 tensor([[0.5977],
-        [0.5391],
-        [0.5586],
-        [0.6172]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.4727, 0.4746, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0718, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:48:46,760] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.54 | optimizer_step: 4.36
-[2025-01-25 11:48:46,761] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.10 | bwd_microstep: 4616.92 | bwd_inner_microstep: 4611.91 | bwd_allreduce_microstep: 4.94 | step_microstep: 45.61
-[2025-01-25 11:48:46,761] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.06 | bwd: 4616.95 | bwd_inner: 4611.91 | bwd_allreduce: 4.97 | step: 45.61
- 19%|█▉        | 1129/5800 [3:02:16<8:57:12,  6.90s/it]                                                       {'loss': 0.0718, 'grad_norm': 7.857066631317139, 'learning_rate': 3.722290787918386e-05, 'epoch': 9.73}
- 19%|█▉        | 1129/5800 [3:02:16<8:57:12,  6.90s/it]score1 tensor([[0.4492],
-        [0.6484],
-        [0.6367],
-        [0.6992]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3613, 0.6055, 0.5430, 0.6484], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0688, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:48:53,657] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.36
-[2025-01-25 11:48:53,658] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.63 | bwd_microstep: 4620.13 | bwd_inner_microstep: 4615.07 | bwd_allreduce_microstep: 4.94 | step_microstep: 50.36
-[2025-01-25 11:48:53,659] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.57 | bwd: 4620.17 | bwd_inner: 4615.07 | bwd_allreduce: 4.99 | step: 50.36
- 19%|█▉        | 1130/5800 [3:02:23<8:57:00,  6.90s/it]                                                       {'loss': 0.0688, 'grad_norm': 8.096781730651855, 'learning_rate': 3.721722778194637e-05, 'epoch': 9.74}
- 19%|█▉        | 1130/5800 [3:02:23<8:57:00,  6.90s/it]score1 tensor([[0.3965],
-        [0.0000],
-        [0.5625],
-        [0.6367]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3398, 0.1787, 0.4453, 0.5977], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0977, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:49:00,519] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.78 | optimizer_step: 4.37
-[2025-01-25 11:49:00,521] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2170.22 | bwd_microstep: 4571.60 | bwd_inner_microstep: 4565.56 | bwd_allreduce_microstep: 5.94 | step_microstep: 56.88
-[2025-01-25 11:49:00,522] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.08 | bwd: 4571.62 | bwd_inner: 4565.56 | bwd_allreduce: 5.99 | step: 56.92
- 20%|█▉        | 1131/5800 [3:02:30<8:56:57,  6.90s/it]                                                       {'loss': 0.0977, 'grad_norm': 5.542466640472412, 'learning_rate': 3.7211542316078506e-05, 'epoch': 9.75}
- 20%|█▉        | 1131/5800 [3:02:30<8:56:57,  6.90s/it]score1 tensor([[0.6523],
-        [0.5312],
-        [0.5312],
-        [0.5938]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6367, 0.4551, 0.4629, 0.4414], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0781, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:49:07,498] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 11:49:07,499] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.25 | bwd_microstep: 4623.60 | bwd_inner_microstep: 4617.19 | bwd_allreduce_microstep: 6.22 | step_microstep: 62.77
-[2025-01-25 11:49:07,499] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.19 | bwd: 4623.64 | bwd_inner: 4617.19 | bwd_allreduce: 6.32 | step: 62.73
- 20%|█▉        | 1132/5800 [3:02:37<8:58:14,  6.92s/it]                                                       {'loss': 0.0781, 'grad_norm': 7.8043012619018555, 'learning_rate': 3.7205851483353105e-05, 'epoch': 9.76}
- 20%|█▉        | 1132/5800 [3:02:37<8:58:14,  6.92s/it]score1 tensor([[0.5391],
-        [0.4883],
-        [0.6172],
-        [0.4570]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.4629, 0.6797, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0381, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:49:14,435] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.37
-[2025-01-25 11:49:14,436] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.79 | bwd_microstep: 4624.23 | bwd_inner_microstep: 4619.02 | bwd_allreduce_microstep: 5.10 | step_microstep: 46.43
-[2025-01-25 11:49:14,437] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.72 | bwd: 4624.26 | bwd_inner: 4619.02 | bwd_allreduce: 5.16 | step: 46.44
- 20%|█▉        | 1133/5800 [3:02:44<8:58:05,  6.92s/it]                                                       {'loss': 0.0381, 'grad_norm': 3.8599853515625, 'learning_rate': 3.7200155285544666e-05, 'epoch': 9.77}
- 20%|█▉        | 1133/5800 [3:02:44<8:58:05,  6.92s/it]score1 tensor([[0.5195],
-        [0.4980],
-        [0.5273],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4902, 0.5430, 0.4785, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0347, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:49:21,341] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 11:49:21,342] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.14 | bwd_microstep: 4630.45 | bwd_inner_microstep: 4624.77 | bwd_allreduce_microstep: 5.58 | step_microstep: 43.93
-[2025-01-25 11:49:21,343] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.10 | bwd: 4630.47 | bwd_inner: 4624.77 | bwd_allreduce: 5.64 | step: 43.93
- 20%|█▉        | 1134/5800 [3:02:51<8:57:54,  6.92s/it]                                                       {'loss': 0.0347, 'grad_norm': 3.7602858543395996, 'learning_rate': 3.7194453724429356e-05, 'epoch': 9.78}
- 20%|█▉        | 1134/5800 [3:02:51<8:57:54,  6.92s/it]score1 tensor([[0.5781],
-        [0.4551],
-        [0.5156],
-        [0.4277]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.4688, 0.4941, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0303, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:49:28,257] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.52 | optimizer_step: 4.37
-[2025-01-25 11:49:28,258] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.39 | bwd_microstep: 4625.27 | bwd_inner_microstep: 4620.60 | bwd_allreduce_microstep: 4.58 | step_microstep: 43.79
-[2025-01-25 11:49:28,258] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.34 | bwd: 4625.30 | bwd_inner: 4620.60 | bwd_allreduce: 4.62 | step: 43.82
- 20%|█▉        | 1135/5800 [3:02:58<8:57:35,  6.91s/it]                                                       {'loss': 0.0303, 'grad_norm': 3.451547384262085, 'learning_rate': 3.7188746801785034e-05, 'epoch': 9.78}
- 20%|█▉        | 1135/5800 [3:02:58<8:57:35,  6.91s/it]score1 tensor([[0.4629],
-        [0.5000],
-        [0.5352],
-        [0.4551]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4336, 0.4805, 0.5664, 0.3789], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:49:35,157] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.37
-[2025-01-25 11:49:35,159] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.62 | bwd_microstep: 4620.26 | bwd_inner_microstep: 4615.32 | bwd_allreduce_microstep: 4.82 | step_microstep: 46.54
-[2025-01-25 11:49:35,159] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.59 | bwd: 4620.29 | bwd_inner: 4615.32 | bwd_allreduce: 4.89 | step: 46.55
- 20%|█▉        | 1136/5800 [3:03:05<8:57:04,  6.91s/it]                                                       {'loss': 0.0391, 'grad_norm': 3.2802743911743164, 'learning_rate': 3.7183034519391204e-05, 'epoch': 9.79}
- 20%|█▉        | 1136/5800 [3:03:05<8:57:04,  6.91s/it]score1 tensor([[0.4863],
-        [0.4336],
-        [0.4707],
-        [0.4922]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.5039, 0.4824, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0620, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:49:42,055] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 11:49:42,057] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.32 | bwd_microstep: 4623.56 | bwd_inner_microstep: 4618.46 | bwd_allreduce_microstep: 5.01 | step_microstep: 46.23
-[2025-01-25 11:49:42,057] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.29 | bwd: 4623.59 | bwd_inner: 4618.46 | bwd_allreduce: 5.06 | step: 46.24
- 20%|█▉        | 1137/5800 [3:03:12<8:56:47,  6.91s/it]                                                       {'loss': 0.062, 'grad_norm': 6.801357746124268, 'learning_rate': 3.717731687902905e-05, 'epoch': 9.8}
- 20%|█▉        | 1137/5800 [3:03:12<8:56:47,  6.91s/it]score1 tensor([[0.3691],
-        [0.4863],
-        [0.4492],
-        [0.4746]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4395, 0.4512, 0.4844, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0571, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:49:48,956] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.37
-[2025-01-25 11:49:48,957] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.72 | bwd_microstep: 4622.44 | bwd_inner_microstep: 4617.45 | bwd_allreduce_microstep: 4.86 | step_microstep: 44.34
-[2025-01-25 11:49:48,958] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.68 | bwd: 4622.47 | bwd_inner: 4617.45 | bwd_allreduce: 4.93 | step: 44.35
- 20%|█▉        | 1138/5800 [3:03:18<8:56:31,  6.91s/it]                                                       {'loss': 0.0571, 'grad_norm': 3.0824666023254395, 'learning_rate': 3.7171593882481455e-05, 'epoch': 9.81}
- 20%|█▉        | 1138/5800 [3:03:18<8:56:31,  6.91s/it]score1 tensor([[0.4824],
-        [0.4844],
-        [0.4805],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.4551, 0.5156, 0.6289], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0693, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:49:55,854] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 11:49:55,855] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.78 | bwd_microstep: 4620.80 | bwd_inner_microstep: 4615.71 | bwd_allreduce_microstep: 4.93 | step_microstep: 45.17
-[2025-01-25 11:49:55,856] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.74 | bwd: 4620.82 | bwd_inner: 4615.71 | bwd_allreduce: 5.03 | step: 45.18
- 20%|█▉        | 1139/5800 [3:03:25<8:56:13,  6.90s/it]                                                       {'loss': 0.0693, 'grad_norm': 3.6454732418060303, 'learning_rate': 3.716586553153293e-05, 'epoch': 9.82}
- 20%|█▉        | 1139/5800 [3:03:25<8:56:13,  6.90s/it]score1 tensor([[0.4883],
-        [0.4414],
-        [0.4863],
-        [0.4941]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.4648, 0.5352, 0.4629], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0361, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:50:02,755] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.36
-[2025-01-25 11:50:02,757] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.35 | bwd_microstep: 4621.85 | bwd_inner_microstep: 4616.49 | bwd_allreduce_microstep: 5.14 | step_microstep: 45.06
-[2025-01-25 11:50:02,757] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.32 | bwd: 4621.91 | bwd_inner: 4616.49 | bwd_allreduce: 5.25 | step: 45.04
- 20%|█▉        | 1140/5800 [3:03:32<8:56:17,  6.91s/it]                                                       {'loss': 0.0361, 'grad_norm': 0.5259912014007568, 'learning_rate': 3.716013182796967e-05, 'epoch': 9.83}
- 20%|█▉        | 1140/5800 [3:03:32<8:56:17,  6.91s/it]score1 tensor([[0.4668],
-        [0.3848],
-        [0.5039],
-        [0.4512]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4121, 0.4258, 0.5430, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0454, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:50:09,689] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.13 | optimizer_step: 4.37
-[2025-01-25 11:50:09,691] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.43 | bwd_microstep: 4621.26 | bwd_inner_microstep: 4615.97 | bwd_allreduce_microstep: 5.19 | step_microstep: 66.90
-[2025-01-25 11:50:09,695] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.39 | bwd: 4621.28 | bwd_inner: 4615.97 | bwd_allreduce: 5.24 | step: 67.01
- 20%|█▉        | 1141/5800 [3:03:39<8:57:25,  6.92s/it]                                                       {'loss': 0.0454, 'grad_norm': 3.225099563598633, 'learning_rate': 3.715439277357956e-05, 'epoch': 9.84}
- 20%|█▉        | 1141/5800 [3:03:39<8:57:25,  6.92s/it]score1 tensor([[0.4785],
-        [0.5625],
-        [0.4883],
-        [0.4609]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4336, 0.5664, 0.5039, 0.3672], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0396, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:50:16,666] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.06 | optimizer_step: 4.37
-[2025-01-25 11:50:16,668] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.49 | bwd_microstep: 4623.57 | bwd_inner_microstep: 4617.42 | bwd_allreduce_microstep: 6.03 | step_microstep: 63.07
-[2025-01-25 11:50:16,668] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.46 | bwd: 4623.59 | bwd_inner: 4617.42 | bwd_allreduce: 6.10 | step: 63.08
- 20%|█▉        | 1142/5800 [3:03:46<8:58:17,  6.93s/it]                                                       {'loss': 0.0396, 'grad_norm': 1.1438307762145996, 'learning_rate': 3.7148648370152134e-05, 'epoch': 9.84}
- 20%|█▉        | 1142/5800 [3:03:46<8:58:17,  6.93s/it]score1 tensor([[0.4688],
-        [0.4727],
-        [0.5469],
-        [0.4727]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3672, 0.3652, 0.5391, 0.4258], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0659, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:50:23,608] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.36
-[2025-01-25 11:50:23,609] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2170.87 | bwd_microstep: 4633.69 | bwd_inner_microstep: 4628.61 | bwd_allreduce_microstep: 4.99 | step_microstep: 45.90
-[2025-01-25 11:50:23,610] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2170.84 | bwd: 4633.72 | bwd_inner: 4628.61 | bwd_allreduce: 5.03 | step: 45.91
- 20%|█▉        | 1143/5800 [3:03:53<8:58:00,  6.93s/it]                                                       {'loss': 0.0659, 'grad_norm': 6.9123454093933105, 'learning_rate': 3.7142898619478584e-05, 'epoch': 9.85}
- 20%|█▉        | 1143/5800 [3:03:53<8:58:00,  6.93s/it]score1 tensor([[0.5039],
-        [0.4980],
-        [0.5195],
-        [0.4648]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4922, 0.4609, 0.4863, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0366, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:50:30,516] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.50 | optimizer_step: 4.36
-[2025-01-25 11:50:30,517] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.79 | bwd_microstep: 4621.25 | bwd_inner_microstep: 4616.40 | bwd_allreduce_microstep: 4.77 | step_microstep: 44.84
-[2025-01-25 11:50:30,518] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.75 | bwd: 4621.27 | bwd_inner: 4616.40 | bwd_allreduce: 4.81 | step: 44.85
- 20%|█▉        | 1144/5800 [3:04:00<8:57:09,  6.92s/it]                                                       {'loss': 0.0366, 'grad_norm': 6.936183452606201, 'learning_rate': 3.7137143523351787e-05, 'epoch': 9.86}
- 20%|█▉        | 1144/5800 [3:04:00<8:57:09,  6.92s/it]score1 tensor([[0.5469],
-        [0.5625],
-        [0.5977],
-        [0.6094]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.4707, 0.6016, 0.6250], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0327, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:50:37,421] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.50 | optimizer_step: 4.36
-[2025-01-25 11:50:37,422] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.78 | bwd_microstep: 4629.13 | bwd_inner_microstep: 4624.43 | bwd_allreduce_microstep: 4.60 | step_microstep: 43.15
-[2025-01-25 11:50:37,422] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.75 | bwd: 4629.16 | bwd_inner: 4624.43 | bwd_allreduce: 4.65 | step: 43.17
- 20%|█▉        | 1145/5800 [3:04:07<8:56:40,  6.92s/it]                                                       {'loss': 0.0327, 'grad_norm': 0.526807427406311, 'learning_rate': 3.713138308356629e-05, 'epoch': 9.87}
- 20%|█▉        | 1145/5800 [3:04:07<8:56:40,  6.92s/it]score1 tensor([[0.5039],
-        [0.4492],
-        [0.5391],
-        [0.4961]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4961, 0.4160, 0.5078, 0.4316], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0342, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:50:44,344] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 11:50:44,345] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.49 | bwd_microstep: 4648.60 | bwd_inner_microstep: 4643.64 | bwd_allreduce_microstep: 4.87 | step_microstep: 45.24
-[2025-01-25 11:50:44,345] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.47 | bwd: 4648.62 | bwd_inner: 4643.64 | bwd_allreduce: 4.92 | step: 45.25
- 20%|█▉        | 1146/5800 [3:04:14<8:56:45,  6.92s/it]                                                       {'loss': 0.0342, 'grad_norm': 6.9365034103393555, 'learning_rate': 3.712561730191829e-05, 'epoch': 9.88}
- 20%|█▉        | 1146/5800 [3:04:14<8:56:45,  6.92s/it]score1 tensor([[0.5547],
-        [0.5273],
-        [0.4785],
-        [0.4688]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.5430, 0.4180, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0347, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:50:51,280] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 11:50:51,282] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.52 | bwd_microstep: 4647.10 | bwd_inner_microstep: 4641.65 | bwd_allreduce_microstep: 5.34 | step_microstep: 46.88
-[2025-01-25 11:50:51,282] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.48 | bwd: 4647.12 | bwd_inner: 4641.65 | bwd_allreduce: 5.39 | step: 46.90
- 20%|█▉        | 1147/5800 [3:04:21<8:56:58,  6.92s/it]                                                       {'loss': 0.0347, 'grad_norm': 3.734989643096924, 'learning_rate': 3.711984618020566e-05, 'epoch': 9.89}
- 20%|█▉        | 1147/5800 [3:04:21<8:56:58,  6.92s/it]score1 tensor([[0.4902],
-        [0.4941],
-        [0.4746],
-        [0.6055]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4805, 0.4570, 0.4453, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0200, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:50:58,195] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.47 | optimizer_step: 4.36
-[2025-01-25 11:50:58,196] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.29 | bwd_microstep: 4640.05 | bwd_inner_microstep: 4635.21 | bwd_allreduce_microstep: 4.77 | step_microstep: 43.67
-[2025-01-25 11:50:58,196] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.23 | bwd: 4640.08 | bwd_inner: 4635.21 | bwd_allreduce: 4.81 | step: 43.69
- 20%|█▉        | 1148/5800 [3:04:28<8:56:36,  6.92s/it]                                                       {'loss': 0.02, 'grad_norm': 3.18876314163208, 'learning_rate': 3.7114069720227934e-05, 'epoch': 9.9}
- 20%|█▉        | 1148/5800 [3:04:28<8:56:36,  6.92s/it]score1 tensor([[0.5000],
-        [0.5430],
-        [0.3301],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4922, 0.4492, 0.3418, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0400, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:51:05,122] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.52 | optimizer_step: 4.36
-[2025-01-25 11:51:05,124] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.09 | bwd_microstep: 4649.46 | bwd_inner_microstep: 4644.92 | bwd_allreduce_microstep: 4.46 | step_microstep: 43.03
-[2025-01-25 11:51:05,124] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.04 | bwd: 4649.48 | bwd_inner: 4644.92 | bwd_allreduce: 4.50 | step: 43.05
- 20%|█▉        | 1149/5800 [3:04:35<8:56:37,  6.92s/it]                                                       {'loss': 0.04, 'grad_norm': 0.7415252327919006, 'learning_rate': 3.710828792378631e-05, 'epoch': 9.91}
- 20%|█▉        | 1149/5800 [3:04:35<8:56:37,  6.92s/it]score1 tensor([[0.6055],
-        [0.4766],
-        [0.5234],
-        [0.4180]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.7070, 0.4531, 0.5195, 0.3926], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0386, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 11:51:12,052] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.03 | optimizer_step: 4.36
-[2025-01-25 11:51:12,054] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.32 | bwd_microstep: 4643.50 | bwd_inner_microstep: 4637.18 | bwd_allreduce_microstep: 6.11 | step_microstep: 56.77
-[2025-01-25 11:51:12,054] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.29 | bwd: 4643.55 | bwd_inner: 4637.18 | bwd_allreduce: 6.22 | step: 56.76
- 20%|█▉        | 1150/5800 [3:04:42<8:56:51,  6.93s/it]                                                       {'loss': 0.0386, 'grad_norm': 3.1945228576660156, 'learning_rate': 3.710250079268367e-05, 'epoch': 9.91}
- 20%|█▉        | 1150/5800 [3:04:42<8:56:51,  6.93s/it]evaluate!
-score1 tensor([[0.4180]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4805, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0625, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5625, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4707]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4766, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4883]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5703, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0820, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5469]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5742, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4180]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5547, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1367, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5312]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5977, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4219, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1133, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4707]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4531]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5000, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4043]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3945, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4746]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5000, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4590]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3730, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0859, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4766]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3965]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4414, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0449, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4668]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6797, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2129, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4766]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5391, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0625, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4434]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4648, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3613]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4258, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0645, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3848]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4707, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0859, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4629]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3965, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4688]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3750, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0938, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4648]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4492, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4727]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5586, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0859, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4844]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6523, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1680, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5469]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5898, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5078]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4355, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0723, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4629]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5156, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0527, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4375]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3262, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1113, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4199]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4863, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4062]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2422, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5117]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6055, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0938, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5195]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4570, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0625, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5312]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1172, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5664]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.7031, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1367, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4551]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4336, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5039]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5391, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4004]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4473, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4688]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5430, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0742, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4043]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4590, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5781]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6641, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0859, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4570]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4453, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5430]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.7109, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1680, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5078]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5273, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4766]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3691, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1074, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4199]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4180, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4434]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3223, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1211, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6016, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0742, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4375]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4648, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4160]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3418, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0742, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4180]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4590, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0410, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4570]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5117, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4531]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4551, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4453]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5234, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0781, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5508]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5312, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5117]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5820, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3945]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4199, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3730]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3926, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5508]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4844]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4355]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0527, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5000]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5352, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6250, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1094, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5781]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5703, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4062]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4727, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4980]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4434, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4492]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4941, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0449, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5352, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4551]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0332, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4180]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3828, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.1240]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4219, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2969, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4395]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4844, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0449, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4473]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4805, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0332, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1016, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5703]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5898, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5742]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6641, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0898, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5000]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4688, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3867]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4297, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3887]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4668, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0781, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4883]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1406, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4180]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4297, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3730]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4609, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0879, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4531]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1641, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4199]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4492, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4043]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3730, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4785]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6445, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1660, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4941]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1543, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4121]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5430, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1309, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4668]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5273, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0605, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4004]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3867, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4199]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4668, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4844]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5625, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5781]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6250, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5312]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0977, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4922]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1367, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4121]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4785, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4355]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4062, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4609]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4395, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4102]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5664, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1562, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4219]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3555, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3945]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4785, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0840, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5078, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5430]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6016, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0586, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4180]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4277, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5430]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5781, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4590]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3477, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1113, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4707]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6133, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1426, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5156, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3770]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3379, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4121]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3418, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5430]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3848, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1582, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4199]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3828, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0371, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4473]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4277, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3711]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4043, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0332, device='cuda:0', dtype=torch.bfloat16)
-Results saved to /DATA/DATA1/wjr/intern/InternVL/internvl_chat/new_train_result/stage2/mos3_test.csv
-Accuracy: 0.0
-SRCC_score: 0.6171805400254877
-PLCC_score: 0.5668766171035841
-KRCC_score: 0.44145287710862735
-SRCC_level: 0.6171805400254877
-PLCC_level: 0.5668766171035841
-KRCC_level: 0.44145287710862735
-New best SRCC_score: 0.6171805400254877. Saving model...
-[INFO|trainer.py:3705] 2025-01-25 12:01:44,866 >> Saving model checkpoint to /DATA/env/wjr/newtrain/stage2/mos3
-[INFO|configuration_utils.py:410] 2025-01-25 12:01:44,873 >> Configuration saved in /DATA/env/wjr/newtrain/stage2/mos3/config.json
-[INFO|configuration_utils.py:868] 2025-01-25 12:01:44,874 >> Configuration saved in /DATA/env/wjr/newtrain/stage2/mos3/generation_config.json
-[INFO|modeling_utils.py:2844] 2025-01-25 12:03:24,825 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 4 checkpoint shards. You can find where each parameters has been saved in the index located at /DATA/env/wjr/newtrain/stage2/mos3/model.safetensors.index.json.
-[INFO|tokenization_utils_base.py:2641] 2025-01-25 12:03:24,830 >> tokenizer config file saved in /DATA/env/wjr/newtrain/stage2/mos3/tokenizer_config.json
-[INFO|tokenization_utils_base.py:2650] 2025-01-25 12:03:24,830 >> Special tokens file saved in /DATA/env/wjr/newtrain/stage2/mos3/special_tokens_map.json
-[INFO|tokenization_utils_base.py:2701] 2025-01-25 12:03:24,831 >> added tokens file saved in /DATA/env/wjr/newtrain/stage2/mos3/added_tokens.json
-01/25/2025 12:03:35 - INFO - __main__ - Saved LoRA weights to /DATA/env/wjr/newtrain/stage2/mos3/lora_weights.pth
-score1 tensor([[0.4043],
-        [0.4766],
-        [0.4980],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4238, 0.5078, 0.4922, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0259, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:03:42,578] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.37
-[2025-01-25 12:03:42,579] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2130.40 | bwd_microstep: 4578.36 | bwd_inner_microstep: 4569.28 | bwd_allreduce_microstep: 8.82 | step_microstep: 52.91
-[2025-01-25 12:03:42,580] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2130.34 | bwd: 4578.43 | bwd_inner: 4569.28 | bwd_allreduce: 8.96 | step: 52.89
- 20%|█▉        | 1151/5800 [3:17:12<297:01:36, 230.01s/it]                                                          {'loss': 0.0259, 'grad_norm': 3.3545780181884766, 'learning_rate': 3.709670832872451e-05, 'epoch': 9.92}
- 20%|█▉        | 1151/5800 [3:17:12<297:01:36, 230.01s/it]score1 tensor([[0.4062],
-        [0.4395],
-        [0.4199],
-        [0.4395]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.5430, 0.4160, 0.6016], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0830, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:03:49,430] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 12:03:49,432] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2138.09 | bwd_microstep: 4596.91 | bwd_inner_microstep: 4591.58 | bwd_allreduce_microstep: 5.24 | step_microstep: 44.68
-[2025-01-25 12:03:49,432] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2138.06 | bwd: 4596.93 | bwd_inner: 4591.58 | bwd_allreduce: 5.28 | step: 44.69
- 20%|█▉        | 1152/5800 [3:17:19<210:31:46, 163.06s/it]                                                          {'loss': 0.083, 'grad_norm': 3.305673837661743, 'learning_rate': 3.7090910533715055e-05, 'epoch': 9.93}
- 20%|█▉        | 1152/5800 [3:17:19<210:31:46, 163.06s/it]score1 tensor([[0.4512],
-        [0.4102],
-        [0.4766],
-        [0.4590]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.4941, 0.5508, 0.4766], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0444, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:03:56,304] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.37
-[2025-01-25 12:03:56,306] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2142.67 | bwd_microstep: 4607.84 | bwd_inner_microstep: 4602.84 | bwd_allreduce_microstep: 4.90 | step_microstep: 44.75
-[2025-01-25 12:03:56,306] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2142.63 | bwd: 4607.87 | bwd_inner: 4602.83 | bwd_allreduce: 4.95 | step: 44.76
- 20%|█▉        | 1153/5800 [3:17:26<150:00:00, 116.20s/it]                                                          {'loss': 0.0444, 'grad_norm': 3.3409082889556885, 'learning_rate': 3.708510740946314e-05, 'epoch': 9.94}
- 20%|█▉        | 1153/5800 [3:17:26<150:00:00, 116.20s/it]score1 tensor([[0.4727],
-        [0.4355],
-        [0.4375],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.5352, 0.5352, 0.6328], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0757, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:04:03,163] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.55 | optimizer_step: 4.36
-[2025-01-25 12:04:03,164] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2145.09 | bwd_microstep: 4596.25 | bwd_inner_microstep: 4591.63 | bwd_allreduce_microstep: 4.51 | step_microstep: 42.61
-[2025-01-25 12:04:03,164] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2145.05 | bwd: 4596.28 | bwd_inner: 4591.63 | bwd_allreduce: 4.56 | step: 42.61
- 20%|█▉        | 1154/5800 [3:17:33<107:38:20, 83.41s/it]                                                          {'loss': 0.0757, 'grad_norm': 6.893413066864014, 'learning_rate': 3.7079298957778274e-05, 'epoch': 9.95}
- 20%|█▉        | 1154/5800 [3:17:33<107:38:20, 83.41s/it]score1 tensor([[0.4531],
-        [0.4492],
-        [0.4980],
-        [0.4688]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.4551, 0.5820, 0.3262], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0610, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:04:10,023] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 12:04:10,024] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2134.30 | bwd_microstep: 4589.20 | bwd_inner_microstep: 4582.89 | bwd_allreduce_microstep: 6.20 | step_microstep: 46.71
-[2025-01-25 12:04:10,025] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2134.26 | bwd: 4589.22 | bwd_inner: 4582.89 | bwd_allreduce: 6.26 | step: 46.72
- 20%|█▉        | 1155/5800 [3:17:39<77:58:49, 60.44s/it]                                                         {'loss': 0.061, 'grad_norm': 3.3845055103302, 'learning_rate': 3.7073485180471654e-05, 'epoch': 9.96}
- 20%|█▉        | 1155/5800 [3:17:40<77:58:49, 60.44s/it]score1 tensor([[0.4570],
-        [0.4316],
-        [0.5312],
-        [0.4121]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3809, 0.4492, 0.5508, 0.3730], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0381, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:04:16,885] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 12:04:16,887] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2138.19 | bwd_microstep: 4604.69 | bwd_inner_microstep: 4600.15 | bwd_allreduce_microstep: 4.45 | step_microstep: 42.16
-[2025-01-25 12:04:16,887] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2138.15 | bwd: 4604.72 | bwd_inner: 4600.15 | bwd_allreduce: 4.50 | step: 42.16
- 20%|█▉        | 1156/5800 [3:17:46<57:13:44, 44.36s/it]                                                        {'loss': 0.0381, 'grad_norm': 0.6172711253166199, 'learning_rate': 3.7067666079356096e-05, 'epoch': 9.97}
- 20%|█▉        | 1156/5800 [3:17:46<57:13:44, 44.36s/it]score1 tensor([[0.4434],
-        [0.5977],
-        [0.5898],
-        [0.4531]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4062, 0.5352, 0.4844, 0.3457], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0781, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:04:23,731] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 12:04:23,733] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2140.45 | bwd_microstep: 4595.92 | bwd_inner_microstep: 4591.65 | bwd_allreduce_microstep: 4.19 | step_microstep: 43.10
-[2025-01-25 12:04:23,734] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2140.43 | bwd: 4595.94 | bwd_inner: 4591.65 | bwd_allreduce: 4.23 | step: 43.15
- 20%|█▉        | 1157/5800 [3:17:53<42:42:43, 33.12s/it]                                                        {'loss': 0.0781, 'grad_norm': 7.284182548522949, 'learning_rate': 3.7061841656246105e-05, 'epoch': 9.97}
- 20%|█▉        | 1157/5800 [3:17:53<42:42:43, 33.12s/it]score1 tensor([[0.5391],
-        [0.5352],
-        [0.7227],
-        [0.6641]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4355, 0.4219, 0.6562, 0.6289], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0796, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:04:30,646] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 12:04:30,648] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2149.87 | bwd_microstep: 4604.88 | bwd_inner_microstep: 4600.34 | bwd_allreduce_microstep: 4.44 | step_microstep: 48.50
-[2025-01-25 12:04:30,648] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2149.79 | bwd: 4604.90 | bwd_inner: 4600.34 | bwd_allreduce: 4.49 | step: 48.51
- 20%|█▉        | 1158/5800 [3:18:00<32:33:21, 25.25s/it]                                                        {'loss': 0.0796, 'grad_norm': 8.482718467712402, 'learning_rate': 3.7056011912957836e-05, 'epoch': 9.98}
- 20%|█▉        | 1158/5800 [3:18:00<32:33:21, 25.25s/it]score1 tensor([[0.5352],
-        [0.5859],
-        [0.5977],
-        [0.6758]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.5898, 0.5234, 0.6484], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0356, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:04:37,525] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 12:04:37,527] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.67 | bwd_microstep: 4611.35 | bwd_inner_microstep: 4605.12 | bwd_allreduce_microstep: 6.16 | step_microstep: 42.36
-[2025-01-25 12:04:37,527] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.64 | bwd: 4611.37 | bwd_inner: 4605.12 | bwd_allreduce: 6.20 | step: 42.37
- 20%|█▉        | 1159/5800 [3:18:07<25:26:37, 19.74s/it]                                                        {'loss': 0.0356, 'grad_norm': 4.2587571144104, 'learning_rate': 3.70501768513091e-05, 'epoch': 9.99}
- 20%|█▉        | 1159/5800 [3:18:07<25:26:37, 19.74s/it][93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-score1 tensor([[0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0723, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:04:42,534] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.29 | optimizer_step: 4.37
-[2025-01-25 12:04:42,535] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 575.79 | bwd_microstep: 1219.74 | bwd_inner_microstep: 1213.08 | bwd_allreduce_microstep: 6.46 | step_microstep: 38.63
-[2025-01-25 12:04:42,535] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 575.72 | bwd: 1219.79 | bwd_inner: 1213.08 | bwd_allreduce: 6.57 | step: 38.61
- 20%|██        | 1160/5800 [3:18:12<19:44:25, 15.32s/it]                                                        {'loss': 0.0723, 'grad_norm': 8.657249450683594, 'learning_rate': 3.7044336473119386e-05, 'epoch': 10.0}
- 20%|██        | 1160/5800 [3:18:12<19:44:25, 15.32s/it][2025-01-25 12:04:47,259] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 12:04:57,490] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 12:05:07,650] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 12:05:17,398] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-score1 tensor([[0.5664],
-        [0.6055],
-        [0.5352],
-        [0.4258]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.5625, 0.4375, 0.3555], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0654, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:05:32,100] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 12:05:32,101] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2136.91 | bwd_microstep: 4607.85 | bwd_inner_microstep: 4601.97 | bwd_allreduce_microstep: 5.76 | step_microstep: 50.14
-[2025-01-25 12:05:32,102] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2136.85 | bwd: 4607.87 | bwd_inner: 4601.97 | bwd_allreduce: 5.83 | step: 50.14
- 20%|██        | 1161/5800 [3:19:02<32:58:50, 25.59s/it]                                                        {'loss': 0.0654, 'grad_norm': 8.223969459533691, 'learning_rate': 3.7038490780209814e-05, 'epoch': 10.01}
- 20%|██        | 1161/5800 [3:19:02<32:58:50, 25.59s/it]score1 tensor([[0.4805],
-        [0.5742],
-        [0.5234],
-        [0.4102]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.6484, 0.5508, 0.4160], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0303, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:05:38,964] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 12:05:38,965] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2147.60 | bwd_microstep: 4587.18 | bwd_inner_microstep: 4581.89 | bwd_allreduce_microstep: 5.19 | step_microstep: 47.30
-[2025-01-25 12:05:38,966] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2147.56 | bwd: 4587.21 | bwd_inner: 4581.89 | bwd_allreduce: 5.25 | step: 47.30
- 20%|██        | 1162/5800 [3:19:08<25:44:01, 19.97s/it]                                                        {'loss': 0.0303, 'grad_norm': 8.536867141723633, 'learning_rate': 3.7032639774403174e-05, 'epoch': 10.02}
- 20%|██        | 1162/5800 [3:19:08<25:44:01, 19.97s/it]score1 tensor([[0.5195],
-        [0.5664],
-        [0.3887],
-        [0.4473]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4609, 0.5664, 0.4922, 0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0615, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:05:45,761] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.53 | optimizer_step: 4.36
-[2025-01-25 12:05:45,762] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2134.74 | bwd_microstep: 4544.11 | bwd_inner_microstep: 4539.73 | bwd_allreduce_microstep: 4.29 | step_microstep: 43.17
-[2025-01-25 12:05:45,763] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2134.72 | bwd: 4544.13 | bwd_inner: 4539.73 | bwd_allreduce: 4.33 | step: 43.18
- 20%|██        | 1163/5800 [3:19:15<20:38:21, 16.02s/it]                                                        {'loss': 0.0615, 'grad_norm': 1.8374052047729492, 'learning_rate': 3.70267834575239e-05, 'epoch': 10.03}
- 20%|██        | 1163/5800 [3:19:15<20:38:21, 16.02s/it]score1 tensor([[0.4297],
-        [0.3965],
-        [0.4316],
-        [0.5781]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.4980, 0.4551, 0.6133], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:05:52,634] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 12:05:52,635] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2135.34 | bwd_microstep: 4600.85 | bwd_inner_microstep: 4593.58 | bwd_allreduce_microstep: 7.03 | step_microstep: 57.85
-[2025-01-25 12:05:52,635] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2135.30 | bwd: 4600.92 | bwd_inner: 4593.58 | bwd_allreduce: 7.15 | step: 57.87
- 20%|██        | 1164/5800 [3:19:22<17:05:53, 13.28s/it]                                                        {'loss': 0.0508, 'grad_norm': 8.359138488769531, 'learning_rate': 3.702092183139811e-05, 'epoch': 10.03}
- 20%|██        | 1164/5800 [3:19:22<17:05:53, 13.28s/it]score1 tensor([[0.5625],
-        [0.4883],
-        [0.4316],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6016, 0.5352, 0.4629, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0322, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:05:59,519] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 12:05:59,520] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2143.92 | bwd_microstep: 4609.50 | bwd_inner_microstep: 4604.79 | bwd_allreduce_microstep: 4.63 | step_microstep: 42.60
-[2025-01-25 12:05:59,521] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2143.89 | bwd: 4609.53 | bwd_inner: 4604.79 | bwd_allreduce: 4.67 | step: 42.60
- 20%|██        | 1165/5800 [3:19:29<14:37:35, 11.36s/it]                                                        {'loss': 0.0322, 'grad_norm': 4.053049087524414, 'learning_rate': 3.701505489785356e-05, 'epoch': 10.04}
- 20%|██        | 1165/5800 [3:19:29<14:37:35, 11.36s/it]score1 tensor([[0.4590],
-        [0.4766],
-        [0.5859],
-        [0.4902]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4668, 0.4785, 0.5547, 0.4609], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:06:06,443] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.66 | optimizer_step: 4.36
-[2025-01-25 12:06:06,446] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.34 | bwd_microstep: 4616.81 | bwd_inner_microstep: 4610.64 | bwd_allreduce_microstep: 5.98 | step_microstep: 72.28
-[2025-01-25 12:06:06,447] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.29 | bwd: 4616.86 | bwd_inner: 4610.64 | bwd_allreduce: 6.07 | step: 72.31
- 20%|██        | 1166/5800 [3:19:36<12:55:12, 10.04s/it]                                                        {'loss': 0.0176, 'grad_norm': 0.7991170287132263, 'learning_rate': 3.700918265871964e-05, 'epoch': 10.05}
- 20%|██        | 1166/5800 [3:19:36<12:55:12, 10.04s/it]score1 tensor([[0.6016],
-        [0.5898],
-        [0.5547],
-        [0.4492]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.5469, 0.6055, 0.3691], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0591, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:06:13,400] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 12:06:13,402] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.62 | bwd_microstep: 4629.43 | bwd_inner_microstep: 4624.72 | bwd_allreduce_microstep: 4.58 | step_microstep: 58.70
-[2025-01-25 12:06:13,403] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.58 | bwd: 4629.45 | bwd_inner: 4624.72 | bwd_allreduce: 4.65 | step: 58.77
- 20%|██        | 1167/5800 [3:19:43<11:43:13,  9.11s/it]                                                        {'loss': 0.0591, 'grad_norm': 4.033764839172363, 'learning_rate': 3.7003305115827445e-05, 'epoch': 10.06}
- 20%|██        | 1167/5800 [3:19:43<11:43:13,  9.11s/it]score1 tensor([[0.6016],
-        [0.5352],
-        [0.5430],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.4355, 0.5391, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0400, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:06:20,312] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 12:06:20,313] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.58 | bwd_microstep: 4615.72 | bwd_inner_microstep: 4610.75 | bwd_allreduce_microstep: 4.88 | step_microstep: 41.70
-[2025-01-25 12:06:20,313] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.51 | bwd: 4615.75 | bwd_inner: 4610.75 | bwd_allreduce: 4.93 | step: 41.71
- 20%|██        | 1168/5800 [3:19:50<10:52:00,  8.45s/it]                                                        {'loss': 0.04, 'grad_norm': 4.118961334228516, 'learning_rate': 3.699742227100968e-05, 'epoch': 10.07}
- 20%|██        | 1168/5800 [3:19:50<10:52:00,  8.45s/it]score1 tensor([[0.4277],
-        [0.4668],
-        [0.4316],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3398, 0.4004, 0.4004, 0.3809], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0840, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:06:27,227] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.53 | optimizer_step: 4.36
-[2025-01-25 12:06:27,228] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.84 | bwd_microstep: 4633.44 | bwd_inner_microstep: 4625.75 | bwd_allreduce_microstep: 7.46 | step_microstep: 48.19
-[2025-01-25 12:06:27,229] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.81 | bwd: 4633.51 | bwd_inner: 4625.75 | bwd_allreduce: 7.58 | step: 48.16
- 20%|██        | 1169/5800 [3:19:57<10:16:23,  7.99s/it]                                                        {'loss': 0.084, 'grad_norm': 6.77643346786499, 'learning_rate': 3.699153412610072e-05, 'epoch': 10.08}
- 20%|██        | 1169/5800 [3:19:57<10:16:23,  7.99s/it]score1 tensor([[0.4746],
-        [0.5078],
-        [0.4766],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3652, 0.4805, 0.4297, 0.6367], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0615, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:06:34,119] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 12:06:34,120] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.07 | bwd_microstep: 4620.77 | bwd_inner_microstep: 4615.78 | bwd_allreduce_microstep: 4.89 | step_microstep: 43.51
-[2025-01-25 12:06:34,121] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.04 | bwd: 4620.79 | bwd_inner: 4615.78 | bwd_allreduce: 4.94 | step: 43.53
- 20%|██        | 1170/5800 [3:20:04<9:50:52,  7.66s/it]                                                        {'loss': 0.0615, 'grad_norm': 3.4362151622772217, 'learning_rate': 3.6985640682936594e-05, 'epoch': 10.09}
- 20%|██        | 1170/5800 [3:20:04<9:50:52,  7.66s/it]score1 tensor([[0.5586],
-        [0.4746],
-        [0.5977],
-        [0.3848]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4766, 0.6133, 0.3730], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:06:41,001] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 12:06:41,002] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.67 | bwd_microstep: 4614.96 | bwd_inner_microstep: 4609.39 | bwd_allreduce_microstep: 5.44 | step_microstep: 44.39
-[2025-01-25 12:06:41,003] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.63 | bwd: 4614.99 | bwd_inner: 4609.39 | bwd_allreduce: 5.50 | step: 44.39
- 20%|██        | 1171/5800 [3:20:10<9:32:47,  7.42s/it]                                                       {'loss': 0.0093, 'grad_norm': 0.7338117361068726, 'learning_rate': 3.6979741943354966e-05, 'epoch': 10.09}
- 20%|██        | 1171/5800 [3:20:10<9:32:47,  7.42s/it]score1 tensor([[0.4785],
-        [0.5039],
-        [0.4062],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4590, 0.4609, 0.3457, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0439, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:06:47,903] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 12:06:47,904] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.74 | bwd_microstep: 4626.85 | bwd_inner_microstep: 4622.11 | bwd_allreduce_microstep: 4.60 | step_microstep: 48.63
-[2025-01-25 12:06:47,905] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.70 | bwd: 4626.88 | bwd_inner: 4622.11 | bwd_allreduce: 4.68 | step: 48.63
- 20%|██        | 1172/5800 [3:20:17<9:20:44,  7.27s/it]                                                       {'loss': 0.0439, 'grad_norm': 7.598629951477051, 'learning_rate': 3.697383790919519e-05, 'epoch': 10.1}
- 20%|██        | 1172/5800 [3:20:17<9:20:44,  7.27s/it]score1 tensor([[0.5664],
-        [0.4043],
-        [0.4336],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6797, 0.4883, 0.4473, 0.6641], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0820, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:06:54,810] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.36
-[2025-01-25 12:06:54,812] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.66 | bwd_microstep: 4619.70 | bwd_inner_microstep: 4615.11 | bwd_allreduce_microstep: 4.49 | step_microstep: 44.35
-[2025-01-25 12:06:54,812] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.62 | bwd: 4619.72 | bwd_inner: 4615.11 | bwd_allreduce: 4.54 | step: 44.35
- 20%|██        | 1173/5800 [3:20:24<9:12:00,  7.16s/it]                                                       {'loss': 0.082, 'grad_norm': 7.541567325592041, 'learning_rate': 3.696792858229822e-05, 'epoch': 10.11}
- 20%|██        | 1173/5800 [3:20:24<9:12:00,  7.16s/it]score1 tensor([[0.4277],
-        [0.4609],
-        [0.4688],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.5195, 0.4883, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0649, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:07:01,696] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.47 | optimizer_step: 4.36
-[2025-01-25 12:07:01,698] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.04 | bwd_microstep: 4619.99 | bwd_inner_microstep: 4615.55 | bwd_allreduce_microstep: 4.36 | step_microstep: 36.72
-[2025-01-25 12:07:01,698] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.01 | bwd: 4620.00 | bwd_inner: 4615.55 | bwd_allreduce: 4.40 | step: 36.72
- 20%|██        | 1174/5800 [3:20:31<9:05:27,  7.07s/it]                                                       {'loss': 0.0649, 'grad_norm': 7.943951606750488, 'learning_rate': 3.6962013964506705e-05, 'epoch': 10.12}
- 20%|██        | 1174/5800 [3:20:31<9:05:27,  7.07s/it]score1 tensor([[0.5391],
-        [0.4629],
-        [0.4258],
-        [0.3535]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5977, 0.5156, 0.4512, 0.3711], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0386, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:07:08,605] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.37
-[2025-01-25 12:07:08,606] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.97 | bwd_microstep: 4628.47 | bwd_inner_microstep: 4624.00 | bwd_allreduce_microstep: 4.38 | step_microstep: 53.00
-[2025-01-25 12:07:08,606] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.94 | bwd: 4628.49 | bwd_inner: 4624.00 | bwd_allreduce: 4.42 | step: 53.01
- 20%|██        | 1175/5800 [3:20:38<9:01:56,  7.03s/it]                                                       {'loss': 0.0386, 'grad_norm': 7.368718147277832, 'learning_rate': 3.695609405766491e-05, 'epoch': 10.13}
- 20%|██        | 1175/5800 [3:20:38<9:01:56,  7.03s/it]score1 tensor([[0.5117],
-        [0.4863],
-        [0.4004],
-        [0.4941]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.5625, 0.4570, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0513, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:07:15,551] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.37
-[2025-01-25 12:07:15,553] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.73 | bwd_microstep: 4641.32 | bwd_inner_microstep: 4634.17 | bwd_allreduce_microstep: 6.87 | step_microstep: 50.66
-[2025-01-25 12:07:15,554] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.69 | bwd: 4641.46 | bwd_inner: 4634.17 | bwd_allreduce: 7.01 | step: 50.68
- 20%|██        | 1176/5800 [3:20:45<9:00:35,  7.01s/it]                                                       {'loss': 0.0513, 'grad_norm': 3.2722108364105225, 'learning_rate': 3.6950168863618784e-05, 'epoch': 10.14}
- 20%|██        | 1176/5800 [3:20:45<9:00:35,  7.01s/it]score1 tensor([[0.6406],
-        [0.3965],
-        [0.5703],
-        [0.3848]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6953, 0.4082, 0.6289, 0.3652], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0361, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:07:22,536] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.44 | optimizer_step: 4.37
-[2025-01-25 12:07:22,537] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2171.81 | bwd_microstep: 4639.94 | bwd_inner_microstep: 4634.50 | bwd_allreduce_microstep: 5.36 | step_microstep: 46.20
-[2025-01-25 12:07:22,537] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2171.68 | bwd: 4639.97 | bwd_inner: 4634.50 | bwd_allreduce: 5.40 | step: 46.20
- 20%|██        | 1177/5800 [3:20:52<8:58:59,  7.00s/it]                                                       {'loss': 0.0361, 'grad_norm': 4.726600646972656, 'learning_rate': 3.6944238384215895e-05, 'epoch': 10.15}
- 20%|██        | 1177/5800 [3:20:52<8:58:59,  7.00s/it]score1 tensor([[0.4473],
-        [0.5000],
-        [0.4297],
-        [0.4160]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4297, 0.5078, 0.5391, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0420, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:07:29,472] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.95 | optimizer_step: 4.36
-[2025-01-25 12:07:29,474] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.33 | bwd_microstep: 4626.64 | bwd_inner_microstep: 4617.65 | bwd_allreduce_microstep: 8.78 | step_microstep: 65.56
-[2025-01-25 12:07:29,474] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.29 | bwd: 4626.70 | bwd_inner: 4617.65 | bwd_allreduce: 8.89 | step: 65.53
- 20%|██        | 1178/5800 [3:20:59<8:57:47,  6.98s/it]                                                       {'loss': 0.042, 'grad_norm': 3.439448595046997, 'learning_rate': 3.6938302621305474e-05, 'epoch': 10.16}
- 20%|██        | 1178/5800 [3:20:59<8:57:47,  6.98s/it]score1 tensor([[0.4512],
-        [0.4688],
-        [0.5586],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3789, 0.4668, 0.4004, 0.6016], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0796, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:07:36,420] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.51 | optimizer_step: 4.36
-[2025-01-25 12:07:36,421] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.92 | bwd_microstep: 4631.11 | bwd_inner_microstep: 4626.47 | bwd_allreduce_microstep: 4.51 | step_microstep: 43.19
-[2025-01-25 12:07:36,422] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.88 | bwd: 4631.14 | bwd_inner: 4626.47 | bwd_allreduce: 4.59 | step: 43.19
- 20%|██        | 1179/5800 [3:21:06<8:56:35,  6.97s/it]                                                       {'loss': 0.0796, 'grad_norm': 3.5793986320495605, 'learning_rate': 3.693236157673838e-05, 'epoch': 10.16}
- 20%|██        | 1179/5800 [3:21:06<8:56:35,  6.97s/it]score1 tensor([[0.4980],
-        [0.5898],
-        [0.4668],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3262, 0.4941, 0.4922, 0.4863], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0845, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:07:43,317] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.29 | optimizer_step: 4.37
-[2025-01-25 12:07:43,318] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.09 | bwd_microstep: 4619.62 | bwd_inner_microstep: 4615.67 | bwd_allreduce_microstep: 3.88 | step_microstep: 34.63
-[2025-01-25 12:07:43,319] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.05 | bwd: 4619.63 | bwd_inner: 4615.67 | bwd_allreduce: 3.91 | step: 34.64
- 20%|██        | 1180/5800 [3:21:13<8:54:32,  6.94s/it]                                                       {'loss': 0.0845, 'grad_norm': 3.9650893211364746, 'learning_rate': 3.692641525236715e-05, 'epoch': 10.17}
- 20%|██        | 1180/5800 [3:21:13<8:54:32,  6.94s/it]score1 tensor([[0.5508],
-        [0.4570],
-        [0.5977],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.4277, 0.4844, 0.4688], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0581, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:07:50,209] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.37
-[2025-01-25 12:07:50,210] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.37 | bwd_microstep: 4622.90 | bwd_inner_microstep: 4617.27 | bwd_allreduce_microstep: 5.53 | step_microstep: 47.51
-[2025-01-25 12:07:50,211] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.35 | bwd: 4622.93 | bwd_inner: 4617.27 | bwd_allreduce: 5.58 | step: 47.52
- 20%|██        | 1181/5800 [3:21:20<8:53:34,  6.93s/it]                                                       {'loss': 0.0581, 'grad_norm': 7.498727798461914, 'learning_rate': 3.692046365004594e-05, 'epoch': 10.18}
- 20%|██        | 1181/5800 [3:21:20<8:53:34,  6.93s/it]score1 tensor([[0.6172],
-        [0.4492],
-        [0.5547],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.4453, 0.5156, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0288, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:07:57,133] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 12:07:57,134] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.43 | bwd_microstep: 4624.06 | bwd_inner_microstep: 4618.94 | bwd_allreduce_microstep: 5.04 | step_microstep: 45.78
-[2025-01-25 12:07:57,134] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.36 | bwd: 4624.08 | bwd_inner: 4618.94 | bwd_allreduce: 5.08 | step: 45.78
- 20%|██        | 1182/5800 [3:21:27<8:53:13,  6.93s/it]                                                       {'loss': 0.0288, 'grad_norm': 3.499589681625366, 'learning_rate': 3.691450677163057e-05, 'epoch': 10.19}
- 20%|██        | 1182/5800 [3:21:27<8:53:13,  6.93s/it]score1 tensor([[0.5742],
-        [0.5352],
-        [0.4824],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.5117, 0.4824, 0.4727], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0225, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:08:03,999] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.37
-[2025-01-25 12:08:04,000] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.21 | bwd_microstep: 4588.97 | bwd_inner_microstep: 4584.22 | bwd_allreduce_microstep: 4.67 | step_microstep: 44.37
-[2025-01-25 12:08:04,001] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.18 | bwd: 4588.99 | bwd_inner: 4584.22 | bwd_allreduce: 4.71 | step: 44.38
- 20%|██        | 1183/5800 [3:21:33<8:51:38,  6.91s/it]                                                       {'loss': 0.0225, 'grad_norm': 5.745931625366211, 'learning_rate': 3.69085446189785e-05, 'epoch': 10.2}
- 20%|██        | 1183/5800 [3:21:33<8:51:38,  6.91s/it]score1 tensor([[0.4668],
-        [0.6758],
-        [0.5820],
-        [0.3633]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.6523, 0.6289, 0.3926], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0317, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:08:10,929] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.66 | optimizer_step: 4.36
-[2025-01-25 12:08:10,931] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.26 | bwd_microstep: 4627.17 | bwd_inner_microstep: 4620.68 | bwd_allreduce_microstep: 6.41 | step_microstep: 67.64
-[2025-01-25 12:08:10,932] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.23 | bwd: 4627.20 | bwd_inner: 4620.68 | bwd_allreduce: 6.45 | step: 67.69
- 20%|██        | 1184/5800 [3:21:40<8:52:10,  6.92s/it]                                                       {'loss': 0.0317, 'grad_norm': 3.165625810623169, 'learning_rate': 3.690257719394883e-05, 'epoch': 10.21}
- 20%|██        | 1184/5800 [3:21:40<8:52:10,  6.92s/it]score1 tensor([[0.3789],
-        [0.4043],
-        [0.6250],
-        [0.5977]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4434, 0.4316, 0.6094, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0327, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:08:17,869] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.87 | optimizer_step: 4.36
-[2025-01-25 12:08:17,870] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.92 | bwd_microstep: 4625.16 | bwd_inner_microstep: 4618.06 | bwd_allreduce_microstep: 6.94 | step_microstep: 65.12
-[2025-01-25 12:08:17,870] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.88 | bwd: 4625.21 | bwd_inner: 4618.06 | bwd_allreduce: 7.02 | step: 65.10
- 20%|██        | 1185/5800 [3:21:47<8:52:55,  6.93s/it]                                                       {'loss': 0.0327, 'grad_norm': 3.263784170150757, 'learning_rate': 3.6896604498402303e-05, 'epoch': 10.22}
- 20%|██        | 1185/5800 [3:21:47<8:52:55,  6.93s/it]score1 tensor([[0.4863],
-        [0.4316],
-        [0.4336],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.4473, 0.4023, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0259, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:08:24,814] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 12:08:24,815] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.61 | bwd_microstep: 4622.21 | bwd_inner_microstep: 4617.24 | bwd_allreduce_microstep: 4.87 | step_microstep: 43.81
-[2025-01-25 12:08:24,815] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.56 | bwd: 4622.24 | bwd_inner: 4617.24 | bwd_allreduce: 4.92 | step: 43.81
- 20%|██        | 1186/5800 [3:21:54<8:52:41,  6.93s/it]                                                       {'loss': 0.0259, 'grad_norm': 0.6245892643928528, 'learning_rate': 3.689062653420132e-05, 'epoch': 10.22}
- 20%|██        | 1186/5800 [3:21:54<8:52:41,  6.93s/it]score1 tensor([[0.4453],
-        [0.5664],
-        [0.5898],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4590, 0.5664, 0.5273, 0.4766], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0327, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:08:31,665] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 12:08:31,666] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.11 | bwd_microstep: 4575.97 | bwd_inner_microstep: 4570.99 | bwd_allreduce_microstep: 4.89 | step_microstep: 45.51
-[2025-01-25 12:08:31,667] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.08 | bwd: 4575.99 | bwd_inner: 4570.99 | bwd_allreduce: 4.93 | step: 45.52
- 20%|██        | 1187/5800 [3:22:01<8:50:59,  6.91s/it]                                                       {'loss': 0.0327, 'grad_norm': 2.3834314346313477, 'learning_rate': 3.688464330320989e-05, 'epoch': 10.23}
- 20%|██        | 1187/5800 [3:22:01<8:50:59,  6.91s/it]score1 tensor([[0.4980],
-        [0.4629],
-        [0.4219],
-        [0.4668]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.5117, 0.4473, 0.4902], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0405, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:08:38,588] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.37
-[2025-01-25 12:08:38,590] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.59 | bwd_microstep: 4625.57 | bwd_inner_microstep: 4619.88 | bwd_allreduce_microstep: 5.56 | step_microstep: 49.45
-[2025-01-25 12:08:38,591] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.54 | bwd: 4625.58 | bwd_inner: 4619.88 | bwd_allreduce: 5.64 | step: 49.46
- 20%|██        | 1188/5800 [3:22:08<8:51:11,  6.91s/it]                                                       {'loss': 0.0405, 'grad_norm': 7.096670627593994, 'learning_rate': 3.687865480729371e-05, 'epoch': 10.24}
- 20%|██        | 1188/5800 [3:22:08<8:51:11,  6.91s/it]score1 tensor([[0.5664],
-        [0.4336],
-        [0.4941],
-        [0.3789]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.4961, 0.4961, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0269, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:08:45,531] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 12:08:45,533] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.89 | bwd_microstep: 4643.67 | bwd_inner_microstep: 4638.49 | bwd_allreduce_microstep: 5.05 | step_microstep: 45.26
-[2025-01-25 12:08:45,533] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.85 | bwd: 4643.69 | bwd_inner: 4638.49 | bwd_allreduce: 5.12 | step: 45.27
- 20%|██        | 1189/5800 [3:22:15<8:51:45,  6.92s/it]                                                       {'loss': 0.0269, 'grad_norm': 4.027939319610596, 'learning_rate': 3.6872661048320096e-05, 'epoch': 10.25}
- 20%|██        | 1189/5800 [3:22:15<8:51:45,  6.92s/it]score1 tensor([[0.4844],
-        [0.4375],
-        [0.4844],
-        [0.3633]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4922, 0.4180, 0.4648, 0.3672], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:08:52,471] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 12:08:52,473] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2169.97 | bwd_microstep: 4654.74 | bwd_inner_microstep: 4649.57 | bwd_allreduce_microstep: 5.05 | step_microstep: 43.39
-[2025-01-25 12:08:52,473] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2169.92 | bwd: 4654.76 | bwd_inner: 4649.57 | bwd_allreduce: 5.11 | step: 43.39
- 21%|██        | 1190/5800 [3:22:22<8:52:13,  6.93s/it]                                                       {'loss': 0.0127, 'grad_norm': 0.8172531723976135, 'learning_rate': 3.6866662028158e-05, 'epoch': 10.26}
- 21%|██        | 1190/5800 [3:22:22<8:52:13,  6.93s/it]score1 tensor([[0.4766],
-        [0.4707],
-        [0.3945],
-        [0.6328]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.5000, 0.3613, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0269, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:08:59,417] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 12:08:59,419] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.05 | bwd_microstep: 4647.03 | bwd_inner_microstep: 4642.23 | bwd_allreduce_microstep: 4.72 | step_microstep: 50.73
-[2025-01-25 12:08:59,419] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2168.01 | bwd: 4647.05 | bwd_inner: 4642.23 | bwd_allreduce: 4.76 | step: 50.74
- 21%|██        | 1191/5800 [3:22:29<8:52:24,  6.93s/it]                                                       {'loss': 0.0269, 'grad_norm': 0.6171064972877502, 'learning_rate': 3.6860657748678016e-05, 'epoch': 10.27}
- 21%|██        | 1191/5800 [3:22:29<8:52:24,  6.93s/it]score1 tensor([[0.5508],
-        [0.4414],
-        [0.4629],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.4082, 0.4844, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0386, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:09:06,361] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.94 | optimizer_step: 4.36
-[2025-01-25 12:09:06,363] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.89 | bwd_microstep: 4650.16 | bwd_inner_microstep: 4644.29 | bwd_allreduce_microstep: 5.77 | step_microstep: 55.19
-[2025-01-25 12:09:06,363] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.86 | bwd: 4650.18 | bwd_inner: 4644.29 | bwd_allreduce: 5.82 | step: 55.20
- 21%|██        | 1192/5800 [3:22:36<8:52:39,  6.94s/it]                                                       {'loss': 0.0386, 'grad_norm': 3.9380502700805664, 'learning_rate': 3.685464821175239e-05, 'epoch': 10.28}
- 21%|██        | 1192/5800 [3:22:36<8:52:39,  6.94s/it]score1 tensor([[0.4648],
-        [0.4863],
-        [0.5078],
-        [0.4160]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4609, 0.4141, 0.5078, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0278, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:09:13,269] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.64 | optimizer_step: 4.37
-[2025-01-25 12:09:13,271] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.15 | bwd_microstep: 4599.57 | bwd_inner_microstep: 4594.07 | bwd_allreduce_microstep: 5.30 | step_microstep: 56.23
-[2025-01-25 12:09:13,271] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2168.12 | bwd: 4599.59 | bwd_inner: 4594.07 | bwd_allreduce: 5.43 | step: 56.28
- 21%|██        | 1193/5800 [3:22:43<8:52:17,  6.93s/it]                                                       {'loss': 0.0278, 'grad_norm': 1.966986894607544, 'learning_rate': 3.6848633419255e-05, 'epoch': 10.28}
- 21%|██        | 1193/5800 [3:22:43<8:52:17,  6.93s/it]score1 tensor([[0.4805],
-        [0.4805],
-        [0.5195],
-        [0.3125]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.4512, 0.5977, 0.3086], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:09:20,237] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.37
-[2025-01-25 12:09:20,239] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.04 | bwd_microstep: 4652.55 | bwd_inner_microstep: 4644.33 | bwd_allreduce_microstep: 8.02 | step_microstep: 48.72
-[2025-01-25 12:09:20,239] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.90 | bwd: 4652.60 | bwd_inner: 4644.33 | bwd_allreduce: 8.14 | step: 48.70
- 21%|██        | 1194/5800 [3:22:50<8:52:38,  6.94s/it]                                                       {'loss': 0.0312, 'grad_norm': 0.9278971552848816, 'learning_rate': 3.6842613373061365e-05, 'epoch': 10.29}
- 21%|██        | 1194/5800 [3:22:50<8:52:38,  6.94s/it]score1 tensor([[0.5195],
-        [0.4570],
-        [0.5000],
-        [0.4121]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.4941, 0.5430, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:09:27,165] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 12:09:27,169] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.61 | bwd_microstep: 4592.93 | bwd_inner_microstep: 4585.43 | bwd_allreduce_microstep: 7.25 | step_microstep: 69.98
-[2025-01-25 12:09:27,169] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2168.54 | bwd: 4592.99 | bwd_inner: 4585.42 | bwd_allreduce: 7.38 | step: 70.00
- 21%|██        | 1195/5800 [3:22:57<8:52:43,  6.94s/it]                                                       {'loss': 0.0215, 'grad_norm': 5.334711074829102, 'learning_rate': 3.683658807504863e-05, 'epoch': 10.3}
- 21%|██        | 1195/5800 [3:22:57<8:52:43,  6.94s/it]score1 tensor([[0.5312],
-        [0.4609],
-        [0.4492],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.4551, 0.4570, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:09:34,123] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.50 | optimizer_step: 4.36
-[2025-01-25 12:09:34,124] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.76 | bwd_microstep: 4646.70 | bwd_inner_microstep: 4642.21 | bwd_allreduce_microstep: 4.42 | step_microstep: 40.71
-[2025-01-25 12:09:34,124] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.73 | bwd: 4646.72 | bwd_inner: 4642.21 | bwd_allreduce: 4.45 | step: 40.72
- 21%|██        | 1196/5800 [3:23:04<8:52:25,  6.94s/it]                                                       {'loss': 0.0132, 'grad_norm': 3.983029365539551, 'learning_rate': 3.683055752709559e-05, 'epoch': 10.31}
- 21%|██        | 1196/5800 [3:23:04<8:52:25,  6.94s/it]score1 tensor([[0.5195],
-        [0.4414],
-        [0.4297],
-        [0.4512]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4336, 0.3945, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0303, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:09:41,049] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.47 | optimizer_step: 4.37
-[2025-01-25 12:09:41,050] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.88 | bwd_microstep: 4648.42 | bwd_inner_microstep: 4643.78 | bwd_allreduce_microstep: 4.56 | step_microstep: 42.11
-[2025-01-25 12:09:41,051] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.84 | bwd: 4648.44 | bwd_inner: 4643.78 | bwd_allreduce: 4.60 | step: 42.12
- 21%|██        | 1197/5800 [3:23:11<8:51:58,  6.93s/it]                                                       {'loss': 0.0303, 'grad_norm': 0.6933881044387817, 'learning_rate': 3.682452173108268e-05, 'epoch': 10.32}
- 21%|██        | 1197/5800 [3:23:11<8:51:58,  6.93s/it]score1 tensor([[0.6328],
-        [0.6406],
-        [0.4746],
-        [0.4297]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.6172, 0.4785, 0.4805], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:09:47,977] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 12:09:47,978] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.22 | bwd_microstep: 4650.49 | bwd_inner_microstep: 4646.08 | bwd_allreduce_microstep: 4.34 | step_microstep: 38.99
-[2025-01-25 12:09:47,979] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2168.17 | bwd: 4650.51 | bwd_inner: 4646.08 | bwd_allreduce: 4.37 | step: 39.00
- 21%|██        | 1198/5800 [3:23:17<8:51:41,  6.93s/it]                                                       {'loss': 0.0234, 'grad_norm': 1.4720319509506226, 'learning_rate': 3.681848068889196e-05, 'epoch': 10.33}
- 21%|██        | 1198/5800 [3:23:17<8:51:41,  6.93s/it]score1 tensor([[0.6328],
-        [0.4551],
-        [0.4902],
-        [0.4785]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6484, 0.4121, 0.4961, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0244, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:09:54,919] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.37
-[2025-01-25 12:09:54,921] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.55 | bwd_microstep: 4646.69 | bwd_inner_microstep: 4641.40 | bwd_allreduce_microstep: 5.21 | step_microstep: 57.90
-[2025-01-25 12:09:54,922] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.52 | bwd: 4646.72 | bwd_inner: 4641.40 | bwd_allreduce: 5.25 | step: 57.91
- 21%|██        | 1199/5800 [3:23:24<8:51:56,  6.94s/it]                                                       {'loss': 0.0244, 'grad_norm': 0.8175918459892273, 'learning_rate': 3.6812434402407145e-05, 'epoch': 10.34}
- 21%|██        | 1199/5800 [3:23:24<8:51:56,  6.94s/it]score1 tensor([[0.5039],
-        [0.5820],
-        [0.4688],
-        [0.4766]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.5820, 0.4707, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0181, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:10:01,820] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 12:10:01,821] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.36 | bwd_microstep: 4601.56 | bwd_inner_microstep: 4594.97 | bwd_allreduce_microstep: 6.50 | step_microstep: 47.26
-[2025-01-25 12:10:01,821] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.33 | bwd: 4601.58 | bwd_inner: 4594.97 | bwd_allreduce: 6.55 | step: 47.26
- 21%|██        | 1200/5800 [3:23:31<8:50:51,  6.92s/it]                                                       {'loss': 0.0181, 'grad_norm': 5.741109848022461, 'learning_rate': 3.680638287351355e-05, 'epoch': 10.34}
- 21%|██        | 1200/5800 [3:23:31<8:50:51,  6.92s/it]score1 tensor([[0.5234],
-        [0.5195],
-        [0.6641],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4316, 0.5039, 0.6445, 0.4277], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0537, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:10:08,757] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 12:10:08,759] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2170.30 | bwd_microstep: 4657.40 | bwd_inner_microstep: 4652.91 | bwd_allreduce_microstep: 4.39 | step_microstep: 40.99
-[2025-01-25 12:10:08,759] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2170.28 | bwd: 4657.42 | bwd_inner: 4652.91 | bwd_allreduce: 4.44 | step: 41.00
- 21%|██        | 1201/5800 [3:23:38<8:51:10,  6.93s/it]                                                       {'loss': 0.0537, 'grad_norm': 8.182930946350098, 'learning_rate': 3.680032610409817e-05, 'epoch': 10.35}
- 21%|██        | 1201/5800 [3:23:38<8:51:10,  6.93s/it]score1 tensor([[0.4277],
-        [0.6133],
-        [0.4902],
-        [0.3027]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4199, 0.5430, 0.3984, 0.3105], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0444, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:10:15,693] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.37
-[2025-01-25 12:10:15,694] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.36 | bwd_microstep: 4647.82 | bwd_inner_microstep: 4643.21 | bwd_allreduce_microstep: 4.52 | step_microstep: 46.94
-[2025-01-25 12:10:15,694] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.30 | bwd: 4647.84 | bwd_inner: 4643.21 | bwd_allreduce: 4.57 | step: 46.95
- 21%|██        | 1202/5800 [3:23:45<8:51:07,  6.93s/it]                                                       {'loss': 0.0444, 'grad_norm': 4.491299152374268, 'learning_rate': 3.67942640960496e-05, 'epoch': 10.36}
- 21%|██        | 1202/5800 [3:23:45<8:51:07,  6.93s/it]score1 tensor([[0.5039],
-        [0.5898],
-        [0.5469],
-        [0.3008]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4453, 0.5664, 0.5195, 0.3223], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0327, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:10:22,642] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 12:10:22,644] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2169.38 | bwd_microstep: 4650.01 | bwd_inner_microstep: 4644.81 | bwd_allreduce_microstep: 5.10 | step_microstep: 58.31
-[2025-01-25 12:10:22,644] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2169.34 | bwd: 4650.03 | bwd_inner: 4644.81 | bwd_allreduce: 5.15 | step: 58.33
- 21%|██        | 1203/5800 [3:23:52<8:51:30,  6.94s/it]                                                       {'loss': 0.0327, 'grad_norm': 4.67891788482666, 'learning_rate': 3.678819685125807e-05, 'epoch': 10.37}
- 21%|██        | 1203/5800 [3:23:52<8:51:30,  6.94s/it]score1 tensor([[0.5742],
-        [0.5000],
-        [0.5352],
-        [0.3926]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.4043, 0.5508, 0.4062], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0332, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:10:29,606] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.47 | optimizer_step: 4.45
-[2025-01-25 12:10:29,607] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.60 | bwd_microstep: 4648.38 | bwd_inner_microstep: 4641.06 | bwd_allreduce_microstep: 7.11 | step_microstep: 61.89
-[2025-01-25 12:10:29,608] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.51 | bwd: 4648.43 | bwd_inner: 4641.06 | bwd_allreduce: 7.22 | step: 61.88
- 21%|██        | 1204/5800 [3:23:59<8:51:54,  6.94s/it]                                                       {'loss': 0.0332, 'grad_norm': 0.7655556201934814, 'learning_rate': 3.6782124371615465e-05, 'epoch': 10.38}
- 21%|██        | 1204/5800 [3:23:59<8:51:54,  6.94s/it]score1 tensor([[0.4922],
-        [0.4648],
-        [0.5781],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4375, 0.4863, 0.6211, 0.4414], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0552, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:10:36,572] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.37
-[2025-01-25 12:10:36,573] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.97 | bwd_microstep: 4661.77 | bwd_inner_microstep: 4656.33 | bwd_allreduce_microstep: 5.33 | step_microstep: 48.93
-[2025-01-25 12:10:36,573] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.93 | bwd: 4661.79 | bwd_inner: 4656.33 | bwd_allreduce: 5.39 | step: 48.93
- 21%|██        | 1205/5800 [3:24:06<8:52:46,  6.96s/it]                                                       {'loss': 0.0552, 'grad_norm': 0.6812517046928406, 'learning_rate': 3.677604665901529e-05, 'epoch': 10.39}
- 21%|██        | 1205/5800 [3:24:06<8:52:46,  6.96s/it]score1 tensor([[0.5469],
-        [0.4414],
-        [0.5039],
-        [0.3789]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4961, 0.4453, 0.5352, 0.4160], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0308, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:10:43,555] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 12:10:43,556] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.29 | bwd_microstep: 4651.62 | bwd_inner_microstep: 4646.27 | bwd_allreduce_microstep: 5.27 | step_microstep: 44.31
-[2025-01-25 12:10:43,556] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.18 | bwd: 4651.64 | bwd_inner: 4646.27 | bwd_allreduce: 5.30 | step: 44.32
- 21%|██        | 1206/5800 [3:24:13<8:52:59,  6.96s/it]                                                       {'loss': 0.0308, 'grad_norm': 3.668241500854492, 'learning_rate': 3.676996371535268e-05, 'epoch': 10.4}
- 21%|██        | 1206/5800 [3:24:13<8:52:59,  6.96s/it]score1 tensor([[0.3555],
-        [0.2617],
-        [0.4727],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4043, 0.3457, 0.5039, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0479, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:10:50,497] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.53 | optimizer_step: 4.36
-[2025-01-25 12:10:50,499] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.54 | bwd_microstep: 4648.48 | bwd_inner_microstep: 4643.81 | bwd_allreduce_microstep: 4.57 | step_microstep: 44.67
-[2025-01-25 12:10:50,499] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.51 | bwd: 4648.51 | bwd_inner: 4643.81 | bwd_allreduce: 4.61 | step: 44.67
- 21%|██        | 1207/5800 [3:24:20<8:52:23,  6.95s/it]                                                       {'loss': 0.0479, 'grad_norm': 7.2841572761535645, 'learning_rate': 3.676387554252439e-05, 'epoch': 10.41}
- 21%|██        | 1207/5800 [3:24:20<8:52:23,  6.95s/it]score1 tensor([[0.5156],
-        [0.3340],
-        [0.5117],
-        [0.4902]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.3652, 0.5352, 0.4746], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:10:57,448] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 12:10:57,450] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.38 | bwd_microstep: 4648.62 | bwd_inner_microstep: 4643.56 | bwd_allreduce_microstep: 4.98 | step_microstep: 46.98
-[2025-01-25 12:10:57,450] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.35 | bwd: 4648.64 | bwd_inner: 4643.56 | bwd_allreduce: 5.02 | step: 46.98
- 21%|██        | 1208/5800 [3:24:27<8:52:07,  6.95s/it]                                                       {'loss': 0.043, 'grad_norm': 3.8089146614074707, 'learning_rate': 3.675778214242883e-05, 'epoch': 10.41}
- 21%|██        | 1208/5800 [3:24:27<8:52:07,  6.95s/it]score1 tensor([[0.5117],
-        [0.4473],
-        [0.5156],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.4395, 0.4180, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:11:04,392] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 12:11:04,393] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.63 | bwd_microstep: 4651.55 | bwd_inner_microstep: 4646.68 | bwd_allreduce_microstep: 4.78 | step_microstep: 48.20
-[2025-01-25 12:11:04,393] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.59 | bwd: 4651.58 | bwd_inner: 4646.68 | bwd_allreduce: 4.83 | step: 48.20
- 21%|██        | 1209/5800 [3:24:34<8:51:48,  6.95s/it]                                                       {'loss': 0.0352, 'grad_norm': 0.654802143573761, 'learning_rate': 3.675168351696602e-05, 'epoch': 10.42}
- 21%|██        | 1209/5800 [3:24:34<8:51:48,  6.95s/it]score1 tensor([[0.5078],
-        [0.6250],
-        [0.6133],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.6875, 0.5742, 0.4727], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0674, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:11:11,326] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 12:11:11,327] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.96 | bwd_microstep: 4649.46 | bwd_inner_microstep: 4644.46 | bwd_allreduce_microstep: 4.90 | step_microstep: 44.93
-[2025-01-25 12:11:11,327] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.91 | bwd: 4649.49 | bwd_inner: 4644.46 | bwd_allreduce: 4.96 | step: 44.93
- 21%|██        | 1210/5800 [3:24:41<8:51:15,  6.94s/it]                                                       {'loss': 0.0674, 'grad_norm': 4.009251117706299, 'learning_rate': 3.6745579668037625e-05, 'epoch': 10.43}
- 21%|██        | 1210/5800 [3:24:41<8:51:15,  6.94s/it]score1 tensor([[0.5938],
-        [0.4941],
-        [0.5039],
-        [0.5938]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6406, 0.4551, 0.4512, 0.5742], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0396, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:11:18,268] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.88 | optimizer_step: 4.37
-[2025-01-25 12:11:18,270] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.84 | bwd_microstep: 4650.66 | bwd_inner_microstep: 4645.16 | bwd_allreduce_microstep: 5.39 | step_microstep: 51.18
-[2025-01-25 12:11:18,270] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.81 | bwd: 4650.69 | bwd_inner: 4645.16 | bwd_allreduce: 5.45 | step: 51.20
- 21%|██        | 1211/5800 [3:24:48<8:51:10,  6.94s/it]                                                       {'loss': 0.0396, 'grad_norm': 4.012576103210449, 'learning_rate': 3.6739470597546935e-05, 'epoch': 10.44}
- 21%|██        | 1211/5800 [3:24:48<8:51:10,  6.94s/it]score1 tensor([[0.5664],
-        [0.5859],
-        [0.5898],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5234, 0.5508, 0.5273, 0.4902], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0425, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:11:25,224] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 12:11:25,226] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2171.20 | bwd_microstep: 4651.26 | bwd_inner_microstep: 4645.97 | bwd_allreduce_microstep: 5.18 | step_microstep: 47.33
-[2025-01-25 12:11:25,226] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2171.15 | bwd: 4651.29 | bwd_inner: 4645.97 | bwd_allreduce: 5.23 | step: 47.33
- 21%|██        | 1212/5800 [3:24:55<8:51:18,  6.95s/it]                                                       {'loss': 0.0425, 'grad_norm': 8.570745468139648, 'learning_rate': 3.673335630739885e-05, 'epoch': 10.45}
- 21%|██        | 1212/5800 [3:24:55<8:51:18,  6.95s/it]score1 tensor([[0.6133],
-        [0.5117],
-        [0.5039],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5742, 0.4668, 0.5078, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0259, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:11:32,156] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.37
-[2025-01-25 12:11:32,157] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.66 | bwd_microstep: 4643.56 | bwd_inner_microstep: 4636.94 | bwd_allreduce_microstep: 6.48 | step_microstep: 50.54
-[2025-01-25 12:11:32,158] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.63 | bwd: 4643.58 | bwd_inner: 4636.94 | bwd_allreduce: 6.56 | step: 50.55
- 21%|██        | 1213/5800 [3:25:02<8:50:50,  6.94s/it]                                                       {'loss': 0.0259, 'grad_norm': 4.467371940612793, 'learning_rate': 3.672723679949993e-05, 'epoch': 10.46}
- 21%|██        | 1213/5800 [3:25:02<8:50:50,  6.94s/it]score1 tensor([[0.4766],
-        [0.5117],
-        [0.5430],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.4805, 0.5078, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:11:39,103] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.81 | optimizer_step: 4.37
-[2025-01-25 12:11:39,104] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2169.77 | bwd_microstep: 4643.07 | bwd_inner_microstep: 4638.20 | bwd_allreduce_microstep: 4.79 | step_microstep: 57.43
-[2025-01-25 12:11:39,104] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2169.72 | bwd: 4643.10 | bwd_inner: 4638.20 | bwd_allreduce: 4.83 | step: 57.44
- 21%|██        | 1214/5800 [3:25:09<8:51:21,  6.95s/it]                                                       {'loss': 0.0293, 'grad_norm': 4.04142951965332, 'learning_rate': 3.6721112075758325e-05, 'epoch': 10.47}
- 21%|██        | 1214/5800 [3:25:09<8:51:21,  6.95s/it]score1 tensor([[0.3418],
-        [0.3926],
-        [0.5156],
-        [0.4316]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4414, 0.4043, 0.5938, 0.4238], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0493, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:11:46,101] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 12:11:46,102] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.00 | bwd_microstep: 4651.17 | bwd_inner_microstep: 4642.43 | bwd_allreduce_microstep: 8.46 | step_microstep: 52.51
-[2025-01-25 12:11:46,102] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.89 | bwd: 4651.22 | bwd_inner: 4642.43 | bwd_allreduce: 8.63 | step: 52.50
- 21%|██        | 1215/5800 [3:25:16<8:51:42,  6.96s/it]                                                       {'loss': 0.0493, 'grad_norm': 3.7442715167999268, 'learning_rate': 3.671498213808384e-05, 'epoch': 10.47}
- 21%|██        | 1215/5800 [3:25:16<8:51:42,  6.96s/it]score1 tensor([[0.4219],
-        [0.4707],
-        [0.5625],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.5391, 0.6133, 0.7070], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0835, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:11:53,046] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 12:11:53,047] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.48 | bwd_microstep: 4644.50 | bwd_inner_microstep: 4638.34 | bwd_allreduce_microstep: 6.03 | step_microstep: 47.47
-[2025-01-25 12:11:53,048] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2168.43 | bwd: 4644.53 | bwd_inner: 4638.34 | bwd_allreduce: 6.10 | step: 47.48
- 21%|██        | 1216/5800 [3:25:23<8:51:27,  6.96s/it]                                                       {'loss': 0.0835, 'grad_norm': 8.220194816589355, 'learning_rate': 3.670884698838789e-05, 'epoch': 10.48}
- 21%|██        | 1216/5800 [3:25:23<8:51:27,  6.96s/it]score1 tensor([[0.4258],
-        [0.5273],
-        [0.3906],
-        [0.4297]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.6641, 0.4648, 0.4297], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0791, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:11:59,953] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 12:11:59,954] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2169.21 | bwd_microstep: 4607.22 | bwd_inner_microstep: 4602.62 | bwd_allreduce_microstep: 4.51 | step_microstep: 44.96
-[2025-01-25 12:11:59,954] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2169.17 | bwd: 4607.24 | bwd_inner: 4602.62 | bwd_allreduce: 4.55 | step: 44.96
- 21%|██        | 1217/5800 [3:25:29<8:50:06,  6.94s/it]                                                       {'loss': 0.0791, 'grad_norm': 5.851864814758301, 'learning_rate': 3.670270662858353e-05, 'epoch': 10.49}
- 21%|██        | 1217/5800 [3:25:29<8:50:06,  6.94s/it]score1 tensor([[0.4688],
-        [0.4375],
-        [0.5078],
-        [0.3418]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.4531, 0.5352, 0.4023], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0474, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:12:06,908] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.37
-[2025-01-25 12:12:06,909] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.27 | bwd_microstep: 4652.30 | bwd_inner_microstep: 4647.35 | bwd_allreduce_microstep: 4.85 | step_microstep: 44.54
-[2025-01-25 12:12:06,910] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.22 | bwd: 4652.32 | bwd_inner: 4647.35 | bwd_allreduce: 4.90 | step: 44.54
- 21%|██        | 1218/5800 [3:25:36<8:50:13,  6.94s/it]                                                       {'loss': 0.0474, 'grad_norm': 7.638889312744141, 'learning_rate': 3.6696561060585424e-05, 'epoch': 10.5}
- 21%|██        | 1218/5800 [3:25:36<8:50:13,  6.94s/it]score1 tensor([[0.4824],
-        [0.4316],
-        [0.4746],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.5234, 0.5586, 0.5820], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0835, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:12:13,832] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 12:12:13,834] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.42 | bwd_microstep: 4638.02 | bwd_inner_microstep: 4632.40 | bwd_allreduce_microstep: 5.43 | step_microstep: 47.67
-[2025-01-25 12:12:13,834] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.37 | bwd: 4638.05 | bwd_inner: 4632.40 | bwd_allreduce: 5.55 | step: 47.69
- 21%|██        | 1219/5800 [3:25:43<8:49:43,  6.94s/it]                                                       {'loss': 0.0835, 'grad_norm': 7.869019985198975, 'learning_rate': 3.669041028630987e-05, 'epoch': 10.51}
- 21%|██        | 1219/5800 [3:25:43<8:49:43,  6.94s/it]score1 tensor([[0.4629],
-        [0.3945],
-        [0.3789],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.4258, 0.3906, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0278, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:12:20,764] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.49 | optimizer_step: 4.37
-[2025-01-25 12:12:20,765] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2174.38 | bwd_microstep: 4641.27 | bwd_inner_microstep: 4636.07 | bwd_allreduce_microstep: 5.10 | step_microstep: 43.94
-[2025-01-25 12:12:20,766] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2174.35 | bwd: 4641.29 | bwd_inner: 4636.07 | bwd_allreduce: 5.15 | step: 43.94
- 21%|██        | 1220/5800 [3:25:50<8:49:22,  6.94s/it]                                                       {'loss': 0.0278, 'grad_norm': 7.60185432434082, 'learning_rate': 3.668425430767479e-05, 'epoch': 10.52}
- 21%|██        | 1220/5800 [3:25:50<8:49:22,  6.94s/it]score1 tensor([[0.5586],
-        [0.5703],
-        [0.5117],
-        [0.4941]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.5430, 0.4277, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0405, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:12:27,684] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 12:12:27,685] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.44 | bwd_microstep: 4643.05 | bwd_inner_microstep: 4638.30 | bwd_allreduce_microstep: 4.65 | step_microstep: 43.87
-[2025-01-25 12:12:27,685] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.38 | bwd: 4643.09 | bwd_inner: 4638.30 | bwd_allreduce: 4.70 | step: 43.88
- 21%|██        | 1221/5800 [3:25:57<8:48:56,  6.93s/it]                                                       {'loss': 0.0405, 'grad_norm': 4.273210048675537, 'learning_rate': 3.667809312659971e-05, 'epoch': 10.53}
- 21%|██        | 1221/5800 [3:25:57<8:48:56,  6.93s/it]score1 tensor([[0.5742],
-        [0.5977],
-        [0.5078],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5586, 0.5781, 0.4492, 0.4375], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0410, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:12:34,612] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 12:12:34,613] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.22 | bwd_microstep: 4642.35 | bwd_inner_microstep: 4636.98 | bwd_allreduce_microstep: 5.24 | step_microstep: 44.86
-[2025-01-25 12:12:34,614] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.18 | bwd: 4642.37 | bwd_inner: 4636.98 | bwd_allreduce: 5.31 | step: 44.88
- 21%|██        | 1222/5800 [3:26:04<8:48:51,  6.93s/it]                                                       {'loss': 0.041, 'grad_norm': 8.207637786865234, 'learning_rate': 3.667192674500581e-05, 'epoch': 10.53}
- 21%|██        | 1222/5800 [3:26:04<8:48:51,  6.93s/it]score1 tensor([[0.4180],
-        [0.6641],
-        [0.5625],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3418, 0.6250, 0.5312, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0415, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:12:41,545] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.37
-[2025-01-25 12:12:41,547] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.53 | bwd_microstep: 4642.05 | bwd_inner_microstep: 4636.23 | bwd_allreduce_microstep: 5.67 | step_microstep: 50.93
-[2025-01-25 12:12:41,547] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.50 | bwd: 4642.07 | bwd_inner: 4636.23 | bwd_allreduce: 5.76 | step: 50.94
- 21%|██        | 1223/5800 [3:26:11<8:48:47,  6.93s/it]                                                       {'loss': 0.0415, 'grad_norm': 4.094811916351318, 'learning_rate': 3.666575516481587e-05, 'epoch': 10.54}
- 21%|██        | 1223/5800 [3:26:11<8:48:47,  6.93s/it]score1 tensor([[0.0000],
-        [0.4648],
-        [0.5547],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.1787, 0.4180, 0.5781, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0850, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:12:48,442] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 12:12:48,444] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.29 | bwd_microstep: 4594.44 | bwd_inner_microstep: 4586.28 | bwd_allreduce_microstep: 7.99 | step_microstep: 57.52
-[2025-01-25 12:12:48,445] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.26 | bwd: 4594.50 | bwd_inner: 4586.28 | bwd_allreduce: 8.08 | step: 57.51
- 21%|██        | 1224/5800 [3:26:18<8:48:05,  6.92s/it]                                                       {'loss': 0.085, 'grad_norm': 1.989526391029358, 'learning_rate': 3.665957838795429e-05, 'epoch': 10.55}
- 21%|██        | 1224/5800 [3:26:18<8:48:05,  6.92s/it]score1 tensor([[0.5039],
-        [0.5625],
-        [0.4922],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4141, 0.4844, 0.4551, 0.4688], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0708, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:12:55,448] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.34 | optimizer_step: 4.36
-[2025-01-25 12:12:55,454] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2172.40 | bwd_microstep: 4647.37 | bwd_inner_microstep: 4640.09 | bwd_allreduce_microstep: 7.20 | step_microstep: 77.17
-[2025-01-25 12:12:55,455] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2172.32 | bwd: 4647.41 | bwd_inner: 4640.09 | bwd_allreduce: 7.24 | step: 77.17
- 21%|██        | 1225/5800 [3:26:25<8:49:46,  6.95s/it]                                                       {'loss': 0.0708, 'grad_norm': 8.02910327911377, 'learning_rate': 3.66533964163471e-05, 'epoch': 10.56}
- 21%|██        | 1225/5800 [3:26:25<8:49:46,  6.95s/it]score1 tensor([[0.5742],
-        [0.4277],
-        [0.5391],
-        [0.6250]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.4180, 0.4766, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0356, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:13:02,410] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 12:13:02,411] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2169.20 | bwd_microstep: 4646.46 | bwd_inner_microstep: 4641.49 | bwd_allreduce_microstep: 4.89 | step_microstep: 48.54
-[2025-01-25 12:13:02,412] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2169.16 | bwd: 4646.48 | bwd_inner: 4641.49 | bwd_allreduce: 4.93 | step: 48.55
- 21%|██        | 1226/5800 [3:26:32<8:49:49,  6.95s/it]                                                       {'loss': 0.0356, 'grad_norm': 8.186454772949219, 'learning_rate': 3.664720925192193e-05, 'epoch': 10.57}
- 21%|██        | 1226/5800 [3:26:32<8:49:49,  6.95s/it]score1 tensor([[0.4121],
-        [0.5273],
-        [0.4395],
-        [0.4980]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.5547, 0.4336, 0.5000], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:13:09,347] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 12:13:09,349] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.23 | bwd_microstep: 4640.30 | bwd_inner_microstep: 4634.89 | bwd_allreduce_microstep: 5.29 | step_microstep: 46.55
-[2025-01-25 12:13:09,349] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.19 | bwd: 4640.33 | bwd_inner: 4634.89 | bwd_allreduce: 5.35 | step: 46.55
- 21%|██        | 1227/5800 [3:26:39<8:49:22,  6.95s/it]                                                       {'loss': 0.0117, 'grad_norm': 0.7843502759933472, 'learning_rate': 3.6641016896608086e-05, 'epoch': 10.58}
- 21%|██        | 1227/5800 [3:26:39<8:49:22,  6.95s/it]score1 tensor([[0.5859],
-        [0.5000],
-        [0.5039],
-        [0.4863]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6484, 0.5703, 0.5195, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0376, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:13:16,294] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 11.73 | optimizer_step: 4.36
-[2025-01-25 12:13:16,295] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.78 | bwd_microstep: 4650.68 | bwd_inner_microstep: 4645.19 | bwd_allreduce_microstep: 5.40 | step_microstep: 55.27
-[2025-01-25 12:13:16,296] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.73 | bwd: 4650.71 | bwd_inner: 4645.19 | bwd_allreduce: 5.44 | step: 55.28
- 21%|██        | 1228/5800 [3:26:46<8:49:17,  6.95s/it]                                                       {'loss': 0.0376, 'grad_norm': 4.190552234649658, 'learning_rate': 3.663481935233641e-05, 'epoch': 10.59}
- 21%|██        | 1228/5800 [3:26:46<8:49:17,  6.95s/it]score1 tensor([[0.4551],
-        [0.4844],
-        [0.4805],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.5391, 0.5586, 0.6875], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1147, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:13:23,233] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 12:13:23,234] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2170.61 | bwd_microstep: 4650.50 | bwd_inner_microstep: 4645.46 | bwd_allreduce_microstep: 4.95 | step_microstep: 43.26
-[2025-01-25 12:13:23,235] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2170.58 | bwd: 4650.53 | bwd_inner: 4645.46 | bwd_allreduce: 5.00 | step: 43.26
- 21%|██        | 1229/5800 [3:26:53<8:49:00,  6.94s/it]                                                       {'loss': 0.1147, 'grad_norm': 7.825051784515381, 'learning_rate': 3.662861662103943e-05, 'epoch': 10.59}
- 21%|██        | 1229/5800 [3:26:53<8:49:00,  6.94s/it]score1 tensor([[0.4316],
-        [0.4219],
-        [0.5156],
-        [0.3594]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.4863, 0.5742, 0.3789], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0576, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:13:30,166] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.32 | optimizer_step: 4.37
-[2025-01-25 12:13:30,167] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.29 | bwd_microstep: 4653.02 | bwd_inner_microstep: 4647.80 | bwd_allreduce_microstep: 5.10 | step_microstep: 45.66
-[2025-01-25 12:13:30,167] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.25 | bwd: 4653.04 | bwd_inner: 4647.80 | bwd_allreduce: 5.16 | step: 45.66
- 21%|██        | 1230/5800 [3:27:00<8:48:35,  6.94s/it]                                                       {'loss': 0.0576, 'grad_norm': 7.51120662689209, 'learning_rate': 3.6622408704651254e-05, 'epoch': 10.6}
- 21%|██        | 1230/5800 [3:27:00<8:48:35,  6.94s/it]score1 tensor([[0.5000],
-        [0.2773],
-        [0.4707],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6484, 0.2812, 0.5820, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0776, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:13:37,095] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 12:13:37,097] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.62 | bwd_microstep: 4653.22 | bwd_inner_microstep: 4647.50 | bwd_allreduce_microstep: 5.61 | step_microstep: 46.88
-[2025-01-25 12:13:37,097] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.57 | bwd: 4653.25 | bwd_inner: 4647.50 | bwd_allreduce: 5.67 | step: 46.89
- 21%|██        | 1231/5800 [3:27:07<8:48:17,  6.94s/it]                                                       {'loss': 0.0776, 'grad_norm': 7.665003299713135, 'learning_rate': 3.6616195605107615e-05, 'epoch': 10.61}
- 21%|██        | 1231/5800 [3:27:07<8:48:17,  6.94s/it]score1 tensor([[0.5430],
-        [0.5664],
-        [0.5234],
-        [0.3672]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.6094, 0.4648, 0.4727], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0684, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:13:44,057] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 12:13:44,059] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.19 | bwd_microstep: 4654.29 | bwd_inner_microstep: 4646.91 | bwd_allreduce_microstep: 7.17 | step_microstep: 64.61
-[2025-01-25 12:13:44,059] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.15 | bwd: 4654.35 | bwd_inner: 4646.91 | bwd_allreduce: 7.29 | step: 64.57
- 21%|██        | 1232/5800 [3:27:14<8:48:46,  6.95s/it]                                                       {'loss': 0.0684, 'grad_norm': 3.953032970428467, 'learning_rate': 3.660997732434588e-05, 'epoch': 10.62}
- 21%|██        | 1232/5800 [3:27:14<8:48:46,  6.95s/it]score1 tensor([[0.5898],
-        [0.4219],
-        [0.3613],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.4629, 0.3867, 0.5898], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0283, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:13:51,004] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 12:13:51,006] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2176.53 | bwd_microstep: 4645.98 | bwd_inner_microstep: 4641.14 | bwd_allreduce_microstep: 4.76 | step_microstep: 44.60
-[2025-01-25 12:13:51,006] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2176.49 | bwd: 4646.00 | bwd_inner: 4641.14 | bwd_allreduce: 4.80 | step: 44.59
- 21%|██▏       | 1233/5800 [3:27:20<8:48:48,  6.95s/it]                                                       {'loss': 0.0283, 'grad_norm': 7.829960823059082, 'learning_rate': 3.6603753864305e-05, 'epoch': 10.63}
- 21%|██▏       | 1233/5800 [3:27:20<8:48:48,  6.95s/it]score1 tensor([[0.6523],
-        [0.6680],
-        [0.5430],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6562, 0.6445, 0.4023, 0.6016], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0557, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:13:57,953] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.37
-[2025-01-25 12:13:57,955] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2171.09 | bwd_microstep: 4644.50 | bwd_inner_microstep: 4639.55 | bwd_allreduce_microstep: 4.84 | step_microstep: 44.88
-[2025-01-25 12:13:57,955] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2170.99 | bwd: 4644.53 | bwd_inner: 4639.55 | bwd_allreduce: 4.90 | step: 44.88
- 21%|██▏       | 1234/5800 [3:27:27<8:49:11,  6.95s/it]                                                       {'loss': 0.0557, 'grad_norm': 0.5715789794921875, 'learning_rate': 3.6597525226925566e-05, 'epoch': 10.64}
- 21%|██▏       | 1234/5800 [3:27:27<8:49:11,  6.95s/it]score1 tensor([[0.4492],
-        [0.7031],
-        [0.5391],
-        [0.6953]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3887, 0.6406, 0.4648, 0.6562], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0591, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:14:04,928] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 12:14:04,929] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.19 | bwd_microstep: 4653.20 | bwd_inner_microstep: 4648.03 | bwd_allreduce_microstep: 5.06 | step_microstep: 47.82
-[2025-01-25 12:14:04,930] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.15 | bwd: 4653.23 | bwd_inner: 4648.03 | bwd_allreduce: 5.11 | step: 47.83
- 21%|██▏       | 1235/5800 [3:27:34<8:49:03,  6.95s/it]                                                       {'loss': 0.0591, 'grad_norm': 8.63907527923584, 'learning_rate': 3.659129141414977e-05, 'epoch': 10.65}
- 21%|██▏       | 1235/5800 [3:27:34<8:49:03,  6.95s/it]score1 tensor([[0.5469],
-        [0.6914],
-        [0.4531],
-        [0.6758]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4707, 0.5703, 0.4043, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0889, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:14:11,873] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 12:14:11,875] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.69 | bwd_microstep: 4646.40 | bwd_inner_microstep: 4641.40 | bwd_allreduce_microstep: 4.90 | step_microstep: 48.44
-[2025-01-25 12:14:11,876] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.63 | bwd: 4646.42 | bwd_inner: 4641.40 | bwd_allreduce: 4.95 | step: 48.44
- 21%|██▏       | 1236/5800 [3:27:41<8:49:08,  6.96s/it]                                                       {'loss': 0.0889, 'grad_norm': 8.488553047180176, 'learning_rate': 3.6585052427921436e-05, 'epoch': 10.66}
- 21%|██▏       | 1236/5800 [3:27:41<8:49:08,  6.96s/it]score1 tensor([[0.5195],
-        [0.4688],
-        [0.7695],
-        [0.4238]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.4219, 0.6133, 0.3438], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1006, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:14:18,830] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 12:14:18,831] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.14 | bwd_microstep: 4651.70 | bwd_inner_microstep: 4646.23 | bwd_allreduce_microstep: 5.36 | step_microstep: 45.08
-[2025-01-25 12:14:18,832] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.09 | bwd: 4651.73 | bwd_inner: 4646.23 | bwd_allreduce: 5.42 | step: 45.09
- 21%|██▏       | 1237/5800 [3:27:48<8:48:36,  6.95s/it]                                                       {'loss': 0.1006, 'grad_norm': 8.166180610656738, 'learning_rate': 3.657880827018598e-05, 'epoch': 10.66}
- 21%|██▏       | 1237/5800 [3:27:48<8:48:36,  6.95s/it]score1 tensor([[0.6289],
-        [0.4727],
-        [0.6133],
-        [0.6680]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.4922, 0.5156, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0762, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:14:25,777] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.54 | optimizer_step: 4.36
-[2025-01-25 12:14:25,778] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2172.77 | bwd_microstep: 4651.64 | bwd_inner_microstep: 4646.30 | bwd_allreduce_microstep: 5.23 | step_microstep: 44.04
-[2025-01-25 12:14:25,778] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2172.73 | bwd: 4651.67 | bwd_inner: 4646.30 | bwd_allreduce: 5.29 | step: 44.04
- 21%|██▏       | 1238/5800 [3:27:55<8:48:16,  6.95s/it]                                                       {'loss': 0.0762, 'grad_norm': 4.764855861663818, 'learning_rate': 3.657255894289043e-05, 'epoch': 10.67}
- 21%|██▏       | 1238/5800 [3:27:55<8:48:16,  6.95s/it]score1 tensor([[0.6875],
-        [0.6211],
-        [0.6602],
-        [0.7148]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.5273, 0.6055, 0.6289], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0918, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:14:32,698] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 12:14:32,700] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.67 | bwd_microstep: 4641.62 | bwd_inner_microstep: 4636.44 | bwd_allreduce_microstep: 5.06 | step_microstep: 48.73
-[2025-01-25 12:14:32,700] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.63 | bwd: 4641.65 | bwd_inner: 4636.44 | bwd_allreduce: 5.13 | step: 48.74
- 21%|██▏       | 1239/5800 [3:28:02<8:47:38,  6.94s/it]                                                       {'loss': 0.0918, 'grad_norm': 9.180195808410645, 'learning_rate': 3.656630444798345e-05, 'epoch': 10.68}
- 21%|██▏       | 1239/5800 [3:28:02<8:47:38,  6.94s/it]score1 tensor([[0.5234],
-        [0.6406],
-        [0.5000],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4395, 0.6094, 0.4688, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0464, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:14:39,634] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 12:14:39,636] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.90 | bwd_microstep: 4651.05 | bwd_inner_microstep: 4646.03 | bwd_allreduce_microstep: 4.94 | step_microstep: 45.29
-[2025-01-25 12:14:39,636] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.85 | bwd: 4651.08 | bwd_inner: 4646.03 | bwd_allreduce: 4.98 | step: 45.30
- 21%|██▏       | 1240/5800 [3:28:09<8:47:26,  6.94s/it]                                                       {'loss': 0.0464, 'grad_norm': 8.064788818359375, 'learning_rate': 3.656004478741528e-05, 'epoch': 10.69}
- 21%|██▏       | 1240/5800 [3:28:09<8:47:26,  6.94s/it]score1 tensor([[0.6797],
-        [0.4805],
-        [0.4004],
-        [0.4668]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6719, 0.4121, 0.3809, 0.4355], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0317, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:14:46,574] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 12:14:46,575] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.40 | bwd_microstep: 4652.88 | bwd_inner_microstep: 4647.93 | bwd_allreduce_microstep: 4.84 | step_microstep: 44.32
-[2025-01-25 12:14:46,576] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2168.36 | bwd: 4652.90 | bwd_inner: 4647.93 | bwd_allreduce: 4.90 | step: 44.33
- 21%|██▏       | 1241/5800 [3:28:16<8:47:20,  6.94s/it]                                                       {'loss': 0.0317, 'grad_norm': 7.78867244720459, 'learning_rate': 3.655377996313782e-05, 'epoch': 10.7}
- 21%|██▏       | 1241/5800 [3:28:16<8:47:20,  6.94s/it]score1 tensor([[0.3496],
-        [0.4785],
-        [0.3594],
-        [0.6055]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3516, 0.5078, 0.3750, 0.6133], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:14:53,528] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.11 | optimizer_step: 4.36
-[2025-01-25 12:14:53,530] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.19 | bwd_microstep: 4645.90 | bwd_inner_microstep: 4640.38 | bwd_allreduce_microstep: 5.44 | step_microstep: 71.09
-[2025-01-25 12:14:53,530] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.15 | bwd: 4645.92 | bwd_inner: 4640.38 | bwd_allreduce: 5.48 | step: 71.11
- 21%|██▏       | 1242/5800 [3:28:23<8:47:30,  6.94s/it]                                                       {'loss': 0.0137, 'grad_norm': 7.392763614654541, 'learning_rate': 3.6547509977104526e-05, 'epoch': 10.71}
- 21%|██▏       | 1242/5800 [3:28:23<8:47:30,  6.94s/it]score1 tensor([[0.4668],
-        [0.5391],
-        [0.4785],
-        [0.3789]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.6055, 0.5664, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0879, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:15:00,517] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.43 | optimizer_step: 4.45
-[2025-01-25 12:15:00,519] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2173.06 | bwd_microstep: 4652.27 | bwd_inner_microstep: 4644.26 | bwd_allreduce_microstep: 7.81 | step_microstep: 85.00
-[2025-01-25 12:15:00,519] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2173.02 | bwd: 4652.33 | bwd_inner: 4644.26 | bwd_allreduce: 7.90 | step: 85.00
- 21%|██▏       | 1243/5800 [3:28:30<8:48:52,  6.96s/it]                                                       {'loss': 0.0879, 'grad_norm': 11.472989082336426, 'learning_rate': 3.654123483127049e-05, 'epoch': 10.72}
- 21%|██▏       | 1243/5800 [3:28:30<8:48:52,  6.96s/it]score1 tensor([[0.4668],
-        [0.4219],
-        [0.3398],
-        [0.4199]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4434, 0.4551, 0.4629, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0522, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:15:07,490] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.30 | optimizer_step: 4.36
-[2025-01-25 12:15:07,491] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.78 | bwd_microstep: 4647.61 | bwd_inner_microstep: 4640.14 | bwd_allreduce_microstep: 4.96 | step_microstep: 46.54
-[2025-01-25 12:15:07,492] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.75 | bwd: 4647.64 | bwd_inner: 4640.14 | bwd_allreduce: 5.01 | step: 46.54
- 21%|██▏       | 1244/5800 [3:28:37<8:48:27,  6.96s/it]                                                       {'loss': 0.0522, 'grad_norm': 3.4944868087768555, 'learning_rate': 3.653495452759243e-05, 'epoch': 10.72}
- 21%|██▏       | 1244/5800 [3:28:37<8:48:27,  6.96s/it]score1 tensor([[0.5547],
-        [0.4180],
-        [0.3477],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6602, 0.4473, 0.3945, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0552, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:15:14,435] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.37
-[2025-01-25 12:15:14,436] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.09 | bwd_microstep: 4652.94 | bwd_inner_microstep: 4645.64 | bwd_allreduce_microstep: 7.13 | step_microstep: 50.63
-[2025-01-25 12:15:14,437] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.05 | bwd: 4652.99 | bwd_inner: 4645.64 | bwd_allreduce: 7.21 | step: 50.63
- 21%|██▏       | 1245/5800 [3:28:44<8:48:09,  6.96s/it]                                                       {'loss': 0.0552, 'grad_norm': 7.4584784507751465, 'learning_rate': 3.6528669068028635e-05, 'epoch': 10.73}
- 21%|██▏       | 1245/5800 [3:28:44<8:48:09,  6.96s/it]score1 tensor([[0.4219],
-        [0.3574],
-        [0.5898],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4199, 0.3750, 0.5938, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:15:21,364] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 12:15:21,366] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.40 | bwd_microstep: 4641.18 | bwd_inner_microstep: 4636.29 | bwd_allreduce_microstep: 4.81 | step_microstep: 43.76
-[2025-01-25 12:15:21,367] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.37 | bwd: 4641.20 | bwd_inner: 4636.29 | bwd_allreduce: 4.85 | step: 43.77
- 21%|██▏       | 1246/5800 [3:28:51<8:47:16,  6.95s/it]                                                       {'loss': 0.0078, 'grad_norm': 0.7687323093414307, 'learning_rate': 3.652237845453903e-05, 'epoch': 10.74}
- 21%|██▏       | 1246/5800 [3:28:51<8:47:16,  6.95s/it]score1 tensor([[0.4336],
-        [0.4492],
-        [0.4629],
-        [0.5977]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.4844, 0.5195, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0581, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:15:28,302] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 12:15:28,304] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.75 | bwd_microstep: 4647.25 | bwd_inner_microstep: 4641.96 | bwd_allreduce_microstep: 5.18 | step_microstep: 45.23
-[2025-01-25 12:15:28,304] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.71 | bwd: 4647.28 | bwd_inner: 4641.96 | bwd_allreduce: 5.24 | step: 45.24
- 22%|██▏       | 1247/5800 [3:28:58<8:46:53,  6.94s/it]                                                       {'loss': 0.0581, 'grad_norm': 7.587847709655762, 'learning_rate': 3.651608268908513e-05, 'epoch': 10.75}
- 22%|██▏       | 1247/5800 [3:28:58<8:46:53,  6.94s/it]score1 tensor([[0.4551],
-        [0.4531],
-        [0.3711],
-        [0.6758]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.5039, 0.3789, 0.6562], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0464, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:15:35,231] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.36
-[2025-01-25 12:15:35,233] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.03 | bwd_microstep: 4645.21 | bwd_inner_microstep: 4640.25 | bwd_allreduce_microstep: 4.85 | step_microstep: 46.45
-[2025-01-25 12:15:35,233] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.98 | bwd: 4645.25 | bwd_inner: 4640.25 | bwd_allreduce: 4.89 | step: 46.47
- 22%|██▏       | 1248/5800 [3:29:05<8:46:31,  6.94s/it]                                                       {'loss': 0.0464, 'grad_norm': 3.260430335998535, 'learning_rate': 3.650978177363008e-05, 'epoch': 10.76}
- 22%|██▏       | 1248/5800 [3:29:05<8:46:31,  6.94s/it]score1 tensor([[0.5039],
-        [0.4219],
-        [0.5664],
-        [0.4668]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4570, 0.4844, 0.5117, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0439, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:15:42,166] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 12:15:42,167] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.34 | bwd_microstep: 4648.61 | bwd_inner_microstep: 4643.47 | bwd_allreduce_microstep: 5.03 | step_microstep: 46.86
-[2025-01-25 12:15:42,168] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.31 | bwd: 4648.64 | bwd_inner: 4643.47 | bwd_allreduce: 5.09 | step: 46.86
- 22%|██▏       | 1249/5800 [3:29:12<8:46:12,  6.94s/it]                                                       {'loss': 0.0439, 'grad_norm': 3.9509620666503906, 'learning_rate': 3.650347571013859e-05, 'epoch': 10.77}
- 22%|██▏       | 1249/5800 [3:29:12<8:46:12,  6.94s/it]score1 tensor([[0.4766],
-        [0.6328],
-        [0.4863],
-        [0.5938]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4883, 0.5508, 0.5430, 0.5000], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0610, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:15:49,092] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 12:15:49,093] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.89 | bwd_microstep: 4641.23 | bwd_inner_microstep: 4635.83 | bwd_allreduce_microstep: 5.30 | step_microstep: 47.31
-[2025-01-25 12:15:49,094] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.85 | bwd: 4641.26 | bwd_inner: 4635.83 | bwd_allreduce: 5.36 | step: 47.32
- 22%|██▏       | 1250/5800 [3:29:19<8:45:55,  6.94s/it]                                                       {'loss': 0.061, 'grad_norm': 1.009061336517334, 'learning_rate': 3.6497164500577026e-05, 'epoch': 10.78}
- 22%|██▏       | 1250/5800 [3:29:19<8:45:55,  6.94s/it]score1 tensor([[0.4961],
-        [0.5781],
-        [0.5977],
-        [0.5938]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.5469, 0.5625, 0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0439, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:15:56,039] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 12:15:56,041] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.72 | bwd_microstep: 4648.06 | bwd_inner_microstep: 4642.50 | bwd_allreduce_microstep: 5.48 | step_microstep: 59.36
-[2025-01-25 12:15:56,042] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.68 | bwd: 4648.09 | bwd_inner: 4642.50 | bwd_allreduce: 5.52 | step: 59.34
- 22%|██▏       | 1251/5800 [3:29:26<8:46:11,  6.94s/it]                                                       {'loss': 0.0439, 'grad_norm': 7.98270320892334, 'learning_rate': 3.649084814691331e-05, 'epoch': 10.78}
- 22%|██▏       | 1251/5800 [3:29:26<8:46:11,  6.94s/it]score1 tensor([[0.5781],
-        [0.5156],
-        [0.4980],
-        [0.6719]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.4805, 0.4512, 0.6250], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0400, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:16:02,982] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 12:16:02,983] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.77 | bwd_microstep: 4636.67 | bwd_inner_microstep: 4626.88 | bwd_allreduce_microstep: 9.50 | step_microstep: 57.18
-[2025-01-25 12:16:02,984] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.73 | bwd: 4636.73 | bwd_inner: 4626.88 | bwd_allreduce: 9.64 | step: 57.07
- 22%|██▏       | 1252/5800 [3:29:32<8:46:08,  6.94s/it]                                                       {'loss': 0.04, 'grad_norm': 7.940670967102051, 'learning_rate': 3.6484526651117e-05, 'epoch': 10.79}
- 22%|██▏       | 1252/5800 [3:29:32<8:46:08,  6.94s/it]score1 tensor([[0.4629],
-        [0.5703],
-        [0.5547],
-        [0.6211]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3984, 0.5000, 0.5273, 0.6016], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0454, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:16:09,926] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 12:16:09,927] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2170.47 | bwd_microstep: 4622.30 | bwd_inner_microstep: 4617.05 | bwd_allreduce_microstep: 5.10 | step_microstep: 45.75
-[2025-01-25 12:16:09,927] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2170.33 | bwd: 4622.33 | bwd_inner: 4617.05 | bwd_allreduce: 5.20 | step: 45.75
- 22%|██▏       | 1253/5800 [3:29:39<8:45:58,  6.94s/it]                                                       {'loss': 0.0454, 'grad_norm': 7.773318290710449, 'learning_rate': 3.647820001515926e-05, 'epoch': 10.8}
- 22%|██▏       | 1253/5800 [3:29:39<8:45:58,  6.94s/it]score1 tensor([[0.5469],
-        [0.4629],
-        [0.5820],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.3750, 0.6328, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0396, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:16:16,850] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.37
-[2025-01-25 12:16:16,851] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.52 | bwd_microstep: 4617.41 | bwd_inner_microstep: 4612.32 | bwd_allreduce_microstep: 4.97 | step_microstep: 46.05
-[2025-01-25 12:16:16,852] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.49 | bwd: 4617.43 | bwd_inner: 4612.31 | bwd_allreduce: 5.03 | step: 46.06
- 22%|██▏       | 1254/5800 [3:29:46<8:45:33,  6.94s/it]                                                       {'loss': 0.0396, 'grad_norm': 3.637190818786621, 'learning_rate': 3.647186824101282e-05, 'epoch': 10.81}
- 22%|██▏       | 1254/5800 [3:29:46<8:45:33,  6.94s/it]score1 tensor([[0.4688],
-        [0.5000],
-        [0.4785],
-        [0.4785]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.4863, 0.4160, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0396, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:16:23,758] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 12:16:23,759] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.08 | bwd_microstep: 4619.46 | bwd_inner_microstep: 4613.68 | bwd_allreduce_microstep: 5.70 | step_microstep: 42.27
-[2025-01-25 12:16:23,760] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.04 | bwd: 4619.48 | bwd_inner: 4613.68 | bwd_allreduce: 5.74 | step: 42.27
- 22%|██▏       | 1255/5800 [3:29:53<8:44:38,  6.93s/it]                                                       {'loss': 0.0396, 'grad_norm': 0.34262028336524963, 'learning_rate': 3.646553133065206e-05, 'epoch': 10.82}
- 22%|██▏       | 1255/5800 [3:29:53<8:44:38,  6.93s/it]score1 tensor([[0.5234],
-        [0.4883],
-        [0.4512],
-        [0.4766]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.4590, 0.4258, 0.4570], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:16:30,652] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.36
-[2025-01-25 12:16:30,654] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.53 | bwd_microstep: 4618.76 | bwd_inner_microstep: 4613.51 | bwd_allreduce_microstep: 5.17 | step_microstep: 44.50
-[2025-01-25 12:16:30,654] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.49 | bwd: 4618.79 | bwd_inner: 4613.51 | bwd_allreduce: 5.21 | step: 44.51
- 22%|██▏       | 1256/5800 [3:30:00<8:43:52,  6.92s/it]                                                       {'loss': 0.0391, 'grad_norm': 3.434023141860962, 'learning_rate': 3.645918928605293e-05, 'epoch': 10.83}
- 22%|██▏       | 1256/5800 [3:30:00<8:43:52,  6.92s/it]score1 tensor([[0.5547],
-        [0.4160],
-        [0.4121],
-        [0.4805]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6875, 0.4707, 0.4395, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0859, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:16:37,561] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.37
-[2025-01-25 12:16:37,562] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.38 | bwd_microstep: 4628.38 | bwd_inner_microstep: 4623.40 | bwd_allreduce_microstep: 4.84 | step_microstep: 44.02
-[2025-01-25 12:16:37,563] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.33 | bwd: 4628.40 | bwd_inner: 4623.40 | bwd_allreduce: 4.91 | step: 44.03
- 22%|██▏       | 1257/5800 [3:30:07<8:43:30,  6.91s/it]                                                       {'loss': 0.0859, 'grad_norm': 6.989355564117432, 'learning_rate': 3.645284210919299e-05, 'epoch': 10.84}
- 22%|██▏       | 1257/5800 [3:30:07<8:43:30,  6.91s/it]score1 tensor([[0.4395],
-        [0.4180],
-        [0.4219],
-        [0.4512]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4219, 0.5430, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0625, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:16:44,464] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.76 | optimizer_step: 4.75
-[2025-01-25 12:16:44,465] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.32 | bwd_microstep: 4617.99 | bwd_inner_microstep: 4613.02 | bwd_allreduce_microstep: 4.88 | step_microstep: 51.60
-[2025-01-25 12:16:44,466] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.29 | bwd: 4618.02 | bwd_inner: 4613.02 | bwd_allreduce: 4.93 | step: 51.61
- 22%|██▏       | 1258/5800 [3:30:14<8:43:07,  6.91s/it]                                                       {'loss': 0.0625, 'grad_norm': 6.6791510581970215, 'learning_rate': 3.6446489802051385e-05, 'epoch': 10.84}
- 22%|██▏       | 1258/5800 [3:30:14<8:43:07,  6.91s/it]score1 tensor([[0.4336],
-        [0.3887],
-        [0.4082],
-        [0.4297]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.4473, 0.5195, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1089, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:16:51,361] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 12:16:51,362] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.30 | bwd_microstep: 4621.86 | bwd_inner_microstep: 4616.56 | bwd_allreduce_microstep: 5.21 | step_microstep: 45.56
-[2025-01-25 12:16:51,362] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.27 | bwd: 4621.89 | bwd_inner: 4616.56 | bwd_allreduce: 5.26 | step: 45.57
- 22%|██▏       | 1259/5800 [3:30:21<8:42:46,  6.91s/it]                                                       {'loss': 0.1089, 'grad_norm': 6.50346565246582, 'learning_rate': 3.6440132366608895e-05, 'epoch': 10.85}
- 22%|██▏       | 1259/5800 [3:30:21<8:42:46,  6.91s/it]score1 tensor([[0.3965],
-        [0.4277],
-        [0.4434],
-        [0.4395]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3672, 0.4473, 0.4863, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0537, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:16:58,272] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.37
-[2025-01-25 12:16:58,274] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.09 | bwd_microstep: 4622.26 | bwd_inner_microstep: 4616.56 | bwd_allreduce_microstep: 5.62 | step_microstep: 54.35
-[2025-01-25 12:16:58,274] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.03 | bwd: 4622.29 | bwd_inner: 4616.56 | bwd_allreduce: 5.66 | step: 54.37
- 22%|██▏       | 1260/5800 [3:30:28<8:42:50,  6.91s/it]                                                       {'loss': 0.0537, 'grad_norm': 3.4350438117980957, 'learning_rate': 3.643376980484788e-05, 'epoch': 10.86}
- 22%|██▏       | 1260/5800 [3:30:28<8:42:50,  6.91s/it]score1 tensor([[0.4355],
-        [0.4766],
-        [0.4570],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.5586, 0.4941, 0.6367], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0688, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:17:05,223] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.30 | optimizer_step: 4.37
-[2025-01-25 12:17:05,225] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.37 | bwd_microstep: 4625.85 | bwd_inner_microstep: 4619.54 | bwd_allreduce_microstep: 6.11 | step_microstep: 78.52
-[2025-01-25 12:17:05,225] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.34 | bwd: 4625.90 | bwd_inner: 4619.54 | bwd_allreduce: 6.21 | step: 78.47
- 22%|██▏       | 1261/5800 [3:30:35<8:44:06,  6.93s/it]                                                       {'loss': 0.0688, 'grad_norm': 7.080763339996338, 'learning_rate': 3.642740211875228e-05, 'epoch': 10.87}
- 22%|██▏       | 1261/5800 [3:30:35<8:44:06,  6.93s/it]score1 tensor([[0.5508],
-        [0.5430],
-        [0.4766],
-        [0.4590]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.6836, 0.5312, 0.5000], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0757, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:17:12,176] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.19 | optimizer_step: 4.37
-[2025-01-25 12:17:12,178] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.57 | bwd_microstep: 4623.71 | bwd_inner_microstep: 4617.91 | bwd_allreduce_microstep: 5.65 | step_microstep: 51.71
-[2025-01-25 12:17:12,178] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.48 | bwd: 4623.73 | bwd_inner: 4617.91 | bwd_allreduce: 5.74 | step: 51.72
- 22%|██▏       | 1262/5800 [3:30:42<8:44:07,  6.93s/it]                                                       {'loss': 0.0757, 'grad_norm': 7.241316318511963, 'learning_rate': 3.642102931030766e-05, 'epoch': 10.88}
- 22%|██▏       | 1262/5800 [3:30:42<8:44:07,  6.93s/it]score1 tensor([[0.4805],
-        [0.5156],
-        [0.4902],
-        [0.4395]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4922, 0.6211, 0.4746, 0.3477], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0562, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:17:19,087] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.53 | optimizer_step: 4.36
-[2025-01-25 12:17:19,088] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.57 | bwd_microstep: 4622.13 | bwd_inner_microstep: 4617.07 | bwd_allreduce_microstep: 4.97 | step_microstep: 45.33
-[2025-01-25 12:17:19,088] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.51 | bwd: 4622.15 | bwd_inner: 4617.07 | bwd_allreduce: 5.00 | step: 45.34
- 22%|██▏       | 1263/5800 [3:30:49<8:43:47,  6.93s/it]                                                       {'loss': 0.0562, 'grad_norm': 0.3947470486164093, 'learning_rate': 3.6414651381501174e-05, 'epoch': 10.89}
- 22%|██▏       | 1263/5800 [3:30:49<8:43:47,  6.93s/it]score1 tensor([[0.5078],
-        [0.5312],
-        [0.5117],
-        [0.5938]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4531, 0.5625, 0.4336, 0.6367], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0518, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:17:26,021] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.36
-[2025-01-25 12:17:26,022] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.37 | bwd_microstep: 4619.82 | bwd_inner_microstep: 4613.80 | bwd_allreduce_microstep: 5.92 | step_microstep: 48.15
-[2025-01-25 12:17:26,023] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.34 | bwd: 4619.84 | bwd_inner: 4613.80 | bwd_allreduce: 5.97 | step: 48.16
- 22%|██▏       | 1264/5800 [3:30:55<8:43:32,  6.93s/it]                                                       {'loss': 0.0518, 'grad_norm': 0.557636559009552, 'learning_rate': 3.640826833432157e-05, 'epoch': 10.9}
- 22%|██▏       | 1264/5800 [3:30:56<8:43:32,  6.93s/it]score1 tensor([[0.6016],
-        [0.5625],
-        [0.5977],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.5039, 0.6797, 0.4824], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0620, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:17:32,924] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.03 | optimizer_step: 4.36
-[2025-01-25 12:17:32,925] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.54 | bwd_microstep: 4620.95 | bwd_inner_microstep: 4615.79 | bwd_allreduce_microstep: 5.07 | step_microstep: 47.89
-[2025-01-25 12:17:32,925] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.50 | bwd: 4620.98 | bwd_inner: 4615.79 | bwd_allreduce: 5.12 | step: 47.90
- 22%|██▏       | 1265/5800 [3:31:02<8:42:47,  6.92s/it]                                                       {'loss': 0.062, 'grad_norm': 0.4982696771621704, 'learning_rate': 3.640188017075919e-05, 'epoch': 10.91}
- 22%|██▏       | 1265/5800 [3:31:02<8:42:47,  6.92s/it]score1 tensor([[0.5781],
-        [0.5547],
-        [0.5742],
-        [0.6055]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.5273, 0.5430, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0562, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:17:39,819] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 12:17:39,820] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.82 | bwd_microstep: 4620.73 | bwd_inner_microstep: 4615.66 | bwd_allreduce_microstep: 4.98 | step_microstep: 45.04
-[2025-01-25 12:17:39,821] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.78 | bwd: 4620.75 | bwd_inner: 4615.66 | bwd_allreduce: 5.02 | step: 45.04
- 22%|██▏       | 1266/5800 [3:31:09<8:42:13,  6.91s/it]                                                       {'loss': 0.0562, 'grad_norm': 7.766552925109863, 'learning_rate': 3.639548689280598e-05, 'epoch': 10.91}
- 22%|██▏       | 1266/5800 [3:31:09<8:42:13,  6.91s/it]score1 tensor([[0.6523],
-        [0.5820],
-        [0.5156],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.4844, 0.3340, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1270, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:17:46,719] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 12:17:46,721] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.36 | bwd_microstep: 4623.83 | bwd_inner_microstep: 4618.44 | bwd_allreduce_microstep: 5.28 | step_microstep: 47.56
-[2025-01-25 12:17:46,721] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.33 | bwd: 4623.85 | bwd_inner: 4618.44 | bwd_allreduce: 5.33 | step: 47.57
- 22%|██▏       | 1267/5800 [3:31:16<8:41:50,  6.91s/it]                                                       {'loss': 0.127, 'grad_norm': 7.786996364593506, 'learning_rate': 3.638908850245546e-05, 'epoch': 10.92}
- 22%|██▏       | 1267/5800 [3:31:16<8:41:50,  6.91s/it]score1 tensor([[0.5586],
-        [0.6250],
-        [0.5430],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4277, 0.4453, 0.4512, 0.4238], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1289, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:17:53,625] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 12:17:53,626] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.14 | bwd_microstep: 4620.38 | bwd_inner_microstep: 4615.31 | bwd_allreduce_microstep: 4.96 | step_microstep: 48.36
-[2025-01-25 12:17:53,627] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.08 | bwd: 4620.40 | bwd_inner: 4615.31 | bwd_allreduce: 5.00 | step: 48.37
- 22%|██▏       | 1268/5800 [3:31:23<8:41:39,  6.91s/it]                                                       {'loss': 0.1289, 'grad_norm': 7.653419494628906, 'learning_rate': 3.638268500170277e-05, 'epoch': 10.93}
- 22%|██▏       | 1268/5800 [3:31:23<8:41:39,  6.91s/it]score1 tensor([[0.6289],
-        [0.5312],
-        [0.5117],
-        [0.6523]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5703, 0.4629, 0.3730, 0.5898], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0820, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:18:00,525] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 12:18:00,527] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.89 | bwd_microstep: 4620.40 | bwd_inner_microstep: 4614.02 | bwd_allreduce_microstep: 6.18 | step_microstep: 56.51
-[2025-01-25 12:18:00,527] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.86 | bwd: 4620.46 | bwd_inner: 4614.02 | bwd_allreduce: 6.27 | step: 56.49
- 22%|██▏       | 1269/5800 [3:31:30<8:41:33,  6.91s/it]                                                       {'loss': 0.082, 'grad_norm': 7.824914932250977, 'learning_rate': 3.637627639254464e-05, 'epoch': 10.94}
- 22%|██▏       | 1269/5800 [3:31:30<8:41:33,  6.91s/it]score1 tensor([[0.5938],
-        [0.5898],
-        [0.5039],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.5391, 0.4785, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0288, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:18:07,479] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.86 | optimizer_step: 4.66
-[2025-01-25 12:18:07,480] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.47 | bwd_microstep: 4620.39 | bwd_inner_microstep: 4615.48 | bwd_allreduce_microstep: 4.83 | step_microstep: 98.94
-[2025-01-25 12:18:07,481] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.44 | bwd: 4620.41 | bwd_inner: 4615.48 | bwd_allreduce: 4.87 | step: 98.96
- 22%|██▏       | 1270/5800 [3:31:37<8:42:56,  6.93s/it]                                                       {'loss': 0.0288, 'grad_norm': 3.62014102935791, 'learning_rate': 3.6369862676979375e-05, 'epoch': 10.95}
- 22%|██▏       | 1270/5800 [3:31:37<8:42:56,  6.93s/it]score1 tensor([[0.5117],
-        [0.5000],
-        [0.5430],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.4062, 0.5898, 0.4121], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0698, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:18:14,417] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 12:18:14,419] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.61 | bwd_microstep: 4626.45 | bwd_inner_microstep: 4621.50 | bwd_allreduce_microstep: 4.87 | step_microstep: 48.56
-[2025-01-25 12:18:14,419] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.49 | bwd: 4626.47 | bwd_inner: 4621.50 | bwd_allreduce: 4.91 | step: 48.57
- 22%|██▏       | 1271/5800 [3:31:44<8:42:46,  6.93s/it]                                                       {'loss': 0.0698, 'grad_norm': 0.3500367999076843, 'learning_rate': 3.636344385700689e-05, 'epoch': 10.96}
- 22%|██▏       | 1271/5800 [3:31:44<8:42:46,  6.93s/it]score1 tensor([[0.6094],
-        [0.5859],
-        [0.5391],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.5352, 0.5391, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0239, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:18:21,287] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.37
-[2025-01-25 12:18:21,288] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.35 | bwd_microstep: 4586.74 | bwd_inner_microstep: 4581.54 | bwd_allreduce_microstep: 5.09 | step_microstep: 46.44
-[2025-01-25 12:18:21,288] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.31 | bwd: 4586.77 | bwd_inner: 4581.54 | bwd_allreduce: 5.15 | step: 46.44
- 22%|██▏       | 1272/5800 [3:31:51<8:41:43,  6.91s/it]                                                       {'loss': 0.0239, 'grad_norm': 1.7531980276107788, 'learning_rate': 3.635701993462867e-05, 'epoch': 10.97}
- 22%|██▏       | 1272/5800 [3:31:51<8:41:43,  6.91s/it]score1 tensor([[0.4238],
-        [0.4590],
-        [0.5039],
-        [0.4512]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4258, 0.5156, 0.4492, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0415, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:18:28,213] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.37
-[2025-01-25 12:18:28,215] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.58 | bwd_microstep: 4620.84 | bwd_inner_microstep: 4616.11 | bwd_allreduce_microstep: 4.62 | step_microstep: 44.85
-[2025-01-25 12:18:28,215] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.55 | bwd: 4620.87 | bwd_inner: 4616.11 | bwd_allreduce: 4.67 | step: 44.86
- 22%|██▏       | 1273/5800 [3:31:58<8:41:28,  6.91s/it]                                                       {'loss': 0.0415, 'grad_norm': 3.214268684387207, 'learning_rate': 3.6350590911847824e-05, 'epoch': 10.97}
- 22%|██▏       | 1273/5800 [3:31:58<8:41:28,  6.91s/it]score1 tensor([[0.4395],
-        [0.5156],
-        [0.4922],
-        [0.6211]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.5352, 0.6211, 0.7031], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0596, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:18:35,124] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 12:18:35,125] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.66 | bwd_microstep: 4631.41 | bwd_inner_microstep: 4622.38 | bwd_allreduce_microstep: 8.91 | step_microstep: 47.26
-[2025-01-25 12:18:35,125] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.62 | bwd: 4631.44 | bwd_inner: 4622.38 | bwd_allreduce: 8.98 | step: 47.27
- 22%|██▏       | 1274/5800 [3:32:05<8:41:12,  6.91s/it]                                                       {'loss': 0.0596, 'grad_norm': 7.488287448883057, 'learning_rate': 3.634415679066902e-05, 'epoch': 10.98}
- 22%|██▏       | 1274/5800 [3:32:05<8:41:12,  6.91s/it]score1 tensor([[0.4473],
-        [0.4395],
-        [0.4824],
-        [0.4375]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.4141, 0.5117, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0381, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:18:42,025] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 12:18:42,027] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.11 | bwd_microstep: 4621.85 | bwd_inner_microstep: 4616.68 | bwd_allreduce_microstep: 5.08 | step_microstep: 49.12
-[2025-01-25 12:18:42,027] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.06 | bwd: 4621.88 | bwd_inner: 4616.68 | bwd_allreduce: 5.13 | step: 49.12
- 22%|██▏       | 1275/5800 [3:32:11<8:40:57,  6.91s/it]                                                       {'loss': 0.0381, 'grad_norm': 3.449174404144287, 'learning_rate': 3.633771757309853e-05, 'epoch': 10.99}
- 22%|██▏       | 1275/5800 [3:32:12<8:40:57,  6.91s/it][93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-score1 tensor([[0.4395]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0762, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:18:47,438] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.52 | optimizer_step: 4.37
-[2025-01-25 12:18:47,439] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 571.55 | bwd_microstep: 1217.63 | bwd_inner_microstep: 1213.21 | bwd_allreduce_microstep: 4.33 | step_microstep: 45.69
-[2025-01-25 12:18:47,440] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 571.52 | bwd: 1217.66 | bwd_inner: 1213.21 | bwd_allreduce: 4.38 | step: 45.71
- 22%|██▏       | 1276/5800 [3:32:17<8:06:57,  6.46s/it]                                                       {'loss': 0.0762, 'grad_norm': 6.7330732345581055, 'learning_rate': 3.633127326114422e-05, 'epoch': 11.0}
- 22%|██▏       | 1276/5800 [3:32:17<8:06:57,  6.46s/it][2025-01-25 12:18:52,329] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 12:19:03,704] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 12:19:13,787] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 12:19:23,769] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-score1 tensor([[0.4785],
-        [0.4824],
-        [0.4980],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.5508, 0.5625, 0.6406], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0674, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:19:40,849] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 8.08 | optimizer_step: 4.37
-[2025-01-25 12:19:40,851] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2146.31 | bwd_microstep: 4553.60 | bwd_inner_microstep: 4548.25 | bwd_allreduce_microstep: 5.21 | step_microstep: 50.75
-[2025-01-25 12:19:40,851] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2146.28 | bwd: 4553.63 | bwd_inner: 4548.25 | bwd_allreduce: 5.29 | step: 50.76
- 22%|██▏       | 1277/5800 [3:33:10<25:48:51, 20.55s/it]                                                        {'loss': 0.0674, 'grad_norm': 5.47248649597168, 'learning_rate': 3.632482385681554e-05, 'epoch': 11.01}
- 22%|██▏       | 1277/5800 [3:33:10<25:48:51, 20.55s/it]score1 tensor([[0.5586],
-        [0.4629],
-        [0.5391],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.4941, 0.5508, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0200, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:19:47,701] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.90 | optimizer_step: 4.40
-[2025-01-25 12:19:47,703] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2138.71 | bwd_microstep: 4585.50 | bwd_inner_microstep: 4579.75 | bwd_allreduce_microstep: 5.66 | step_microstep: 48.60
-[2025-01-25 12:19:47,704] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2138.68 | bwd: 4585.52 | bwd_inner: 4579.75 | bwd_allreduce: 5.71 | step: 48.62
- 22%|██▏       | 1278/5800 [3:33:17<20:38:56, 16.44s/it]                                                        {'loss': 0.02, 'grad_norm': 3.696808338165283, 'learning_rate': 3.6318369362123515e-05, 'epoch': 11.02}
- 22%|██▏       | 1278/5800 [3:33:17<20:38:56, 16.44s/it]score1 tensor([[0.4922],
-        [0.5391],
-        [0.5781],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.5039, 0.4883, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0708, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:19:54,572] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 12:19:54,574] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2138.16 | bwd_microstep: 4592.57 | bwd_inner_microstep: 4587.08 | bwd_allreduce_microstep: 5.40 | step_microstep: 54.55
-[2025-01-25 12:19:54,574] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2138.07 | bwd: 4592.59 | bwd_inner: 4587.08 | bwd_allreduce: 5.44 | step: 54.56
- 22%|██▏       | 1279/5800 [3:33:24<17:02:30, 13.57s/it]                                                        {'loss': 0.0708, 'grad_norm': 8.627277374267578, 'learning_rate': 3.631190977908078e-05, 'epoch': 11.03}
- 22%|██▏       | 1279/5800 [3:33:24<17:02:30, 13.57s/it]score1 tensor([[0.5000],
-        [0.5078],
-        [0.5742],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4219, 0.4512, 0.5430, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0557, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:20:01,492] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.29 | optimizer_step: 4.37
-[2025-01-25 12:20:01,495] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2138.76 | bwd_microstep: 4600.39 | bwd_inner_microstep: 4594.98 | bwd_allreduce_microstep: 5.21 | step_microstep: 83.32
-[2025-01-25 12:20:01,496] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2138.72 | bwd: 4600.45 | bwd_inner: 4594.98 | bwd_allreduce: 5.31 | step: 83.34
- 22%|██▏       | 1280/5800 [3:33:31<14:32:11, 11.58s/it]                                                        {'loss': 0.0557, 'grad_norm': 8.433256149291992, 'learning_rate': 3.630544510970153e-05, 'epoch': 11.03}
- 22%|██▏       | 1280/5800 [3:33:31<14:32:11, 11.58s/it]score1 tensor([[0.5352],
-        [0.5117],
-        [0.5547],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4863, 0.4375, 0.5547, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0366, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:20:08,368] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 12:20:08,370] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.06 | bwd_microstep: 4559.53 | bwd_inner_microstep: 4553.20 | bwd_allreduce_microstep: 6.12 | step_microstep: 48.66
-[2025-01-25 12:20:08,371] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.01 | bwd: 4559.55 | bwd_inner: 4553.20 | bwd_allreduce: 6.25 | step: 48.67
- 22%|██▏       | 1281/5800 [3:33:38<12:45:25, 10.16s/it]                                                        {'loss': 0.0366, 'grad_norm': 6.329970359802246, 'learning_rate': 3.6298975356001576e-05, 'epoch': 11.04}
- 22%|██▏       | 1281/5800 [3:33:38<12:45:25, 10.16s/it]score1 tensor([[0.5547],
-        [0.5273],
-        [0.4844],
-        [0.6523]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.4277, 0.3906, 0.6016], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0659, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:20:15,265] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 12:20:15,268] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.03 | bwd_microstep: 4613.54 | bwd_inner_microstep: 4607.52 | bwd_allreduce_microstep: 5.88 | step_microstep: 49.94
-[2025-01-25 12:20:15,268] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2149.94 | bwd: 4613.57 | bwd_inner: 4607.52 | bwd_allreduce: 5.96 | step: 49.96
- 22%|██▏       | 1282/5800 [3:33:45<11:31:28,  9.18s/it]                                                        {'loss': 0.0659, 'grad_norm': 7.6740193367004395, 'learning_rate': 3.6292500519998295e-05, 'epoch': 11.05}
- 22%|██▏       | 1282/5800 [3:33:45<11:31:28,  9.18s/it]score1 tensor([[0.4375],
-        [0.4785],
-        [0.5586],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3477, 0.5039, 0.5625, 0.4727], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0444, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:20:22,170] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.36
-[2025-01-25 12:20:22,172] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.07 | bwd_microstep: 4621.43 | bwd_inner_microstep: 4616.17 | bwd_allreduce_microstep: 5.15 | step_microstep: 46.43
-[2025-01-25 12:20:22,172] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.04 | bwd: 4621.46 | bwd_inner: 4616.17 | bwd_allreduce: 5.21 | step: 46.44
- 22%|██▏       | 1283/5800 [3:33:52<10:39:48,  8.50s/it]                                                        {'loss': 0.0444, 'grad_norm': 0.4340049624443054, 'learning_rate': 3.628602060371065e-05, 'epoch': 11.06}
- 22%|██▏       | 1283/5800 [3:33:52<10:39:48,  8.50s/it]score1 tensor([[0.5547],
-        [0.5273],
-        [0.5078],
-        [0.6680]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.4941, 0.4004, 0.6133], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0752, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:20:29,085] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.04 | optimizer_step: 4.37
-[2025-01-25 12:20:29,087] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.77 | bwd_microstep: 4626.19 | bwd_inner_microstep: 4620.71 | bwd_allreduce_microstep: 5.37 | step_microstep: 58.79
-[2025-01-25 12:20:29,087] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.74 | bwd: 4626.21 | bwd_inner: 4620.71 | bwd_allreduce: 5.43 | step: 58.80
- 22%|██▏       | 1284/5800 [3:33:59<10:04:00,  8.02s/it]                                                        {'loss': 0.0752, 'grad_norm': 7.882734298706055, 'learning_rate': 3.6279535609159193e-05, 'epoch': 11.07}
- 22%|██▏       | 1284/5800 [3:33:59<10:04:00,  8.02s/it]score1 tensor([[0.5977],
-        [0.5156],
-        [0.5156],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5703, 0.5664, 0.4902, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0288, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:20:36,034] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.85 | optimizer_step: 4.36
-[2025-01-25 12:20:36,037] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.97 | bwd_microstep: 4625.81 | bwd_inner_microstep: 4619.62 | bwd_allreduce_microstep: 6.01 | step_microstep: 79.55
-[2025-01-25 12:20:36,037] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.91 | bwd: 4625.86 | bwd_inner: 4619.62 | bwd_allreduce: 6.10 | step: 79.57
- 22%|██▏       | 1285/5800 [3:34:06<9:40:23,  7.71s/it]                                                        {'loss': 0.0288, 'grad_norm': 4.010962963104248, 'learning_rate': 3.6273045538366064e-05, 'epoch': 11.08}
- 22%|██▏       | 1285/5800 [3:34:06<9:40:23,  7.71s/it]score1 tensor([[0.4609],
-        [0.6484],
-        [0.5234],
-        [0.4414]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4863, 0.6719, 0.6484, 0.3652], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0625, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:20:42,998] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.94 | optimizer_step: 4.36
-[2025-01-25 12:20:43,001] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.69 | bwd_microstep: 4617.97 | bwd_inner_microstep: 4611.79 | bwd_allreduce_microstep: 5.96 | step_microstep: 51.54
-[2025-01-25 12:20:43,001] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.66 | bwd: 4618.03 | bwd_inner: 4611.79 | bwd_allreduce: 6.07 | step: 51.56
- 22%|██▏       | 1286/5800 [3:34:12<9:22:53,  7.48s/it]                                                       {'loss': 0.0625, 'grad_norm': 4.1160101890563965, 'learning_rate': 3.626655039335497e-05, 'epoch': 11.09}
- 22%|██▏       | 1286/5800 [3:34:13<9:22:53,  7.48s/it]score1 tensor([[0.4727],
-        [0.4648],
-        [0.4980],
-        [0.3887]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.4648, 0.5781, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0410, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:20:49,870] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 12:20:49,871] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.62 | bwd_microstep: 4565.11 | bwd_inner_microstep: 4559.54 | bwd_allreduce_microstep: 5.47 | step_microstep: 44.24
-[2025-01-25 12:20:49,872] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.51 | bwd: 4565.13 | bwd_inner: 4559.54 | bwd_allreduce: 5.52 | step: 44.24
- 22%|██▏       | 1287/5800 [3:34:19<9:08:28,  7.29s/it]                                                       {'loss': 0.041, 'grad_norm': 5.236188888549805, 'learning_rate': 3.626005017615121e-05, 'epoch': 11.09}
- 22%|██▏       | 1287/5800 [3:34:19<9:08:28,  7.29s/it]score1 tensor([[0.3691],
-        [0.4629],
-        [0.5039],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.4648, 0.5586, 0.5977], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0356, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:20:56,758] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 12:20:56,759] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.95 | bwd_microstep: 4617.48 | bwd_inner_microstep: 4612.48 | bwd_allreduce_microstep: 4.90 | step_microstep: 42.38
-[2025-01-25 12:20:56,759] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.91 | bwd: 4617.50 | bwd_inner: 4612.48 | bwd_allreduce: 4.95 | step: 42.38
- 22%|██▏       | 1288/5800 [3:34:26<8:59:14,  7.17s/it]                                                       {'loss': 0.0356, 'grad_norm': 7.176864147186279, 'learning_rate': 3.625354488878168e-05, 'epoch': 11.1}
- 22%|██▏       | 1288/5800 [3:34:26<8:59:14,  7.17s/it]score1 tensor([[0.5703],
-        [0.4824],
-        [0.5938],
-        [0.4414]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5898, 0.5781, 0.6055, 0.4375], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0327, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:21:03,649] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 12:21:03,650] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.97 | bwd_microstep: 4619.89 | bwd_inner_microstep: 4615.04 | bwd_allreduce_microstep: 4.75 | step_microstep: 42.44
-[2025-01-25 12:21:03,651] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.94 | bwd: 4619.91 | bwd_inner: 4615.04 | bwd_allreduce: 4.80 | step: 42.45
- 22%|██▏       | 1289/5800 [3:34:33<8:52:45,  7.09s/it]                                                       {'loss': 0.0327, 'grad_norm': 4.228385925292969, 'learning_rate': 3.624703453327482e-05, 'epoch': 11.11}
- 22%|██▏       | 1289/5800 [3:34:33<8:52:45,  7.09s/it]score1 tensor([[0.4824],
-        [0.4824],
-        [0.4824],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.4570, 0.4512, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0527, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:21:10,523] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.52 | optimizer_step: 4.36
-[2025-01-25 12:21:10,525] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.30 | bwd_microstep: 4615.39 | bwd_inner_microstep: 4610.84 | bwd_allreduce_microstep: 4.47 | step_microstep: 39.85
-[2025-01-25 12:21:10,525] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.28 | bwd: 4615.41 | bwd_inner: 4610.84 | bwd_allreduce: 4.51 | step: 39.86
- 22%|██▏       | 1290/5800 [3:34:40<8:47:52,  7.02s/it]                                                       {'loss': 0.0527, 'grad_norm': 7.465330600738525, 'learning_rate': 3.6240519111660686e-05, 'epoch': 11.12}
- 22%|██▏       | 1290/5800 [3:34:40<8:47:52,  7.02s/it]score1 tensor([[0.5430],
-        [0.5234],
-        [0.6406],
-        [0.5898]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5586, 0.4844, 0.6211, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0283, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:21:17,421] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.01 | optimizer_step: 4.37
-[2025-01-25 12:21:17,423] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.09 | bwd_microstep: 4623.57 | bwd_inner_microstep: 4618.23 | bwd_allreduce_microstep: 5.23 | step_microstep: 47.13
-[2025-01-25 12:21:17,423] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.06 | bwd: 4623.59 | bwd_inner: 4618.23 | bwd_allreduce: 5.29 | step: 47.15
- 22%|██▏       | 1291/5800 [3:34:47<8:45:10,  6.99s/it]                                                       {'loss': 0.0283, 'grad_norm': 4.325109481811523, 'learning_rate': 3.62339986259709e-05, 'epoch': 11.13}
- 22%|██▏       | 1291/5800 [3:34:47<8:45:10,  6.99s/it]score1 tensor([[0.4941],
-        [0.5430],
-        [0.5234],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5000, 0.4941, 0.5156, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:21:24,366] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.26 | optimizer_step: 4.36
-[2025-01-25 12:21:24,367] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.96 | bwd_microstep: 4635.07 | bwd_inner_microstep: 4628.06 | bwd_allreduce_microstep: 6.83 | step_microstep: 69.43
-[2025-01-25 12:21:24,368] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.91 | bwd: 4635.13 | bwd_inner: 4628.06 | bwd_allreduce: 6.92 | step: 69.36
- 22%|██▏       | 1292/5800 [3:34:54<8:44:33,  6.98s/it]                                                       {'loss': 0.0176, 'grad_norm': 4.088429927825928, 'learning_rate': 3.622747307823865e-05, 'epoch': 11.14}
- 22%|██▏       | 1292/5800 [3:34:54<8:44:33,  6.98s/it]score1 tensor([[0.5234],
-        [0.4922],
-        [0.5742],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6016, 0.4785, 0.6094, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0327, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:21:31,366] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.97 | optimizer_step: 5.04
-[2025-01-25 12:21:31,368] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2173.58 | bwd_microstep: 4631.98 | bwd_inner_microstep: 4626.49 | bwd_allreduce_microstep: 5.36 | step_microstep: 78.20
-[2025-01-25 12:21:31,369] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2173.51 | bwd: 4632.01 | bwd_inner: 4626.49 | bwd_allreduce: 5.43 | step: 78.23
- 22%|██▏       | 1293/5800 [3:35:01<8:44:39,  6.98s/it]                                                       {'loss': 0.0327, 'grad_norm': 4.189290523529053, 'learning_rate': 3.622094247049873e-05, 'epoch': 11.15}
- 22%|██▏       | 1293/5800 [3:35:01<8:44:39,  6.98s/it]score1 tensor([[0.4434],
-        [0.5195],
-        [0.4805],
-        [0.4336]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.5273, 0.4883, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0151, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:21:38,295] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 12:21:38,296] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.26 | bwd_microstep: 4620.74 | bwd_inner_microstep: 4615.92 | bwd_allreduce_microstep: 4.71 | step_microstep: 41.89
-[2025-01-25 12:21:38,296] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.15 | bwd: 4620.81 | bwd_inner: 4615.92 | bwd_allreduce: 4.77 | step: 41.90
- 22%|██▏       | 1294/5800 [3:35:08<8:42:47,  6.96s/it]                                                       {'loss': 0.0151, 'grad_norm': 7.262045860290527, 'learning_rate': 3.6214406804787484e-05, 'epoch': 11.16}
- 22%|██▏       | 1294/5800 [3:35:08<8:42:47,  6.96s/it]score1 tensor([[0.6406],
-        [0.3906],
-        [0.4863],
-        [0.4297]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6289, 0.3945, 0.4414, 0.4258], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:21:45,195] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.54 | optimizer_step: 4.37
-[2025-01-25 12:21:45,196] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.88 | bwd_microstep: 4624.76 | bwd_inner_microstep: 4620.10 | bwd_allreduce_microstep: 4.57 | step_microstep: 45.86
-[2025-01-25 12:21:45,196] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.84 | bwd: 4624.79 | bwd_inner: 4620.10 | bwd_allreduce: 4.61 | step: 45.87
- 22%|██▏       | 1295/5800 [3:35:15<8:41:18,  6.94s/it]                                                       {'loss': 0.0161, 'grad_norm': 4.255095958709717, 'learning_rate': 3.620786608314285e-05, 'epoch': 11.16}
- 22%|██▏       | 1295/5800 [3:35:15<8:41:18,  6.94s/it]score1 tensor([[0.4336],
-        [0.6094],
-        [0.5234],
-        [0.6367]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4453, 0.6016, 0.5391, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:21:52,082] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 12:21:52,083] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.95 | bwd_microstep: 4618.64 | bwd_inner_microstep: 4613.93 | bwd_allreduce_microstep: 4.61 | step_microstep: 43.16
-[2025-01-25 12:21:52,084] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.91 | bwd: 4618.66 | bwd_inner: 4613.93 | bwd_allreduce: 4.66 | step: 43.17
- 22%|██▏       | 1296/5800 [3:35:22<8:39:57,  6.93s/it]                                                       {'loss': 0.0156, 'grad_norm': 1.0820928812026978, 'learning_rate': 3.620132030760435e-05, 'epoch': 11.17}
- 22%|██▏       | 1296/5800 [3:35:22<8:39:57,  6.93s/it]score1 tensor([[0.6406],
-        [0.4688],
-        [0.5430],
-        [0.4492]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6484, 0.4512, 0.5703, 0.4141], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0220, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:21:58,981] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.36
-[2025-01-25 12:21:58,982] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.68 | bwd_microstep: 4627.99 | bwd_inner_microstep: 4623.40 | bwd_allreduce_microstep: 4.51 | step_microstep: 40.89
-[2025-01-25 12:21:58,983] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.66 | bwd: 4628.01 | bwd_inner: 4623.40 | bwd_allreduce: 4.55 | step: 40.89
- 22%|██▏       | 1297/5800 [3:35:28<8:39:10,  6.92s/it]                                                       {'loss': 0.022, 'grad_norm': 1.0528266429901123, 'learning_rate': 3.6194769480213054e-05, 'epoch': 11.18}
- 22%|██▏       | 1297/5800 [3:35:28<8:39:10,  6.92s/it]score1 tensor([[0.4434],
-        [0.5664],
-        [0.4453],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4082, 0.5352, 0.4238, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0308, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:22:05,884] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.45
-[2025-01-25 12:22:05,886] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.02 | bwd_microstep: 4630.65 | bwd_inner_microstep: 4625.66 | bwd_allreduce_microstep: 4.88 | step_microstep: 42.71
-[2025-01-25 12:22:05,886] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.99 | bwd: 4630.67 | bwd_inner: 4625.66 | bwd_allreduce: 4.93 | step: 42.71
- 22%|██▏       | 1298/5800 [3:35:35<8:38:48,  6.91s/it]                                                       {'loss': 0.0308, 'grad_norm': 3.632497549057007, 'learning_rate': 3.618821360301163e-05, 'epoch': 11.19}
- 22%|██▏       | 1298/5800 [3:35:35<8:38:48,  6.91s/it]score1 tensor([[0.5547],
-        [0.4609],
-        [0.4316],
-        [0.4727]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4707, 0.4531, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0244, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:22:12,791] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.00 | optimizer_step: 4.36
-[2025-01-25 12:22:12,792] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.94 | bwd_microstep: 4630.24 | bwd_inner_microstep: 4624.27 | bwd_allreduce_microstep: 5.81 | step_microstep: 41.63
-[2025-01-25 12:22:12,793] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.91 | bwd: 4630.27 | bwd_inner: 4624.27 | bwd_allreduce: 5.90 | step: 41.64
- 22%|██▏       | 1299/5800 [3:35:42<8:38:31,  6.91s/it]                                                       {'loss': 0.0244, 'grad_norm': 3.4096648693084717, 'learning_rate': 3.618165267804432e-05, 'epoch': 11.2}
- 22%|██▏       | 1299/5800 [3:35:42<8:38:31,  6.91s/it]score1 tensor([[0.5586],
-        [0.3945],
-        [0.4707],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5820, 0.3789, 0.4570, 0.5742], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:22:19,674] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.28 | optimizer_step: 4.37
-[2025-01-25 12:22:19,676] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.26 | bwd_microstep: 4589.54 | bwd_inner_microstep: 4583.12 | bwd_allreduce_microstep: 6.32 | step_microstep: 48.97
-[2025-01-25 12:22:19,676] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.23 | bwd: 4589.56 | bwd_inner: 4583.12 | bwd_allreduce: 6.37 | step: 49.01
- 22%|██▏       | 1300/5800 [3:35:49<8:37:53,  6.91s/it]                                                       {'loss': 0.0132, 'grad_norm': 1.541059970855713, 'learning_rate': 3.617508670735692e-05, 'epoch': 11.21}
- 22%|██▏       | 1300/5800 [3:35:49<8:37:53,  6.91s/it]score1 tensor([[0.4082],
-        [0.4941],
-        [0.5742],
-        [0.4980]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4355, 0.4922, 0.5742, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:22:26,588] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.48 | optimizer_step: 4.37
-[2025-01-25 12:22:26,590] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2170.79 | bwd_microstep: 4590.07 | bwd_inner_microstep: 4585.26 | bwd_allreduce_microstep: 4.68 | step_microstep: 75.70
-[2025-01-25 12:22:26,591] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2170.72 | bwd: 4590.10 | bwd_inner: 4585.26 | bwd_allreduce: 4.76 | step: 75.75
- 22%|██▏       | 1301/5800 [3:35:56<8:38:18,  6.91s/it]                                                       {'loss': 0.0083, 'grad_norm': 2.185786485671997, 'learning_rate': 3.6168515692996826e-05, 'epoch': 11.22}
- 22%|██▏       | 1301/5800 [3:35:56<8:38:18,  6.91s/it]score1 tensor([[0.5391],
-        [0.4102],
-        [0.4121],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.4844, 0.4121, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0566, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:22:33,524] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.52 | optimizer_step: 4.36
-[2025-01-25 12:22:33,525] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2172.16 | bwd_microstep: 4595.49 | bwd_inner_microstep: 4590.35 | bwd_allreduce_microstep: 5.03 | step_microstep: 71.45
-[2025-01-25 12:22:33,525] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2172.09 | bwd: 4595.51 | bwd_inner: 4590.35 | bwd_allreduce: 5.09 | step: 71.45
- 22%|██▏       | 1302/5800 [3:36:03<8:38:32,  6.92s/it]                                                       {'loss': 0.0566, 'grad_norm': 5.837783336639404, 'learning_rate': 3.6161939637012995e-05, 'epoch': 11.22}
- 22%|██▏       | 1302/5800 [3:36:03<8:38:32,  6.92s/it]score1 tensor([[0.3809],
-        [0.3828],
-        [0.3340],
-        [0.4512]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4258, 0.4609, 0.2812, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0688, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:22:40,456] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.55 | optimizer_step: 4.36
-[2025-01-25 12:22:40,457] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.85 | bwd_microstep: 4621.75 | bwd_inner_microstep: 4616.87 | bwd_allreduce_microstep: 4.73 | step_microstep: 53.37
-[2025-01-25 12:22:40,458] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.82 | bwd: 4621.78 | bwd_inner: 4616.87 | bwd_allreduce: 4.82 | step: 53.37
- 22%|██▏       | 1303/5800 [3:36:10<8:38:36,  6.92s/it]                                                       {'loss': 0.0688, 'grad_norm': 3.6597213745117188, 'learning_rate': 3.6155358541455947e-05, 'epoch': 11.23}
- 22%|██▏       | 1303/5800 [3:36:10<8:38:36,  6.92s/it]score1 tensor([[0.5312],
-        [0.3965],
-        [0.4551],
-        [0.4629]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.4746, 0.4766, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0654, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:22:47,369] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.36
-[2025-01-25 12:22:47,371] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.83 | bwd_microstep: 4622.19 | bwd_inner_microstep: 4616.54 | bwd_allreduce_microstep: 5.56 | step_microstep: 52.09
-[2025-01-25 12:22:47,372] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.78 | bwd: 4622.21 | bwd_inner: 4616.54 | bwd_allreduce: 5.60 | step: 52.10
- 22%|██▏       | 1304/5800 [3:36:17<8:38:22,  6.92s/it]                                                       {'loss': 0.0654, 'grad_norm': 7.380373954772949, 'learning_rate': 3.614877240837779e-05, 'epoch': 11.24}
- 22%|██▏       | 1304/5800 [3:36:17<8:38:22,  6.92s/it]score1 tensor([[0.5195],
-        [0.4492],
-        [0.5430],
-        [0.3945]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4980, 0.5391, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0259, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:22:54,290] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.36
-[2025-01-25 12:22:54,291] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.01 | bwd_microstep: 4626.59 | bwd_inner_microstep: 4621.55 | bwd_allreduce_microstep: 4.93 | step_microstep: 47.60
-[2025-01-25 12:22:54,292] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.96 | bwd: 4626.62 | bwd_inner: 4621.55 | bwd_allreduce: 4.99 | step: 47.61
- 22%|██▎       | 1305/5800 [3:36:24<8:38:17,  6.92s/it]                                                       {'loss': 0.0259, 'grad_norm': 0.46146243810653687, 'learning_rate': 3.614218123983219e-05, 'epoch': 11.25}
- 22%|██▎       | 1305/5800 [3:36:24<8:38:17,  6.92s/it]score1 tensor([[0.5273],
-        [0.5234],
-        [0.5664],
-        [0.6641]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.5664, 0.6094, 0.6367], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0479, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:23:01,207] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 12:23:01,208] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.23 | bwd_microstep: 4627.61 | bwd_inner_microstep: 4622.40 | bwd_allreduce_microstep: 5.13 | step_microstep: 43.91
-[2025-01-25 12:23:01,209] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.18 | bwd: 4627.63 | bwd_inner: 4622.39 | bwd_allreduce: 5.17 | step: 43.91
- 23%|██▎       | 1306/5800 [3:36:31<8:38:00,  6.92s/it]                                                       {'loss': 0.0479, 'grad_norm': 3.917523145675659, 'learning_rate': 3.6135585037874386e-05, 'epoch': 11.26}
- 23%|██▎       | 1306/5800 [3:36:31<8:38:00,  6.92s/it]score1 tensor([[0.6680],
-        [0.3652],
-        [0.4277],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.3613, 0.4629, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:23:08,063] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 12:23:08,065] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.97 | bwd_microstep: 4581.56 | bwd_inner_microstep: 4573.04 | bwd_allreduce_microstep: 8.41 | step_microstep: 42.25
-[2025-01-25 12:23:08,065] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.94 | bwd: 4581.59 | bwd_inner: 4573.04 | bwd_allreduce: 8.47 | step: 42.26
- 23%|██▎       | 1307/5800 [3:36:38<8:36:35,  6.90s/it]                                                       {'loss': 0.0215, 'grad_norm': 2.211463451385498, 'learning_rate': 3.612898380456119e-05, 'epoch': 11.27}
- 23%|██▎       | 1307/5800 [3:36:38<8:36:35,  6.90s/it]score1 tensor([[0.5664],
-        [0.3945],
-        [0.4902],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6289, 0.3652, 0.5195, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:23:14,986] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 12:23:14,988] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.33 | bwd_microstep: 4634.83 | bwd_inner_microstep: 4629.00 | bwd_allreduce_microstep: 5.73 | step_microstep: 45.93
-[2025-01-25 12:23:14,988] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.28 | bwd: 4634.86 | bwd_inner: 4629.00 | bwd_allreduce: 5.79 | step: 45.94
- 23%|██▎       | 1308/5800 [3:36:44<8:36:58,  6.91s/it]                                                       {'loss': 0.0391, 'grad_norm': 4.39670991897583, 'learning_rate': 3.612237754195098e-05, 'epoch': 11.28}
- 23%|██▎       | 1308/5800 [3:36:44<8:36:58,  6.91s/it]score1 tensor([[0.5391],
-        [0.6289],
-        [0.4863],
-        [0.6211]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.6289, 0.4551, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:23:21,837] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.89 | optimizer_step: 4.36
-[2025-01-25 12:23:21,838] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.30 | bwd_microstep: 4573.08 | bwd_inner_microstep: 4568.17 | bwd_allreduce_microstep: 4.81 | step_microstep: 43.83
-[2025-01-25 12:23:21,839] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.26 | bwd: 4573.10 | bwd_inner: 4568.17 | bwd_allreduce: 4.86 | step: 43.84
- 23%|██▎       | 1309/5800 [3:36:51<8:35:47,  6.89s/it]                                                       {'loss': 0.0098, 'grad_norm': 2.1342720985412598, 'learning_rate': 3.6115766252103715e-05, 'epoch': 11.28}
- 23%|██▎       | 1309/5800 [3:36:51<8:35:47,  6.89s/it]score1 tensor([[0.4414],
-        [0.5664],
-        [0.5352],
-        [0.4473]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.6055, 0.5469, 0.4062], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0264, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:23:28,758] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.18 | optimizer_step: 4.36
-[2025-01-25 12:23:28,759] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.01 | bwd_microstep: 4627.25 | bwd_inner_microstep: 4622.41 | bwd_allreduce_microstep: 4.74 | step_microstep: 44.27
-[2025-01-25 12:23:28,760] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.97 | bwd: 4627.27 | bwd_inner: 4622.41 | bwd_allreduce: 4.79 | step: 44.28
- 23%|██▎       | 1310/5800 [3:36:58<8:36:26,  6.90s/it]                                                       {'loss': 0.0264, 'grad_norm': 4.098786354064941, 'learning_rate': 3.610914993708089e-05, 'epoch': 11.29}
- 23%|██▎       | 1310/5800 [3:36:58<8:36:26,  6.90s/it]score1 tensor([[0.6289],
-        [0.4941],
-        [0.5000],
-        [0.6172]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.4082, 0.4766, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0537, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:23:35,703] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.03 | optimizer_step: 4.36
-[2025-01-25 12:23:35,705] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.14 | bwd_microstep: 4628.58 | bwd_inner_microstep: 4619.16 | bwd_allreduce_microstep: 9.32 | step_microstep: 64.86
-[2025-01-25 12:23:35,706] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.07 | bwd: 4628.60 | bwd_inner: 4619.16 | bwd_allreduce: 9.37 | step: 64.92
- 23%|██▎       | 1311/5800 [3:37:05<8:37:21,  6.92s/it]                                                       {'loss': 0.0537, 'grad_norm': 8.159200668334961, 'learning_rate': 3.6102528598945595e-05, 'epoch': 11.3}
- 23%|██▎       | 1311/5800 [3:37:05<8:37:21,  6.92s/it]score1 tensor([[0.5039],
-        [0.4980],
-        [0.6094],
-        [0.4023]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4668, 0.5195, 0.5430, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0317, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:23:42,660] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.51 | optimizer_step: 4.36
-[2025-01-25 12:23:42,661] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.77 | bwd_microstep: 4637.41 | bwd_inner_microstep: 4631.92 | bwd_allreduce_microstep: 5.40 | step_microstep: 46.27
-[2025-01-25 12:23:42,662] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.68 | bwd: 4637.44 | bwd_inner: 4631.92 | bwd_allreduce: 5.45 | step: 46.29
- 23%|██▎       | 1312/5800 [3:37:12<8:37:58,  6.92s/it]                                                       {'loss': 0.0317, 'grad_norm': 3.905186891555786, 'learning_rate': 3.609590223976248e-05, 'epoch': 11.31}
- 23%|██▎       | 1312/5800 [3:37:12<8:37:58,  6.92s/it]score1 tensor([[0.5000],
-        [0.6445],
-        [0.5859],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4277, 0.5664, 0.5195, 0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0542, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:23:49,530] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 12:23:49,532] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.05 | bwd_microstep: 4576.14 | bwd_inner_microstep: 4569.89 | bwd_allreduce_microstep: 6.13 | step_microstep: 46.65
-[2025-01-25 12:23:49,533] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.99 | bwd: 4576.16 | bwd_inner: 4569.89 | bwd_allreduce: 6.20 | step: 46.64
- 23%|██▎       | 1313/5800 [3:37:19<8:36:43,  6.91s/it]                                                       {'loss': 0.0542, 'grad_norm': 6.255759239196777, 'learning_rate': 3.608927086159776e-05, 'epoch': 11.32}
- 23%|██▎       | 1313/5800 [3:37:19<8:36:43,  6.91s/it]score1 tensor([[0.4785],
-        [0.5977],
-        [0.4863],
-        [0.6094]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.5508, 0.4688, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:23:56,451] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.37
-[2025-01-25 12:23:56,452] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.21 | bwd_microstep: 4628.07 | bwd_inner_microstep: 4619.38 | bwd_allreduce_microstep: 8.58 | step_microstep: 46.76
-[2025-01-25 12:23:56,453] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.17 | bwd: 4628.09 | bwd_inner: 4619.38 | bwd_allreduce: 8.63 | step: 46.77
- 23%|██▎       | 1314/5800 [3:37:26<8:36:48,  6.91s/it]                                                       {'loss': 0.0352, 'grad_norm': 7.996276378631592, 'learning_rate': 3.608263446651922e-05, 'epoch': 11.33}
- 23%|██▎       | 1314/5800 [3:37:26<8:36:48,  6.91s/it]score1 tensor([[0.5000],
-        [0.4062],
-        [0.5547],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4668, 0.4434, 0.6172, 0.4727], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0371, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:24:03,363] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 12:24:03,365] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.90 | bwd_microstep: 4625.77 | bwd_inner_microstep: 4620.98 | bwd_allreduce_microstep: 4.70 | step_microstep: 44.40
-[2025-01-25 12:24:03,365] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.85 | bwd: 4625.79 | bwd_inner: 4620.98 | bwd_allreduce: 4.75 | step: 44.41
- 23%|██▎       | 1315/5800 [3:37:33<8:36:39,  6.91s/it]                                                       {'loss': 0.0371, 'grad_norm': 0.5800384879112244, 'learning_rate': 3.6075993056596186e-05, 'epoch': 11.34}
- 23%|██▎       | 1315/5800 [3:37:33<8:36:39,  6.91s/it]score1 tensor([[0.3652],
-        [0.6328],
-        [0.5742],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3652, 0.6875, 0.6055, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0361, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:24:10,218] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 12:24:10,219] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.48 | bwd_microstep: 4573.25 | bwd_inner_microstep: 4567.37 | bwd_allreduce_microstep: 5.79 | step_microstep: 47.12
-[2025-01-25 12:24:10,220] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.45 | bwd: 4573.27 | bwd_inner: 4567.37 | bwd_allreduce: 5.83 | step: 47.12
- 23%|██▎       | 1316/5800 [3:37:40<8:35:20,  6.90s/it]                                                       {'loss': 0.0361, 'grad_norm': 6.239884853363037, 'learning_rate': 3.606934663389957e-05, 'epoch': 11.34}
- 23%|██▎       | 1316/5800 [3:37:40<8:35:20,  6.90s/it]score1 tensor([[0.6680],
-        [0.5586],
-        [0.4414],
-        [0.5898]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6875, 0.5820, 0.4570, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:24:17,153] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 12:24:17,154] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.04 | bwd_microstep: 4634.05 | bwd_inner_microstep: 4629.33 | bwd_allreduce_microstep: 4.62 | step_microstep: 42.57
-[2025-01-25 12:24:17,154] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.01 | bwd: 4634.07 | bwd_inner: 4629.33 | bwd_allreduce: 4.67 | step: 42.58
- 23%|██▎       | 1317/5800 [3:37:47<8:35:56,  6.91s/it]                                                       {'loss': 0.0254, 'grad_norm': 4.0421953201293945, 'learning_rate': 3.6062695200501856e-05, 'epoch': 11.35}
- 23%|██▎       | 1317/5800 [3:37:47<8:35:56,  6.91s/it]score1 tensor([[0.4707],
-        [0.3926],
-        [0.5898],
-        [0.4902]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4609, 0.4551, 0.6367, 0.4824], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0317, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:24:24,065] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.36
-[2025-01-25 12:24:24,066] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.44 | bwd_microstep: 4632.70 | bwd_inner_microstep: 4628.20 | bwd_allreduce_microstep: 4.40 | step_microstep: 46.22
-[2025-01-25 12:24:24,067] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.40 | bwd: 4632.73 | bwd_inner: 4628.21 | bwd_allreduce: 4.45 | step: 46.23
- 23%|██▎       | 1318/5800 [3:37:54<8:35:58,  6.91s/it]                                                       {'loss': 0.0317, 'grad_norm': 0.486285537481308, 'learning_rate': 3.605603875847707e-05, 'epoch': 11.36}
- 23%|██▎       | 1318/5800 [3:37:54<8:35:58,  6.91s/it]score1 tensor([[0.4707],
-        [0.5234],
-        [0.4141],
-        [0.4980]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.5547, 0.4023, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0269, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:24:30,987] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.11 | optimizer_step: 4.37
-[2025-01-25 12:24:30,990] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.95 | bwd_microstep: 4629.46 | bwd_inner_microstep: 4619.18 | bwd_allreduce_microstep: 10.15 | step_microstep: 60.77
-[2025-01-25 12:24:30,991] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.91 | bwd: 4629.48 | bwd_inner: 4619.18 | bwd_allreduce: 10.21 | step: 60.79
- 23%|██▎       | 1319/5800 [3:38:00<8:36:41,  6.92s/it]                                                       {'loss': 0.0269, 'grad_norm': 0.5720790028572083, 'learning_rate': 3.604937730990079e-05, 'epoch': 11.37}
- 23%|██▎       | 1319/5800 [3:38:00<8:36:41,  6.92s/it]score1 tensor([[0.6055],
-        [0.4062],
-        [0.5664],
-        [0.6133]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6562, 0.4277, 0.5586, 0.6250], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0229, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:24:37,942] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.37
-[2025-01-25 12:24:37,943] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.00 | bwd_microstep: 4640.21 | bwd_inner_microstep: 4633.21 | bwd_allreduce_microstep: 6.91 | step_microstep: 42.64
-[2025-01-25 12:24:37,943] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.93 | bwd: 4640.24 | bwd_inner: 4633.21 | bwd_allreduce: 6.97 | step: 42.65
- 23%|██▎       | 1320/5800 [3:38:07<8:37:23,  6.93s/it]                                                       {'loss': 0.0229, 'grad_norm': 4.007045745849609, 'learning_rate': 3.604271085685019e-05, 'epoch': 11.38}
- 23%|██▎       | 1320/5800 [3:38:07<8:37:23,  6.93s/it]score1 tensor([[0.5273],
-        [0.4277],
-        [0.5039],
-        [0.3555]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4434, 0.4238, 0.5078, 0.3457], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:24:44,914] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.84 | optimizer_step: 4.37
-[2025-01-25 12:24:44,915] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.72 | bwd_microstep: 4628.89 | bwd_inner_microstep: 4624.66 | bwd_allreduce_microstep: 4.15 | step_microstep: 52.57
-[2025-01-25 12:24:44,916] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.62 | bwd: 4628.91 | bwd_inner: 4624.66 | bwd_allreduce: 4.20 | step: 52.58
- 23%|██▎       | 1321/5800 [3:38:14<8:37:53,  6.94s/it]                                                       {'loss': 0.0254, 'grad_norm': 3.4697182178497314, 'learning_rate': 3.6036039401403986e-05, 'epoch': 11.39}
- 23%|██▎       | 1321/5800 [3:38:14<8:37:53,  6.94s/it]score1 tensor([[0.4062],
-        [0.6055],
-        [0.5898],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4453, 0.6836, 0.6094, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0522, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:24:51,859] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.52 | optimizer_step: 4.37
-[2025-01-25 12:24:51,861] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.19 | bwd_microstep: 4627.67 | bwd_inner_microstep: 4620.20 | bwd_allreduce_microstep: 7.24 | step_microstep: 74.68
-[2025-01-25 12:24:51,862] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.14 | bwd: 4627.72 | bwd_inner: 4620.20 | bwd_allreduce: 7.35 | step: 74.71
- 23%|██▎       | 1322/5800 [3:38:21<8:38:15,  6.94s/it]                                                       {'loss': 0.0522, 'grad_norm': 4.046682834625244, 'learning_rate': 3.6029362945642436e-05, 'epoch': 11.4}
- 23%|██▎       | 1322/5800 [3:38:21<8:38:15,  6.94s/it]score1 tensor([[0.4355],
-        [0.5664],
-        [0.3535],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.6094, 0.3105, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0435, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:24:58,785] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 12:24:58,787] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.45 | bwd_microstep: 4629.05 | bwd_inner_microstep: 4624.18 | bwd_allreduce_microstep: 4.76 | step_microstep: 42.18
-[2025-01-25 12:24:58,787] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.42 | bwd: 4629.07 | bwd_inner: 4624.18 | bwd_allreduce: 4.82 | step: 42.18
- 23%|██▎       | 1323/5800 [3:38:28<8:37:20,  6.93s/it]                                                       {'loss': 0.0435, 'grad_norm': 0.739618182182312, 'learning_rate': 3.602268149164739e-05, 'epoch': 11.41}
- 23%|██▎       | 1323/5800 [3:38:28<8:37:20,  6.93s/it]score1 tensor([[0.4180],
-        [0.6367],
-        [0.6680],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.5469, 0.6641, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0420, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:25:05,700] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.37
-[2025-01-25 12:25:05,702] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.67 | bwd_microstep: 4629.52 | bwd_inner_microstep: 4624.67 | bwd_allreduce_microstep: 4.76 | step_microstep: 42.92
-[2025-01-25 12:25:05,702] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.64 | bwd: 4629.54 | bwd_inner: 4624.67 | bwd_allreduce: 4.81 | step: 42.93
- 23%|██▎       | 1324/5800 [3:38:35<8:36:51,  6.93s/it]                                                       {'loss': 0.042, 'grad_norm': 4.985085964202881, 'learning_rate': 3.601599504150224e-05, 'epoch': 11.41}
- 23%|██▎       | 1324/5800 [3:38:35<8:36:51,  6.93s/it]score1 tensor([[0.4082],
-        [0.5000],
-        [0.3418],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4297, 0.4805, 0.3555, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:25:12,600] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.37
-[2025-01-25 12:25:12,601] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.08 | bwd_microstep: 4620.91 | bwd_inner_microstep: 4616.05 | bwd_allreduce_microstep: 4.76 | step_microstep: 43.44
-[2025-01-25 12:25:12,602] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.05 | bwd: 4620.94 | bwd_inner: 4616.05 | bwd_allreduce: 4.81 | step: 43.45
- 23%|██▎       | 1325/5800 [3:38:42<8:36:05,  6.92s/it]                                                       {'loss': 0.0273, 'grad_norm': 3.351613759994507, 'learning_rate': 3.600930359729193e-05, 'epoch': 11.42}
- 23%|██▎       | 1325/5800 [3:38:42<8:36:05,  6.92s/it]score1 tensor([[0.6836],
-        [0.4258],
-        [0.4141],
-        [0.7578]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.7070, 0.3457, 0.3750, 0.7031], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0493, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:25:19,499] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.54 | optimizer_step: 4.36
-[2025-01-25 12:25:19,500] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.58 | bwd_microstep: 4619.67 | bwd_inner_microstep: 4614.73 | bwd_allreduce_microstep: 4.72 | step_microstep: 41.96
-[2025-01-25 12:25:19,501] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.55 | bwd: 4619.71 | bwd_inner: 4614.73 | bwd_allreduce: 4.88 | step: 41.96
- 23%|██▎       | 1326/5800 [3:38:49<8:35:23,  6.91s/it]                                                       {'loss': 0.0493, 'grad_norm': 3.576418161392212, 'learning_rate': 3.600260716110298e-05, 'epoch': 11.43}
- 23%|██▎       | 1326/5800 [3:38:49<8:35:23,  6.91s/it]score1 tensor([[0.5234],
-        [0.4941],
-        [0.5039],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.4844, 0.4941, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0298, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:25:26,400] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 12:25:26,402] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.57 | bwd_microstep: 4621.77 | bwd_inner_microstep: 4616.88 | bwd_allreduce_microstep: 4.80 | step_microstep: 46.34
-[2025-01-25 12:25:26,402] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.54 | bwd: 4621.79 | bwd_inner: 4616.88 | bwd_allreduce: 4.84 | step: 46.38
- 23%|██▎       | 1327/5800 [3:38:56<8:35:01,  6.91s/it]                                                       {'loss': 0.0298, 'grad_norm': 3.9606399536132812, 'learning_rate': 3.599590573502344e-05, 'epoch': 11.44}
- 23%|██▎       | 1327/5800 [3:38:56<8:35:01,  6.91s/it]score1 tensor([[0.4512],
-        [0.4570],
-        [0.5000],
-        [0.6133]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.4570, 0.4980, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0410, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:25:33,257] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 12:25:33,259] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.03 | bwd_microstep: 4578.30 | bwd_inner_microstep: 4573.35 | bwd_allreduce_microstep: 4.86 | step_microstep: 45.91
-[2025-01-25 12:25:33,259] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.00 | bwd: 4578.32 | bwd_inner: 4573.35 | bwd_allreduce: 4.90 | step: 45.92
- 23%|██▎       | 1328/5800 [3:39:03<8:33:48,  6.89s/it]                                                       {'loss': 0.041, 'grad_norm': 2.426882028579712, 'learning_rate': 3.598919932114294e-05, 'epoch': 11.45}
- 23%|██▎       | 1328/5800 [3:39:03<8:33:48,  6.89s/it]score1 tensor([[0.4355],
-        [0.5547],
-        [0.3340],
-        [0.5898]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4453, 0.4922, 0.3691, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0425, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:25:40,184] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.36
-[2025-01-25 12:25:40,185] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.05 | bwd_microstep: 4625.23 | bwd_inner_microstep: 4617.39 | bwd_allreduce_microstep: 7.68 | step_microstep: 59.80
-[2025-01-25 12:25:40,186] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.01 | bwd: 4625.28 | bwd_inner: 4617.39 | bwd_allreduce: 7.76 | step: 59.78
- 23%|██▎       | 1329/5800 [3:39:10<8:35:11,  6.91s/it]                                                       {'loss': 0.0425, 'grad_norm': 1.3182213306427002, 'learning_rate': 3.598248792155266e-05, 'epoch': 11.46}
- 23%|██▎       | 1329/5800 [3:39:10<8:35:11,  6.91s/it]score1 tensor([[0.3926],
-        [0.5117],
-        [0.4805],
-        [0.3457]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3730, 0.4961, 0.4375, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0376, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:25:47,152] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.50 | optimizer_step: 4.37
-[2025-01-25 12:25:47,153] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.57 | bwd_microstep: 4623.59 | bwd_inner_microstep: 4618.65 | bwd_allreduce_microstep: 4.85 | step_microstep: 42.55
-[2025-01-25 12:25:47,154] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2168.52 | bwd: 4623.62 | bwd_inner: 4618.65 | bwd_allreduce: 4.90 | step: 42.55
- 23%|██▎       | 1330/5800 [3:39:17<8:35:27,  6.92s/it]                                                       {'loss': 0.0376, 'grad_norm': 3.968858003616333, 'learning_rate': 3.5975771538345325e-05, 'epoch': 11.47}
- 23%|██▎       | 1330/5800 [3:39:17<8:35:27,  6.92s/it]score1 tensor([[0.3438],
-        [0.2539],
-        [0.3555],
-        [0.6758]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4121, 0.3086, 0.3672, 0.6953], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0386, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:25:54,054] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.37
-[2025-01-25 12:25:54,056] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.30 | bwd_microstep: 4623.98 | bwd_inner_microstep: 4618.71 | bwd_allreduce_microstep: 5.16 | step_microstep: 48.00
-[2025-01-25 12:25:54,056] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.27 | bwd: 4624.01 | bwd_inner: 4618.71 | bwd_allreduce: 5.22 | step: 48.00
- 23%|██▎       | 1331/5800 [3:39:24<8:34:58,  6.91s/it]                                                       {'loss': 0.0386, 'grad_norm': 6.712577819824219, 'learning_rate': 3.596905017361521e-05, 'epoch': 11.47}
- 23%|██▎       | 1331/5800 [3:39:24<8:34:58,  6.91s/it]score1 tensor([[0.6445],
-        [0.5703],
-        [0.4141],
-        [0.4004]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6484, 0.6133, 0.4902, 0.4883], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0527, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:26:00,952] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 12:26:00,953] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.13 | bwd_microstep: 4623.89 | bwd_inner_microstep: 4619.28 | bwd_allreduce_microstep: 4.53 | step_microstep: 43.82
-[2025-01-25 12:26:00,953] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.10 | bwd: 4623.91 | bwd_inner: 4619.28 | bwd_allreduce: 4.57 | step: 43.82
- 23%|██▎       | 1332/5800 [3:39:30<8:34:32,  6.91s/it]                                                       {'loss': 0.0527, 'grad_norm': 7.674210548400879, 'learning_rate': 3.5962323829458175e-05, 'epoch': 11.48}
- 23%|██▎       | 1332/5800 [3:39:30<8:34:32,  6.91s/it]score1 tensor([[0.5742],
-        [0.3691],
-        [0.3730],
-        [0.4062]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6641, 0.4141, 0.4473, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0737, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:26:07,850] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 12:26:07,852] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.02 | bwd_microstep: 4621.03 | bwd_inner_microstep: 4616.46 | bwd_allreduce_microstep: 4.50 | step_microstep: 43.16
-[2025-01-25 12:26:07,852] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.99 | bwd: 4621.06 | bwd_inner: 4616.47 | bwd_allreduce: 4.53 | step: 43.16
- 23%|██▎       | 1333/5800 [3:39:37<8:34:16,  6.91s/it]                                                       {'loss': 0.0737, 'grad_norm': 6.983479976654053, 'learning_rate': 3.59555925079716e-05, 'epoch': 11.49}
- 23%|██▎       | 1333/5800 [3:39:37<8:34:16,  6.91s/it]score1 tensor([[0.4121],
-        [0.4102],
-        [0.4277],
-        [0.3672]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.5156, 0.4395, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0771, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:26:14,763] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 12:26:14,765] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.89 | bwd_microstep: 4622.95 | bwd_inner_microstep: 4617.23 | bwd_allreduce_microstep: 5.58 | step_microstep: 52.78
-[2025-01-25 12:26:14,765] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.86 | bwd: 4622.98 | bwd_inner: 4617.23 | bwd_allreduce: 5.68 | step: 52.79
- 23%|██▎       | 1334/5800 [3:39:44<8:34:12,  6.91s/it]                                                       {'loss': 0.0771, 'grad_norm': 6.708005428314209, 'learning_rate': 3.594885621125442e-05, 'epoch': 11.5}
- 23%|██▎       | 1334/5800 [3:39:44<8:34:12,  6.91s/it]score1 tensor([[0.4883],
-        [0.4766],
-        [0.4473],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.5078, 0.5312, 0.5898], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0493, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:26:21,671] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 12:26:21,673] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.27 | bwd_microstep: 4619.06 | bwd_inner_microstep: 4613.84 | bwd_allreduce_microstep: 5.12 | step_microstep: 45.26
-[2025-01-25 12:26:21,673] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.23 | bwd: 4619.09 | bwd_inner: 4613.84 | bwd_allreduce: 5.16 | step: 45.27
- 23%|██▎       | 1335/5800 [3:39:51<8:34:19,  6.91s/it]                                                       {'loss': 0.0493, 'grad_norm': 7.547095775604248, 'learning_rate': 3.594211494140714e-05, 'epoch': 11.51}
- 23%|██▎       | 1335/5800 [3:39:51<8:34:19,  6.91s/it]score1 tensor([[0.5000],
-        [0.4375],
-        [0.5234],
-        [0.4375]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.4121, 0.6094, 0.4297], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0337, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:26:28,591] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.36
-[2025-01-25 12:26:28,593] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.33 | bwd_microstep: 4622.05 | bwd_inner_microstep: 4617.17 | bwd_allreduce_microstep: 4.80 | step_microstep: 44.08
-[2025-01-25 12:26:28,593] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.29 | bwd: 4622.08 | bwd_inner: 4617.18 | bwd_allreduce: 4.84 | step: 44.09
- 23%|██▎       | 1336/5800 [3:39:58<8:34:14,  6.91s/it]                                                       {'loss': 0.0337, 'grad_norm': 0.6893147826194763, 'learning_rate': 3.59353687005318e-05, 'epoch': 11.52}
- 23%|██▎       | 1336/5800 [3:39:58<8:34:14,  6.91s/it]score1 tensor([[0.5625],
-        [0.4824],
-        [0.4609],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.4980, 0.4004, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0210, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:26:35,528] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.37
-[2025-01-25 12:26:35,529] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.27 | bwd_microstep: 4616.72 | bwd_inner_microstep: 4611.58 | bwd_allreduce_microstep: 5.05 | step_microstep: 47.94
-[2025-01-25 12:26:35,530] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.23 | bwd: 4616.75 | bwd_inner: 4611.58 | bwd_allreduce: 5.09 | step: 47.95
- 23%|██▎       | 1337/5800 [3:40:05<8:34:35,  6.92s/it]                                                       {'loss': 0.021, 'grad_norm': 0.5639159083366394, 'learning_rate': 3.5928617490732e-05, 'epoch': 11.53}
- 23%|██▎       | 1337/5800 [3:40:05<8:34:35,  6.92s/it]score1 tensor([[0.5508],
-        [0.4883],
-        [0.5078],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.4805, 0.4180, 0.4590], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0435, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:26:42,461] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.51 | optimizer_step: 4.37
-[2025-01-25 12:26:42,462] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.80 | bwd_microstep: 4628.29 | bwd_inner_microstep: 4623.24 | bwd_allreduce_microstep: 4.98 | step_microstep: 68.38
-[2025-01-25 12:26:42,463] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.77 | bwd: 4628.31 | bwd_inner: 4623.24 | bwd_allreduce: 5.01 | step: 68.39
- 23%|██▎       | 1338/5800 [3:40:12<8:35:07,  6.93s/it]                                                       {'loss': 0.0435, 'grad_norm': 7.680257320404053, 'learning_rate': 3.592186131411288e-05, 'epoch': 11.53}
- 23%|██▎       | 1338/5800 [3:40:12<8:35:07,  6.93s/it]score1 tensor([[0.5039],
-        [0.4785],
-        [0.5352],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.4336, 0.4531, 0.3809], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0894, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:26:49,382] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.52 | optimizer_step: 4.37
-[2025-01-25 12:26:49,384] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.20 | bwd_microstep: 4620.51 | bwd_inner_microstep: 4615.45 | bwd_allreduce_microstep: 4.97 | step_microstep: 43.56
-[2025-01-25 12:26:49,384] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.17 | bwd: 4620.54 | bwd_inner: 4615.45 | bwd_allreduce: 5.02 | step: 43.58
- 23%|██▎       | 1339/5800 [3:40:19<8:34:36,  6.92s/it]                                                       {'loss': 0.0894, 'grad_norm': 7.668815612792969, 'learning_rate': 3.591510017278113e-05, 'epoch': 11.54}
- 23%|██▎       | 1339/5800 [3:40:19<8:34:36,  6.92s/it]score1 tensor([[0.4844],
-        [0.4336],
-        [0.5820],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4141, 0.4199, 0.5391, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0386, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:26:56,300] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.52 | optimizer_step: 4.37
-[2025-01-25 12:26:56,302] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.56 | bwd_microstep: 4631.93 | bwd_inner_microstep: 4626.86 | bwd_allreduce_microstep: 4.99 | step_microstep: 42.78
-[2025-01-25 12:26:56,302] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.52 | bwd: 4631.95 | bwd_inner: 4626.86 | bwd_allreduce: 5.03 | step: 42.81
- 23%|██▎       | 1340/5800 [3:40:26<8:34:23,  6.92s/it]                                                       {'loss': 0.0386, 'grad_norm': 7.617952823638916, 'learning_rate': 3.5908334068845e-05, 'epoch': 11.55}
- 23%|██▎       | 1340/5800 [3:40:26<8:34:23,  6.92s/it]score1 tensor([[0.5156],
-        [0.4648],
-        [0.5508],
-        [0.4785]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4355, 0.4316, 0.5000, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0562, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:27:03,213] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 12:27:03,215] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.58 | bwd_microstep: 4626.69 | bwd_inner_microstep: 4621.63 | bwd_allreduce_microstep: 4.94 | step_microstep: 50.69
-[2025-01-25 12:27:03,216] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.54 | bwd: 4626.71 | bwd_inner: 4621.63 | bwd_allreduce: 5.00 | step: 50.75
- 23%|██▎       | 1341/5800 [3:40:33<8:34:17,  6.92s/it]                                                       {'loss': 0.0562, 'grad_norm': 7.51724910736084, 'learning_rate': 3.590156300441427e-05, 'epoch': 11.56}
- 23%|██▎       | 1341/5800 [3:40:33<8:34:17,  6.92s/it]score1 tensor([[0.5312],
-        [0.6914],
-        [0.5742],
-        [0.4414]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4863, 0.6133, 0.5117, 0.4160], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0527, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:27:10,156] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 12:27:10,158] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.57 | bwd_microstep: 4628.28 | bwd_inner_microstep: 4620.85 | bwd_allreduce_microstep: 7.23 | step_microstep: 67.35
-[2025-01-25 12:27:10,158] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.54 | bwd: 4628.34 | bwd_inner: 4620.85 | bwd_allreduce: 7.33 | step: 67.35
- 23%|██▎       | 1342/5800 [3:40:40<8:35:01,  6.93s/it]                                                       {'loss': 0.0527, 'grad_norm': 8.165261268615723, 'learning_rate': 3.589478698160028e-05, 'epoch': 11.57}
- 23%|██▎       | 1342/5800 [3:40:40<8:35:01,  6.93s/it]score1 tensor([[0.3926],
-        [0.4746],
-        [0.6289],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.5156, 0.6445, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:27:17,107] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 12:27:17,109] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.19 | bwd_microstep: 4631.87 | bwd_inner_microstep: 4625.00 | bwd_allreduce_microstep: 6.73 | step_microstep: 49.05
-[2025-01-25 12:27:17,109] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2168.16 | bwd: 4631.91 | bwd_inner: 4625.00 | bwd_allreduce: 6.79 | step: 49.07
- 23%|██▎       | 1343/5800 [3:40:47<8:35:09,  6.94s/it]                                                       {'loss': 0.0391, 'grad_norm': 7.588966369628906, 'learning_rate': 3.5888006002515914e-05, 'epoch': 11.58}
- 23%|██▎       | 1343/5800 [3:40:47<8:35:09,  6.94s/it]score1 tensor([[0.4180],
-        [0.4805],
-        [0.3789],
-        [0.6211]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3984, 0.5391, 0.4336, 0.6133], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:27:24,035] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.37
-[2025-01-25 12:27:24,036] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.87 | bwd_microstep: 4617.17 | bwd_inner_microstep: 4612.03 | bwd_allreduce_microstep: 5.05 | step_microstep: 42.88
-[2025-01-25 12:27:24,036] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.84 | bwd: 4617.19 | bwd_inner: 4612.02 | bwd_allreduce: 5.10 | step: 42.89
- 23%|██▎       | 1344/5800 [3:40:54<8:34:32,  6.93s/it]                                                       {'loss': 0.0352, 'grad_norm': 0.7734765410423279, 'learning_rate': 3.58812200692756e-05, 'epoch': 11.59}
- 23%|██▎       | 1344/5800 [3:40:54<8:34:32,  6.93s/it]score1 tensor([[0.4219],
-        [0.4805],
-        [0.3965],
-        [0.4941]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4043, 0.5430, 0.3789, 0.5742], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0444, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:27:30,934] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.36
-[2025-01-25 12:27:30,935] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.88 | bwd_microstep: 4616.53 | bwd_inner_microstep: 4611.44 | bwd_allreduce_microstep: 5.02 | step_microstep: 42.43
-[2025-01-25 12:27:30,936] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.84 | bwd: 4616.56 | bwd_inner: 4611.44 | bwd_allreduce: 5.06 | step: 42.44
- 23%|██▎       | 1345/5800 [3:41:00<8:33:49,  6.92s/it]                                                       {'loss': 0.0444, 'grad_norm': 0.8267106413841248, 'learning_rate': 3.587442918399529e-05, 'epoch': 11.59}
- 23%|██▎       | 1345/5800 [3:41:00<8:33:49,  6.92s/it]score1 tensor([[0.4844],
-        [0.4863],
-        [0.5000],
-        [0.4551]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.5430, 0.5117, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0435, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:27:37,826] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.36
-[2025-01-25 12:27:37,827] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.72 | bwd_microstep: 4617.56 | bwd_inner_microstep: 4613.10 | bwd_allreduce_microstep: 4.39 | step_microstep: 41.67
-[2025-01-25 12:27:37,827] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.69 | bwd: 4617.59 | bwd_inner: 4613.10 | bwd_allreduce: 4.42 | step: 41.68
- 23%|██▎       | 1346/5800 [3:41:07<8:32:59,  6.91s/it]                                                       {'loss': 0.0435, 'grad_norm': 7.404323577880859, 'learning_rate': 3.586763334879252e-05, 'epoch': 11.6}
- 23%|██▎       | 1346/5800 [3:41:07<8:32:59,  6.91s/it]score1 tensor([[0.4668],
-        [0.4160],
-        [0.5547],
-        [0.4902]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.4648, 0.6875, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0845, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:27:44,719] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.55 | optimizer_step: 4.36
-[2025-01-25 12:27:44,720] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.96 | bwd_microstep: 4620.62 | bwd_inner_microstep: 4615.58 | bwd_allreduce_microstep: 4.91 | step_microstep: 46.04
-[2025-01-25 12:27:44,720] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.92 | bwd: 4620.64 | bwd_inner: 4615.58 | bwd_allreduce: 4.97 | step: 46.04
- 23%|██▎       | 1347/5800 [3:41:14<8:32:39,  6.91s/it]                                                       {'loss': 0.0845, 'grad_norm': 7.392343044281006, 'learning_rate': 3.586083256578635e-05, 'epoch': 11.61}
- 23%|██▎       | 1347/5800 [3:41:14<8:32:39,  6.91s/it]score1 tensor([[0.3887],
-        [0.3945],
-        [0.4375],
-        [0.4219]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4062, 0.4414, 0.4805, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0342, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:27:51,628] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 12:27:51,629] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.25 | bwd_microstep: 4624.87 | bwd_inner_microstep: 4616.16 | bwd_allreduce_microstep: 8.63 | step_microstep: 42.61
-[2025-01-25 12:27:51,630] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.22 | bwd: 4624.89 | bwd_inner: 4616.16 | bwd_allreduce: 8.67 | step: 42.62
- 23%|██▎       | 1348/5800 [3:41:21<8:32:27,  6.91s/it]                                                       {'loss': 0.0342, 'grad_norm': 6.847311019897461, 'learning_rate': 3.5854026837097377e-05, 'epoch': 11.62}
- 23%|██▎       | 1348/5800 [3:41:21<8:32:27,  6.91s/it]score1 tensor([[0.5586],
-        [0.4043],
-        [0.4688],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.4160, 0.4609, 0.5703], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:27:58,526] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 12:27:58,531] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.38 | bwd_microstep: 4624.07 | bwd_inner_microstep: 4619.39 | bwd_allreduce_microstep: 4.60 | step_microstep: 45.65
-[2025-01-25 12:27:58,531] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.34 | bwd: 4624.09 | bwd_inner: 4619.39 | bwd_allreduce: 4.64 | step: 45.66
- 23%|██▎       | 1349/5800 [3:41:28<8:32:12,  6.90s/it]                                                       {'loss': 0.0127, 'grad_norm': 3.9824604988098145, 'learning_rate': 3.584721616484774e-05, 'epoch': 11.63}
- 23%|██▎       | 1349/5800 [3:41:28<8:32:12,  6.90s/it]score1 tensor([[0.4727],
-        [0.4727],
-        [0.5859],
-        [0.4219]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5000, 0.4219, 0.6094, 0.3789], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0361, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:28:05,439] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.36
-[2025-01-25 12:28:05,441] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.26 | bwd_microstep: 4632.10 | bwd_inner_microstep: 4627.03 | bwd_allreduce_microstep: 4.98 | step_microstep: 46.37
-[2025-01-25 12:28:05,441] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.23 | bwd: 4632.12 | bwd_inner: 4627.03 | bwd_allreduce: 5.02 | step: 46.37
- 23%|██▎       | 1350/5800 [3:41:35<8:32:12,  6.91s/it]                                                       {'loss': 0.0361, 'grad_norm': 0.6618459820747375, 'learning_rate': 3.584040055116113e-05, 'epoch': 11.64}
- 23%|██▎       | 1350/5800 [3:41:35<8:32:12,  6.91s/it]score1 tensor([[0.6641],
-        [0.5820],
-        [0.5742],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133, 0.6484, 0.5312, 0.4805], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0479, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:28:12,347] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.37
-[2025-01-25 12:28:12,349] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.45 | bwd_microstep: 4620.16 | bwd_inner_microstep: 4613.59 | bwd_allreduce_microstep: 6.35 | step_microstep: 48.85
-[2025-01-25 12:28:12,349] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.41 | bwd: 4620.22 | bwd_inner: 4613.59 | bwd_allreduce: 6.47 | step: 48.83
- 23%|██▎       | 1351/5800 [3:41:42<8:32:11,  6.91s/it]                                                       {'loss': 0.0479, 'grad_norm': 4.275677680969238, 'learning_rate': 3.583357999816277e-05, 'epoch': 11.65}
- 23%|██▎       | 1351/5800 [3:41:42<8:32:11,  6.91s/it]score1 tensor([[0.5078],
-        [0.4199],
-        [0.7031],
-        [0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.4395, 0.6562, 0.4629], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0249, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:28:19,281] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.23 | optimizer_step: 4.37
-[2025-01-25 12:28:19,284] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.25 | bwd_microstep: 4621.04 | bwd_inner_microstep: 4615.13 | bwd_allreduce_microstep: 5.80 | step_microstep: 71.80
-[2025-01-25 12:28:19,284] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.21 | bwd: 4621.09 | bwd_inner: 4615.13 | bwd_allreduce: 5.84 | step: 71.82
- 23%|██▎       | 1352/5800 [3:41:49<8:33:21,  6.92s/it]                                                       {'loss': 0.0249, 'grad_norm': 0.9868018627166748, 'learning_rate': 3.582675450797944e-05, 'epoch': 11.66}
- 23%|██▎       | 1352/5800 [3:41:49<8:33:21,  6.92s/it]score1 tensor([[0.5273],
-        [0.5234],
-        [0.5117],
-        [0.4512]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4746, 0.4941, 0.4473, 0.4043], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0483, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:28:26,247] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 12:28:26,249] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.82 | bwd_microstep: 4627.63 | bwd_inner_microstep: 4619.68 | bwd_allreduce_microstep: 7.74 | step_microstep: 51.05
-[2025-01-25 12:28:26,249] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.73 | bwd: 4627.68 | bwd_inner: 4619.68 | bwd_allreduce: 7.86 | step: 51.04
- 23%|██▎       | 1353/5800 [3:41:56<8:33:29,  6.93s/it]                                                       {'loss': 0.0483, 'grad_norm': 7.680278778076172, 'learning_rate': 3.581992408273944e-05, 'epoch': 11.66}
- 23%|██▎       | 1353/5800 [3:41:56<8:33:29,  6.93s/it]score1 tensor([[0.5898],
-        [0.4785],
-        [0.5117],
-        [0.4902]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4961, 0.4551, 0.5117, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0415, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:28:33,116] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.37
-[2025-01-25 12:28:33,117] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.19 | bwd_microstep: 4583.70 | bwd_inner_microstep: 4579.06 | bwd_allreduce_microstep: 4.57 | step_microstep: 41.78
-[2025-01-25 12:28:33,117] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.16 | bwd: 4583.73 | bwd_inner: 4579.06 | bwd_allreduce: 4.61 | step: 41.79
- 23%|██▎       | 1354/5800 [3:42:03<8:31:57,  6.91s/it]                                                       {'loss': 0.0415, 'grad_norm': 2.116483211517334, 'learning_rate': 3.5813088724572595e-05, 'epoch': 11.67}
- 23%|██▎       | 1354/5800 [3:42:03<8:31:57,  6.91s/it]score1 tensor([[0.5391],
-        [0.5898],
-        [0.5195],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4590, 0.5273, 0.3262, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0957, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:28:40,014] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.37
-[2025-01-25 12:28:40,015] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.71 | bwd_microstep: 4616.82 | bwd_inner_microstep: 4611.92 | bwd_allreduce_microstep: 4.82 | step_microstep: 42.32
-[2025-01-25 12:28:40,015] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.67 | bwd: 4616.85 | bwd_inner: 4611.93 | bwd_allreduce: 4.86 | step: 42.32
- 23%|██▎       | 1355/5800 [3:42:09<8:31:38,  6.91s/it]                                                       {'loss': 0.0957, 'grad_norm': 8.064332008361816, 'learning_rate': 3.5806248435610316e-05, 'epoch': 11.68}
- 23%|██▎       | 1355/5800 [3:42:09<8:31:38,  6.91s/it]score1 tensor([[0.5586],
-        [0.5117],
-        [0.5977],
-        [0.6562]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5586, 0.5000, 0.5391, 0.6797], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:28:46,859] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 12:28:46,860] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.20 | bwd_microstep: 4568.34 | bwd_inner_microstep: 4563.79 | bwd_allreduce_microstep: 4.47 | step_microstep: 41.46
-[2025-01-25 12:28:46,861] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.16 | bwd: 4568.36 | bwd_inner: 4563.79 | bwd_allreduce: 4.51 | step: 41.46
- 23%|██▎       | 1356/5800 [3:42:16<8:30:07,  6.89s/it]                                                       {'loss': 0.0234, 'grad_norm': 1.8263041973114014, 'learning_rate': 3.579940321798551e-05, 'epoch': 11.69}
- 23%|██▎       | 1356/5800 [3:42:16<8:30:07,  6.89s/it]score1 tensor([[0.5586],
-        [0.5898],
-        [0.5156],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.5547, 0.5781, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0410, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:28:53,749] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 12:28:53,751] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.66 | bwd_microstep: 4617.24 | bwd_inner_microstep: 4612.30 | bwd_allreduce_microstep: 4.85 | step_microstep: 42.54
-[2025-01-25 12:28:53,751] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.62 | bwd: 4617.26 | bwd_inner: 4612.30 | bwd_allreduce: 4.89 | step: 42.55
- 23%|██▎       | 1357/5800 [3:42:23<8:30:07,  6.89s/it]                                                       {'loss': 0.041, 'grad_norm': 0.6701372265815735, 'learning_rate': 3.5792553073832635e-05, 'epoch': 11.7}
- 23%|██▎       | 1357/5800 [3:42:23<8:30:07,  6.89s/it]score1 tensor([[0.4277],
-        [0.4805],
-        [0.5156],
-        [0.4219]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.5117, 0.5625, 0.3672], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0400, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:29:00,645] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.37
-[2025-01-25 12:29:00,651] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.73 | bwd_microstep: 4618.79 | bwd_inner_microstep: 4613.92 | bwd_allreduce_microstep: 4.75 | step_microstep: 45.82
-[2025-01-25 12:29:00,651] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.69 | bwd: 4618.82 | bwd_inner: 4613.92 | bwd_allreduce: 4.80 | step: 45.84
- 23%|██▎       | 1358/5800 [3:42:30<8:30:16,  6.89s/it]                                                       {'loss': 0.04, 'grad_norm': 0.7043178081512451, 'learning_rate': 3.578569800528769e-05, 'epoch': 11.71}
- 23%|██▎       | 1358/5800 [3:42:30<8:30:16,  6.89s/it]score1 tensor([[0.3789],
-        [0.4648],
-        [0.5352],
-        [0.4961]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3711, 0.5039, 0.6016, 0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0371, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:29:07,553] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 12:29:07,554] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.05 | bwd_microstep: 4625.13 | bwd_inner_microstep: 4620.30 | bwd_allreduce_microstep: 4.74 | step_microstep: 42.70
-[2025-01-25 12:29:07,555] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.01 | bwd: 4625.16 | bwd_inner: 4620.30 | bwd_allreduce: 4.78 | step: 42.71
- 23%|██▎       | 1359/5800 [3:42:37<8:30:24,  6.90s/it]                                                       {'loss': 0.0371, 'grad_norm': 4.126577377319336, 'learning_rate': 3.5778838014488195e-05, 'epoch': 11.72}
- 23%|██▎       | 1359/5800 [3:42:37<8:30:24,  6.90s/it]score1 tensor([[0.4688],
-        [0.4355],
-        [0.5117],
-        [0.4707]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.4980, 0.6602, 0.4727], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0620, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:29:14,459] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 12:29:14,460] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.64 | bwd_microstep: 4618.47 | bwd_inner_microstep: 4612.27 | bwd_allreduce_microstep: 6.08 | step_microstep: 48.36
-[2025-01-25 12:29:14,461] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.61 | bwd: 4618.49 | bwd_inner: 4612.27 | bwd_allreduce: 6.15 | step: 48.36
- 23%|██▎       | 1360/5800 [3:42:44<8:30:41,  6.90s/it]                                                       {'loss': 0.062, 'grad_norm': 7.405867099761963, 'learning_rate': 3.5771973103573226e-05, 'epoch': 11.72}
- 23%|██▎       | 1360/5800 [3:42:44<8:30:41,  6.90s/it]score1 tensor([[0.4277],
-        [0.4336],
-        [0.3711],
-        [0.4668]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.5039, 0.3730, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:29:21,405] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.53 | optimizer_step: 4.38
-[2025-01-25 12:29:21,407] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2170.15 | bwd_microstep: 4624.26 | bwd_inner_microstep: 4619.40 | bwd_allreduce_microstep: 4.77 | step_microstep: 60.53
-[2025-01-25 12:29:21,408] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2170.10 | bwd: 4624.28 | bwd_inner: 4619.40 | bwd_allreduce: 4.82 | step: 60.59
- 23%|██▎       | 1361/5800 [3:42:51<8:31:46,  6.92s/it]                                                       {'loss': 0.0293, 'grad_norm': 3.3885655403137207, 'learning_rate': 3.576510327468338e-05, 'epoch': 11.73}
- 23%|██▎       | 1361/5800 [3:42:51<8:31:46,  6.92s/it]score1 tensor([[0.4043],
-        [0.5547],
-        [0.3789],
-        [0.5859]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.5547, 0.4199, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0361, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:29:28,298] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 12:29:28,299] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.74 | bwd_microstep: 4579.11 | bwd_inner_microstep: 4574.36 | bwd_allreduce_microstep: 4.67 | step_microstep: 42.71
-[2025-01-25 12:29:28,299] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.71 | bwd: 4579.14 | bwd_inner: 4574.36 | bwd_allreduce: 4.71 | step: 42.72
- 23%|██▎       | 1362/5800 [3:42:58<8:30:40,  6.90s/it]                                                       {'loss': 0.0361, 'grad_norm': 1.5633060932159424, 'learning_rate': 3.5758228529960776e-05, 'epoch': 11.74}
- 23%|██▎       | 1362/5800 [3:42:58<8:30:40,  6.90s/it]score1 tensor([[0.5664],
-        [0.5781],
-        [0.3555],
-        [0.3633]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.5664, 0.3223, 0.3418], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0303, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:29:35,202] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.49 | optimizer_step: 4.36
-[2025-01-25 12:29:35,203] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.70 | bwd_microstep: 4617.92 | bwd_inner_microstep: 4612.49 | bwd_allreduce_microstep: 5.29 | step_microstep: 44.57
-[2025-01-25 12:29:35,203] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.66 | bwd: 4617.95 | bwd_inner: 4612.49 | bwd_allreduce: 5.38 | step: 44.58
- 24%|██▎       | 1363/5800 [3:43:05<8:31:02,  6.91s/it]                                                       {'loss': 0.0303, 'grad_norm': 3.2925400733947754, 'learning_rate': 3.575134887154909e-05, 'epoch': 11.75}
- 24%|██▎       | 1363/5800 [3:43:05<8:31:02,  6.91s/it]score1 tensor([[0.5977],
-        [0.4160],
-        [0.6094],
-        [0.5781]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6523, 0.3516, 0.5938, 0.4863], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0566, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:29:42,114] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 12:29:42,116] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.54 | bwd_microstep: 4615.27 | bwd_inner_microstep: 4610.48 | bwd_allreduce_microstep: 4.70 | step_microstep: 42.67
-[2025-01-25 12:29:42,116] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.51 | bwd: 4615.30 | bwd_inner: 4610.48 | bwd_allreduce: 4.75 | step: 42.68
- 24%|██▎       | 1364/5800 [3:43:12<8:30:35,  6.91s/it]                                                       {'loss': 0.0566, 'grad_norm': 3.7544679641723633, 'learning_rate': 3.574446430159352e-05, 'epoch': 11.76}
- 24%|██▎       | 1364/5800 [3:43:12<8:30:35,  6.91s/it]score1 tensor([[0.3848],
-        [0.4648],
-        [0.5586],
-        [0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3438, 0.4629, 0.5430, 0.4707], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0181, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:29:49,017] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 12:29:49,018] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.48 | bwd_microstep: 4625.67 | bwd_inner_microstep: 4620.84 | bwd_allreduce_microstep: 4.72 | step_microstep: 43.18
-[2025-01-25 12:29:49,018] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.44 | bwd: 4625.70 | bwd_inner: 4620.84 | bwd_allreduce: 4.77 | step: 43.18
- 24%|██▎       | 1365/5800 [3:43:18<8:30:17,  6.90s/it]                                                       {'loss': 0.0181, 'grad_norm': 7.309021949768066, 'learning_rate': 3.5737574822240785e-05, 'epoch': 11.77}
- 24%|██▎       | 1365/5800 [3:43:18<8:30:17,  6.90s/it]score1 tensor([[0.5234],
-        [0.5234],
-        [0.5273],
-        [0.6133]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.5469, 0.5078, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0225, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:29:55,932] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.97 | optimizer_step: 4.37
-[2025-01-25 12:29:55,933] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.44 | bwd_microstep: 4631.27 | bwd_inner_microstep: 4625.55 | bwd_allreduce_microstep: 5.59 | step_microstep: 46.40
-[2025-01-25 12:29:55,934] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.41 | bwd: 4631.29 | bwd_inner: 4625.55 | bwd_allreduce: 5.66 | step: 46.41
- 24%|██▎       | 1366/5800 [3:43:25<8:30:23,  6.91s/it]                                                       {'loss': 0.0225, 'grad_norm': 4.110779285430908, 'learning_rate': 3.5730680435639154e-05, 'epoch': 11.78}
- 24%|██▎       | 1366/5800 [3:43:25<8:30:23,  6.91s/it]score1 tensor([[0.5117],
-        [0.4727],
-        [0.4219],
-        [0.4004]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.4629, 0.3750, 0.4336], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:30:02,838] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 12:30:02,839] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.31 | bwd_microstep: 4631.62 | bwd_inner_microstep: 4627.21 | bwd_allreduce_microstep: 4.34 | step_microstep: 41.99
-[2025-01-25 12:30:02,839] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.29 | bwd: 4631.65 | bwd_inner: 4627.21 | bwd_allreduce: 4.38 | step: 42.01
- 24%|██▎       | 1367/5800 [3:43:32<8:30:23,  6.91s/it]                                                       {'loss': 0.0234, 'grad_norm': 0.5956703424453735, 'learning_rate': 3.572378114393841e-05, 'epoch': 11.78}
- 24%|██▎       | 1367/5800 [3:43:32<8:30:23,  6.91s/it]score1 tensor([[0.5625],
-        [0.5391],
-        [0.5859],
-        [0.4766]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.5508, 0.6367, 0.4277], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0366, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:30:09,748] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.52 | optimizer_step: 4.36
-[2025-01-25 12:30:09,749] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.25 | bwd_microstep: 4630.30 | bwd_inner_microstep: 4625.83 | bwd_allreduce_microstep: 4.39 | step_microstep: 46.44
-[2025-01-25 12:30:09,749] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.22 | bwd: 4630.32 | bwd_inner: 4625.83 | bwd_allreduce: 4.43 | step: 46.44
- 24%|██▎       | 1368/5800 [3:43:39<8:30:13,  6.91s/it]                                                       {'loss': 0.0366, 'grad_norm': 0.794878363609314, 'learning_rate': 3.571687694928987e-05, 'epoch': 11.79}
- 24%|██▎       | 1368/5800 [3:43:39<8:30:13,  6.91s/it]score1 tensor([[0.5273],
-        [0.5469],
-        [0.0383],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.4551, 0.1787, 0.4590], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0791, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:30:16,652] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 12:30:16,653] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.80 | bwd_microstep: 4626.88 | bwd_inner_microstep: 4620.97 | bwd_allreduce_microstep: 5.82 | step_microstep: 44.94
-[2025-01-25 12:30:16,654] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.77 | bwd: 4626.90 | bwd_inner: 4620.97 | bwd_allreduce: 5.87 | step: 44.95
- 24%|██▎       | 1369/5800 [3:43:46<8:30:06,  6.91s/it]                                                       {'loss': 0.0791, 'grad_norm': 5.283751010894775, 'learning_rate': 3.57099678538464e-05, 'epoch': 11.8}
- 24%|██▎       | 1369/5800 [3:43:46<8:30:06,  6.91s/it]score1 tensor([[0.5078],
-        [0.5625],
-        [0.5938],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5820, 0.5742, 0.6445, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0361, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:30:23,563] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 12:30:23,565] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.85 | bwd_microstep: 4625.02 | bwd_inner_microstep: 4620.16 | bwd_allreduce_microstep: 4.75 | step_microstep: 42.63
-[2025-01-25 12:30:23,565] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.77 | bwd: 4625.04 | bwd_inner: 4620.16 | bwd_allreduce: 4.81 | step: 42.64
- 24%|██▎       | 1370/5800 [3:43:53<8:30:21,  6.91s/it]                                                       {'loss': 0.0361, 'grad_norm': 7.976075172424316, 'learning_rate': 3.5703053859762347e-05, 'epoch': 11.81}
- 24%|██▎       | 1370/5800 [3:43:53<8:30:21,  6.91s/it]score1 tensor([[0.4707],
-        [0.5078],
-        [0.6016],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.5273, 0.6797, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:30:30,494] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 12:30:30,495] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.49 | bwd_microstep: 4628.16 | bwd_inner_microstep: 4623.50 | bwd_allreduce_microstep: 4.56 | step_microstep: 47.48
-[2025-01-25 12:30:30,496] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.46 | bwd: 4628.18 | bwd_inner: 4623.50 | bwd_allreduce: 4.61 | step: 47.49
- 24%|██▎       | 1371/5800 [3:44:00<8:30:20,  6.91s/it]                                                       {'loss': 0.0312, 'grad_norm': 4.125895023345947, 'learning_rate': 3.569613496919363e-05, 'epoch': 11.82}
- 24%|██▎       | 1371/5800 [3:44:00<8:30:20,  6.91s/it]score1 tensor([[0.5820],
-        [0.4023],
-        [0.3359],
-        [0.4043]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5938, 0.3340, 0.3398, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0283, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:30:37,429] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.37
-[2025-01-25 12:30:37,430] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.96 | bwd_microstep: 4640.07 | bwd_inner_microstep: 4634.58 | bwd_allreduce_microstep: 5.37 | step_microstep: 44.70
-[2025-01-25 12:30:37,430] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.93 | bwd: 4640.10 | bwd_inner: 4634.58 | bwd_allreduce: 5.44 | step: 44.70
- 24%|██▎       | 1372/5800 [3:44:07<8:30:33,  6.92s/it]                                                       {'loss': 0.0283, 'grad_norm': 0.8537492156028748, 'learning_rate': 3.568921118429768e-05, 'epoch': 11.83}
- 24%|██▎       | 1372/5800 [3:44:07<8:30:33,  6.92s/it]score1 tensor([[0.5352],
-        [0.5195],
-        [0.4023],
-        [0.5859]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.4941, 0.3926, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0264, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:30:44,332] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 12:30:44,333] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.22 | bwd_microstep: 4629.45 | bwd_inner_microstep: 4625.02 | bwd_allreduce_microstep: 4.35 | step_microstep: 41.70
-[2025-01-25 12:30:44,333] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.12 | bwd: 4629.47 | bwd_inner: 4625.02 | bwd_allreduce: 4.39 | step: 41.70
- 24%|██▎       | 1373/5800 [3:44:14<8:30:11,  6.91s/it]                                                       {'loss': 0.0264, 'grad_norm': 3.5293469429016113, 'learning_rate': 3.5682282507233456e-05, 'epoch': 11.84}
- 24%|██▎       | 1373/5800 [3:44:14<8:30:11,  6.91s/it]score1 tensor([[0.6484],
-        [0.4668],
-        [0.5117],
-        [0.3809]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.4316, 0.4766, 0.4609], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0454, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:30:51,231] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.49 | optimizer_step: 4.37
-[2025-01-25 12:30:51,233] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.41 | bwd_microstep: 4630.58 | bwd_inner_microstep: 4626.00 | bwd_allreduce_microstep: 4.51 | step_microstep: 40.91
-[2025-01-25 12:30:51,233] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.37 | bwd: 4630.60 | bwd_inner: 4626.00 | bwd_allreduce: 4.54 | step: 40.92
- 24%|██▎       | 1374/5800 [3:44:21<8:29:37,  6.91s/it]                                                       {'loss': 0.0454, 'grad_norm': 4.298064231872559, 'learning_rate': 3.5675348940161426e-05, 'epoch': 11.84}
- 24%|██▎       | 1374/5800 [3:44:21<8:29:37,  6.91s/it]score1 tensor([[0.4023],
-        [0.6328],
-        [0.4180],
-        [0.4316]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4668, 0.5195, 0.4512, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0659, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:30:58,128] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.53 | optimizer_step: 4.36
-[2025-01-25 12:30:58,129] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.22 | bwd_microstep: 4629.95 | bwd_inner_microstep: 4625.64 | bwd_allreduce_microstep: 4.23 | step_microstep: 41.67
-[2025-01-25 12:30:58,130] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.18 | bwd: 4629.97 | bwd_inner: 4625.64 | bwd_allreduce: 4.27 | step: 41.69
- 24%|██▎       | 1375/5800 [3:44:28<8:29:18,  6.91s/it]                                                       {'loss': 0.0659, 'grad_norm': 3.302027940750122, 'learning_rate': 3.566841048524361e-05, 'epoch': 11.85}
- 24%|██▎       | 1375/5800 [3:44:28<8:29:18,  6.91s/it]score1 tensor([[0.6250],
-        [0.5742],
-        [0.6758],
-        [0.4785]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6562, 0.5508, 0.6445, 0.4297], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0337, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:31:05,026] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.54 | optimizer_step: 4.36
-[2025-01-25 12:31:05,027] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.73 | bwd_microstep: 4627.79 | bwd_inner_microstep: 4623.09 | bwd_allreduce_microstep: 4.58 | step_microstep: 42.73
-[2025-01-25 12:31:05,028] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.70 | bwd: 4627.81 | bwd_inner: 4623.09 | bwd_allreduce: 4.64 | step: 42.74
- 24%|██▎       | 1376/5800 [3:44:34<8:29:04,  6.90s/it]                                                       {'loss': 0.0337, 'grad_norm': 4.039226055145264, 'learning_rate': 3.566146714464354e-05, 'epoch': 11.86}
- 24%|██▎       | 1376/5800 [3:44:35<8:29:04,  6.90s/it]score1 tensor([[0.6367],
-        [0.6133],
-        [0.3242],
-        [0.4785]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.6250, 0.3984, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:31:11,926] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.37
-[2025-01-25 12:31:11,927] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.10 | bwd_microstep: 4626.20 | bwd_inner_microstep: 4621.24 | bwd_allreduce_microstep: 4.86 | step_microstep: 42.36
-[2025-01-25 12:31:11,927] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.06 | bwd: 4626.22 | bwd_inner: 4621.24 | bwd_allreduce: 4.92 | step: 42.36
- 24%|██▎       | 1377/5800 [3:44:41<8:28:49,  6.90s/it]                                                       {'loss': 0.0312, 'grad_norm': 3.4656293392181396, 'learning_rate': 3.565451892052626e-05, 'epoch': 11.87}
- 24%|██▎       | 1377/5800 [3:44:41<8:28:49,  6.90s/it]score1 tensor([[0.6172],
-        [0.4414],
-        [0.6094],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.4785, 0.6445, 0.5234], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0200, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:31:18,780] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 12:31:18,781] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.75 | bwd_microstep: 4574.31 | bwd_inner_microstep: 4567.87 | bwd_allreduce_microstep: 6.35 | step_microstep: 46.23
-[2025-01-25 12:31:18,781] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.71 | bwd: 4574.34 | bwd_inner: 4567.87 | bwd_allreduce: 6.40 | step: 46.24
- 24%|██▍       | 1378/5800 [3:44:48<8:27:45,  6.89s/it]                                                       {'loss': 0.02, 'grad_norm': 2.0279979705810547, 'learning_rate': 3.5647565815058346e-05, 'epoch': 11.88}
- 24%|██▍       | 1378/5800 [3:44:48<8:27:45,  6.89s/it]score1 tensor([[0.3672],
-        [0.3828],
-        [0.5195],
-        [0.4707]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4043, 0.5039, 0.4863, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0679, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:31:25,701] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.13 | optimizer_step: 4.36
-[2025-01-25 12:31:25,704] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.92 | bwd_microstep: 4631.85 | bwd_inner_microstep: 4627.54 | bwd_allreduce_microstep: 4.24 | step_microstep: 49.23
-[2025-01-25 12:31:25,704] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.86 | bwd: 4631.87 | bwd_inner: 4627.54 | bwd_allreduce: 4.27 | step: 49.27
- 24%|██▍       | 1379/5800 [3:44:55<8:28:36,  6.90s/it]                                                       {'loss': 0.0679, 'grad_norm': 3.3012499809265137, 'learning_rate': 3.564060783040791e-05, 'epoch': 11.89}
- 24%|██▍       | 1379/5800 [3:44:55<8:28:36,  6.90s/it]score1 tensor([[0.5469],
-        [0.3906],
-        [0.4512],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.4453, 0.4492, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0327, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:31:32,620] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 12:31:32,621] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.01 | bwd_microstep: 4627.62 | bwd_inner_microstep: 4616.42 | bwd_allreduce_microstep: 11.00 | step_microstep: 42.49
-[2025-01-25 12:31:32,621] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.96 | bwd: 4627.64 | bwd_inner: 4616.42 | bwd_allreduce: 11.14 | step: 42.49
- 24%|██▍       | 1380/5800 [3:45:02<8:28:44,  6.91s/it]                                                       {'loss': 0.0327, 'grad_norm': 1.264862298965454, 'learning_rate': 3.563364496874456e-05, 'epoch': 11.9}
- 24%|██▍       | 1380/5800 [3:45:02<8:28:44,  6.91s/it]evaluate!
-score1 tensor([[0.4316]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4805, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0488, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6289]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5625, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4258]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4766, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4727]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5703, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0977, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6914]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5742, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1172, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5117]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5547, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5859]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5977, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5938]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4219, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1719, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5742]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0859, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5195]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5000, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4590]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3945, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0645, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5977]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5000, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0977, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4434]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3730, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4668]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4219]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4414, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5742]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6797, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1055, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5586]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5391, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5312]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4648, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4062]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4258, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4648]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4707, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4766]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3965, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0801, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3750, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1641, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4961]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4492, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5586, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6523, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1289, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6172]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5898, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5625]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4355, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1270, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5156, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5742]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3262, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2480, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4609]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4863, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4160]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2324, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6055]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6055, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0., device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5703]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4570, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1133, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6602]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.7344]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.7031, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4727]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4336, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5391, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0., device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4805]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4473, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0332, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5312]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5430, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4414]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4590, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.7109]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6641, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5312]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4453, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0859, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6602]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.7109, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4980]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5273, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4180]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3691, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0488, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4258]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4180, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4297]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3223, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1074, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6250]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6016, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4707]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4648, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5039]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3418, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1621, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4355]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4590, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4844]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5117, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5078]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4551, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0527, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4922]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5234, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5898]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5312, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0586, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6406]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5820, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0586, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3730]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4199, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4570]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3926, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0645, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6406]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5898]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0977, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5117]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5039]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5352, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6953]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6250, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6133]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5703, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4727, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0625, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5820]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4434, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1387, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4941, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5977]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5352, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0625, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4336]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3828, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4004]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4219, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5078]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4844, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4805, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0586, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5703]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6797]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5898, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0898, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6797]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6641, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5469]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4688, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0781, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4004]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4297, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3887]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4668, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0781, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5664]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0625, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4062]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4297, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4219]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4609, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4551]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1621, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4883]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4492, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4355]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3730, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0625, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5000]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6445, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1445, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4375]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2109, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4297]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5430, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1133, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6016]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5273, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0742, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4297]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3867, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4492]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4668, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6094]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5625, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6562]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6250, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6289]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0., device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4902]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1387, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5117]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4785, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0332, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4922]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4062, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0859, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5820]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4395, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1426, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4766]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5664, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0898, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4727]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3555, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1172, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4688]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4785, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5977]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5078, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0898, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6055]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6016, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3926]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4277, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.7148]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5781, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1367, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4531]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3477, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1055, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4922]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6133, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1211, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5898]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5156, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0742, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4551]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3379, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1172, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4180]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3418, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0762, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5625]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3848, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1777, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4023]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3828, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4238]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4277, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4023]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4043, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16)
-Results saved to /DATA/DATA1/wjr/intern/InternVL/internvl_chat/new_train_result/stage2/mos3_test.csv
-Accuracy: 0.0
-SRCC_score: 0.598447135348724
-PLCC_score: 0.6048142526894762
-KRCC_score: 0.4359253447229126
-SRCC_level: 0.598447135348724
-PLCC_level: 0.6048142526894762
-KRCC_level: 0.4359253447229126
-score1 tensor([[0.5664],
-        [0.6758],
-        [0.5977],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5000, 0.6328, 0.5977, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:41:52,111] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.94 | optimizer_step: 4.36
-[2025-01-25 12:41:52,113] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2139.33 | bwd_microstep: 4550.86 | bwd_inner_microstep: 4546.17 | bwd_allreduce_microstep: 4.59 | step_microstep: 53.48
-[2025-01-25 12:41:52,114] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2139.28 | bwd: 4550.88 | bwd_inner: 4546.17 | bwd_allreduce: 4.64 | step: 53.49
- 24%|██▍       | 1381/5800 [3:55:22<234:03:28, 190.68s/it]                                                          {'loss': 0.0352, 'grad_norm': 6.280959129333496, 'learning_rate': 3.5626677232239445e-05, 'epoch': 11.91}
- 24%|██▍       | 1381/5800 [3:55:22<234:03:28, 190.68s/it]score1 tensor([[0.4336],
-        [0.4980],
-        [0.3926],
-        [0.5898]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.4688, 0.3809, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0205, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:41:58,984] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.44 | optimizer_step: 4.37
-[2025-01-25 12:41:58,985] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2149.24 | bwd_microstep: 4604.56 | bwd_inner_microstep: 4598.61 | bwd_allreduce_microstep: 5.85 | step_microstep: 41.92
-[2025-01-25 12:41:58,985] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2149.22 | bwd: 4604.58 | bwd_inner: 4598.61 | bwd_allreduce: 5.90 | step: 41.93
- 24%|██▍       | 1382/5800 [3:55:28<166:19:57, 135.54s/it]                                                          {'loss': 0.0205, 'grad_norm': 3.965329170227051, 'learning_rate': 3.5619704623065216e-05, 'epoch': 11.91}
- 24%|██▍       | 1382/5800 [3:55:28<166:19:57, 135.54s/it]score1 tensor([[0.4004],
-        [0.4570],
-        [0.4043],
-        [0.4609]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4043, 0.4395, 0.3887, 0.4023], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0239, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:42:05,837] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.58 | optimizer_step: 4.37
-[2025-01-25 12:42:05,839] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2137.30 | bwd_microstep: 4589.48 | bwd_inner_microstep: 4582.80 | bwd_allreduce_microstep: 6.50 | step_microstep: 59.79
-[2025-01-25 12:42:05,840] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2137.27 | bwd: 4589.53 | bwd_inner: 4582.80 | bwd_allreduce: 6.59 | step: 59.81
- 24%|██▍       | 1383/5800 [3:55:35<118:55:52, 96.93s/it]                                                          {'loss': 0.0239, 'grad_norm': 3.631549119949341, 'learning_rate': 3.5612727143396064e-05, 'epoch': 11.92}
- 24%|██▍       | 1383/5800 [3:55:35<118:55:52, 96.93s/it]score1 tensor([[0.5156],
-        [0.4277],
-        [0.5156],
-        [0.4023]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5234, 0.4688, 0.5195, 0.3867], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0171, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:42:12,708] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 12:42:12,710] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2146.39 | bwd_microstep: 4598.00 | bwd_inner_microstep: 4592.71 | bwd_allreduce_microstep: 5.20 | step_microstep: 46.11
-[2025-01-25 12:42:12,710] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2146.36 | bwd: 4598.03 | bwd_inner: 4592.72 | bwd_allreduce: 5.24 | step: 46.12
- 24%|██▍       | 1384/5800 [3:55:42<85:45:37, 69.91s/it]                                                         {'loss': 0.0171, 'grad_norm': 3.9173693656921387, 'learning_rate': 3.560574479540768e-05, 'epoch': 11.93}
- 24%|██▍       | 1384/5800 [3:55:42<85:45:37, 69.91s/it]score1 tensor([[0.6016],
-        [0.5312],
-        [0.4961],
-        [0.6133]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.5039, 0.4688, 0.6406], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0420, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:42:19,560] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 12:42:19,561] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2143.21 | bwd_microstep: 4587.02 | bwd_inner_microstep: 4580.34 | bwd_allreduce_microstep: 6.57 | step_microstep: 48.38
-[2025-01-25 12:42:19,561] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2143.13 | bwd: 4587.05 | bwd_inner: 4580.34 | bwd_allreduce: 6.63 | step: 48.38
- 24%|██▍       | 1385/5800 [3:55:49<62:32:38, 51.00s/it]                                                        {'loss': 0.042, 'grad_norm': 3.9329915046691895, 'learning_rate': 3.55987575812773e-05, 'epoch': 11.94}
- 24%|██▍       | 1385/5800 [3:55:49<62:32:38, 51.00s/it]score1 tensor([[0.5586],
-        [0.4961],
-        [0.4844],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.5117, 0.4844, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:42:26,367] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.55 | optimizer_step: 4.37
-[2025-01-25 12:42:26,369] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2140.93 | bwd_microstep: 4538.33 | bwd_inner_microstep: 4533.22 | bwd_allreduce_microstep: 5.00 | step_microstep: 44.36
-[2025-01-25 12:42:26,369] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2140.91 | bwd: 4538.35 | bwd_inner: 4533.22 | bwd_allreduce: 5.05 | step: 44.37
- 24%|██▍       | 1386/5800 [3:55:56<46:16:12, 37.74s/it]                                                        {'loss': 0.0093, 'grad_norm': 1.923454761505127, 'learning_rate': 3.559176550318363e-05, 'epoch': 11.95}
- 24%|██▍       | 1386/5800 [3:55:56<46:16:12, 37.74s/it]score1 tensor([[0.4512],
-        [0.5117],
-        [0.4199],
-        [0.6953]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.5156, 0.3945, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0269, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:42:33,226] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.46 | optimizer_step: 4.36
-[2025-01-25 12:42:33,227] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2149.58 | bwd_microstep: 4594.23 | bwd_inner_microstep: 4588.72 | bwd_allreduce_microstep: 5.39 | step_microstep: 42.75
-[2025-01-25 12:42:33,228] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2149.54 | bwd: 4594.25 | bwd_inner: 4588.72 | bwd_allreduce: 5.46 | step: 42.75
- 24%|██▍       | 1387/5800 [3:56:03<34:54:10, 28.47s/it]                                                        {'loss': 0.0269, 'grad_norm': 3.879202365875244, 'learning_rate': 3.558476856330693e-05, 'epoch': 11.96}
- 24%|██▍       | 1387/5800 [3:56:03<34:54:10, 28.47s/it]score1 tensor([[0.4531],
-        [0.4941],
-        [0.5430],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.5664, 0.5898, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0366, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:42:40,078] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.39 | optimizer_step: 4.37
-[2025-01-25 12:42:40,080] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2146.69 | bwd_microstep: 4599.31 | bwd_inner_microstep: 4594.82 | bwd_allreduce_microstep: 4.41 | step_microstep: 42.94
-[2025-01-25 12:42:40,080] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2146.66 | bwd: 4599.33 | bwd_inner: 4594.82 | bwd_allreduce: 4.45 | step: 42.94
- 24%|██▍       | 1388/5800 [3:56:10<26:56:43, 21.99s/it]                                                        {'loss': 0.0366, 'grad_norm': 3.785064458847046, 'learning_rate': 3.5577766763828986e-05, 'epoch': 11.97}
- 24%|██▍       | 1388/5800 [3:56:10<26:56:43, 21.99s/it]score1 tensor([[0.3496],
-        [0.4902],
-        [0.4844],
-        [0.4082]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.4707, 0.4824, 0.4258], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0269, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:42:46,943] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.39 | optimizer_step: 4.36
-[2025-01-25 12:42:46,944] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2148.81 | bwd_microstep: 4609.45 | bwd_inner_microstep: 4605.08 | bwd_allreduce_microstep: 4.30 | step_microstep: 43.26
-[2025-01-25 12:42:46,944] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2148.78 | bwd: 4609.48 | bwd_inner: 4605.08 | bwd_allreduce: 4.33 | step: 43.27
- 24%|██▍       | 1389/5800 [3:56:16<21:22:57, 17.45s/it]                                                        {'loss': 0.0269, 'grad_norm': 0.968726396560669, 'learning_rate': 3.5570760106933063e-05, 'epoch': 11.97}
- 24%|██▍       | 1389/5800 [3:56:16<21:22:57, 17.45s/it]score1 tensor([[0.4902],
-        [0.5195],
-        [0.5273],
-        [0.3848]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.5430, 0.4844, 0.4023], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0283, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:42:53,818] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.51 | optimizer_step: 4.36
-[2025-01-25 12:42:53,819] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2151.93 | bwd_microstep: 4607.54 | bwd_inner_microstep: 4602.55 | bwd_allreduce_microstep: 4.91 | step_microstep: 44.76
-[2025-01-25 12:42:53,819] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2151.90 | bwd: 4607.57 | bwd_inner: 4602.55 | bwd_allreduce: 4.95 | step: 44.77
- 24%|██▍       | 1390/5800 [3:56:23<17:29:29, 14.28s/it]                                                        {'loss': 0.0283, 'grad_norm': 3.562462329864502, 'learning_rate': 3.556374859480396e-05, 'epoch': 11.98}
- 24%|██▍       | 1390/5800 [3:56:23<17:29:29, 14.28s/it]score1 tensor([[0.4121],
-        [0.4336],
-        [0.5352],
-        [0.4355]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.4180, 0.5547, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0405, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:43:00,738] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.19 | optimizer_step: 4.37
-[2025-01-25 12:43:00,739] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.34 | bwd_microstep: 4617.70 | bwd_inner_microstep: 4612.44 | bwd_allreduce_microstep: 5.17 | step_microstep: 73.54
-[2025-01-25 12:43:00,739] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.30 | bwd: 4617.72 | bwd_inner: 4612.44 | bwd_allreduce: 5.21 | step: 73.55
- 24%|██▍       | 1391/5800 [3:56:30<14:47:03, 12.07s/it]                                                        {'loss': 0.0405, 'grad_norm': 3.7852604389190674, 'learning_rate': 3.5556732229628e-05, 'epoch': 11.99}
- 24%|██▍       | 1391/5800 [3:56:30<14:47:03, 12.07s/it][93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-score1 tensor([[0.4238]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4160], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:43:05,566] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.98 | optimizer_step: 4.37
-[2025-01-25 12:43:05,569] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 578.73 | bwd_microstep: 1227.19 | bwd_inner_microstep: 1220.18 | bwd_allreduce_microstep: 6.74 | step_microstep: 66.55
-[2025-01-25 12:43:05,570] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 578.68 | bwd: 1227.20 | bwd_inner: 1220.16 | bwd_allreduce: 6.86 | step: 66.58
- 24%|██▍       | 1392/5800 [3:56:35<12:07:49,  9.91s/it]                                                        {'loss': 0.0078, 'grad_norm': 6.880266189575195, 'learning_rate': 3.5549711013592995e-05, 'epoch': 12.0}
- 24%|██▍       | 1392/5800 [3:56:35<12:07:49,  9.91s/it][2025-01-25 12:43:10,169] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 12:43:21,010] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 12:43:31,034] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 12:43:40,700] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-score1 tensor([[0.5195],
-        [0.4473],
-        [0.4238],
-        [0.4531]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.4180, 0.3809, 0.4570], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0415, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:43:58,377] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 12:43:58,379] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2145.74 | bwd_microstep: 4621.66 | bwd_inner_microstep: 4614.59 | bwd_allreduce_microstep: 6.93 | step_microstep: 45.57
-[2025-01-25 12:43:58,379] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2145.70 | bwd: 4621.70 | bwd_inner: 4614.59 | bwd_allreduce: 7.02 | step: 45.58
- 24%|██▍       | 1393/5800 [3:57:28<27:52:30, 22.77s/it]                                                        {'loss': 0.0415, 'grad_norm': 0.6298724412918091, 'learning_rate': 3.554268494888828e-05, 'epoch': 12.01}
- 24%|██▍       | 1393/5800 [3:57:28<27:52:30, 22.77s/it]score1 tensor([[0.5742],
-        [0.4512],
-        [0.4453],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5703, 0.4336, 0.4473, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:44:05,251] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 12:44:05,253] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2146.03 | bwd_microstep: 4595.18 | bwd_inner_microstep: 4589.80 | bwd_allreduce_microstep: 5.24 | step_microstep: 47.37
-[2025-01-25 12:44:05,253] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2146.00 | bwd: 4595.21 | bwd_inner: 4589.81 | bwd_allreduce: 5.31 | step: 47.37
- 24%|██▍       | 1394/5800 [3:57:35<22:01:57, 18.00s/it]                                                        {'loss': 0.0186, 'grad_norm': 0.4798443019390106, 'learning_rate': 3.55356540377047e-05, 'epoch': 12.02}
- 24%|██▍       | 1394/5800 [3:57:35<22:01:57, 18.00s/it]score1 tensor([[0.5391],
-        [0.4805],
-        [0.4160],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.5039, 0.4004, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:44:12,113] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.96 | optimizer_step: 4.36
-[2025-01-25 12:44:12,115] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2138.79 | bwd_microstep: 4595.14 | bwd_inner_microstep: 4589.53 | bwd_allreduce_microstep: 5.51 | step_microstep: 46.60
-[2025-01-25 12:44:12,116] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2138.75 | bwd: 4595.17 | bwd_inner: 4589.53 | bwd_allreduce: 5.56 | step: 46.61
- 24%|██▍       | 1395/5800 [3:57:42<17:56:22, 14.66s/it]                                                        {'loss': 0.0234, 'grad_norm': 4.19819450378418, 'learning_rate': 3.552861828223464e-05, 'epoch': 12.03}
- 24%|██▍       | 1395/5800 [3:57:42<17:56:22, 14.66s/it]score1 tensor([[0.4980],
-        [0.4785],
-        [0.5039],
-        [0.3984]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4805, 0.5039, 0.4648, 0.3105], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0425, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:44:18,987] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.55 | optimizer_step: 4.36
-[2025-01-25 12:44:18,988] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2139.29 | bwd_microstep: 4605.00 | bwd_inner_microstep: 4600.04 | bwd_allreduce_microstep: 4.84 | step_microstep: 43.94
-[2025-01-25 12:44:18,989] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2139.25 | bwd: 4605.03 | bwd_inner: 4600.04 | bwd_allreduce: 4.90 | step: 43.94
- 24%|██▍       | 1396/5800 [3:57:48<15:04:20, 12.32s/it]                                                        {'loss': 0.0425, 'grad_norm': 3.5860989093780518, 'learning_rate': 3.552157768467195e-05, 'epoch': 12.03}
- 24%|██▍       | 1396/5800 [3:57:48<15:04:20, 12.32s/it]score1 tensor([[0.5703],
-        [0.4785],
-        [0.5664],
-        [0.6328]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.4414, 0.5664, 0.5938], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0327, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:44:25,817] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.37
-[2025-01-25 12:44:25,818] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.57 | bwd_microstep: 4558.81 | bwd_inner_microstep: 4552.43 | bwd_allreduce_microstep: 6.23 | step_microstep: 52.33
-[2025-01-25 12:44:25,819] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.54 | bwd: 4558.84 | bwd_inner: 4552.43 | bwd_allreduce: 6.32 | step: 52.34
- 24%|██▍       | 1397/5800 [3:57:55<13:03:26, 10.68s/it]                                                        {'loss': 0.0327, 'grad_norm': 6.064123630523682, 'learning_rate': 3.5514532247212015e-05, 'epoch': 12.04}
- 24%|██▍       | 1397/5800 [3:57:55<13:03:26, 10.68s/it]score1 tensor([[0.4961],
-        [0.4805],
-        [0.5234],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4922, 0.4668, 0.4941, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:44:32,704] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 12:44:32,705] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.32 | bwd_microstep: 4613.01 | bwd_inner_microstep: 4607.72 | bwd_allreduce_microstep: 5.18 | step_microstep: 44.13
-[2025-01-25 12:44:32,706] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.29 | bwd: 4613.04 | bwd_inner: 4607.72 | bwd_allreduce: 5.24 | step: 44.13
- 24%|██▍       | 1398/5800 [3:58:02<11:39:42,  9.54s/it]                                                        {'loss': 0.0137, 'grad_norm': 7.618793964385986, 'learning_rate': 3.5507481972051724e-05, 'epoch': 12.05}
- 24%|██▍       | 1398/5800 [3:58:02<11:39:42,  9.54s/it]score1 tensor([[0.4902],
-        [0.3906],
-        [0.3945],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5234, 0.3457, 0.3652, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0361, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:44:39,588] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 12:44:39,589] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2151.70 | bwd_microstep: 4617.70 | bwd_inner_microstep: 4612.50 | bwd_allreduce_microstep: 5.10 | step_microstep: 41.66
-[2025-01-25 12:44:39,590] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2151.67 | bwd: 4617.73 | bwd_inner: 4612.50 | bwd_allreduce: 5.15 | step: 41.67
- 24%|██▍       | 1399/5800 [3:58:09<10:41:12,  8.74s/it]                                                        {'loss': 0.0361, 'grad_norm': 3.3739230632781982, 'learning_rate': 3.550042686138947e-05, 'epoch': 12.06}
- 24%|██▍       | 1399/5800 [3:58:09<10:41:12,  8.74s/it]score1 tensor([[0.4062],
-        [0.5117],
-        [0.4414],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3730, 0.5664, 0.4844, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0386, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:44:46,479] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 12:44:46,481] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.00 | bwd_microstep: 4618.99 | bwd_inner_microstep: 4613.72 | bwd_allreduce_microstep: 5.15 | step_microstep: 46.17
-[2025-01-25 12:44:46,482] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.96 | bwd: 4619.01 | bwd_inner: 4613.72 | bwd_allreduce: 5.21 | step: 46.18
- 24%|██▍       | 1400/5800 [3:58:16<10:00:26,  8.19s/it]                                                        {'loss': 0.0386, 'grad_norm': 3.896517038345337, 'learning_rate': 3.5493366917425175e-05, 'epoch': 12.07}
- 24%|██▍       | 1400/5800 [3:58:16<10:00:26,  8.19s/it]score1 tensor([[0.3965],
-        [0.4160],
-        [0.5078],
-        [0.4590]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4277, 0.4043, 0.5430, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0396, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:44:53,379] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.37
-[2025-01-25 12:44:53,380] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.91 | bwd_microstep: 4622.67 | bwd_inner_microstep: 4617.90 | bwd_allreduce_microstep: 4.70 | step_microstep: 40.75
-[2025-01-25 12:44:53,381] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.86 | bwd: 4622.70 | bwd_inner: 4617.90 | bwd_allreduce: 4.73 | step: 40.76
- 24%|██▍       | 1401/5800 [3:58:23<9:31:49,  7.80s/it]                                                        {'loss': 0.0396, 'grad_norm': 3.624911069869995, 'learning_rate': 3.548630214236023e-05, 'epoch': 12.08}
- 24%|██▍       | 1401/5800 [3:58:23<9:31:49,  7.80s/it]score1 tensor([[0.4238],
-        [0.5352],
-        [0.3750],
-        [0.4199]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4277, 0.5742, 0.3750, 0.4531], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0190, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:45:00,209] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 12:45:00,210] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.92 | bwd_microstep: 4567.52 | bwd_inner_microstep: 4562.37 | bwd_allreduce_microstep: 5.03 | step_microstep: 42.93
-[2025-01-25 12:45:00,211] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.89 | bwd: 4567.57 | bwd_inner: 4562.37 | bwd_allreduce: 5.10 | step: 42.93
- 24%|██▍       | 1402/5800 [3:58:30<9:10:25,  7.51s/it]                                                       {'loss': 0.019, 'grad_norm': 5.4412713050842285, 'learning_rate': 3.547923253839758e-05, 'epoch': 12.09}
- 24%|██▍       | 1402/5800 [3:58:30<9:10:25,  7.51s/it]score1 tensor([[0.4160],
-        [0.5547],
-        [0.5117],
-        [0.4297]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.6250, 0.6016, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0688, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:45:07,096] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 12:45:07,097] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.99 | bwd_microstep: 4617.63 | bwd_inner_microstep: 4612.97 | bwd_allreduce_microstep: 4.57 | step_microstep: 43.26
-[2025-01-25 12:45:07,098] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.96 | bwd: 4617.66 | bwd_inner: 4612.97 | bwd_allreduce: 4.62 | step: 43.26
- 24%|██▍       | 1403/5800 [3:58:37<8:56:36,  7.32s/it]                                                       {'loss': 0.0688, 'grad_norm': 7.386566162109375, 'learning_rate': 3.547215810774163e-05, 'epoch': 12.09}
- 24%|██▍       | 1403/5800 [3:58:37<8:56:36,  7.32s/it]score1 tensor([[0.4570],
-        [0.4160],
-        [0.5195],
-        [0.4551]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4258, 0.4707, 0.5469, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:45:13,985] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.40 | optimizer_step: 4.37
-[2025-01-25 12:45:13,990] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.96 | bwd_microstep: 4622.17 | bwd_inner_microstep: 4617.19 | bwd_allreduce_microstep: 4.89 | step_microstep: 43.96
-[2025-01-25 12:45:13,990] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.92 | bwd: 4622.21 | bwd_inner: 4617.19 | bwd_allreduce: 4.93 | step: 43.97
- 24%|██▍       | 1404/5800 [3:58:43<8:46:59,  7.19s/it]                                                       {'loss': 0.0391, 'grad_norm': 3.595986843109131, 'learning_rate': 3.546507885259831e-05, 'epoch': 12.1}
- 24%|██▍       | 1404/5800 [3:58:43<8:46:59,  7.19s/it]score1 tensor([[0.6406],
-        [0.6367],
-        [0.5352],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6797, 0.6211, 0.5547, 0.6602], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0479, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:45:20,877] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 12:45:20,882] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.38 | bwd_microstep: 4623.74 | bwd_inner_microstep: 4619.16 | bwd_allreduce_microstep: 4.47 | step_microstep: 47.79
-[2025-01-25 12:45:20,882] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.36 | bwd: 4623.77 | bwd_inner: 4619.16 | bwd_allreduce: 4.53 | step: 47.80
- 24%|██▍       | 1405/5800 [3:58:50<8:40:25,  7.10s/it]                                                       {'loss': 0.0479, 'grad_norm': 4.027733325958252, 'learning_rate': 3.545799477517507e-05, 'epoch': 12.11}
- 24%|██▍       | 1405/5800 [3:58:50<8:40:25,  7.10s/it]score1 tensor([[0.5352],
-        [0.5156],
-        [0.5625],
-        [0.4941]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.4141, 0.4980, 0.3262], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1055, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:45:27,782] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.53 | optimizer_step: 4.37
-[2025-01-25 12:45:27,784] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.04 | bwd_microstep: 4622.88 | bwd_inner_microstep: 4618.71 | bwd_allreduce_microstep: 4.09 | step_microstep: 45.40
-[2025-01-25 12:45:27,784] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.00 | bwd: 4622.90 | bwd_inner: 4618.72 | bwd_allreduce: 4.12 | step: 45.40
- 24%|██▍       | 1406/5800 [3:58:57<8:35:42,  7.04s/it]                                                       {'loss': 0.1055, 'grad_norm': 7.725748062133789, 'learning_rate': 3.5450905877680846e-05, 'epoch': 12.12}
- 24%|██▍       | 1406/5800 [3:58:57<8:35:42,  7.04s/it]score1 tensor([[0.5469],
-        [0.5859],
-        [0.4219],
-        [0.5859]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4238, 0.5117, 0.3477, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0737, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:45:34,683] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.37
-[2025-01-25 12:45:34,684] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.73 | bwd_microstep: 4629.38 | bwd_inner_microstep: 4623.73 | bwd_allreduce_microstep: 5.56 | step_microstep: 45.34
-[2025-01-25 12:45:34,685] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.70 | bwd: 4629.40 | bwd_inner: 4623.73 | bwd_allreduce: 5.61 | step: 45.34
- 24%|██▍       | 1407/5800 [3:59:04<8:32:47,  7.00s/it]                                                       {'loss': 0.0737, 'grad_norm': 7.769436359405518, 'learning_rate': 3.544381216232606e-05, 'epoch': 12.13}
- 24%|██▍       | 1407/5800 [3:59:04<8:32:47,  7.00s/it]score1 tensor([[0.5117],
-        [0.5469],
-        [0.5703],
-        [0.5859]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4043, 0.4941, 0.5391, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0537, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:45:41,633] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.59 | optimizer_step: 4.39
-[2025-01-25 12:45:41,634] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.68 | bwd_microstep: 4632.28 | bwd_inner_microstep: 4627.19 | bwd_allreduce_microstep: 4.99 | step_microstep: 66.54
-[2025-01-25 12:45:41,634] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.64 | bwd: 4632.31 | bwd_inner: 4627.19 | bwd_allreduce: 5.04 | step: 66.55
- 24%|██▍       | 1408/5800 [3:59:11<8:31:30,  6.99s/it]                                                       {'loss': 0.0537, 'grad_norm': 7.9423747062683105, 'learning_rate': 3.543671363132267e-05, 'epoch': 12.14}
- 24%|██▍       | 1408/5800 [3:59:11<8:31:30,  6.99s/it]score1 tensor([[0.5977],
-        [0.6250],
-        [0.5039],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.6133, 0.4043, 0.4727], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0591, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:45:48,576] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.49 | optimizer_step: 4.37
-[2025-01-25 12:45:48,578] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.85 | bwd_microstep: 4637.71 | bwd_inner_microstep: 4632.96 | bwd_allreduce_microstep: 4.66 | step_microstep: 51.86
-[2025-01-25 12:45:48,578] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.82 | bwd: 4637.73 | bwd_inner: 4632.96 | bwd_allreduce: 4.70 | step: 51.90
- 24%|██▍       | 1409/5800 [3:59:18<8:30:08,  6.97s/it]                                                       {'loss': 0.0591, 'grad_norm': 8.039648056030273, 'learning_rate': 3.542961028688412e-05, 'epoch': 12.15}
- 24%|██▍       | 1409/5800 [3:59:18<8:30:08,  6.97s/it]score1 tensor([[0.5703],
-        [0.6328],
-        [0.5820],
-        [0.6602]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.5664, 0.5469, 0.6562], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0283, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:45:55,485] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 12:45:55,487] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.19 | bwd_microstep: 4628.13 | bwd_inner_microstep: 4623.17 | bwd_allreduce_microstep: 4.87 | step_microstep: 46.34
-[2025-01-25 12:45:55,487] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.14 | bwd: 4628.15 | bwd_inner: 4623.17 | bwd_allreduce: 4.91 | step: 46.35
- 24%|██▍       | 1410/5800 [3:59:25<8:28:44,  6.95s/it]                                                       {'loss': 0.0283, 'grad_norm': 4.531373977661133, 'learning_rate': 3.542250213122536e-05, 'epoch': 12.16}
- 24%|██▍       | 1410/5800 [3:59:25<8:28:44,  6.95s/it]score1 tensor([[0.6406],
-        [0.6094],
-        [0.4902],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133, 0.6055, 0.4746, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0259, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:46:02,391] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 12:46:02,392] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.25 | bwd_microstep: 4626.38 | bwd_inner_microstep: 4620.89 | bwd_allreduce_microstep: 5.40 | step_microstep: 44.53
-[2025-01-25 12:46:02,392] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.22 | bwd: 4626.40 | bwd_inner: 4620.89 | bwd_allreduce: 5.44 | step: 44.54
- 24%|██▍       | 1411/5800 [3:59:32<8:27:30,  6.94s/it]                                                       {'loss': 0.0259, 'grad_norm': 8.100629806518555, 'learning_rate': 3.5415389166562825e-05, 'epoch': 12.16}
- 24%|██▍       | 1411/5800 [3:59:32<8:27:30,  6.94s/it]score1 tensor([[0.5547],
-        [0.5430],
-        [0.5195],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.5352, 0.4727, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0371, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:46:09,298] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 12:46:09,299] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.63 | bwd_microstep: 4628.93 | bwd_inner_microstep: 4623.94 | bwd_allreduce_microstep: 4.89 | step_microstep: 44.93
-[2025-01-25 12:46:09,300] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.59 | bwd: 4628.95 | bwd_inner: 4623.94 | bwd_allreduce: 4.95 | step: 44.94
- 24%|██▍       | 1412/5800 [3:59:39<8:26:44,  6.93s/it]                                                       {'loss': 0.0371, 'grad_norm': 7.930273532867432, 'learning_rate': 3.5408271395114475e-05, 'epoch': 12.17}
- 24%|██▍       | 1412/5800 [3:59:39<8:26:44,  6.93s/it]score1 tensor([[0.6016],
-        [0.5156],
-        [0.4727],
-        [0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6562, 0.5195, 0.4785, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0337, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:46:16,216] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.37
-[2025-01-25 12:46:16,217] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.90 | bwd_microstep: 4638.70 | bwd_inner_microstep: 4634.22 | bwd_allreduce_microstep: 4.40 | step_microstep: 40.63
-[2025-01-25 12:46:16,217] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.86 | bwd: 4638.72 | bwd_inner: 4634.22 | bwd_allreduce: 4.44 | step: 40.64
- 24%|██▍       | 1413/5800 [3:59:46<8:26:16,  6.92s/it]                                                       {'loss': 0.0337, 'grad_norm': 7.723071098327637, 'learning_rate': 3.5401148819099746e-05, 'epoch': 12.18}
- 24%|██▍       | 1413/5800 [3:59:46<8:26:16,  6.92s/it]score1 tensor([[0.5078],
-        [0.3633],
-        [0.4512],
-        [0.4375]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4492, 0.4648, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0542, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:46:23,124] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 12:46:23,125] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.13 | bwd_microstep: 4633.19 | bwd_inner_microstep: 4628.35 | bwd_allreduce_microstep: 4.73 | step_microstep: 45.85
-[2025-01-25 12:46:23,126] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.08 | bwd: 4633.21 | bwd_inner: 4628.35 | bwd_allreduce: 4.78 | step: 45.86
- 24%|██▍       | 1414/5800 [3:59:53<8:25:50,  6.92s/it]                                                       {'loss': 0.0542, 'grad_norm': 6.977697372436523, 'learning_rate': 3.539402144073958e-05, 'epoch': 12.19}
- 24%|██▍       | 1414/5800 [3:59:53<8:25:50,  6.92s/it]score1 tensor([[0.5742],
-        [0.4746],
-        [0.4023],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.5352, 0.4219, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0601, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:46:30,037] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.36
-[2025-01-25 12:46:30,039] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.21 | bwd_microstep: 4636.04 | bwd_inner_microstep: 4631.09 | bwd_allreduce_microstep: 4.82 | step_microstep: 46.35
-[2025-01-25 12:46:30,039] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.18 | bwd: 4636.08 | bwd_inner: 4631.09 | bwd_allreduce: 4.88 | step: 46.36
- 24%|██▍       | 1415/5800 [4:00:00<8:25:45,  6.92s/it]                                                       {'loss': 0.0601, 'grad_norm': 7.578367233276367, 'learning_rate': 3.538688926225642e-05, 'epoch': 12.2}
- 24%|██▍       | 1415/5800 [4:00:00<8:25:45,  6.92s/it]score1 tensor([[0.5117],
-        [0.6094],
-        [0.4961],
-        [0.3574]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.6367, 0.5273, 0.2812], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0356, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:46:36,963] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 12:46:36,964] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.31 | bwd_microstep: 4639.90 | bwd_inner_microstep: 4634.82 | bwd_allreduce_microstep: 4.99 | step_microstep: 43.68
-[2025-01-25 12:46:36,965] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.28 | bwd: 4639.93 | bwd_inner: 4634.82 | bwd_allreduce: 5.04 | step: 43.69
- 24%|██▍       | 1416/5800 [4:00:06<8:25:56,  6.92s/it]                                                       {'loss': 0.0356, 'grad_norm': 4.533538818359375, 'learning_rate': 3.53797522858742e-05, 'epoch': 12.21}
- 24%|██▍       | 1416/5800 [4:00:06<8:25:56,  6.92s/it]score1 tensor([[0.4590],
-        [0.4375],
-        [0.5781],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.4297, 0.6133, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0288, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:46:43,956] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 8.61 | optimizer_step: 5.23
-[2025-01-25 12:46:43,958] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.29 | bwd_microstep: 4639.84 | bwd_inner_microstep: 4632.56 | bwd_allreduce_microstep: 6.98 | step_microstep: 92.53
-[2025-01-25 12:46:43,959] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.24 | bwd: 4639.90 | bwd_inner: 4632.56 | bwd_allreduce: 7.16 | step: 92.56
- 24%|██▍       | 1417/5800 [4:00:13<8:27:29,  6.95s/it]                                                       {'loss': 0.0288, 'grad_norm': 4.010669231414795, 'learning_rate': 3.537261051381836e-05, 'epoch': 12.22}
- 24%|██▍       | 1417/5800 [4:00:13<8:27:29,  6.95s/it]score1 tensor([[0.4473],
-        [0.5664],
-        [0.5117],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4922, 0.5781, 0.5391, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0366, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:46:50,935] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.36
-[2025-01-25 12:46:50,936] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2171.33 | bwd_microstep: 4645.18 | bwd_inner_microstep: 4639.87 | bwd_allreduce_microstep: 5.22 | step_microstep: 58.25
-[2025-01-25 12:46:50,936] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2171.28 | bwd: 4645.21 | bwd_inner: 4639.87 | bwd_allreduce: 5.27 | step: 58.26
- 24%|██▍       | 1418/5800 [4:00:20<8:27:49,  6.95s/it]                                                       {'loss': 0.0366, 'grad_norm': 7.65838623046875, 'learning_rate': 3.536546394831582e-05, 'epoch': 12.22}
- 24%|██▍       | 1418/5800 [4:00:20<8:27:49,  6.95s/it]score1 tensor([[0.5156],
-        [0.5820],
-        [0.5547],
-        [0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.5820, 0.4980, 0.4609], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0278, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:46:57,807] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 12:46:57,808] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.34 | bwd_microstep: 4576.37 | bwd_inner_microstep: 4571.19 | bwd_allreduce_microstep: 5.08 | step_microstep: 42.86
-[2025-01-25 12:46:57,808] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.30 | bwd: 4576.39 | bwd_inner: 4571.19 | bwd_allreduce: 5.13 | step: 42.87
- 24%|██▍       | 1419/5800 [4:00:27<8:25:38,  6.93s/it]                                                       {'loss': 0.0278, 'grad_norm': 5.773011207580566, 'learning_rate': 3.5358312591595006e-05, 'epoch': 12.23}
- 24%|██▍       | 1419/5800 [4:00:27<8:25:38,  6.93s/it]score1 tensor([[0.4590],
-        [0.4434],
-        [0.5586],
-        [0.4902]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4297, 0.4414, 0.5078, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:47:04,703] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 12:47:04,704] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.95 | bwd_microstep: 4624.40 | bwd_inner_microstep: 4619.47 | bwd_allreduce_microstep: 4.82 | step_microstep: 45.84
-[2025-01-25 12:47:04,704] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.92 | bwd: 4624.42 | bwd_inner: 4619.47 | bwd_allreduce: 4.87 | step: 45.86
- 24%|██▍       | 1420/5800 [4:00:34<8:24:52,  6.92s/it]                                                       {'loss': 0.0215, 'grad_norm': 3.6716501712799072, 'learning_rate': 3.535115644588584e-05, 'epoch': 12.24}
- 24%|██▍       | 1420/5800 [4:00:34<8:24:52,  6.92s/it]score1 tensor([[0.4551],
-        [0.5000],
-        [0.6289],
-        [0.6094]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3984, 0.4688, 0.6094, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0386, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:47:11,609] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 12:47:11,611] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.52 | bwd_microstep: 4625.43 | bwd_inner_microstep: 4619.92 | bwd_allreduce_microstep: 5.40 | step_microstep: 44.44
-[2025-01-25 12:47:11,611] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.49 | bwd: 4625.47 | bwd_inner: 4619.92 | bwd_allreduce: 5.46 | step: 44.45
- 24%|██▍       | 1421/5800 [4:00:41<8:24:37,  6.91s/it]                                                       {'loss': 0.0386, 'grad_norm': 7.921080112457275, 'learning_rate': 3.534399551341972e-05, 'epoch': 12.25}
- 24%|██▍       | 1421/5800 [4:00:41<8:24:37,  6.91s/it]score1 tensor([[0.5000],
-        [0.5234],
-        [0.5820],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.4863, 0.5625, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0298, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:47:18,525] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 12:47:18,527] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.48 | bwd_microstep: 4635.55 | bwd_inner_microstep: 4630.21 | bwd_allreduce_microstep: 5.25 | step_microstep: 45.13
-[2025-01-25 12:47:18,527] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.44 | bwd: 4635.58 | bwd_inner: 4630.21 | bwd_allreduce: 5.30 | step: 45.13
- 25%|██▍       | 1422/5800 [4:00:48<8:24:34,  6.92s/it]                                                       {'loss': 0.0298, 'grad_norm': 4.1977715492248535, 'learning_rate': 3.533682979642957e-05, 'epoch': 12.26}
- 25%|██▍       | 1422/5800 [4:00:48<8:24:34,  6.92s/it]score1 tensor([[0.5625],
-        [0.4941],
-        [0.4668],
-        [0.3789]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.4785, 0.5039, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:47:25,432] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 12:47:25,434] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.52 | bwd_microstep: 4631.48 | bwd_inner_microstep: 4626.45 | bwd_allreduce_microstep: 4.93 | step_microstep: 43.24
-[2025-01-25 12:47:25,434] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.49 | bwd: 4631.50 | bwd_inner: 4626.45 | bwd_allreduce: 4.98 | step: 43.25
- 25%|██▍       | 1423/5800 [4:00:55<8:24:23,  6.91s/it]                                                       {'loss': 0.0161, 'grad_norm': 3.701340436935425, 'learning_rate': 3.5329659297149766e-05, 'epoch': 12.27}
- 25%|██▍       | 1423/5800 [4:00:55<8:24:23,  6.91s/it]score1 tensor([[0.3945],
-        [0.5898],
-        [0.6289],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3691, 0.5664, 0.6875, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0322, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:47:32,341] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 12:47:32,342] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.36 | bwd_microstep: 4627.33 | bwd_inner_microstep: 4622.43 | bwd_allreduce_microstep: 4.81 | step_microstep: 42.74
-[2025-01-25 12:47:32,343] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.32 | bwd: 4627.35 | bwd_inner: 4622.43 | bwd_allreduce: 4.85 | step: 42.75
- 25%|██▍       | 1424/5800 [4:01:02<8:24:04,  6.91s/it]                                                       {'loss': 0.0322, 'grad_norm': 3.532071352005005, 'learning_rate': 3.53224840178162e-05, 'epoch': 12.28}
- 25%|██▍       | 1424/5800 [4:01:02<8:24:04,  6.91s/it]score1 tensor([[0.5742],
-        [0.5391],
-        [0.4434],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.6484, 0.4473, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0488, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:47:39,252] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 12:47:39,253] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.00 | bwd_microstep: 4629.99 | bwd_inner_microstep: 4624.38 | bwd_allreduce_microstep: 5.48 | step_microstep: 48.27
-[2025-01-25 12:47:39,254] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.96 | bwd: 4630.04 | bwd_inner: 4624.38 | bwd_allreduce: 5.55 | step: 48.27
- 25%|██▍       | 1425/5800 [4:01:09<8:24:05,  6.91s/it]                                                       {'loss': 0.0488, 'grad_norm': 7.755043029785156, 'learning_rate': 3.531530396066625e-05, 'epoch': 12.28}
- 25%|██▍       | 1425/5800 [4:01:09<8:24:05,  6.91s/it]score1 tensor([[0.4219],
-        [0.4902],
-        [0.4844],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.6484, 0.5508, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0933, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:47:46,194] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.32 | optimizer_step: 4.36
-[2025-01-25 12:47:46,196] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.31 | bwd_microstep: 4637.31 | bwd_inner_microstep: 4630.09 | bwd_allreduce_microstep: 7.11 | step_microstep: 56.91
-[2025-01-25 12:47:46,196] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.28 | bwd: 4637.33 | bwd_inner: 4630.09 | bwd_allreduce: 7.16 | step: 56.92
- 25%|█��▍       | 1426/5800 [4:01:16<8:24:30,  6.92s/it]                                                       {'loss': 0.0933, 'grad_norm': 7.384828090667725, 'learning_rate': 3.530811912793878e-05, 'epoch': 12.29}
- 25%|██▍       | 1426/5800 [4:01:16<8:24:30,  6.92s/it]score1 tensor([[0.5273],
-        [0.3281],
-        [0.5820],
-        [0.4297]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.3789, 0.6367, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0605, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:47:53,131] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.36
-[2025-01-25 12:47:53,133] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.27 | bwd_microstep: 4634.01 | bwd_inner_microstep: 4628.87 | bwd_allreduce_microstep: 5.02 | step_microstep: 56.35
-[2025-01-25 12:47:53,134] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.24 | bwd: 4634.04 | bwd_inner: 4628.87 | bwd_allreduce: 5.08 | step: 56.39
- 25%|██▍       | 1427/5800 [4:01:23<8:25:41,  6.94s/it]                                                       {'loss': 0.0605, 'grad_norm': 7.301475524902344, 'learning_rate': 3.530092952187415e-05, 'epoch': 12.3}
- 25%|██▍       | 1427/5800 [4:01:23<8:25:41,  6.94s/it]score1 tensor([[0.5273],
-        [0.4141],
-        [0.5820],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5703, 0.4863, 0.6250, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0630, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:48:00,139] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.37
-[2025-01-25 12:48:00,141] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2171.11 | bwd_microstep: 4640.95 | bwd_inner_microstep: 4635.97 | bwd_allreduce_microstep: 4.87 | step_microstep: 46.40
-[2025-01-25 12:48:00,142] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2171.07 | bwd: 4640.97 | bwd_inner: 4635.97 | bwd_allreduce: 4.92 | step: 46.41
- 25%|██▍       | 1428/5800 [4:01:30<8:26:46,  6.95s/it]                                                       {'loss': 0.063, 'grad_norm': 7.7766828536987305, 'learning_rate': 3.5293735144714196e-05, 'epoch': 12.31}
- 25%|██▍       | 1428/5800 [4:01:30<8:26:46,  6.95s/it]score1 tensor([[0.5156],
-        [0.3008],
-        [0.3262],
-        [0.3066]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.3418, 0.3926, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0513, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:48:07,111] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 12:48:07,112] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.68 | bwd_microstep: 4633.72 | bwd_inner_microstep: 4623.85 | bwd_allreduce_microstep: 9.70 | step_microstep: 52.71
-[2025-01-25 12:48:07,112] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.62 | bwd: 4633.76 | bwd_inner: 4623.85 | bwd_allreduce: 9.79 | step: 52.68
- 25%|██▍       | 1429/5800 [4:01:37<8:26:39,  6.95s/it]                                                       {'loss': 0.0513, 'grad_norm': 2.6504578590393066, 'learning_rate': 3.528653599870226e-05, 'epoch': 12.32}
- 25%|██▍       | 1429/5800 [4:01:37<8:26:39,  6.95s/it]score1 tensor([[0.4316],
-        [0.4023],
-        [0.6016],
-        [0.3965]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.4473, 0.6094, 0.4023], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0483, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:48:14,063] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.55 | optimizer_step: 4.36
-[2025-01-25 12:48:14,065] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.48 | bwd_microstep: 4630.58 | bwd_inner_microstep: 4623.58 | bwd_allreduce_microstep: 6.89 | step_microstep: 57.31
-[2025-01-25 12:48:14,065] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.38 | bwd: 4630.60 | bwd_inner: 4623.58 | bwd_allreduce: 6.95 | step: 57.33
- 25%|██▍       | 1430/5800 [4:01:44<8:26:57,  6.96s/it]                                                       {'loss': 0.0483, 'grad_norm': 7.183610916137695, 'learning_rate': 3.5279332086083146e-05, 'epoch': 12.33}
- 25%|██▍       | 1430/5800 [4:01:44<8:26:57,  6.96s/it]score1 tensor([[0.4609],
-        [0.5195],
-        [0.3359],
-        [0.4258]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4590, 0.4941, 0.3086, 0.4141], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0166, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:48:21,032] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.95 | optimizer_step: 4.36
-[2025-01-25 12:48:21,034] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.53 | bwd_microstep: 4628.18 | bwd_inner_microstep: 4622.99 | bwd_allreduce_microstep: 5.10 | step_microstep: 46.28
-[2025-01-25 12:48:21,035] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.49 | bwd: 4628.20 | bwd_inner: 4622.99 | bwd_allreduce: 5.14 | step: 46.29
- 25%|██▍       | 1431/5800 [4:01:51<8:26:45,  6.96s/it]                                                       {'loss': 0.0166, 'grad_norm': 6.967432975769043, 'learning_rate': 3.527212340910318e-05, 'epoch': 12.34}
- 25%|██▍       | 1431/5800 [4:01:51<8:26:45,  6.96s/it]score1 tensor([[0.4824],
-        [0.4668],
-        [0.5234],
-        [0.5898]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4316, 0.4277, 0.5391, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0361, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:48:27,980] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.37
-[2025-01-25 12:48:27,982] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.87 | bwd_microstep: 4635.63 | bwd_inner_microstep: 4629.45 | bwd_allreduce_microstep: 6.06 | step_microstep: 46.88
-[2025-01-25 12:48:27,982] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.83 | bwd: 4635.66 | bwd_inner: 4629.45 | bwd_allreduce: 6.14 | step: 46.89
- 25%|██▍       | 1432/5800 [4:01:57<8:26:26,  6.96s/it]                                                       {'loss': 0.0361, 'grad_norm': 3.7923743724823, 'learning_rate': 3.526490997001014e-05, 'epoch': 12.34}
- 25%|██▍       | 1432/5800 [4:01:57<8:26:26,  6.96s/it]score1 tensor([[0.5898],
-        [0.5781],
-        [0.5547],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.5352, 0.5469, 0.4570], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0332, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:48:34,945] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 9.17 | optimizer_step: 4.62
-[2025-01-25 12:48:34,948] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.33 | bwd_microstep: 4632.26 | bwd_inner_microstep: 4623.87 | bwd_allreduce_microstep: 8.26 | step_microstep: 64.67
-[2025-01-25 12:48:34,949] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.29 | bwd: 4632.29 | bwd_inner: 4623.87 | bwd_allreduce: 8.32 | step: 64.66
- 25%|██▍       | 1433/5800 [4:02:04<8:26:34,  6.96s/it]                                                       {'loss': 0.0332, 'grad_norm': 3.8653078079223633, 'learning_rate': 3.525769177105329e-05, 'epoch': 12.35}
- 25%|██▍       | 1433/5800 [4:02:04<8:26:34,  6.96s/it]score1 tensor([[0.5938],
-        [0.5273],
-        [0.5352],
-        [0.4473]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5977, 0.5234, 0.6172, 0.4629], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0264, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:48:41,898] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.30 | optimizer_step: 4.36
-[2025-01-25 12:48:41,899] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.63 | bwd_microstep: 4626.20 | bwd_inner_microstep: 4620.93 | bwd_allreduce_microstep: 5.18 | step_microstep: 41.62
-[2025-01-25 12:48:41,900] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.60 | bwd: 4626.22 | bwd_inner: 4620.93 | bwd_allreduce: 5.22 | step: 41.63
- 25%|██▍       | 1434/5800 [4:02:11<8:25:54,  6.95s/it]                                                       {'loss': 0.0264, 'grad_norm': 3.970933437347412, 'learning_rate': 3.525046881448341e-05, 'epoch': 12.36}
- 25%|██▍       | 1434/5800 [4:02:11<8:25:54,  6.95s/it]score1 tensor([[0.4941],
-        [0.4844],
-        [0.4863],
-        [0.4824]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4434, 0.4668, 0.3809, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0527, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:48:48,844] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.21 | optimizer_step: 4.43
-[2025-01-25 12:48:48,846] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.49 | bwd_microstep: 4633.01 | bwd_inner_microstep: 4627.42 | bwd_allreduce_microstep: 5.46 | step_microstep: 56.52
-[2025-01-25 12:48:48,846] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.45 | bwd: 4633.04 | bwd_inner: 4627.43 | bwd_allreduce: 5.52 | step: 56.54
- 25%|██▍       | 1435/5800 [4:02:18<8:25:51,  6.95s/it]                                                       {'loss': 0.0527, 'grad_norm': 7.4265666007995605, 'learning_rate': 3.524324110255273e-05, 'epoch': 12.37}
- 25%|██▍       | 1435/5800 [4:02:18<8:25:51,  6.95s/it]score1 tensor([[0.7500],
-        [0.4902],
-        [0.4727],
-        [0.6406]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6875, 0.4062, 0.4824, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0449, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:48:55,820] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.48 | optimizer_step: 4.37
-[2025-01-25 12:48:55,822] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.29 | bwd_microstep: 4642.15 | bwd_inner_microstep: 4633.24 | bwd_allreduce_microstep: 8.66 | step_microstep: 56.78
-[2025-01-25 12:48:55,822] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.26 | bwd: 4642.20 | bwd_inner: 4633.24 | bwd_allreduce: 8.79 | step: 56.75
- 25%|██▍       | 1436/5800 [4:02:25<8:25:52,  6.96s/it]                                                       {'loss': 0.0449, 'grad_norm': 4.739851951599121, 'learning_rate': 3.5236008637514985e-05, 'epoch': 12.38}
- 25%|██▍       | 1436/5800 [4:02:25<8:25:52,  6.96s/it]score1 tensor([[0.5742],
-        [0.5625],
-        [0.5078],
-        [0.6016]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.6875, 0.4551, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0659, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:49:02,756] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 12:49:02,757] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.95 | bwd_microstep: 4631.44 | bwd_inner_microstep: 4625.88 | bwd_allreduce_microstep: 5.42 | step_microstep: 44.62
-[2025-01-25 12:49:02,758] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.87 | bwd: 4631.47 | bwd_inner: 4625.88 | bwd_allreduce: 5.48 | step: 44.62
- 25%|██▍       | 1437/5800 [4:02:32<8:25:13,  6.95s/it]                                                       {'loss': 0.0659, 'grad_norm': 4.127861499786377, 'learning_rate': 3.5228771421625386e-05, 'epoch': 12.39}
- 25%|██▍       | 1437/5800 [4:02:32<8:25:13,  6.95s/it]score1 tensor([[0.4121],
-        [0.6133],
-        [0.5703],
-        [0.4766]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3984, 0.6328, 0.5352, 0.4902], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0205, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:49:09,688] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.90 | optimizer_step: 4.36
-[2025-01-25 12:49:09,689] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2171.60 | bwd_microstep: 4638.16 | bwd_inner_microstep: 4632.48 | bwd_allreduce_microstep: 5.59 | step_microstep: 47.46
-[2025-01-25 12:49:09,689] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2171.56 | bwd: 4638.19 | bwd_inner: 4632.48 | bwd_allreduce: 5.64 | step: 47.50
- 25%|██▍       | 1438/5800 [4:02:39<8:24:46,  6.94s/it]                                                       {'loss': 0.0205, 'grad_norm': 0.6031486392021179, 'learning_rate': 3.5221529457140606e-05, 'epoch': 12.4}
- 25%|██▍       | 1438/5800 [4:02:39<8:24:46,  6.94s/it]score1 tensor([[0.5352],
-        [0.4902],
-        [0.5273],
-        [0.4609]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5586, 0.4531, 0.5508, 0.4707], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:49:16,602] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 12:49:16,604] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.61 | bwd_microstep: 4638.11 | bwd_inner_microstep: 4633.13 | bwd_allreduce_microstep: 4.89 | step_microstep: 46.08
-[2025-01-25 12:49:16,604] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.57 | bwd: 4638.14 | bwd_inner: 4633.12 | bwd_allreduce: 4.94 | step: 46.09
- 25%|██▍       | 1439/5800 [4:02:46<8:23:59,  6.93s/it]                                                       {'loss': 0.0234, 'grad_norm': 3.8939969539642334, 'learning_rate': 3.521428274631882e-05, 'epoch': 12.41}
- 25%|██▍       | 1439/5800 [4:02:46<8:23:59,  6.93s/it]score1 tensor([[0.4805],
-        [0.4551],
-        [0.4219],
-        [0.4414]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.4941, 0.4551, 0.4688], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0259, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:49:23,540] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.00 | optimizer_step: 4.36
-[2025-01-25 12:49:23,542] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.40 | bwd_microstep: 4645.84 | bwd_inner_microstep: 4640.20 | bwd_allreduce_microstep: 5.50 | step_microstep: 48.96
-[2025-01-25 12:49:23,543] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.37 | bwd: 4645.86 | bwd_inner: 4640.20 | bwd_allreduce: 5.59 | step: 48.96
- 25%|██▍       | 1440/5800 [4:02:53<8:24:04,  6.94s/it]                                                       {'loss': 0.0259, 'grad_norm': 7.100776195526123, 'learning_rate': 3.5207031291419695e-05, 'epoch': 12.41}
- 25%|██▍       | 1440/5800 [4:02:53<8:24:04,  6.94s/it]score1 tensor([[0.4160],
-        [0.4902],
-        [0.5273],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.5039, 0.5781, 0.6406], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0479, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:49:30,464] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.37
-[2025-01-25 12:49:30,466] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.96 | bwd_microstep: 4638.98 | bwd_inner_microstep: 4634.70 | bwd_allreduce_microstep: 4.21 | step_microstep: 42.63
-[2025-01-25 12:49:30,466] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.90 | bwd: 4638.99 | bwd_inner: 4634.70 | bwd_allreduce: 4.24 | step: 42.64
- 25%|██▍       | 1441/5800 [4:03:00<8:23:32,  6.93s/it]                                                       {'loss': 0.0479, 'grad_norm': 7.505616664886475, 'learning_rate': 3.519977509470435e-05, 'epoch': 12.42}
- 25%|██▍       | 1441/5800 [4:03:00<8:23:32,  6.93s/it]score1 tensor([[0.4746],
-        [0.4238],
-        [0.5117],
-        [0.3984]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4961, 0.4629, 0.5469, 0.3672], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0317, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:49:37,384] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 12:49:37,385] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.36 | bwd_microstep: 4642.15 | bwd_inner_microstep: 4637.39 | bwd_allreduce_microstep: 4.68 | step_microstep: 43.27
-[2025-01-25 12:49:37,385] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.32 | bwd: 4642.18 | bwd_inner: 4637.39 | bwd_allreduce: 4.72 | step: 43.27
- 25%|██▍       | 1442/5800 [4:03:07<8:23:15,  6.93s/it]                                                       {'loss': 0.0317, 'grad_norm': 3.84194278717041, 'learning_rate': 3.5192514158435375e-05, 'epoch': 12.43}
- 25%|██▍       | 1442/5800 [4:03:07<8:23:15,  6.93s/it]score1 tensor([[0.5547],
-        [0.5977],
-        [0.4434],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4961, 0.5820, 0.4609, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0239, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:49:44,307] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 12:49:44,308] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.98 | bwd_microstep: 4638.41 | bwd_inner_microstep: 4632.81 | bwd_allreduce_microstep: 5.52 | step_microstep: 46.66
-[2025-01-25 12:49:44,309] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.94 | bwd: 4638.43 | bwd_inner: 4632.81 | bwd_allreduce: 5.56 | step: 46.67
- 25%|██▍       | 1443/5800 [4:03:14<8:23:00,  6.93s/it]                                                       {'loss': 0.0239, 'grad_norm': 0.7757498621940613, 'learning_rate': 3.518524848487688e-05, 'epoch': 12.44}
- 25%|██▍       | 1443/5800 [4:03:14<8:23:00,  6.93s/it]score1 tensor([[0.6016],
-        [0.2910],
-        [0.6016],
-        [0.4277]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5742, 0.1787, 0.5469, 0.3867], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0588, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:49:51,246] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.35 | optimizer_step: 4.36
-[2025-01-25 12:49:51,248] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.22 | bwd_microstep: 4639.98 | bwd_inner_microstep: 4630.35 | bwd_allreduce_microstep: 9.54 | step_microstep: 59.49
-[2025-01-25 12:49:51,248] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.16 | bwd: 4640.01 | bwd_inner: 4630.35 | bwd_allreduce: 9.59 | step: 59.51
- 25%|██▍       | 1444/5800 [4:03:21<8:23:19,  6.93s/it]                                                       {'loss': 0.0588, 'grad_norm': 7.058671951293945, 'learning_rate': 3.517797807629443e-05, 'epoch': 12.45}
- 25%|██▍       | 1444/5800 [4:03:21<8:23:19,  6.93s/it]score1 tensor([[0.6133],
-        [0.5859],
-        [0.6758],
-        [0.6484]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.5078, 0.6836, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:49:58,220] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.37
-[2025-01-25 12:49:58,221] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.45 | bwd_microstep: 4640.91 | bwd_inner_microstep: 4632.05 | bwd_allreduce_microstep: 8.65 | step_microstep: 71.60
-[2025-01-25 12:49:58,222] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2168.37 | bwd: 4640.95 | bwd_inner: 4632.05 | bwd_allreduce: 8.76 | step: 71.61
- 25%|██▍       | 1445/5800 [4:03:28<8:23:57,  6.94s/it]                                                       {'loss': 0.0547, 'grad_norm': 4.2438812255859375, 'learning_rate': 3.5170702934955046e-05, 'epoch': 12.46}
- 25%|██▍       | 1445/5800 [4:03:28<8:23:57,  6.94s/it]score1 tensor([[0.5273],
-        [0.5859],
-        [0.7344],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4453, 0.5625, 0.7031, 0.4805], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0410, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:50:05,167] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.37
-[2025-01-25 12:50:05,169] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2170.61 | bwd_microstep: 4636.62 | bwd_inner_microstep: 4631.59 | bwd_allreduce_microstep: 4.93 | step_microstep: 67.77
-[2025-01-25 12:50:05,170] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2170.57 | bwd: 4636.65 | bwd_inner: 4631.59 | bwd_allreduce: 4.98 | step: 67.81
- 25%|██▍       | 1446/5800 [4:03:35<8:24:12,  6.95s/it]                                                       {'loss': 0.041, 'grad_norm': 8.235174179077148, 'learning_rate': 3.516342306312726e-05, 'epoch': 12.47}
- 25%|██▍       | 1446/5800 [4:03:35<8:24:12,  6.95s/it]score1 tensor([[0.5469],
-        [0.5352],
-        [0.5234],
-        [0.5898]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.4355, 0.5117, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0376, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:50:12,111] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 12:50:12,112] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.32 | bwd_microstep: 4632.37 | bwd_inner_microstep: 4625.88 | bwd_allreduce_microstep: 6.40 | step_microstep: 53.60
-[2025-01-25 12:50:12,113] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.27 | bwd: 4632.42 | bwd_inner: 4625.88 | bwd_allreduce: 6.43 | step: 53.59
- 25%|██▍       | 1447/5800 [4:03:42<8:23:41,  6.94s/it]                                                       {'loss': 0.0376, 'grad_norm': 3.8042798042297363, 'learning_rate': 3.515613846308104e-05, 'epoch': 12.47}
- 25%|██▍       | 1447/5800 [4:03:42<8:23:41,  6.94s/it]score1 tensor([[0.5117],
-        [0.4883],
-        [0.5117],
-        [0.4707]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.4629, 0.4785, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0225, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:50:19,027] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 12:50:19,028] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.48 | bwd_microstep: 4633.85 | bwd_inner_microstep: 4628.56 | bwd_allreduce_microstep: 5.17 | step_microstep: 46.83
-[2025-01-25 12:50:19,029] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.44 | bwd: 4633.89 | bwd_inner: 4628.56 | bwd_allreduce: 5.22 | step: 46.84
- 25%|██▍       | 1448/5800 [4:03:49<8:23:10,  6.94s/it]                                                       {'loss': 0.0225, 'grad_norm': 3.6583664417266846, 'learning_rate': 3.5148849137087877e-05, 'epoch': 12.48}
- 25%|██▍       | 1448/5800 [4:03:49<8:23:10,  6.94s/it]score1 tensor([[0.5469],
-        [0.5781],
-        [0.4375],
-        [0.4941]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.6445, 0.4395, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0464, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:50:25,951] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.37
-[2025-01-25 12:50:25,953] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.99 | bwd_microstep: 4631.43 | bwd_inner_microstep: 4626.36 | bwd_allreduce_microstep: 4.98 | step_microstep: 42.94
-[2025-01-25 12:50:25,953] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.93 | bwd: 4631.45 | bwd_inner: 4626.36 | bwd_allreduce: 5.03 | step: 42.94
- 25%|██▍       | 1449/5800 [4:03:55<8:22:38,  6.93s/it]                                                       {'loss': 0.0464, 'grad_norm': 3.8752658367156982, 'learning_rate': 3.51415550874207e-05, 'epoch': 12.49}
- 25%|██▍       | 1449/5800 [4:03:55<8:22:38,  6.93s/it]score1 tensor([[0.4883],
-        [0.3945],
-        [0.4043],
-        [0.4746]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5820, 0.3789, 0.4512, 0.4805], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0405, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:50:32,889] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.36
-[2025-01-25 12:50:32,891] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.73 | bwd_microstep: 4643.67 | bwd_inner_microstep: 4637.74 | bwd_allreduce_microstep: 5.81 | step_microstep: 47.28
-[2025-01-25 12:50:32,892] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.70 | bwd: 4643.69 | bwd_inner: 4637.75 | bwd_allreduce: 5.88 | step: 47.29
- 25%|██▌       | 1450/5800 [4:04:02<8:22:38,  6.93s/it]                                                       {'loss': 0.0405, 'grad_norm': 3.672720193862915, 'learning_rate': 3.513425631635391e-05, 'epoch': 12.5}
- 25%|██▌       | 1450/5800 [4:04:02<8:22:38,  6.93s/it]score1 tensor([[0.4922],
-        [0.4492],
-        [0.5078],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5078, 0.4941, 0.5430, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0356, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:50:39,813] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 12:50:39,815] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.84 | bwd_microstep: 4642.07 | bwd_inner_microstep: 4636.92 | bwd_allreduce_microstep: 5.07 | step_microstep: 44.27
-[2025-01-25 12:50:39,815] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.80 | bwd: 4642.09 | bwd_inner: 4636.92 | bwd_allreduce: 5.11 | step: 44.27
- 25%|██▌       | 1451/5800 [4:04:09<8:22:16,  6.93s/it]                                                       {'loss': 0.0356, 'grad_norm': 7.384993553161621, 'learning_rate': 3.51269528261634e-05, 'epoch': 12.51}
- 25%|██▌       | 1451/5800 [4:04:09<8:22:16,  6.93s/it]score1 tensor([[0.5312],
-        [0.4043],
-        [0.4297],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.3652, 0.4121, 0.5742], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0337, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:50:46,743] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.37
-[2025-01-25 12:50:46,745] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.31 | bwd_microstep: 4648.25 | bwd_inner_microstep: 4643.27 | bwd_allreduce_microstep: 4.86 | step_microstep: 45.02
-[2025-01-25 12:50:46,745] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.24 | bwd: 4648.28 | bwd_inner: 4643.27 | bwd_allreduce: 4.93 | step: 45.02
- 25%|██▌       | 1452/5800 [4:04:16<8:22:12,  6.93s/it]                                                       {'loss': 0.0337, 'grad_norm': 3.456650972366333, 'learning_rate': 3.5119644619126524e-05, 'epoch': 12.52}
- 25%|██▌       | 1452/5800 [4:04:16<8:22:12,  6.93s/it]score1 tensor([[0.4160],
-        [0.4082],
-        [0.4883],
-        [0.3887]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.4160, 0.5156, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:50:53,669] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 12:50:53,671] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.41 | bwd_microstep: 4635.78 | bwd_inner_microstep: 4629.44 | bwd_allreduce_microstep: 6.25 | step_microstep: 52.43
-[2025-01-25 12:50:53,671] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.37 | bwd: 4635.81 | bwd_inner: 4629.44 | bwd_allreduce: 6.30 | step: 52.44
- 25%|██▌       | 1453/5800 [4:04:23<8:22:13,  6.93s/it]                                                       {'loss': 0.0156, 'grad_norm': 3.4448447227478027, 'learning_rate': 3.511233169752211e-05, 'epoch': 12.53}
- 25%|██▌       | 1453/5800 [4:04:23<8:22:13,  6.93s/it]score1 tensor([[0.3750],
-        [0.4902],
-        [0.5273],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.5664, 0.5703, 0.6016], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0386, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:51:00,555] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.36
-[2025-01-25 12:51:00,556] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.65 | bwd_microstep: 4581.00 | bwd_inner_microstep: 4576.01 | bwd_allreduce_microstep: 4.90 | step_microstep: 43.09
-[2025-01-25 12:51:00,556] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.59 | bwd: 4581.02 | bwd_inner: 4576.01 | bwd_allreduce: 4.94 | step: 43.09
- 25%|██▌       | 1454/5800 [4:04:30<8:21:00,  6.92s/it]                                                       {'loss': 0.0386, 'grad_norm': 5.8348798751831055, 'learning_rate': 3.5105014063630445e-05, 'epoch': 12.53}
- 25%|██▌       | 1454/5800 [4:04:30<8:21:00,  6.92s/it]score1 tensor([[0.6250],
-        [0.5273],
-        [0.5195],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133, 0.5586, 0.5273, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:51:07,465] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.16 | optimizer_step: 4.37
-[2025-01-25 12:51:07,466] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2174.55 | bwd_microstep: 4592.82 | bwd_inner_microstep: 4588.06 | bwd_allreduce_microstep: 4.66 | step_microstep: 56.38
-[2025-01-25 12:51:07,466] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2174.52 | bwd: 4592.84 | bwd_inner: 4588.06 | bwd_allreduce: 4.71 | step: 56.38
- 25%|██▌       | 1455/5800 [4:04:37<8:21:13,  6.92s/it]                                                       {'loss': 0.0127, 'grad_norm': 1.6840547323226929, 'learning_rate': 3.5097691719733304e-05, 'epoch': 12.54}
- 25%|██▌       | 1455/5800 [4:04:37<8:21:13,  6.92s/it]score1 tensor([[0.6289],
-        [0.5352],
-        [0.5117],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.4883, 0.4785, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0288, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:51:14,455] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.90 | optimizer_step: 4.36
-[2025-01-25 12:51:14,460] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.13 | bwd_microstep: 4651.86 | bwd_inner_microstep: 4646.22 | bwd_allreduce_microstep: 5.55 | step_microstep: 66.20
-[2025-01-25 12:51:14,461] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.09 | bwd: 4651.89 | bwd_inner: 4646.22 | bwd_allreduce: 5.59 | step: 66.25
- 25%|██▌       | 1456/5800 [4:04:44<8:22:24,  6.94s/it]                                                       {'loss': 0.0288, 'grad_norm': 4.0785956382751465, 'learning_rate': 3.5090364668113914e-05, 'epoch': 12.55}
- 25%|██▌       | 1456/5800 [4:04:44<8:22:24,  6.94s/it]score1 tensor([[0.6406],
-        [0.5117],
-        [0.4277],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.4922, 0.3516, 0.4863], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0361, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:51:21,406] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 12:51:21,407] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.00 | bwd_microstep: 4632.77 | bwd_inner_microstep: 4627.74 | bwd_allreduce_microstep: 4.95 | step_microstep: 44.53
-[2025-01-25 12:51:21,408] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.95 | bwd: 4632.79 | bwd_inner: 4627.74 | bwd_allreduce: 4.99 | step: 44.54
- 25%|██▌       | 1457/5800 [4:04:51<8:22:13,  6.94s/it]                                                       {'loss': 0.0361, 'grad_norm': 7.525078773498535, 'learning_rate': 3.5083032911056987e-05, 'epoch': 12.56}
- 25%|██▌       | 1457/5800 [4:04:51<8:22:13,  6.94s/it]score1 tensor([[0.5273],
-        [0.5156],
-        [0.5430],
-        [0.4824]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5078, 0.4512, 0.5039, 0.4727], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0332, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:51:28,342] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.92 | optimizer_step: 4.37
-[2025-01-25 12:51:28,343] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2172.29 | bwd_microstep: 4642.66 | bwd_inner_microstep: 4637.59 | bwd_allreduce_microstep: 4.99 | step_microstep: 43.96
-[2025-01-25 12:51:28,344] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2172.23 | bwd: 4642.68 | bwd_inner: 4637.59 | bwd_allreduce: 5.03 | step: 43.97
- 25%|██▌       | 1458/5800 [4:04:58<8:21:56,  6.94s/it]                                                       {'loss': 0.0332, 'grad_norm': 7.5338850021362305, 'learning_rate': 3.507569645084868e-05, 'epoch': 12.57}
- 25%|██▌       | 1458/5800 [4:04:58<8:21:56,  6.94s/it]score1 tensor([[0.5039],
-        [0.6484],
-        [0.4902],
-        [0.5859]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.6289, 0.3750, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0527, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:51:35,265] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.37
-[2025-01-25 12:51:35,266] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.60 | bwd_microstep: 4643.62 | bwd_inner_microstep: 4638.77 | bwd_allreduce_microstep: 4.77 | step_microstep: 45.28
-[2025-01-25 12:51:35,266] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.55 | bwd: 4643.65 | bwd_inner: 4638.77 | bwd_allreduce: 4.81 | step: 45.30
- 25%|██▌       | 1459/5800 [4:05:05<8:21:35,  6.93s/it]                                                       {'loss': 0.0527, 'grad_norm': 3.8059725761413574, 'learning_rate': 3.5068355289776634e-05, 'epoch': 12.58}
- 25%|██▌       | 1459/5800 [4:05:05<8:21:35,  6.93s/it]score1 tensor([[0.4746],
-        [0.4160],
-        [0.5859],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4082, 0.3652, 0.6094, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:51:42,189] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 12:51:42,190] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.59 | bwd_microstep: 4633.93 | bwd_inner_microstep: 4629.02 | bwd_allreduce_microstep: 4.83 | step_microstep: 42.69
-[2025-01-25 12:51:42,190] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.54 | bwd: 4633.96 | bwd_inner: 4629.01 | bwd_allreduce: 4.87 | step: 42.71
- 25%|██▌       | 1460/5800 [4:05:12<8:21:14,  6.93s/it]                                                       {'loss': 0.0391, 'grad_norm': 3.3076283931732178, 'learning_rate': 3.5061009430129944e-05, 'epoch': 12.59}
- 25%|██▌       | 1460/5800 [4:05:12<8:21:14,  6.93s/it]score1 tensor([[0.4902],
-        [0.5273],
-        [0.5195],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.5977, 0.4648, 0.5586], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0327, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:51:49,064] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 12:51:49,065] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.26 | bwd_microstep: 4591.43 | bwd_inner_microstep: 4586.41 | bwd_allreduce_microstep: 4.92 | step_microstep: 43.64
-[2025-01-25 12:51:49,065] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.22 | bwd: 4591.46 | bwd_inner: 4586.41 | bwd_allreduce: 4.97 | step: 43.65
- 25%|██▌       | 1461/5800 [4:05:19<8:20:00,  6.91s/it]                                                       {'loss': 0.0327, 'grad_norm': 1.8988497257232666, 'learning_rate': 3.5053658874199196e-05, 'epoch': 12.59}
- 25%|██▌       | 1461/5800 [4:05:19<8:20:00,  6.91s/it]score1 tensor([[0.4941],
-        [0.3340],
-        [0.4727],
-        [0.4375]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5078, 0.3613, 0.5000, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0317, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:51:55,986] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 8.65 | optimizer_step: 4.36
-[2025-01-25 12:51:55,988] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.02 | bwd_microstep: 4633.61 | bwd_inner_microstep: 4628.20 | bwd_allreduce_microstep: 5.27 | step_microstep: 50.46
-[2025-01-25 12:51:55,988] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.99 | bwd: 4633.64 | bwd_inner: 4628.20 | bwd_allreduce: 5.34 | step: 50.47
- 25%|██▌       | 1462/5800 [4:05:25<8:20:08,  6.92s/it]                                                       {'loss': 0.0317, 'grad_norm': 6.790613174438477, 'learning_rate': 3.504630362427639e-05, 'epoch': 12.6}
- 25%|██▌       | 1462/5800 [4:05:25<8:20:08,  6.92s/it]score1 tensor([[0.4434],
-        [0.4180],
-        [0.4375],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4902, 0.4277, 0.5000, 0.6562], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0601, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:52:02,937] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.51 | optimizer_step: 4.36
-[2025-01-25 12:52:02,939] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.28 | bwd_microstep: 4652.07 | bwd_inner_microstep: 4646.05 | bwd_allreduce_microstep: 5.88 | step_microstep: 53.58
-[2025-01-25 12:52:02,940] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.24 | bwd: 4652.10 | bwd_inner: 4646.05 | bwd_allreduce: 5.96 | step: 53.60
- 25%|██▌       | 1463/5800 [4:05:32<8:20:51,  6.93s/it]                                                       {'loss': 0.0601, 'grad_norm': 7.01182746887207, 'learning_rate': 3.5038943682655055e-05, 'epoch': 12.61}
- 25%|██▌       | 1463/5800 [4:05:32<8:20:51,  6.93s/it]score1 tensor([[0.4902],
-        [0.4102],
-        [0.4434],
-        [0.4590]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.4551, 0.4336, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:52:09,898] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.37 | optimizer_step: 4.36
-[2025-01-25 12:52:09,901] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.45 | bwd_microstep: 4637.09 | bwd_inner_microstep: 4626.80 | bwd_allreduce_microstep: 10.21 | step_microstep: 68.95
-[2025-01-25 12:52:09,902] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.40 | bwd: 4637.11 | bwd_inner: 4626.80 | bwd_allreduce: 10.25 | step: 69.02
- 25%|██▌       | 1464/5800 [4:05:39<8:21:42,  6.94s/it]                                                       {'loss': 0.0215, 'grad_norm': 0.362703412771225, 'learning_rate': 3.503157905163012e-05, 'epoch': 12.62}
- 25%|██▌       | 1464/5800 [4:05:39<8:21:42,  6.94s/it]score1 tensor([[0.5664],
-        [0.5938],
-        [0.5508],
-        [0.4238]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6797, 0.6719, 0.6406, 0.4766], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0835, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:52:16,872] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.57 | optimizer_step: 4.44
-[2025-01-25 12:52:16,874] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2171.54 | bwd_microstep: 4639.96 | bwd_inner_microstep: 4634.88 | bwd_allreduce_microstep: 4.98 | step_microstep: 66.87
-[2025-01-25 12:52:16,874] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2171.50 | bwd: 4639.98 | bwd_inner: 4634.88 | bwd_allreduce: 5.03 | step: 66.88
- 25%|██▌       | 1465/5800 [4:05:46<8:21:56,  6.95s/it]                                                       {'loss': 0.0835, 'grad_norm': 7.683000087738037, 'learning_rate': 3.502420973349802e-05, 'epoch': 12.63}
- 25%|██▌       | 1465/5800 [4:05:46<8:21:56,  6.95s/it]score1 tensor([[0.3691],
-        [0.4316],
-        [0.5898],
-        [0.3379]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3789, 0.4844, 0.6641, 0.3906], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0474, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:52:23,795] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 12:52:23,796] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.00 | bwd_microstep: 4633.17 | bwd_inner_microstep: 4627.40 | bwd_allreduce_microstep: 5.68 | step_microstep: 46.38
-[2025-01-25 12:52:23,797] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.96 | bwd: 4633.19 | bwd_inner: 4627.40 | bwd_allreduce: 5.73 | step: 46.39
- 25%|██▌       | 1466/5800 [4:05:53<8:21:09,  6.94s/it]                                                       {'loss': 0.0474, 'grad_norm': 6.704880714416504, 'learning_rate': 3.5016835730556636e-05, 'epoch': 12.64}
- 25%|██▌       | 1466/5800 [4:05:53<8:21:09,  6.94s/it]score1 tensor([[0.3340],
-        [0.4473],
-        [0.3789],
-        [0.3984]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3555, 0.4590, 0.3672, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0171, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:52:30,753] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.43 | optimizer_step: 4.36
-[2025-01-25 12:52:30,756] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.08 | bwd_microstep: 4641.59 | bwd_inner_microstep: 4634.08 | bwd_allreduce_microstep: 7.36 | step_microstep: 80.40
-[2025-01-25 12:52:30,757] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.04 | bwd: 4641.68 | bwd_inner: 4634.08 | bwd_allreduce: 7.44 | step: 80.42
- 25%|██▌       | 1467/5800 [4:06:00<8:21:48,  6.95s/it]                                                       {'loss': 0.0171, 'grad_norm': 0.402255117893219, 'learning_rate': 3.50094570451053e-05, 'epoch': 12.65}
- 25%|██▌       | 1467/5800 [4:06:00<8:21:48,  6.95s/it]score1 tensor([[0.4922],
-        [0.4473],
-        [0.4531],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.4746, 0.4941, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0552, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:52:37,717] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 12:52:37,718] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.16 | bwd_microstep: 4641.32 | bwd_inner_microstep: 4636.25 | bwd_allreduce_microstep: 4.99 | step_microstep: 45.06
-[2025-01-25 12:52:37,718] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.13 | bwd: 4641.35 | bwd_inner: 4636.25 | bwd_allreduce: 5.03 | step: 45.08
- 25%|██▌       | 1468/5800 [4:06:07<8:21:36,  6.95s/it]                                                       {'loss': 0.0552, 'grad_norm': 7.1551923751831055, 'learning_rate': 3.500207367944482e-05, 'epoch': 12.66}
- 25%|██▌       | 1468/5800 [4:06:07<8:21:36,  6.95s/it]score1 tensor([[0.4082],
-        [0.4336],
-        [0.4863],
-        [0.3945]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4395, 0.4492, 0.5156, 0.4199], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:52:44,633] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 12:52:44,634] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.68 | bwd_microstep: 4639.61 | bwd_inner_microstep: 4634.63 | bwd_allreduce_microstep: 4.89 | step_microstep: 42.53
-[2025-01-25 12:52:44,635] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.64 | bwd: 4639.63 | bwd_inner: 4634.64 | bwd_allreduce: 4.93 | step: 42.54
- 25%|██▌       | 1469/5800 [4:06:14<8:20:47,  6.94s/it]                                                       {'loss': 0.0254, 'grad_norm': 6.647860527038574, 'learning_rate': 3.499468563587747e-05, 'epoch': 12.66}
- 25%|██▌       | 1469/5800 [4:06:14<8:20:47,  6.94s/it]score1 tensor([[0.4238],
-        [0.6172],
-        [0.5117],
-        [0.4902]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4219, 0.6289, 0.5469, 0.4883], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:52:51,555] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.36
-[2025-01-25 12:52:51,556] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.83 | bwd_microstep: 4641.10 | bwd_inner_microstep: 4635.69 | bwd_allreduce_microstep: 5.32 | step_microstep: 49.72
-[2025-01-25 12:52:51,557] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.80 | bwd: 4641.12 | bwd_inner: 4635.69 | bwd_allreduce: 5.37 | step: 49.73
- 25%|██▌       | 1470/5800 [4:06:21<8:20:24,  6.93s/it]                                                       {'loss': 0.0127, 'grad_norm': 0.8506630659103394, 'learning_rate': 3.4987292916706944e-05, 'epoch': 12.67}
- 25%|██▌       | 1470/5800 [4:06:21<8:20:24,  6.93s/it]score1 tensor([[0.4492],
-        [0.4941],
-        [0.4922],
-        [0.5859]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3711, 0.5039, 0.4570, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0386, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:52:58,474] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.37
-[2025-01-25 12:52:58,475] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.00 | bwd_microstep: 4634.65 | bwd_inner_microstep: 4629.45 | bwd_allreduce_microstep: 5.10 | step_microstep: 44.00
-[2025-01-25 12:52:58,476] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.97 | bwd: 4634.68 | bwd_inner: 4629.44 | bwd_allreduce: 5.16 | step: 44.01
- 25%|██▌       | 1471/5800 [4:06:28<8:19:58,  6.93s/it]                                                       {'loss': 0.0386, 'grad_norm': 0.6731001734733582, 'learning_rate': 3.497989552423844e-05, 'epoch': 12.68}
- 25%|██▌       | 1471/5800 [4:06:28<8:19:58,  6.93s/it]score1 tensor([[0.7070],
-        [0.4785],
-        [0.5195],
-        [0.6094]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6484, 0.4492, 0.4062, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0679, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:53:05,405] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 12:53:05,406] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2169.46 | bwd_microstep: 4640.20 | bwd_inner_microstep: 4635.08 | bwd_allreduce_microstep: 5.04 | step_microstep: 44.81
-[2025-01-25 12:53:05,407] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2169.43 | bwd: 4640.22 | bwd_inner: 4635.07 | bwd_allreduce: 5.08 | step: 44.82
- 25%|██▌       | 1472/5800 [4:06:35<8:19:51,  6.93s/it]                                                       {'loss': 0.0679, 'grad_norm': 8.036713600158691, 'learning_rate': 3.497249346077859e-05, 'epoch': 12.69}
- 25%|██▌       | 1472/5800 [4:06:35<8:19:51,  6.93s/it]score1 tensor([[0.5820],
-        [0.5000],
-        [0.7695],
-        [0.6328]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4258, 0.4297, 0.6953, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0986, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:53:12,345] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.46 | optimizer_step: 4.37
-[2025-01-25 12:53:12,347] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.07 | bwd_microstep: 4647.23 | bwd_inner_microstep: 4639.05 | bwd_allreduce_microstep: 7.90 | step_microstep: 57.36
-[2025-01-25 12:53:12,347] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.03 | bwd: 4647.26 | bwd_inner: 4639.05 | bwd_allreduce: 8.10 | step: 57.37
- 25%|██▌       | 1473/5800 [4:06:42<8:20:12,  6.94s/it]                                                       {'loss': 0.0986, 'grad_norm': 8.452479362487793, 'learning_rate': 3.496508672863548e-05, 'epoch': 12.7}
- 25%|██▌       | 1473/5800 [4:06:42<8:20:12,  6.94s/it]score1 tensor([[0.6758],
-        [0.5078],
-        [0.4902],
-        [0.6016]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.4473, 0.4355, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0708, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:53:19,303] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.51 | optimizer_step: 4.36
-[2025-01-25 12:53:19,306] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.20 | bwd_microstep: 4638.57 | bwd_inner_microstep: 4631.47 | bwd_allreduce_microstep: 6.99 | step_microstep: 58.12
-[2025-01-25 12:53:19,306] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.17 | bwd: 4638.62 | bwd_inner: 4631.47 | bwd_allreduce: 7.05 | step: 58.13
- 25%|██▌       | 1474/5800 [4:06:49<8:21:05,  6.95s/it]                                                       {'loss': 0.0708, 'grad_norm': 7.902544975280762, 'learning_rate': 3.495767533011866e-05, 'epoch': 12.71}
- 25%|██▌       | 1474/5800 [4:06:49<8:21:05,  6.95s/it]score1 tensor([[0.4902],
-        [0.5820],
-        [0.4492],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.5430, 0.4668, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0435, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:53:26,304] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.18 | optimizer_step: 4.36
-[2025-01-25 12:53:26,306] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2170.94 | bwd_microstep: 4646.85 | bwd_inner_microstep: 4641.65 | bwd_allreduce_microstep: 5.10 | step_microstep: 58.38
-[2025-01-25 12:53:26,307] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2170.90 | bwd: 4646.87 | bwd_inner: 4641.65 | bwd_allreduce: 5.15 | step: 58.43
- 25%|██▌       | 1475/5800 [4:06:56<8:21:36,  6.96s/it]                                                       {'loss': 0.0435, 'grad_norm': 0.8779590725898743, 'learning_rate': 3.495025926753914e-05, 'epoch': 12.72}
- 25%|██▌       | 1475/5800 [4:06:56<8:21:36,  6.96s/it]score1 tensor([[0.6250],
-        [0.5781],
-        [0.4355],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.5000, 0.3398, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0688, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:53:33,236] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 12:53:33,238] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.79 | bwd_microstep: 4636.06 | bwd_inner_microstep: 4630.97 | bwd_allreduce_microstep: 4.98 | step_microstep: 45.89
-[2025-01-25 12:53:33,239] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.76 | bwd: 4636.08 | bwd_inner: 4630.97 | bwd_allreduce: 5.04 | step: 45.89
- 25%|██▌       | 1476/5800 [4:07:03<8:20:39,  6.95s/it]                                                       {'loss': 0.0688, 'grad_norm': 7.7245025634765625, 'learning_rate': 3.494283854320937e-05, 'epoch': 12.72}
- 25%|██▌       | 1476/5800 [4:07:03<8:20:39,  6.95s/it]score1 tensor([[0.5703],
-        [0.6406],
-        [0.5547],
-        [0.4141]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.6094, 0.5273, 0.4707], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0298, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:53:40,157] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 12:53:40,158] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.89 | bwd_microstep: 4639.39 | bwd_inner_microstep: 4634.74 | bwd_allreduce_microstep: 4.57 | step_microstep: 43.61
-[2025-01-25 12:53:40,159] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.85 | bwd: 4639.42 | bwd_inner: 4634.74 | bwd_allreduce: 4.61 | step: 43.62
- 25%|██▌       | 1477/5800 [4:07:10<8:19:53,  6.94s/it]                                                       {'loss': 0.0298, 'grad_norm': 4.548588752746582, 'learning_rate': 3.493541315944326e-05, 'epoch': 12.73}
- 25%|██▌       | 1477/5800 [4:07:10<8:19:53,  6.94s/it]score1 tensor([[0.6406],
-        [0.4355],
-        [0.5469],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5898, 0.4199, 0.5430, 0.4570], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0283, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:53:47,067] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 12:53:47,068] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.75 | bwd_microstep: 4632.87 | bwd_inner_microstep: 4627.82 | bwd_allreduce_microstep: 4.95 | step_microstep: 45.75
-[2025-01-25 12:53:47,068] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.71 | bwd: 4632.91 | bwd_inner: 4627.82 | bwd_allreduce: 5.00 | step: 45.75
- 25%|██▌       | 1478/5800 [4:07:17<8:19:10,  6.93s/it]                                                       {'loss': 0.0283, 'grad_norm': 7.646783828735352, 'learning_rate': 3.492798311855617e-05, 'epoch': 12.74}
- 25%|██▌       | 1478/5800 [4:07:17<8:19:10,  6.93s/it]score1 tensor([[0.4102],
-        [0.4688],
-        [0.4473],
-        [0.4531]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3945, 0.4316, 0.4453, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:53:53,978] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.37
-[2025-01-25 12:53:53,980] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.25 | bwd_microstep: 4636.98 | bwd_inner_microstep: 4631.47 | bwd_allreduce_microstep: 5.40 | step_microstep: 44.63
-[2025-01-25 12:53:53,980] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.21 | bwd: 4637.00 | bwd_inner: 4631.47 | bwd_allreduce: 5.47 | step: 44.64
- 26%|██▌       | 1479/5800 [4:07:23<8:18:44,  6.93s/it]                                                       {'loss': 0.0215, 'grad_norm': 3.503214120864868, 'learning_rate': 3.4920548422864926e-05, 'epoch': 12.75}
- 26%|██▌       | 1479/5800 [4:07:23<8:18:44,  6.93s/it]score1 tensor([[0.5000],
-        [0.4258],
-        [0.3984],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.5430, 0.4941, 0.5898], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0825, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:54:00,911] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 12:54:00,912] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.18 | bwd_microstep: 4642.36 | bwd_inner_microstep: 4636.94 | bwd_allreduce_microstep: 5.33 | step_microstep: 45.86
-[2025-01-25 12:54:00,912] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.15 | bwd: 4642.38 | bwd_inner: 4636.94 | bwd_allreduce: 5.38 | step: 45.87
- 26%|██▌       | 1480/5800 [4:07:30<8:18:42,  6.93s/it]                                                       {'loss': 0.0825, 'grad_norm': 7.245509147644043, 'learning_rate': 3.491310907468779e-05, 'epoch': 12.76}
- 26%|██▌       | 1480/5800 [4:07:30<8:18:42,  6.93s/it]score1 tensor([[0.5977],
-        [0.4531],
-        [0.4961],
-        [0.4121]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6484, 0.4961, 0.5430, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0483, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:54:07,827] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 12:54:07,828] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.36 | bwd_microstep: 4635.19 | bwd_inner_microstep: 4626.67 | bwd_allreduce_microstep: 8.43 | step_microstep: 43.01
-[2025-01-25 12:54:07,829] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.31 | bwd: 4635.21 | bwd_inner: 4626.67 | bwd_allreduce: 8.47 | step: 43.01
- 26%|██▌       | 1481/5800 [4:07:37<8:18:27,  6.92s/it]                                                       {'loss': 0.0483, 'grad_norm': 7.38225793838501, 'learning_rate': 3.490566507634447e-05, 'epoch': 12.77}
- 26%|██▌       | 1481/5800 [4:07:37<8:18:27,  6.92s/it]score1 tensor([[0.4043],
-        [0.3730],
-        [0.4023],
-        [0.4121]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4766, 0.4805, 0.4688, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1030, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:54:14,741] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 12:54:14,742] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.31 | bwd_microstep: 4631.25 | bwd_inner_microstep: 4626.46 | bwd_allreduce_microstep: 4.70 | step_microstep: 46.34
-[2025-01-25 12:54:14,743] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.28 | bwd: 4631.28 | bwd_inner: 4626.46 | bwd_allreduce: 4.75 | step: 46.38
- 26%|██▌       | 1482/5800 [4:07:44<8:18:05,  6.92s/it]                                                       {'loss': 0.103, 'grad_norm': 6.609169960021973, 'learning_rate': 3.4898216430156156e-05, 'epoch': 12.78}
- 26%|██▌       | 1482/5800 [4:07:44<8:18:05,  6.92s/it]score1 tensor([[0.3418],
-        [0.4160],
-        [0.5898],
-        [0.3848]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4160, 0.5352, 0.6445, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0781, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:54:21,666] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 12:54:21,667] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.65 | bwd_microstep: 4640.05 | bwd_inner_microstep: 4635.10 | bwd_allreduce_microstep: 4.85 | step_microstep: 43.71
-[2025-01-25 12:54:21,667] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.55 | bwd: 4640.07 | bwd_inner: 4635.10 | bwd_allreduce: 4.90 | step: 43.72
- 26%|██▌       | 1483/5800 [4:07:51<8:18:12,  6.92s/it]                                                       {'loss': 0.0781, 'grad_norm': 6.905087947845459, 'learning_rate': 3.489076313844544e-05, 'epoch': 12.78}
- 26%|██▌       | 1483/5800 [4:07:51<8:18:12,  6.92s/it]score1 tensor([[0.5391],
-        [0.3848],
-        [0.5703],
-        [0.3672]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.4785, 0.6445, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0562, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:54:28,620] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.38 | optimizer_step: 4.37
-[2025-01-25 12:54:28,621] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.50 | bwd_microstep: 4642.80 | bwd_inner_microstep: 4634.13 | bwd_allreduce_microstep: 8.42 | step_microstep: 69.52
-[2025-01-25 12:54:28,622] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.46 | bwd: 4642.87 | bwd_inner: 4634.13 | bwd_allreduce: 8.56 | step: 69.50
- 26%|██▌       | 1484/5800 [4:07:58<8:18:43,  6.93s/it]                                                       {'loss': 0.0562, 'grad_norm': 7.134604454040527, 'learning_rate': 3.488330520353641e-05, 'epoch': 12.79}
- 26%|██▌       | 1484/5800 [4:07:58<8:18:43,  6.93s/it]score1 tensor([[0.4082],
-        [0.4004],
-        [0.3828],
-        [0.4023]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.4395, 0.3457, 0.4375], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0303, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:54:35,573] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 12:54:35,574] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2170.99 | bwd_microstep: 4640.69 | bwd_inner_microstep: 4635.32 | bwd_allreduce_microstep: 5.28 | step_microstep: 43.83
-[2025-01-25 12:54:35,574] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2170.93 | bwd: 4640.72 | bwd_inner: 4635.32 | bwd_allreduce: 5.32 | step: 43.84
- 26%|██▌       | 1485/5800 [4:08:05<8:18:56,  6.94s/it]                                                       {'loss': 0.0303, 'grad_norm': 3.4344043731689453, 'learning_rate': 3.4875842627754566e-05, 'epoch': 12.8}
- 26%|██▌       | 1485/5800 [4:08:05<8:18:56,  6.94s/it]score1 tensor([[0.4258],
-        [0.6055],
-        [0.3828],
-        [0.4199]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4590, 0.7070, 0.4453, 0.4609], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0596, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:54:42,485] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.37
-[2025-01-25 12:54:42,486] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.57 | bwd_microstep: 4631.04 | bwd_inner_microstep: 4625.85 | bwd_allreduce_microstep: 5.06 | step_microstep: 44.33
-[2025-01-25 12:54:42,487] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.52 | bwd: 4631.06 | bwd_inner: 4625.86 | bwd_allreduce: 5.10 | step: 44.33
- 26%|██▌       | 1486/5800 [4:08:12<8:18:10,  6.93s/it]                                                       {'loss': 0.0596, 'grad_norm': 7.08991003036499, 'learning_rate': 3.486837541342688e-05, 'epoch': 12.81}
- 26%|██▌       | 1486/5800 [4:08:12<8:18:10,  6.93s/it]score1 tensor([[0.4434],
-        [0.3613],
-        [0.4141],
-        [0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.3223, 0.4258, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0303, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:54:49,405] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.36
-[2025-01-25 12:54:49,406] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.81 | bwd_microstep: 4639.10 | bwd_inner_microstep: 4633.54 | bwd_allreduce_microstep: 5.46 | step_microstep: 47.82
-[2025-01-25 12:54:49,406] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.78 | bwd: 4639.14 | bwd_inner: 4633.54 | bwd_allreduce: 5.51 | step: 47.83
- 26%|██▌       | 1487/5800 [4:08:19<8:17:51,  6.93s/it]                                                       {'loss': 0.0303, 'grad_norm': 0.6076960563659668, 'learning_rate': 3.4860903562881744e-05, 'epoch': 12.82}
- 26%|██▌       | 1487/5800 [4:08:19<8:17:51,  6.93s/it]score1 tensor([[0.4453],
-        [0.4238],
-        [0.5312],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.3945, 0.5391, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:54:56,306] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 12:54:56,308] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.22 | bwd_microstep: 4627.46 | bwd_inner_microstep: 4622.67 | bwd_allreduce_microstep: 4.68 | step_microstep: 42.94
-[2025-01-25 12:54:56,308] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.19 | bwd: 4627.48 | bwd_inner: 4622.67 | bwd_allreduce: 4.73 | step: 42.95
- 26%|██▌       | 1488/5800 [4:08:26<8:17:19,  6.92s/it]                                                       {'loss': 0.0186, 'grad_norm': 0.3952939212322235, 'learning_rate': 3.4853427078449015e-05, 'epoch': 12.83}
- 26%|██▌       | 1488/5800 [4:08:26<8:17:19,  6.92s/it]score1 tensor([[0.4590],
-        [0.4707],
-        [0.6875],
-        [0.5859]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4160, 0.4023, 0.6133, 0.4766], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0737, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:55:03,233] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.37
-[2025-01-25 12:55:03,235] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.57 | bwd_microstep: 4641.00 | bwd_inner_microstep: 4635.23 | bwd_allreduce_microstep: 5.65 | step_microstep: 46.99
-[2025-01-25 12:55:03,235] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.53 | bwd: 4641.02 | bwd_inner: 4635.23 | bwd_allreduce: 5.72 | step: 47.00
- 26%|██▌       | 1489/5800 [4:08:33<8:17:18,  6.92s/it]                                                       {'loss': 0.0737, 'grad_norm': 7.652723789215088, 'learning_rate': 3.484594596246e-05, 'epoch': 12.84}
- 26%|██▌       | 1489/5800 [4:08:33<8:17:18,  6.92s/it]score1 tensor([[0.5469],
-        [0.6289],
-        [0.5078],
-        [0.6133]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.5898, 0.4844, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:55:10,151] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.37
-[2025-01-25 12:55:10,152] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.21 | bwd_microstep: 4639.38 | bwd_inner_microstep: 4634.72 | bwd_allreduce_microstep: 4.55 | step_microstep: 44.44
-[2025-01-25 12:55:10,153] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.17 | bwd: 4639.40 | bwd_inner: 4634.72 | bwd_allreduce: 4.61 | step: 44.45
- 26%|██▌       | 1490/5800 [4:08:40<8:17:03,  6.92s/it]                                                       {'loss': 0.0352, 'grad_norm': 7.818440914154053, 'learning_rate': 3.483846021724743e-05, 'epoch': 12.84}
- 26%|██▌       | 1490/5800 [4:08:40<8:17:03,  6.92s/it]score1 tensor([[0.5273],
-        [0.6953],
-        [0.4434],
-        [0.4941]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.6445, 0.3340, 0.4629], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0752, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:55:17,063] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 12:55:17,064] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.00 | bwd_microstep: 4630.89 | bwd_inner_microstep: 4625.79 | bwd_allreduce_microstep: 5.00 | step_microstep: 43.70
-[2025-01-25 12:55:17,065] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.97 | bwd: 4630.91 | bwd_inner: 4625.79 | bwd_allreduce: 5.04 | step: 43.71
- 26%|██▌       | 1491/5800 [4:08:47<8:16:49,  6.92s/it]                                                       {'loss': 0.0752, 'grad_norm': 7.474926948547363, 'learning_rate': 3.48309698451455e-05, 'epoch': 12.85}
- 26%|██▌       | 1491/5800 [4:08:47<8:16:49,  6.92s/it]score1 tensor([[0.5586],
-        [0.6641],
-        [0.5117],
-        [0.4980]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.6016, 0.4336, 0.4023], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0742, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:55:23,989] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.13 | optimizer_step: 4.36
-[2025-01-25 12:55:23,991] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.95 | bwd_microstep: 4632.59 | bwd_inner_microstep: 4624.07 | bwd_allreduce_microstep: 8.43 | step_microstep: 56.60
-[2025-01-25 12:55:23,991] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.91 | bwd: 4632.64 | bwd_inner: 4624.07 | bwd_allreduce: 8.46 | step: 56.59
- 26%|██▌       | 1492/5800 [4:08:53<8:17:05,  6.92s/it]                                                       {'loss': 0.0742, 'grad_norm': 7.620026111602783, 'learning_rate': 3.482347484848982e-05, 'epoch': 12.86}
- 26%|██▌       | 1492/5800 [4:08:53<8:17:05,  6.92s/it]score1 tensor([[0.5195],
-        [0.5742],
-        [0.4746],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4375, 0.4980, 0.4375, 0.6016], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0576, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:55:30,923] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 12:55:30,925] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.06 | bwd_microstep: 4632.07 | bwd_inner_microstep: 4625.10 | bwd_allreduce_microstep: 6.71 | step_microstep: 57.44
-[2025-01-25 12:55:30,926] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.02 | bwd: 4632.13 | bwd_inner: 4625.10 | bwd_allreduce: 6.84 | step: 57.49
- 26%|██▌       | 1493/5800 [4:09:00<8:17:34,  6.93s/it]                                                       {'loss': 0.0576, 'grad_norm': 3.4965622425079346, 'learning_rate': 3.4815975229617484e-05, 'epoch': 12.87}
- 26%|██▌       | 1493/5800 [4:09:00<8:17:34,  6.93s/it]score1 tensor([[0.4707],
-        [0.5547],
-        [0.5312],
-        [0.4766]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4141, 0.6055, 0.4648, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0513, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:55:37,912] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.30 | optimizer_step: 4.37
-[2025-01-25 12:55:37,914] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2174.42 | bwd_microstep: 4632.77 | bwd_inner_microstep: 4625.15 | bwd_allreduce_microstep: 7.44 | step_microstep: 76.39
-[2025-01-25 12:55:37,915] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2174.37 | bwd: 4632.83 | bwd_inner: 4625.15 | bwd_allreduce: 7.54 | step: 76.38
- 26%|██▌       | 1494/5800 [4:09:07<8:18:08,  6.94s/it]                                                       {'loss': 0.0513, 'grad_norm': 3.446420907974243, 'learning_rate': 3.4808470990867e-05, 'epoch': 12.88}
- 26%|██▌       | 1494/5800 [4:09:07<8:18:08,  6.94s/it]score1 tensor([[0.5195],
-        [0.4746],
-        [0.4883],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5586, 0.4473, 0.5312, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0332, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:55:44,838] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.37
-[2025-01-25 12:55:44,839] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.48 | bwd_microstep: 4627.75 | bwd_inner_microstep: 4622.70 | bwd_allreduce_microstep: 4.95 | step_microstep: 48.34
-[2025-01-25 12:55:44,839] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.43 | bwd: 4627.77 | bwd_inner: 4622.70 | bwd_allreduce: 5.00 | step: 48.34
- 26%|██▌       | 1495/5800 [4:09:14<8:17:36,  6.94s/it]                                                       {'loss': 0.0332, 'grad_norm': 3.725391149520874, 'learning_rate': 3.480096213457829e-05, 'epoch': 12.89}
- 26%|██▌       | 1495/5800 [4:09:14<8:17:36,  6.94s/it]score1 tensor([[0.4648],
-        [0.4102],
-        [0.3711],
-        [0.4922]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.3438, 0.4180, 0.4824], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0342, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:55:51,751] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.55 | optimizer_step: 4.36
-[2025-01-25 12:55:51,752] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.98 | bwd_microstep: 4638.06 | bwd_inner_microstep: 4633.24 | bwd_allreduce_microstep: 4.72 | step_microstep: 42.77
-[2025-01-25 12:55:51,758] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.94 | bwd: 4638.08 | bwd_inner: 4633.24 | bwd_allreduce: 4.77 | step: 42.78
- 26%|██▌       | 1496/5800 [4:09:21<8:17:07,  6.93s/it]                                                       {'loss': 0.0342, 'grad_norm': 0.5797321796417236, 'learning_rate': 3.4793448663092786e-05, 'epoch': 12.9}
- 26%|██▌       | 1496/5800 [4:09:21<8:17:07,  6.93s/it]score1 tensor([[0.4062],
-        [0.4062],
-        [0.5000],
-        [0.4180]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.3887, 0.6289, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0610, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:55:58,662] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 12:55:58,663] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.66 | bwd_microstep: 4631.03 | bwd_inner_microstep: 4625.96 | bwd_allreduce_microstep: 4.93 | step_microstep: 48.28
-[2025-01-25 12:55:58,664] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.62 | bwd: 4631.05 | bwd_inner: 4625.96 | bwd_allreduce: 5.00 | step: 48.29
- 26%|██▌       | 1497/5800 [4:09:28<8:16:37,  6.92s/it]                                                       {'loss': 0.061, 'grad_norm': 3.391731023788452, 'learning_rate': 3.478593057875329e-05, 'epoch': 12.91}
- 26%|██▌       | 1497/5800 [4:09:28<8:16:37,  6.92s/it]score1 tensor([[0.4668],
-        [0.4219],
-        [0.4512],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.4238, 0.4844, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0259, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:56:05,583] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 12:56:05,585] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.80 | bwd_microstep: 4637.46 | bwd_inner_microstep: 4632.83 | bwd_allreduce_microstep: 4.55 | step_microstep: 43.98
-[2025-01-25 12:56:05,585] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.77 | bwd: 4637.48 | bwd_inner: 4632.83 | bwd_allreduce: 4.59 | step: 43.98
- 26%|██▌       | 1498/5800 [4:09:35<8:16:19,  6.92s/it]                                                       {'loss': 0.0259, 'grad_norm': 6.743212699890137, 'learning_rate': 3.4778407883904086e-05, 'epoch': 12.91}
- 26%|██▌       | 1498/5800 [4:09:35<8:16:19,  6.92s/it]score1 tensor([[0.4727],
-        [0.4180],
-        [0.4688],
-        [0.6016]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.4512, 0.4863, 0.6641], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0347, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:56:12,649] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.95 | optimizer_step: 4.36
-[2025-01-25 12:56:12,651] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.51 | bwd_microstep: 4639.51 | bwd_inner_microstep: 4634.92 | bwd_allreduce_microstep: 4.51 | step_microstep: 49.94
-[2025-01-25 12:56:12,651] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.47 | bwd: 4639.53 | bwd_inner: 4634.92 | bwd_allreduce: 4.55 | step: 49.94
- 26%|██▌       | 1499/5800 [4:09:42<8:19:20,  6.97s/it]                                                       {'loss': 0.0347, 'grad_norm': 6.907049655914307, 'learning_rate': 3.477088058089087e-05, 'epoch': 12.92}
- 26%|██▌       | 1499/5800 [4:09:42<8:19:20,  6.97s/it]score1 tensor([[0.4453],
-        [0.4863],
-        [0.5195],
-        [0.4355]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.4609, 0.5312, 0.4082], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0171, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:56:19,571] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.91 | optimizer_step: 4.36
-[2025-01-25 12:56:19,573] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.13 | bwd_microstep: 4635.24 | bwd_inner_microstep: 4630.29 | bwd_allreduce_microstep: 4.87 | step_microstep: 47.35
-[2025-01-25 12:56:19,574] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.09 | bwd: 4635.26 | bwd_inner: 4630.30 | bwd_allreduce: 4.90 | step: 47.36
- 26%|██▌       | 1500/5800 [4:09:49<8:18:19,  6.95s/it]                                                       {'loss': 0.0171, 'grad_norm': 0.3902988135814667, 'learning_rate': 3.47633486720608e-05, 'epoch': 12.93}
- 26%|██▌       | 1500/5800 [4:09:49<8:18:19,  6.95s/it]score1 tensor([[0.5859],
-        [0.4902],
-        [0.4512],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5938, 0.5000, 0.4121, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:56:26,496] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.88 | optimizer_step: 4.37
-[2025-01-25 12:56:26,497] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.48 | bwd_microstep: 4635.32 | bwd_inner_microstep: 4630.46 | bwd_allreduce_microstep: 4.77 | step_microstep: 47.73
-[2025-01-25 12:56:26,498] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.45 | bwd: 4635.34 | bwd_inner: 4630.46 | bwd_allreduce: 4.81 | step: 47.72
- 26%|██▌       | 1501/5800 [4:09:56<8:17:39,  6.95s/it]                                                       {'loss': 0.0161, 'grad_norm': 5.290223598480225, 'learning_rate': 3.4755812159762445e-05, 'epoch': 12.94}
- 26%|██▌       | 1501/5800 [4:09:56<8:17:39,  6.95s/it]score1 tensor([[0.5234],
-        [0.6133],
-        [0.4805],
-        [0.4023]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.5508, 0.4121, 0.4043], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0537, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:56:33,428] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.36
-[2025-01-25 12:56:33,429] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.73 | bwd_microstep: 4631.34 | bwd_inner_microstep: 4625.54 | bwd_allreduce_microstep: 5.69 | step_microstep: 49.80
-[2025-01-25 12:56:33,430] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.68 | bwd: 4631.37 | bwd_inner: 4625.54 | bwd_allreduce: 5.75 | step: 49.81
- 26%|██▌       | 1502/5800 [4:10:03<8:17:08,  6.94s/it]                                                       {'loss': 0.0537, 'grad_norm': 0.7405180931091309, 'learning_rate': 3.474827104634582e-05, 'epoch': 12.95}
- 26%|██▌       | 1502/5800 [4:10:03<8:17:08,  6.94s/it]score1 tensor([[0.6562],
-        [0.4531],
-        [0.5352],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6523, 0.3730, 0.5391, 0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0269, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:56:40,390] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 12:56:40,391] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.28 | bwd_microstep: 4633.30 | bwd_inner_microstep: 4628.57 | bwd_allreduce_microstep: 4.63 | step_microstep: 74.04
-[2025-01-25 12:56:40,392] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.11 | bwd: 4633.33 | bwd_inner: 4628.57 | bwd_allreduce: 4.68 | step: 74.05
- 26%|██▌       | 1503/5800 [4:10:10<8:17:26,  6.95s/it]                                                       {'loss': 0.0269, 'grad_norm': 3.86045503616333, 'learning_rate': 3.4740725334162385e-05, 'epoch': 12.96}
- 26%|██▌       | 1503/5800 [4:10:10<8:17:26,  6.95s/it]score1 tensor([[0.5430],
-        [0.6094],
-        [0.5703],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5000, 0.5742, 0.5508, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0439, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:56:47,297] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 12:56:47,299] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.76 | bwd_microstep: 4630.30 | bwd_inner_microstep: 4623.40 | bwd_allreduce_microstep: 6.69 | step_microstep: 45.46
-[2025-01-25 12:56:47,299] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.71 | bwd: 4630.36 | bwd_inner: 4623.40 | bwd_allreduce: 6.79 | step: 45.42
- 26%|██▌       | 1504/5800 [4:10:17<8:16:41,  6.94s/it]                                                       {'loss': 0.0439, 'grad_norm': 7.719003677368164, 'learning_rate': 3.4733175025565006e-05, 'epoch': 12.97}
- 26%|██▌       | 1504/5800 [4:10:17<8:16:41,  6.94s/it]score1 tensor([[0.5469],
-        [0.5234],
-        [0.5195],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.4883, 0.5352, 0.4863], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0327, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:56:54,256] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.62 | optimizer_step: 4.36
-[2025-01-25 12:56:54,257] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.24 | bwd_microstep: 4629.77 | bwd_inner_microstep: 4622.78 | bwd_allreduce_microstep: 6.82 | step_microstep: 83.83
-[2025-01-25 12:56:54,258] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.19 | bwd: 4629.81 | bwd_inner: 4622.78 | bwd_allreduce: 6.91 | step: 83.82
- 26%|██▌       | 1505/5800 [4:10:24<8:16:48,  6.94s/it]                                                       {'loss': 0.0327, 'grad_norm': 3.821101665496826, 'learning_rate': 3.472562012290802e-05, 'epoch': 12.97}
- 26%|██▌       | 1505/5800 [4:10:24<8:16:48,  6.94s/it]score1 tensor([[0.4375],
-        [0.6016],
-        [0.4453],
-        [0.4727]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4434, 0.6367, 0.4727, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:57:01,159] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.36
-[2025-01-25 12:57:01,160] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.41 | bwd_microstep: 4636.00 | bwd_inner_microstep: 4631.52 | bwd_allreduce_microstep: 4.41 | step_microstep: 42.53
-[2025-01-25 12:57:01,160] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.38 | bwd: 4636.02 | bwd_inner: 4631.52 | bwd_allreduce: 4.44 | step: 42.54
- 26%|██▌       | 1506/5800 [4:10:31<8:15:51,  6.93s/it]                                                       {'loss': 0.0215, 'grad_norm': 3.587665557861328, 'learning_rate': 3.471806062854716e-05, 'epoch': 12.98}
- 26%|██▌       | 1506/5800 [4:10:31<8:15:51,  6.93s/it]score1 tensor([[0.5078],
-        [0.4805],
-        [0.6094],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.5117, 0.6445, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:57:08,068] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 12:57:08,069] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.79 | bwd_microstep: 4633.62 | bwd_inner_microstep: 4629.07 | bwd_allreduce_microstep: 4.45 | step_microstep: 45.52
-[2025-01-25 12:57:08,069] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.76 | bwd: 4633.64 | bwd_inner: 4629.07 | bwd_allreduce: 4.50 | step: 45.53
- 26%|██▌       | 1507/5800 [4:10:38<8:15:21,  6.92s/it]                                                       {'loss': 0.0293, 'grad_norm': 7.375545501708984, 'learning_rate': 3.471049654483962e-05, 'epoch': 12.99}
- 26%|██▌       | 1507/5800 [4:10:38<8:15:21,  6.92s/it][93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:57:13,085] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.52 | optimizer_step: 4.36
-[2025-01-25 12:57:13,086] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 572.68 | bwd_microstep: 1220.06 | bwd_inner_microstep: 1215.55 | bwd_allreduce_microstep: 4.42 | step_microstep: 42.79
-[2025-01-25 12:57:13,087] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 572.65 | bwd: 1220.08 | bwd_inner: 1215.55 | bwd_allreduce: 4.47 | step: 42.80
- 26%|██▌       | 1508/5800 [4:10:43<7:34:24,  6.35s/it]                                                       {'loss': 0.0156, 'grad_norm': 7.5158562660217285, 'learning_rate': 3.4702927874144015e-05, 'epoch': 13.0}
- 26%|██▌       | 1508/5800 [4:10:43<7:34:24,  6.35s/it][2025-01-25 12:57:17,837] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 12:57:28,585] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 12:57:39,251] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 12:57:49,821] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-score1 tensor([[0.5312],
-        [0.4766],
-        [0.4238],
-        [0.4824]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.4961, 0.3516, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0332, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:58:04,536] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 12:58:04,537] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.22 | bwd_microstep: 4603.46 | bwd_inner_microstep: 4597.51 | bwd_allreduce_microstep: 5.78 | step_microstep: 55.59
-[2025-01-25 12:58:04,538] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.18 | bwd: 4603.51 | bwd_inner: 4597.51 | bwd_allreduce: 5.87 | step: 55.57
- 26%|██▌       | 1509/5800 [4:11:34<23:41:56, 19.88s/it]                                                        {'loss': 0.0332, 'grad_norm': 3.3863883018493652, 'learning_rate': 3.469535461882037e-05, 'epoch': 13.01}
- 26%|██▌       | 1509/5800 [4:11:34<23:41:56, 19.88s/it]score1 tensor([[0.4961],
-        [0.5156],
-        [0.4727],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.4980, 0.4629, 0.5977], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0244, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:58:11,399] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.36
-[2025-01-25 12:58:11,401] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2132.21 | bwd_microstep: 4592.72 | bwd_inner_microstep: 4588.44 | bwd_allreduce_microstep: 4.20 | step_microstep: 56.51
-[2025-01-25 12:58:11,402] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2132.17 | bwd: 4592.74 | bwd_inner: 4588.44 | bwd_allreduce: 4.24 | step: 56.56
- 26%|██▌       | 1510/5800 [4:11:41<19:02:54, 15.98s/it]                                                        {'loss': 0.0244, 'grad_norm': 3.3675241470336914, 'learning_rate': 3.468777678123017e-05, 'epoch': 13.02}
- 26%|██▌       | 1510/5800 [4:11:41<19:02:54, 15.98s/it]score1 tensor([[0.5078],
-        [0.4062],
-        [0.5625],
-        [0.4160]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.3613, 0.5508, 0.4219], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0181, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:58:18,305] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 12:58:18,307] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2137.20 | bwd_microstep: 4605.52 | bwd_inner_microstep: 4600.81 | bwd_allreduce_microstep: 4.62 | step_microstep: 46.69
-[2025-01-25 12:58:18,307] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2137.17 | bwd: 4605.54 | bwd_inner: 4600.81 | bwd_allreduce: 4.67 | step: 46.70
- 26%|██▌       | 1511/5800 [4:11:48<15:47:34, 13.26s/it]                                                        {'loss': 0.0181, 'grad_norm': 3.6560490131378174, 'learning_rate': 3.4680194363736314e-05, 'epoch': 13.03}
- 26%|██▌       | 1511/5800 [4:11:48<15:47:34, 13.26s/it]score1 tensor([[0.4316],
-        [0.4160],
-        [0.4531],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4336, 0.4004, 0.4688, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0171, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:58:25,199] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.37
-[2025-01-25 12:58:25,200] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2142.82 | bwd_microstep: 4612.49 | bwd_inner_microstep: 4608.00 | bwd_allreduce_microstep: 4.39 | step_microstep: 42.33
-[2025-01-25 12:58:25,200] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2142.75 | bwd: 4612.51 | bwd_inner: 4608.00 | bwd_allreduce: 4.44 | step: 42.34
- 26%|██▌       | 1512/5800 [4:11:55<13:30:38, 11.34s/it]                                                        {'loss': 0.0171, 'grad_norm': 3.6422922611236572, 'learning_rate': 3.467260736870314e-05, 'epoch': 13.03}
- 26%|██▌       | 1512/5800 [4:11:55<13:30:38, 11.34s/it]score1 tensor([[0.5156],
-        [0.4785],
-        [0.4238],
-        [0.4531]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.4570, 0.3457, 0.4121], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:58:32,090] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 7.38 | optimizer_step: 4.37
-[2025-01-25 12:58:32,091] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2146.73 | bwd_microstep: 4613.72 | bwd_inner_microstep: 4606.09 | bwd_allreduce_microstep: 7.32 | step_microstep: 52.58
-[2025-01-25 12:58:32,091] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2146.70 | bwd: 4613.78 | bwd_inner: 4606.09 | bwd_allreduce: 7.47 | step: 52.57
- 26%|██▌       | 1513/5800 [4:12:02<11:55:03, 10.01s/it]                                                        {'loss': 0.043, 'grad_norm': 3.1361377239227295, 'learning_rate': 3.466501579849639e-05, 'epoch': 13.04}
- 26%|██▌       | 1513/5800 [4:12:02<11:55:03, 10.01s/it]score1 tensor([[0.4727],
-        [0.5234],
-        [0.4824],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.5391, 0.5000, 0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0220, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:58:38,972] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.54 | optimizer_step: 4.37
-[2025-01-25 12:58:38,973] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2149.69 | bwd_microstep: 4619.72 | bwd_inner_microstep: 4615.16 | bwd_allreduce_microstep: 4.48 | step_microstep: 40.68
-[2025-01-25 12:58:38,974] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2149.65 | bwd: 4619.74 | bwd_inner: 4615.16 | bwd_allreduce: 4.52 | step: 40.68
- 26%|██▌       | 1514/5800 [4:12:08<10:47:52,  9.07s/it]                                                        {'loss': 0.022, 'grad_norm': 7.122725963592529, 'learning_rate': 3.465741965548325e-05, 'epoch': 13.05}
- 26%|██▌       | 1514/5800 [4:12:08<10:47:52,  9.07s/it]score1 tensor([[0.5195],
-        [0.4199],
-        [0.5195],
-        [0.6055]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.4199, 0.5391, 0.6250], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:58:45,800] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.50 | optimizer_step: 4.36
-[2025-01-25 12:58:45,801] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2149.94 | bwd_microstep: 4560.80 | bwd_inner_microstep: 4556.30 | bwd_allreduce_microstep: 4.42 | step_microstep: 39.86
-[2025-01-25 12:58:45,802] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2149.91 | bwd: 4560.83 | bwd_inner: 4556.30 | bwd_allreduce: 4.46 | step: 39.87
- 26%|██▌       | 1515/5800 [4:12:15<9:59:41,  8.40s/it]                                                        {'loss': 0.0117, 'grad_norm': 5.717442035675049, 'learning_rate': 3.4649818942032345e-05, 'epoch': 13.06}
- 26%|██▌       | 1515/5800 [4:12:15<9:59:41,  8.40s/it]score1 tensor([[0.5039],
-        [0.4805],
-        [0.4121],
-        [0.4668]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.5273, 0.3984, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0386, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:58:52,678] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 12:58:52,680] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.30 | bwd_microstep: 4610.56 | bwd_inner_microstep: 4606.06 | bwd_allreduce_microstep: 4.41 | step_microstep: 41.89
-[2025-01-25 12:58:52,680] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.26 | bwd: 4610.59 | bwd_inner: 4606.06 | bwd_allreduce: 4.46 | step: 41.89
- 26%|██▌       | 1516/5800 [4:12:22<9:27:06,  7.94s/it]                                                       {'loss': 0.0386, 'grad_norm': 0.5068907737731934, 'learning_rate': 3.464221366051369e-05, 'epoch': 13.07}
- 26%|██▌       | 1516/5800 [4:12:22<9:27:06,  7.94s/it]score1 tensor([[0.4141],
-        [0.4922],
-        [0.5156],
-        [0.4277]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4570, 0.4883, 0.6406, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0488, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:58:59,569] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.51 | optimizer_step: 4.37
-[2025-01-25 12:58:59,570] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2149.15 | bwd_microstep: 4617.00 | bwd_inner_microstep: 4612.71 | bwd_allreduce_microstep: 4.22 | step_microstep: 39.74
-[2025-01-25 12:58:59,570] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2149.12 | bwd: 4617.02 | bwd_inner: 4612.71 | bwd_allreduce: 4.25 | step: 39.75
- 26%|██▌       | 1517/5800 [4:12:29<9:04:25,  7.63s/it]                                                       {'loss': 0.0488, 'grad_norm': 3.266650438308716, 'learning_rate': 3.463460381329875e-05, 'epoch': 13.08}
- 26%|██▌       | 1517/5800 [4:12:29<9:04:25,  7.63s/it]score1 tensor([[0.5156],
-        [0.4590],
-        [0.4297],
-        [0.4375]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.4648, 0.4043, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:59:06,471] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.37
-[2025-01-25 12:59:06,472] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.28 | bwd_microstep: 4621.14 | bwd_inner_microstep: 4616.34 | bwd_allreduce_microstep: 4.69 | step_microstep: 44.39
-[2025-01-25 12:59:06,472] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.25 | bwd: 4621.16 | bwd_inner: 4616.34 | bwd_allreduce: 4.75 | step: 44.39
- 26%|██▌       | 1518/5800 [4:12:36<8:48:47,  7.41s/it]                                                       {'loss': 0.0156, 'grad_norm': 3.3918683528900146, 'learning_rate': 3.462698940276041e-05, 'epoch': 13.09}
- 26%|██▌       | 1518/5800 [4:12:36<8:48:47,  7.41s/it]score1 tensor([[0.5078],
-        [0.5117],
-        [0.5625],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4863, 0.5586, 0.5547, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0210, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:59:13,388] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.67 | optimizer_step: 4.37
-[2025-01-25 12:59:13,390] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.71 | bwd_microstep: 4620.40 | bwd_inner_microstep: 4615.49 | bwd_allreduce_microstep: 4.81 | step_microstep: 62.01
-[2025-01-25 12:59:13,390] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.68 | bwd: 4620.42 | bwd_inner: 4615.49 | bwd_allreduce: 4.86 | step: 62.01
- 26%|██▌       | 1519/5800 [4:12:43<8:38:13,  7.26s/it]                                                       {'loss': 0.021, 'grad_norm': 0.456207811832428, 'learning_rate': 3.461937043127297e-05, 'epoch': 13.09}
- 26%|██▌       | 1519/5800 [4:12:43<8:38:13,  7.26s/it]score1 tensor([[0.5195],
-        [0.4766],
-        [0.4062],
-        [0.5938]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.4785, 0.3418, 0.5938], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:59:20,270] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 8.04
-[2025-01-25 12:59:20,274] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.30 | bwd_microstep: 4574.47 | bwd_inner_microstep: 4566.61 | bwd_allreduce_microstep: 7.66 | step_microstep: 70.45
-[2025-01-25 12:59:20,274] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.26 | bwd: 4574.52 | bwd_inner: 4566.61 | bwd_allreduce: 7.76 | step: 70.47
- 26%|██▌       | 1520/5800 [4:12:50<8:30:11,  7.15s/it]                                                       {'loss': 0.0312, 'grad_norm': 2.0602478981018066, 'learning_rate': 3.461174690121217e-05, 'epoch': 13.1}
- 26%|██▌       | 1520/5800 [4:12:50<8:30:11,  7.15s/it]score1 tensor([[0.4473],
-        [0.4531],
-        [0.4746],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4258, 0.4121, 0.4434, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:59:27,209] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.12 | optimizer_step: 4.80
-[2025-01-25 12:59:27,211] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.10 | bwd_microstep: 4617.27 | bwd_inner_microstep: 4612.39 | bwd_allreduce_microstep: 4.77 | step_microstep: 58.60
-[2025-01-25 12:59:27,212] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.06 | bwd: 4617.30 | bwd_inner: 4612.39 | bwd_allreduce: 4.82 | step: 58.61
- 26%|██▌       | 1521/5800 [4:12:57<8:25:29,  7.09s/it]                                                       {'loss': 0.0312, 'grad_norm': 3.1832456588745117, 'learning_rate': 3.460411881495515e-05, 'epoch': 13.11}
- 26%|██▌       | 1521/5800 [4:12:57<8:25:29,  7.09s/it]score1 tensor([[0.4727],
-        [0.5547],
-        [0.5273],
-        [0.6367]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4707, 0.5781, 0.4668, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:59:34,129] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.37
-[2025-01-25 12:59:34,130] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.70 | bwd_microstep: 4622.29 | bwd_inner_microstep: 4617.20 | bwd_allreduce_microstep: 4.99 | step_microstep: 42.36
-[2025-01-25 12:59:34,130] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.67 | bwd: 4622.31 | bwd_inner: 4617.20 | bwd_allreduce: 5.04 | step: 42.37
- 26%|██▌       | 1522/5800 [4:13:04<8:21:26,  7.03s/it]                                                       {'loss': 0.0234, 'grad_norm': 0.8036560416221619, 'learning_rate': 3.459648617488047e-05, 'epoch': 13.12}
- 26%|██▌       | 1522/5800 [4:13:04<8:21:26,  7.03s/it]score1 tensor([[0.5781],
-        [0.4707],
-        [0.4785],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.4492, 0.4941, 0.6602], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0337, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:59:41,029] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.50 | optimizer_step: 4.37
-[2025-01-25 12:59:41,030] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.11 | bwd_microstep: 4618.19 | bwd_inner_microstep: 4613.60 | bwd_allreduce_microstep: 4.51 | step_microstep: 44.61
-[2025-01-25 12:59:41,030] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.06 | bwd: 4618.22 | bwd_inner: 4613.60 | bwd_allreduce: 4.55 | step: 44.61
- 26%|██▋       | 1523/5800 [4:13:10<8:18:22,  6.99s/it]                                                       {'loss': 0.0337, 'grad_norm': 0.4591348171234131, 'learning_rate': 3.458884898336814e-05, 'epoch': 13.13}
- 26%|██▋       | 1523/5800 [4:13:11<8:18:22,  6.99s/it]score1 tensor([[0.5508],
-        [0.5234],
-        [0.5586],
-        [0.4727]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.5352, 0.5625, 0.4668], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:59:47,935] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.37
-[2025-01-25 12:59:47,937] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.93 | bwd_microstep: 4629.10 | bwd_inner_microstep: 4624.61 | bwd_allreduce_microstep: 4.38 | step_microstep: 42.90
-[2025-01-25 12:59:47,937] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.90 | bwd: 4629.12 | bwd_inner: 4624.62 | bwd_allreduce: 4.43 | step: 42.91
- 26%|██▋       | 1524/5800 [4:13:17<8:16:35,  6.97s/it]                                                       {'loss': 0.0132, 'grad_norm': 0.48346662521362305, 'learning_rate': 3.4581207242799554e-05, 'epoch': 13.14}
- 26%|██▋       | 1524/5800 [4:13:17<8:16:35,  6.97s/it]score1 tensor([[0.4199],
-        [0.4004],
-        [0.4961],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3789, 0.3887, 0.4648, 0.5000], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0298, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 12:59:54,829] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 12:59:54,830] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.66 | bwd_microstep: 4619.85 | bwd_inner_microstep: 4615.59 | bwd_allreduce_microstep: 4.17 | step_microstep: 36.98
-[2025-01-25 12:59:54,831] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.63 | bwd: 4619.88 | bwd_inner: 4615.59 | bwd_allreduce: 4.22 | step: 36.99
- 26%|██▋       | 1525/5800 [4:13:24<8:14:47,  6.94s/it]                                                       {'loss': 0.0298, 'grad_norm': 6.809762477874756, 'learning_rate': 3.457356095555754e-05, 'epoch': 13.15}
- 26%|██▋       | 1525/5800 [4:13:24<8:14:47,  6.94s/it]score1 tensor([[0.5820],
-        [0.5742],
-        [0.5703],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.5625, 0.5430, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:00:01,725] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.54 | optimizer_step: 4.36
-[2025-01-25 13:00:01,726] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.93 | bwd_microstep: 4617.91 | bwd_inner_microstep: 4613.64 | bwd_allreduce_microstep: 4.18 | step_microstep: 44.12
-[2025-01-25 13:00:01,726] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.91 | bwd: 4617.94 | bwd_inner: 4613.64 | bwd_allreduce: 4.22 | step: 44.13
- 26%|██▋       | 1526/5800 [4:13:31<8:13:39,  6.93s/it]                                                       {'loss': 0.0254, 'grad_norm': 7.9617509841918945, 'learning_rate': 3.456591012402635e-05, 'epoch': 13.16}
- 26%|██▋       | 1526/5800 [4:13:31<8:13:39,  6.93s/it]score1 tensor([[0.5117],
-        [0.4727],
-        [0.5000],
-        [0.4648]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5078, 0.5234, 0.4863, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0308, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:00:08,628] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.53 | optimizer_step: 4.37
-[2025-01-25 13:00:08,629] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.91 | bwd_microstep: 4622.45 | bwd_inner_microstep: 4618.27 | bwd_allreduce_microstep: 4.12 | step_microstep: 43.91
-[2025-01-25 13:00:08,630] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.81 | bwd: 4622.47 | bwd_inner: 4618.27 | bwd_allreduce: 4.15 | step: 43.92
- 26%|██▋       | 1527/5800 [4:13:38<8:12:54,  6.92s/it]                                                       {'loss': 0.0308, 'grad_norm': 0.5977739691734314, 'learning_rate': 3.4558254750591644e-05, 'epoch': 13.16}
- 26%|██▋       | 1527/5800 [4:13:38<8:12:54,  6.92s/it]score1 tensor([[0.6133],
-        [0.5000],
-        [0.4316],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5977, 0.4980, 0.4922, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0205, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:00:15,526] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.49 | optimizer_step: 4.37
-[2025-01-25 13:00:15,527] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.48 | bwd_microstep: 4622.87 | bwd_inner_microstep: 4618.60 | bwd_allreduce_microstep: 4.19 | step_microstep: 40.91
-[2025-01-25 13:00:15,528] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.45 | bwd: 4622.90 | bwd_inner: 4618.60 | bwd_allreduce: 4.22 | step: 40.91
- 26%|██▋       | 1528/5800 [4:13:45<8:12:28,  6.92s/it]                                                       {'loss': 0.0205, 'grad_norm': 0.7586026191711426, 'learning_rate': 3.45505948376405e-05, 'epoch': 13.17}
- 26%|██▋       | 1528/5800 [4:13:45<8:12:28,  6.92s/it]score1 tensor([[0.3750],
-        [0.6172],
-        [0.5547],
-        [0.4082]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4082, 0.5742, 0.5508, 0.4062], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0205, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:00:22,439] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.37
-[2025-01-25 13:00:22,441] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.12 | bwd_microstep: 4631.22 | bwd_inner_microstep: 4626.36 | bwd_allreduce_microstep: 4.70 | step_microstep: 42.53
-[2025-01-25 13:00:22,441] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.09 | bwd: 4631.27 | bwd_inner: 4626.36 | bwd_allreduce: 4.77 | step: 42.53
- 26%|██▋       | 1529/5800 [4:13:52<8:12:16,  6.92s/it]                                                       {'loss': 0.0205, 'grad_norm': 4.220570087432861, 'learning_rate': 3.454293038756141e-05, 'epoch': 13.18}
- 26%|██▋       | 1529/5800 [4:13:52<8:12:16,  6.92s/it]score1 tensor([[0.3809],
-        [0.4746],
-        [0.5703],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4219, 0.5039, 0.6133, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0420, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:00:29,385] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.58 | optimizer_step: 4.53
-[2025-01-25 13:00:29,388] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2170.42 | bwd_microstep: 4631.63 | bwd_inner_microstep: 4622.89 | bwd_allreduce_microstep: 8.62 | step_microstep: 63.73
-[2025-01-25 13:00:29,388] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2170.36 | bwd: 4631.65 | bwd_inner: 4622.89 | bwd_allreduce: 8.68 | step: 63.77
- 26%|██▋       | 1530/5800 [4:13:59<8:13:27,  6.93s/it]                                                       {'loss': 0.042, 'grad_norm': 7.2830071449279785, 'learning_rate': 3.453526140274428e-05, 'epoch': 13.19}
- 26%|██▋       | 1530/5800 [4:13:59<8:13:27,  6.93s/it]score1 tensor([[0.4199],
-        [0.3691],
-        [0.3750],
-        [0.4824]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4531, 0.4707, 0.4375, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0674, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:00:36,345] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.53 | optimizer_step: 4.36
-[2025-01-25 13:00:36,346] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.29 | bwd_microstep: 4629.17 | bwd_inner_microstep: 4622.97 | bwd_allreduce_microstep: 6.05 | step_microstep: 55.79
-[2025-01-25 13:00:36,346] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.26 | bwd: 4629.22 | bwd_inner: 4622.97 | bwd_allreduce: 6.12 | step: 55.77
- 26%|██▋       | 1531/5800 [4:14:06<8:13:03,  6.93s/it]                                                       {'loss': 0.0674, 'grad_norm': 6.479985237121582, 'learning_rate': 3.452758788558044e-05, 'epoch': 13.2}
- 26%|██▋       | 1531/5800 [4:14:06<8:13:03,  6.93s/it]score1 tensor([[0.3535],
-        [0.4023],
-        [0.4902],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3926, 0.5039, 0.5508, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0591, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:00:43,249] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.37
-[2025-01-25 13:00:43,250] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.86 | bwd_microstep: 4621.33 | bwd_inner_microstep: 4616.70 | bwd_allreduce_microstep: 4.54 | step_microstep: 50.69
-[2025-01-25 13:00:43,251] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.78 | bwd: 4621.36 | bwd_inner: 4616.70 | bwd_allreduce: 4.59 | step: 50.69
- 26%|██▋       | 1532/5800 [4:14:13<8:12:23,  6.92s/it]                                                       {'loss': 0.0591, 'grad_norm': 6.648802757263184, 'learning_rate': 3.451990983846262e-05, 'epoch': 13.21}
- 26%|██▋       | 1532/5800 [4:14:13<8:12:23,  6.92s/it]score1 tensor([[0.5117],
-        [0.4785],
-        [0.4297],
-        [0.4199]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.5195, 0.4629, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0435, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:00:50,150] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 13:00:50,151] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.21 | bwd_microstep: 4626.61 | bwd_inner_microstep: 4622.03 | bwd_allreduce_microstep: 4.50 | step_microstep: 42.72
-[2025-01-25 13:00:50,152] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.17 | bwd: 4626.64 | bwd_inner: 4622.02 | bwd_allreduce: 4.54 | step: 42.73
- 26%|██▋       | 1533/5800 [4:14:20<8:11:59,  6.92s/it]                                                       {'loss': 0.0435, 'grad_norm': 6.9254374504089355, 'learning_rate': 3.451222726378497e-05, 'epoch': 13.22}
- 26%|██▋       | 1533/5800 [4:14:20<8:11:59,  6.92s/it]score1 tensor([[0.4102],
-        [0.5820],
-        [0.4492],
-        [0.3516]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4395, 0.5117, 0.4512, 0.3438], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:00:57,044] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.49 | optimizer_step: 4.36
-[2025-01-25 13:00:57,045] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.08 | bwd_microstep: 4619.35 | bwd_inner_microstep: 4614.94 | bwd_allreduce_microstep: 4.33 | step_microstep: 39.88
-[2025-01-25 13:00:57,046] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.05 | bwd: 4619.37 | bwd_inner: 4614.94 | bwd_allreduce: 4.36 | step: 39.88
- 26%|██▋       | 1534/5800 [4:14:27<8:11:18,  6.91s/it]                                                       {'loss': 0.0273, 'grad_norm': 0.6584987044334412, 'learning_rate': 3.450454016394305e-05, 'epoch': 13.22}
- 26%|██▋       | 1534/5800 [4:14:27<8:11:18,  6.91s/it]score1 tensor([[0.5859],
-        [0.6211],
-        [0.6875],
-        [0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6289, 0.5898, 0.6367, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:01:03,889] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.55 | optimizer_step: 4.36
-[2025-01-25 13:01:03,890] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.70 | bwd_microstep: 4574.41 | bwd_inner_microstep: 4570.17 | bwd_allreduce_microstep: 4.15 | step_microstep: 40.22
-[2025-01-25 13:01:03,890] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.66 | bwd: 4574.44 | bwd_inner: 4570.17 | bwd_allreduce: 4.20 | step: 40.21
- 26%|██▋       | 1535/5800 [4:14:33<8:09:44,  6.89s/it]                                                       {'loss': 0.0312, 'grad_norm': 2.484513521194458, 'learning_rate': 3.449684854133383e-05, 'epoch': 13.23}
- 26%|██▋       | 1535/5800 [4:14:33<8:09:44,  6.89s/it]score1 tensor([[0.6328],
-        [0.5391],
-        [0.5859],
-        [0.4961]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.7070, 0.5117, 0.6211, 0.4902], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0356, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:01:10,782] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.36
-[2025-01-25 13:01:10,784] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.83 | bwd_microstep: 4620.12 | bwd_inner_microstep: 4615.73 | bwd_allreduce_microstep: 4.32 | step_microstep: 41.17
-[2025-01-25 13:01:10,784] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.78 | bwd: 4620.15 | bwd_inner: 4615.73 | bwd_allreduce: 4.35 | step: 41.18
- 26%|██▋       | 1536/5800 [4:14:40<8:09:41,  6.89s/it]                                                       {'loss': 0.0356, 'grad_norm': 0.9407358765602112, 'learning_rate': 3.4489152398355696e-05, 'epoch': 13.24}
- 26%|██▋       | 1536/5800 [4:14:40<8:09:41,  6.89s/it]score1 tensor([[0.4570],
-        [0.4648],
-        [0.5742],
-        [0.4297]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.4531, 0.5195, 0.4043], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:01:17,680] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 13:01:17,681] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.04 | bwd_microstep: 4621.91 | bwd_inner_microstep: 4616.95 | bwd_allreduce_microstep: 4.84 | step_microstep: 42.19
-[2025-01-25 13:01:17,682] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.00 | bwd: 4621.94 | bwd_inner: 4616.95 | bwd_allreduce: 4.90 | step: 42.19
- 26%|██▋       | 1537/5800 [4:14:47<8:09:51,  6.89s/it]                                                       {'loss': 0.0254, 'grad_norm': 7.101344585418701, 'learning_rate': 3.4481451737408437e-05, 'epoch': 13.25}
- 26%|██▋       | 1537/5800 [4:14:47<8:09:51,  6.89s/it]score1 tensor([[0.5742],
-        [0.4414],
-        [0.5664],
-        [0.5859]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.4688, 0.5273, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0444, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:01:24,591] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.37
-[2025-01-25 13:01:24,592] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.41 | bwd_microstep: 4625.29 | bwd_inner_microstep: 4620.67 | bwd_allreduce_microstep: 4.51 | step_microstep: 41.46
-[2025-01-25 13:01:24,592] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.37 | bwd: 4625.31 | bwd_inner: 4620.68 | bwd_allreduce: 4.56 | step: 41.48
- 27%|██▋       | 1538/5800 [4:14:54<8:09:56,  6.90s/it]                                                       {'loss': 0.0444, 'grad_norm': 4.391573429107666, 'learning_rate': 3.4473746560893245e-05, 'epoch': 13.26}
- 27%|██▋       | 1538/5800 [4:14:54<8:09:56,  6.90s/it]score1 tensor([[0.4902],
-        [0.3848],
-        [0.5430],
-        [0.4473]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.3691, 0.5000, 0.4590], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:01:31,511] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.20 | optimizer_step: 4.36
-[2025-01-25 13:01:31,513] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.28 | bwd_microstep: 4631.48 | bwd_inner_microstep: 4625.44 | bwd_allreduce_microstep: 5.95 | step_microstep: 60.94
-[2025-01-25 13:01:31,513] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.25 | bwd: 4631.50 | bwd_inner: 4625.44 | bwd_allreduce: 6.00 | step: 60.95
- 27%|██▋       | 1539/5800 [4:15:01<8:10:27,  6.91s/it]                                                       {'loss': 0.0186, 'grad_norm': 0.5451831817626953, 'learning_rate': 3.446603687121273e-05, 'epoch': 13.27}
- 27%|██▋       | 1539/5800 [4:15:01<8:10:27,  6.91s/it]score1 tensor([[0.6016],
-        [0.5547],
-        [0.5352],
-        [0.4941]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.7031, 0.5781, 0.5156, 0.4609], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0444, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:01:38,426] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 13:01:38,427] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.25 | bwd_microstep: 4626.96 | bwd_inner_microstep: 4622.45 | bwd_allreduce_microstep: 4.39 | step_microstep: 41.54
-[2025-01-25 13:01:38,427] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.22 | bwd: 4626.99 | bwd_inner: 4622.45 | bwd_allreduce: 4.44 | step: 41.54
- 27%|██▋       | 1540/5800 [4:15:08<8:10:25,  6.91s/it]                                                       {'loss': 0.0444, 'grad_norm': 0.6851277351379395, 'learning_rate': 3.445832267077092e-05, 'epoch': 13.28}
- 27%|██▋       | 1540/5800 [4:15:08<8:10:25,  6.91s/it]score1 tensor([[0.4355],
-        [0.5938],
-        [0.3750],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.6445, 0.3672, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0278, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:01:45,347] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.54 | optimizer_step: 4.37
-[2025-01-25 13:01:45,348] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.90 | bwd_microstep: 4640.00 | bwd_inner_microstep: 4635.41 | bwd_allreduce_microstep: 4.50 | step_microstep: 41.42
-[2025-01-25 13:01:45,349] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.86 | bwd: 4640.02 | bwd_inner: 4635.41 | bwd_allreduce: 4.54 | step: 41.43
- 27%|██▋       | 1541/5800 [4:15:15<8:10:34,  6.91s/it]                                                       {'loss': 0.0278, 'grad_norm': 0.7864423394203186, 'learning_rate': 3.445060396197323e-05, 'epoch': 13.28}
- 27%|██▋       | 1541/5800 [4:15:15<8:10:34,  6.91s/it]score1 tensor([[0.6250],
-        [0.5195],
-        [0.5586],
-        [0.3887]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.4707, 0.5508, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0308, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:01:52,254] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 13:01:52,255] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.25 | bwd_microstep: 4627.93 | bwd_inner_microstep: 4623.26 | bwd_allreduce_microstep: 4.59 | step_microstep: 42.74
-[2025-01-25 13:01:52,256] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.22 | bwd: 4627.95 | bwd_inner: 4623.26 | bwd_allreduce: 4.63 | step: 42.76
- 27%|██▋       | 1542/5800 [4:15:22<8:10:28,  6.91s/it]                                                       {'loss': 0.0308, 'grad_norm': 4.409336090087891, 'learning_rate': 3.444288074722648e-05, 'epoch': 13.29}
- 27%|██▋       | 1542/5800 [4:15:22<8:10:28,  6.91s/it]score1 tensor([[0.3730],
-        [0.5781],
-        [0.4082],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4238, 0.5625, 0.4512, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0332, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:01:59,167] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.37
-[2025-01-25 13:01:59,168] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.83 | bwd_microstep: 4629.65 | bwd_inner_microstep: 4625.12 | bwd_allreduce_microstep: 4.45 | step_microstep: 41.35
-[2025-01-25 13:01:59,169] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.78 | bwd: 4629.67 | bwd_inner: 4625.12 | bwd_allreduce: 4.49 | step: 41.36
- 27%|██▋       | 1543/5800 [4:15:29<8:10:28,  6.91s/it]                                                       {'loss': 0.0332, 'grad_norm': 3.2452383041381836, 'learning_rate': 3.4435153028938904e-05, 'epoch': 13.3}
- 27%|██▋       | 1543/5800 [4:15:29<8:10:28,  6.91s/it]score1 tensor([[0.6758],
-        [0.4961],
-        [0.3809],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6953, 0.4375, 0.4121, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0303, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:02:06,068] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.54 | optimizer_step: 4.37
-[2025-01-25 13:02:06,069] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.07 | bwd_microstep: 4621.69 | bwd_inner_microstep: 4617.31 | bwd_allreduce_microstep: 4.30 | step_microstep: 39.00
-[2025-01-25 13:02:06,070] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.03 | bwd: 4621.71 | bwd_inner: 4617.31 | bwd_allreduce: 4.33 | step: 39.00
- 27%|██▋       | 1544/5800 [4:15:36<8:09:55,  6.91s/it]                                                       {'loss': 0.0303, 'grad_norm': 0.6027178168296814, 'learning_rate': 3.4427420809520145e-05, 'epoch': 13.31}
- 27%|██▋       | 1544/5800 [4:15:36<8:09:55,  6.91s/it]score1 tensor([[0.5156],
-        [0.4297],
-        [0.4531],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6016, 0.4609, 0.4727, 0.5586], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0381, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:02:12,977] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 13:02:12,978] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.55 | bwd_microstep: 4634.93 | bwd_inner_microstep: 4630.44 | bwd_allreduce_microstep: 4.41 | step_microstep: 40.70
-[2025-01-25 13:02:12,978] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.51 | bwd: 4634.96 | bwd_inner: 4630.44 | bwd_allreduce: 4.45 | step: 40.71
- 27%|██▋       | 1545/5800 [4:15:42<8:09:58,  6.91s/it]                                                       {'loss': 0.0381, 'grad_norm': 7.213539123535156, 'learning_rate': 3.441968409138124e-05, 'epoch': 13.32}
- 27%|██▋       | 1545/5800 [4:15:42<8:09:58,  6.91s/it]score1 tensor([[0.4219],
-        [0.5156],
-        [0.4785],
-        [0.3555]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4258, 0.5352, 0.5312, 0.3789], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0249, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:02:19,881] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.53 | optimizer_step: 4.37
-[2025-01-25 13:02:19,882] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.06 | bwd_microstep: 4622.41 | bwd_inner_microstep: 4617.90 | bwd_allreduce_microstep: 4.42 | step_microstep: 39.85
-[2025-01-25 13:02:19,882] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.03 | bwd: 4622.44 | bwd_inner: 4617.90 | bwd_allreduce: 4.47 | step: 39.86
- 27%|██▋       | 1546/5800 [4:15:49<8:09:35,  6.91s/it]                                                       {'loss': 0.0249, 'grad_norm': 6.8929009437561035, 'learning_rate': 3.4411942876934637e-05, 'epoch': 13.33}
- 27%|██▋       | 1546/5800 [4:15:49<8:09:35,  6.91s/it]score1 tensor([[0.5469],
-        [0.6602],
-        [0.3945],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.6641, 0.4434, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0200, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:02:26,794] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 13:02:26,795] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.07 | bwd_microstep: 4631.89 | bwd_inner_microstep: 4627.15 | bwd_allreduce_microstep: 4.66 | step_microstep: 48.50
-[2025-01-25 13:02:26,796] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.04 | bwd: 4631.91 | bwd_inner: 4627.15 | bwd_allreduce: 4.70 | step: 48.51
- 27%|██▋       | 1547/5800 [4:15:56<8:09:45,  6.91s/it]                                                       {'loss': 0.02, 'grad_norm': 3.840388774871826, 'learning_rate': 3.4404197168594166e-05, 'epoch': 13.34}
- 27%|██▋       | 1547/5800 [4:15:56<8:09:45,  6.91s/it]score1 tensor([[0.5430],
-        [0.5117],
-        [0.4688],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5078, 0.4648, 0.4805, 0.4590], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0444, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:02:33,700] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.54 | optimizer_step: 4.37
-[2025-01-25 13:02:33,701] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.09 | bwd_microstep: 4623.81 | bwd_inner_microstep: 4619.31 | bwd_allreduce_microstep: 4.41 | step_microstep: 40.42
-[2025-01-25 13:02:33,701] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.05 | bwd: 4623.83 | bwd_inner: 4619.31 | bwd_allreduce: 4.46 | step: 40.42
- 27%|██▋       | 1548/5800 [4:16:03<8:09:30,  6.91s/it]                                                       {'loss': 0.0444, 'grad_norm': 4.0351433753967285, 'learning_rate': 3.439644696877509e-05, 'epoch': 13.34}
- 27%|██▋       | 1548/5800 [4:16:03<8:09:30,  6.91s/it]score1 tensor([[0.4609],
-        [0.3555],
-        [0.5938],
-        [0.6289]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.3750, 0.5391, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0220, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:02:40,622] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.08 | optimizer_step: 4.36
-[2025-01-25 13:02:40,624] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.63 | bwd_microstep: 4634.59 | bwd_inner_microstep: 4628.77 | bwd_allreduce_microstep: 5.67 | step_microstep: 58.25
-[2025-01-25 13:02:40,624] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.60 | bwd: 4634.64 | bwd_inner: 4628.77 | bwd_allreduce: 5.75 | step: 58.23
- 27%|██▋       | 1549/5800 [4:16:10<8:09:48,  6.91s/it]                                                       {'loss': 0.022, 'grad_norm': 4.535040378570557, 'learning_rate': 3.438869227989405e-05, 'epoch': 13.35}
- 27%|██▋       | 1549/5800 [4:16:10<8:09:48,  6.91s/it]score1 tensor([[0.5117],
-        [0.6172],
-        [0.5039],
-        [0.6562]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.6094, 0.4922, 0.6133], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0166, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:02:47,586] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.11 | optimizer_step: 4.42
-[2025-01-25 13:02:47,589] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.15 | bwd_microstep: 4633.62 | bwd_inner_microstep: 4626.12 | bwd_allreduce_microstep: 7.32 | step_microstep: 85.42
-[2025-01-25 13:02:47,590] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.11 | bwd: 4633.67 | bwd_inner: 4626.12 | bwd_allreduce: 7.41 | step: 85.44
- 27%|██▋       | 1550/5800 [4:16:17<8:11:18,  6.94s/it]                                                       {'loss': 0.0166, 'grad_norm': 4.351744174957275, 'learning_rate': 3.438093310436909e-05, 'epoch': 13.36}
- 27%|██▋       | 1550/5800 [4:16:17<8:11:18,  6.94s/it]score1 tensor([[0.5508],
-        [0.5273],
-        [0.6914],
-        [0.6680]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.5820, 0.6367, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0449, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:02:54,542] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.37
-[2025-01-25 13:02:54,544] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.47 | bwd_microstep: 4627.32 | bwd_inner_microstep: 4622.65 | bwd_allreduce_microstep: 4.58 | step_microstep: 44.23
-[2025-01-25 13:02:54,545] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.41 | bwd: 4627.35 | bwd_inner: 4622.65 | bwd_allreduce: 4.63 | step: 44.24
- 27%|██▋       | 1551/5800 [4:16:24<8:11:17,  6.94s/it]                                                       {'loss': 0.0449, 'grad_norm': 4.676804065704346, 'learning_rate': 3.4373169444619656e-05, 'epoch': 13.37}
- 27%|██▋       | 1551/5800 [4:16:24<8:11:17,  6.94s/it]score1 tensor([[0.4609],
-        [0.6016],
-        [0.3594],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4414, 0.6094, 0.3652, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0151, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:03:01,458] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.27 | optimizer_step: 4.37
-[2025-01-25 13:03:01,459] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.67 | bwd_microstep: 4619.53 | bwd_inner_microstep: 4614.53 | bwd_allreduce_microstep: 4.90 | step_microstep: 42.77
-[2025-01-25 13:03:01,459] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.61 | bwd: 4619.55 | bwd_inner: 4614.53 | bwd_allreduce: 4.95 | step: 42.77
- 27%|██▋       | 1552/5800 [4:16:31<8:10:35,  6.93s/it]                                                       {'loss': 0.0151, 'grad_norm': 3.868452548980713, 'learning_rate': 3.436540130306659e-05, 'epoch': 13.38}
- 27%|██▋       | 1552/5800 [4:16:31<8:10:35,  6.93s/it]score1 tensor([[0.4297],
-        [0.5625],
-        [0.4316],
-        [0.4219]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4336, 0.5469, 0.4141, 0.4062], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:03:08,377] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 13:03:08,379] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.75 | bwd_microstep: 4625.50 | bwd_inner_microstep: 4620.83 | bwd_allreduce_microstep: 4.56 | step_microstep: 41.85
-[2025-01-25 13:03:08,379] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.72 | bwd: 4625.53 | bwd_inner: 4620.83 | bwd_allreduce: 4.62 | step: 41.86
- 27%|██▋       | 1553/5800 [4:16:38<8:10:10,  6.93s/it]                                                       {'loss': 0.0132, 'grad_norm': 3.664539098739624, 'learning_rate': 3.435762868213214e-05, 'epoch': 13.39}
- 27%|██▋       | 1553/5800 [4:16:38<8:10:10,  6.93s/it]score1 tensor([[0.5508],
-        [0.5781],
-        [0.4199],
-        [0.4141]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5703, 0.6016, 0.4629, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:03:15,296] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.55 | optimizer_step: 4.37
-[2025-01-25 13:03:15,297] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.27 | bwd_microstep: 4630.24 | bwd_inner_microstep: 4625.75 | bwd_allreduce_microstep: 4.39 | step_microstep: 41.94
-[2025-01-25 13:03:15,298] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.24 | bwd: 4630.26 | bwd_inner: 4625.75 | bwd_allreduce: 4.44 | step: 41.95
- 27%|██▋       | 1554/5800 [4:16:45<8:09:47,  6.92s/it]                                                       {'loss': 0.0293, 'grad_norm': 7.2979936599731445, 'learning_rate': 3.4349851584239946e-05, 'epoch': 13.4}
- 27%|██▋       | 1554/5800 [4:16:45<8:09:47,  6.92s/it]score1 tensor([[0.4473],
-        [0.5820],
-        [0.5703],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4824, 0.5664, 0.6133, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0410, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:03:22,201] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.37
-[2025-01-25 13:03:22,202] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.89 | bwd_microstep: 4626.70 | bwd_inner_microstep: 4621.92 | bwd_allreduce_microstep: 4.69 | step_microstep: 42.15
-[2025-01-25 13:03:22,203] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.86 | bwd: 4626.73 | bwd_inner: 4621.92 | bwd_allreduce: 4.73 | step: 42.16
- 27%|██▋       | 1555/5800 [4:16:52<8:09:19,  6.92s/it]                                                       {'loss': 0.041, 'grad_norm': 3.6507108211517334, 'learning_rate': 3.434207001181503e-05, 'epoch': 13.41}
- 27%|██▋       | 1555/5800 [4:16:52<8:09:19,  6.92s/it]score1 tensor([[0.5391],
-        [0.5273],
-        [0.5273],
-        [0.4863]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.5586, 0.5352, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0269, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:03:29,107] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 13:03:29,108] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.44 | bwd_microstep: 4633.48 | bwd_inner_microstep: 4627.61 | bwd_allreduce_microstep: 5.80 | step_microstep: 40.85
-[2025-01-25 13:03:29,108] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.39 | bwd: 4633.50 | bwd_inner: 4627.61 | bwd_allreduce: 5.83 | step: 40.85
- 27%|██▋       | 1556/5800 [4:16:59<8:09:01,  6.91s/it]                                                       {'loss': 0.0269, 'grad_norm': 3.98002552986145, 'learning_rate': 3.4334283967283824e-05, 'epoch': 13.41}
- 27%|██▋       | 1556/5800 [4:16:59<8:09:01,  6.91s/it]score1 tensor([[0.5430],
-        [0.4961],
-        [0.5000],
-        [0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.5391, 0.4688, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0322, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:03:36,014] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.18 | optimizer_step: 4.36
-[2025-01-25 13:03:36,016] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.30 | bwd_microstep: 4623.31 | bwd_inner_microstep: 4618.83 | bwd_allreduce_microstep: 4.40 | step_microstep: 42.74
-[2025-01-25 13:03:36,016] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.27 | bwd: 4623.34 | bwd_inner: 4618.83 | bwd_allreduce: 4.44 | step: 42.76
- 27%|██▋       | 1557/5800 [4:17:05<8:08:59,  6.91s/it]                                                       {'loss': 0.0322, 'grad_norm': 0.47993355989456177, 'learning_rate': 3.432649345307416e-05, 'epoch': 13.42}
- 27%|██▋       | 1557/5800 [4:17:05<8:08:59,  6.91s/it]score1 tensor([[0.5820],
-        [0.5742],
-        [0.5898],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6875, 0.5664, 0.5781, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0464, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:03:42,944] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 10.28 | optimizer_step: 4.36
-[2025-01-25 13:03:42,945] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.54 | bwd_microstep: 4626.97 | bwd_inner_microstep: 4622.17 | bwd_allreduce_microstep: 4.70 | step_microstep: 53.61
-[2025-01-25 13:03:42,945] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.48 | bwd: 4627.00 | bwd_inner: 4622.17 | bwd_allreduce: 4.75 | step: 53.61
- 27%|██▋       | 1558/5800 [4:17:12<8:09:20,  6.92s/it]                                                       {'loss': 0.0464, 'grad_norm': 3.9379522800445557, 'learning_rate': 3.431869847161525e-05, 'epoch': 13.43}
- 27%|██▋       | 1558/5800 [4:17:12<8:09:20,  6.92s/it]score1 tensor([[0.4844],
-        [0.5000],
-        [0.4551],
-        [0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.4785, 0.4883, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0317, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:03:49,901] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.15 | optimizer_step: 4.37
-[2025-01-25 13:03:49,904] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.98 | bwd_microstep: 4630.87 | bwd_inner_microstep: 4623.71 | bwd_allreduce_microstep: 7.01 | step_microstep: 65.93
-[2025-01-25 13:03:49,904] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.95 | bwd: 4630.94 | bwd_inner: 4623.71 | bwd_allreduce: 7.08 | step: 65.95
- 27%|██▋       | 1559/5800 [4:17:19<8:09:57,  6.93s/it]                                                       {'loss': 0.0317, 'grad_norm': 3.7630207538604736, 'learning_rate': 3.4310899025337704e-05, 'epoch': 13.44}
- 27%|██▋       | 1559/5800 [4:17:19<8:09:57,  6.93s/it]score1 tensor([[0.4980],
-        [0.5234],
-        [0.6523],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4746, 0.4766, 0.6484, 0.6133], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0303, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:03:56,846] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.20 | optimizer_step: 4.36
-[2025-01-25 13:03:56,848] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.80 | bwd_microstep: 4630.66 | bwd_inner_microstep: 4623.92 | bwd_allreduce_microstep: 6.53 | step_microstep: 48.00
-[2025-01-25 13:03:56,848] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.74 | bwd: 4630.71 | bwd_inner: 4623.92 | bwd_allreduce: 6.64 | step: 48.03
- 27%|██▋       | 1560/5800 [4:17:26<8:09:52,  6.93s/it]                                                       {'loss': 0.0303, 'grad_norm': 3.9590189456939697, 'learning_rate': 3.430309511667353e-05, 'epoch': 13.45}
- 27%|██▋       | 1560/5800 [4:17:26<8:09:52,  6.93s/it]score1 tensor([[0.3809],
-        [0.5156],
-        [0.4961],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3652, 0.5039, 0.5000, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:04:03,771] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.37
-[2025-01-25 13:04:03,772] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.68 | bwd_microstep: 4624.19 | bwd_inner_microstep: 4619.49 | bwd_allreduce_microstep: 4.60 | step_microstep: 42.77
-[2025-01-25 13:04:03,772] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.64 | bwd: 4624.21 | bwd_inner: 4619.49 | bwd_allreduce: 4.66 | step: 42.78
- 27%|██▋       | 1561/5800 [4:17:33<8:09:36,  6.93s/it]                                                       {'loss': 0.0093, 'grad_norm': 3.498643159866333, 'learning_rate': 3.429528674805612e-05, 'epoch': 13.46}
- 27%|██▋       | 1561/5800 [4:17:33<8:09:36,  6.93s/it]score1 tensor([[0.4453],
-        [0.4590],
-        [0.3770],
-        [0.4551]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.4004, 0.3711, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0322, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:04:10,693] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 13:04:10,694] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.33 | bwd_microstep: 4635.43 | bwd_inner_microstep: 4630.71 | bwd_allreduce_microstep: 4.64 | step_microstep: 43.92
-[2025-01-25 13:04:10,695] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.30 | bwd: 4635.46 | bwd_inner: 4630.70 | bwd_allreduce: 4.68 | step: 43.93
- 27%|██▋       | 1562/5800 [4:17:40<8:09:15,  6.93s/it]                                                       {'loss': 0.0322, 'grad_norm': 3.3856966495513916, 'learning_rate': 3.4287473921920254e-05, 'epoch': 13.47}
- 27%|██▋       | 1562/5800 [4:17:40<8:09:15,  6.93s/it]score1 tensor([[0.4941],
-        [0.2949],
-        [0.4297],
-        [0.4863]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.3223, 0.4902, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0288, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:04:17,611] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 13:04:17,613] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.71 | bwd_microstep: 4638.82 | bwd_inner_microstep: 4633.89 | bwd_allreduce_microstep: 4.85 | step_microstep: 47.41
-[2025-01-25 13:04:17,613] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.68 | bwd: 4638.84 | bwd_inner: 4633.89 | bwd_allreduce: 4.89 | step: 47.42
- 27%|██▋       | 1563/5800 [4:17:47<8:08:59,  6.92s/it]                                                       {'loss': 0.0288, 'grad_norm': 3.112262487411499, 'learning_rate': 3.427965664070211e-05, 'epoch': 13.47}
- 27%|██▋       | 1563/5800 [4:17:47<8:08:59,  6.92s/it]score1 tensor([[0.3965],
-        [0.4199],
-        [0.4551],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4277, 0.4941, 0.6055, 0.6562], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0903, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:04:24,525] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 13:04:24,526] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.00 | bwd_microstep: 4624.92 | bwd_inner_microstep: 4620.33 | bwd_allreduce_microstep: 4.51 | step_microstep: 40.34
-[2025-01-25 13:04:24,526] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.97 | bwd: 4624.94 | bwd_inner: 4620.33 | bwd_allreduce: 4.55 | step: 40.35
- 27%|██▋       | 1564/5800 [4:17:54<8:08:36,  6.92s/it]                                                       {'loss': 0.0903, 'grad_norm': 7.073589324951172, 'learning_rate': 3.427183490683925e-05, 'epoch': 13.48}
- 27%|██▋       | 1564/5800 [4:17:54<8:08:36,  6.92s/it]score1 tensor([[0.5547],
-        [0.4590],
-        [0.4844],
-        [0.1416]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.4961, 0.5664, 0.4590], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1226, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:04:31,435] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 13:04:31,437] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.86 | bwd_microstep: 4626.67 | bwd_inner_microstep: 4621.85 | bwd_allreduce_microstep: 4.71 | step_microstep: 43.05
-[2025-01-25 13:04:31,437] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.82 | bwd: 4626.69 | bwd_inner: 4621.85 | bwd_allreduce: 4.77 | step: 43.06
- 27%|██▋       | 1565/5800 [4:18:01<8:08:18,  6.92s/it]                                                       {'loss': 0.1226, 'grad_norm': 7.7192230224609375, 'learning_rate': 3.426400872277063e-05, 'epoch': 13.49}
- 27%|██▋       | 1565/5800 [4:18:01<8:08:18,  6.92s/it]score1 tensor([[0.4219],
-        [0.5156],
-        [0.4844],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4766, 0.6172, 0.6484, 0.6523], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1045, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:04:38,354] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 13:04:38,355] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.56 | bwd_microstep: 4625.08 | bwd_inner_microstep: 4620.26 | bwd_allreduce_microstep: 4.72 | step_microstep: 42.75
-[2025-01-25 13:04:38,356] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.53 | bwd: 4625.10 | bwd_inner: 4620.26 | bwd_allreduce: 4.77 | step: 42.75
- 27%|██▋       | 1566/5800 [4:18:08<8:08:11,  6.92s/it]                                                       {'loss': 0.1045, 'grad_norm': 7.399806976318359, 'learning_rate': 3.425617809093659e-05, 'epoch': 13.5}
- 27%|██▋       | 1566/5800 [4:18:08<8:08:11,  6.92s/it]score1 tensor([[0.5547],
-        [0.4492],
-        [0.3848],
-        [0.5898]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133, 0.4551, 0.4395, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0435, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:04:45,276] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 13:04:45,278] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.13 | bwd_microstep: 4630.95 | bwd_inner_microstep: 4625.71 | bwd_allreduce_microstep: 5.15 | step_microstep: 49.76
-[2025-01-25 13:04:45,278] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.10 | bwd: 4630.97 | bwd_inner: 4625.71 | bwd_allreduce: 5.19 | step: 49.81
- 27%|██▋       | 1567/5800 [4:18:15<8:08:14,  6.92s/it]                                                       {'loss': 0.0435, 'grad_norm': 7.396218299865723, 'learning_rate': 3.4248343013778854e-05, 'epoch': 13.51}
- 27%|██▋       | 1567/5800 [4:18:15<8:08:14,  6.92s/it]score1 tensor([[0.3613],
-        [0.4453],
-        [0.4668],
-        [0.4980]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.4844, 0.5391, 0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0503, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:04:52,212] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 13:04:52,213] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.24 | bwd_microstep: 4636.07 | bwd_inner_microstep: 4631.51 | bwd_allreduce_microstep: 4.45 | step_microstep: 52.23
-[2025-01-25 13:04:52,214] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.20 | bwd: 4636.09 | bwd_inner: 4631.51 | bwd_allreduce: 4.50 | step: 52.23
- 27%|██▋       | 1568/5800 [4:18:22<8:08:21,  6.92s/it]                                                       {'loss': 0.0503, 'grad_norm': 6.988527297973633, 'learning_rate': 3.4240503493740526e-05, 'epoch': 13.52}
- 27%|██▋       | 1568/5800 [4:18:22<8:08:21,  6.92s/it]score1 tensor([[0.3945],
-        [0.5117],
-        [0.4727],
-        [0.4434]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4414, 0.5508, 0.5430, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0420, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:04:59,147] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.04 | optimizer_step: 4.37
-[2025-01-25 13:04:59,148] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.71 | bwd_microstep: 4629.73 | bwd_inner_microstep: 4622.81 | bwd_allreduce_microstep: 6.76 | step_microstep: 59.54
-[2025-01-25 13:04:59,149] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.67 | bwd: 4629.76 | bwd_inner: 4622.81 | bwd_allreduce: 6.86 | step: 59.56
- 27%|██▋       | 1569/5800 [4:18:29<8:08:39,  6.93s/it]                                                       {'loss': 0.042, 'grad_norm': 7.099950790405273, 'learning_rate': 3.423265953326613e-05, 'epoch': 13.53}
- 27%|██▋       | 1569/5800 [4:18:29<8:08:39,  6.93s/it]score1 tensor([[0.4102],
-        [0.5000],
-        [0.3398],
-        [0.4473]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.4961, 0.3477, 0.4160], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:05:06,089] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 13:05:06,092] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.14 | bwd_microstep: 4631.65 | bwd_inner_microstep: 4627.39 | bwd_allreduce_microstep: 4.17 | step_microstep: 45.08
-[2025-01-25 13:05:06,092] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.06 | bwd: 4631.68 | bwd_inner: 4627.39 | bwd_allreduce: 4.22 | step: 45.10
- 27%|██▋       | 1570/5800 [4:18:36<8:08:44,  6.93s/it]                                                       {'loss': 0.0132, 'grad_norm': 3.8008944988250732, 'learning_rate': 3.422481113480153e-05, 'epoch': 13.53}
- 27%|██▋       | 1570/5800 [4:18:36<8:08:44,  6.93s/it]score1 tensor([[0.4395],
-        [0.5703],
-        [0.5195],
-        [0.5781]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4160, 0.5430, 0.5039, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0322, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:05:13,009] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 13:05:13,010] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.85 | bwd_microstep: 4634.92 | bwd_inner_microstep: 4630.23 | bwd_allreduce_microstep: 4.58 | step_microstep: 41.02
-[2025-01-25 13:05:13,011] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.81 | bwd: 4634.94 | bwd_inner: 4630.23 | bwd_allreduce: 4.64 | step: 41.02
- 27%|██▋       | 1571/5800 [4:18:42<8:08:10,  6.93s/it]                                                       {'loss': 0.0322, 'grad_norm': 7.679900169372559, 'learning_rate': 3.421695830079399e-05, 'epoch': 13.54}
- 27%|██▋       | 1571/5800 [4:18:42<8:08:10,  6.93s/it]score1 tensor([[0.5352],
-        [0.5352],
-        [0.6250],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.4492, 0.6055, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0483, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:05:19,906] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 13:05:19,907] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.49 | bwd_microstep: 4625.39 | bwd_inner_microstep: 4621.01 | bwd_allreduce_microstep: 4.30 | step_microstep: 41.65
-[2025-01-25 13:05:19,908] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.46 | bwd: 4625.42 | bwd_inner: 4621.01 | bwd_allreduce: 4.34 | step: 41.66
- 27%|██▋       | 1572/5800 [4:18:49<8:07:29,  6.92s/it]                                                       {'loss': 0.0483, 'grad_norm': 7.939088821411133, 'learning_rate': 3.4209101033692165e-05, 'epoch': 13.55}
- 27%|██▋       | 1572/5800 [4:18:49<8:07:29,  6.92s/it]score1 tensor([[0.6172],
-        [0.6836],
-        [0.4160],
-        [0.6758]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5938, 0.6445, 0.3672, 0.6562], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0327, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:05:26,819] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 13:05:26,820] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.21 | bwd_microstep: 4632.29 | bwd_inner_microstep: 4627.85 | bwd_allreduce_microstep: 4.34 | step_microstep: 41.18
-[2025-01-25 13:05:26,820] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.18 | bwd: 4632.31 | bwd_inner: 4627.85 | bwd_allreduce: 4.39 | step: 41.19
- 27%|██▋       | 1573/5800 [4:18:56<8:07:17,  6.92s/it]                                                       {'loss': 0.0327, 'grad_norm': 8.273606300354004, 'learning_rate': 3.42012393359461e-05, 'epoch': 13.56}
- 27%|██▋       | 1573/5800 [4:18:56<8:07:17,  6.92s/it]score1 tensor([[0.4961],
-        [0.4609],
-        [0.5625],
-        [0.6367]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4277, 0.3906, 0.5625, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0366, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:05:33,675] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 13:05:33,676] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.57 | bwd_microstep: 4580.00 | bwd_inner_microstep: 4575.72 | bwd_allreduce_microstep: 4.19 | step_microstep: 39.90
-[2025-01-25 13:05:33,676] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.54 | bwd: 4580.03 | bwd_inner: 4575.72 | bwd_allreduce: 4.24 | step: 39.91
- 27%|██▋       | 1574/5800 [4:19:03<8:05:55,  6.90s/it]                                                       {'loss': 0.0366, 'grad_norm': 1.5151419639587402, 'learning_rate': 3.4193373210007186e-05, 'epoch': 13.57}
- 27%|██▋       | 1574/5800 [4:19:03<8:05:55,  6.90s/it]score1 tensor([[0.5508],
-        [0.4414],
-        [0.4297],
-        [0.5898]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.4629, 0.4199, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:05:40,595] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 13:05:40,596] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.21 | bwd_microstep: 4628.67 | bwd_inner_microstep: 4624.25 | bwd_allreduce_microstep: 4.33 | step_microstep: 42.37
-[2025-01-25 13:05:40,596] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.18 | bwd: 4628.70 | bwd_inner: 4624.25 | bwd_allreduce: 4.38 | step: 42.38
- 27%|██▋       | 1575/5800 [4:19:10<8:06:13,  6.90s/it]                                                       {'loss': 0.0176, 'grad_norm': 0.9087044596672058, 'learning_rate': 3.4185502658328235e-05, 'epoch': 13.58}
- 27%|██▋       | 1575/5800 [4:19:10<8:06:13,  6.90s/it]score1 tensor([[0.5859],
-        [0.5547],
-        [0.5586],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5742, 0.5430, 0.6055, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0322, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:05:47,498] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.54 | optimizer_step: 4.36
-[2025-01-25 13:05:47,499] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.79 | bwd_microstep: 4630.24 | bwd_inner_microstep: 4625.84 | bwd_allreduce_microstep: 4.32 | step_microstep: 40.35
-[2025-01-25 13:05:47,500] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.75 | bwd: 4630.27 | bwd_inner: 4625.84 | bwd_allreduce: 4.36 | step: 40.36
- 27%|██▋       | 1576/5800 [4:19:17<8:06:09,  6.91s/it]                                                       {'loss': 0.0322, 'grad_norm': 4.018801689147949, 'learning_rate': 3.417762768336341e-05, 'epoch': 13.59}
- 27%|██▋       | 1576/5800 [4:19:17<8:06:09,  6.91s/it]score1 tensor([[0.4336],
-        [0.5117],
-        [0.5859],
-        [0.3906]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.5820, 0.6094, 0.3984], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0308, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:05:54,413] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.47 | optimizer_step: 4.37
-[2025-01-25 13:05:54,414] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.21 | bwd_microstep: 4629.18 | bwd_inner_microstep: 4624.61 | bwd_allreduce_microstep: 4.50 | step_microstep: 46.50
-[2025-01-25 13:05:54,414] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.17 | bwd: 4629.21 | bwd_inner: 4624.61 | bwd_allreduce: 4.53 | step: 46.48
- 27%|██▋       | 1577/5800 [4:19:24<8:06:09,  6.91s/it]                                                       {'loss': 0.0308, 'grad_norm': 7.364545822143555, 'learning_rate': 3.416974828756828e-05, 'epoch': 13.59}
- 27%|██▋       | 1577/5800 [4:19:24<8:06:09,  6.91s/it]score1 tensor([[0.4492],
-        [0.5469],
-        [0.4043],
-        [0.4668]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4805, 0.6172, 0.4160, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0361, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:06:01,343] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 13:06:01,344] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.59 | bwd_microstep: 4626.74 | bwd_inner_microstep: 4620.93 | bwd_allreduce_microstep: 5.69 | step_microstep: 62.30
-[2025-01-25 13:06:01,344] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.55 | bwd: 4626.76 | bwd_inner: 4620.93 | bwd_allreduce: 5.75 | step: 62.31
- 27%|██▋       | 1578/5800 [4:19:31<8:06:43,  6.92s/it]                                                       {'loss': 0.0361, 'grad_norm': 7.127826690673828, 'learning_rate': 3.416186447339975e-05, 'epoch': 13.6}
- 27%|██▋       | 1578/5800 [4:19:31<8:06:43,  6.92s/it]score1 tensor([[0.4258],
-        [0.4629],
-        [0.4043],
-        [0.3203]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4023, 0.5156, 0.4023, 0.3105], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0220, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:06:08,305] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.97 | optimizer_step: 4.36
-[2025-01-25 13:06:08,308] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2173.88 | bwd_microstep: 4636.33 | bwd_inner_microstep: 4631.28 | bwd_allreduce_microstep: 4.96 | step_microstep: 64.18
-[2025-01-25 13:06:08,309] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2173.84 | bwd: 4636.36 | bwd_inner: 4631.28 | bwd_allreduce: 5.01 | step: 64.22
- 27%|██▋       | 1579/5800 [4:19:38<8:07:39,  6.93s/it]                                                       {'loss': 0.022, 'grad_norm': 3.0484530925750732, 'learning_rate': 3.415397624331616e-05, 'epoch': 13.61}
- 27%|██▋       | 1579/5800 [4:19:38<8:07:39,  6.93s/it]score1 tensor([[0.6172],
-        [0.4062],
-        [0.4336],
-        [0.3242]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6406, 0.4277, 0.4648, 0.3457], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0244, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:06:15,247] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.37
-[2025-01-25 13:06:15,248] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.17 | bwd_microstep: 4632.80 | bwd_inner_microstep: 4627.96 | bwd_allreduce_microstep: 4.76 | step_microstep: 51.34
-[2025-01-25 13:06:15,248] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.11 | bwd: 4632.82 | bwd_inner: 4627.96 | bwd_allreduce: 4.79 | step: 51.34
- 27%|██▋       | 1580/5800 [4:19:45<8:07:25,  6.93s/it]                                                       {'loss': 0.0244, 'grad_norm': 6.917860507965088, 'learning_rate': 3.414608359977719e-05, 'epoch': 13.62}
- 27%|██▋       | 1580/5800 [4:19:45<8:07:25,  6.93s/it]score1 tensor([[0.5625],
-        [0.3320],
-        [0.6055],
-        [0.4316]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5742, 0.2812, 0.6250, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0361, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:06:22,164] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 13:06:22,165] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.74 | bwd_microstep: 4633.51 | bwd_inner_microstep: 4628.65 | bwd_allreduce_microstep: 4.76 | step_microstep: 40.73
-[2025-01-25 13:06:22,166] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.70 | bwd: 4633.53 | bwd_inner: 4628.65 | bwd_allreduce: 4.81 | step: 40.74
- 27%|██▋       | 1581/5800 [4:19:52<8:07:04,  6.93s/it]                                                       {'loss': 0.0361, 'grad_norm': 4.423737049102783, 'learning_rate': 3.413818654524389e-05, 'epoch': 13.63}
- 27%|██▋       | 1581/5800 [4:19:52<8:07:04,  6.93s/it]score1 tensor([[0.5273],
-        [0.4902],
-        [0.4453],
-        [0.4668]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.5625, 0.5078, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0498, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:06:29,088] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 13:06:29,089] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.79 | bwd_microstep: 4632.91 | bwd_inner_microstep: 4627.01 | bwd_allreduce_microstep: 5.81 | step_microstep: 46.46
-[2025-01-25 13:06:29,089] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.75 | bwd: 4632.94 | bwd_inner: 4627.01 | bwd_allreduce: 5.86 | step: 46.47
- 27%|██▋       | 1582/5800 [4:19:59<8:06:55,  6.93s/it]                                                       {'loss': 0.0498, 'grad_norm': 3.7563769817352295, 'learning_rate': 3.41302850821787e-05, 'epoch': 13.64}
- 27%|██▋       | 1582/5800 [4:19:59<8:06:55,  6.93s/it]score1 tensor([[0.3672],
-        [0.5000],
-        [0.4082],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3730, 0.5273, 0.3789, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:06:36,003] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 13:06:36,004] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.04 | bwd_microstep: 4627.99 | bwd_inner_microstep: 4623.11 | bwd_allreduce_microstep: 4.77 | step_microstep: 41.77
-[2025-01-25 13:06:36,005] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.00 | bwd: 4628.01 | bwd_inner: 4623.11 | bwd_allreduce: 4.83 | step: 41.78
- 27%|██▋       | 1583/5800 [4:20:05<8:06:31,  6.92s/it]                                                       {'loss': 0.0254, 'grad_norm': 0.982915997505188, 'learning_rate': 3.412237921304545e-05, 'epoch': 13.65}
- 27%|██▋       | 1583/5800 [4:20:05<8:06:31,  6.92s/it]score1 tensor([[0.5703],
-        [0.4766],
-        [0.4629],
-        [0.3848]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5703, 0.4863, 0.4805, 0.3945], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:06:42,852] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.52 | optimizer_step: 4.36
-[2025-01-25 13:06:42,853] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.38 | bwd_microstep: 4572.65 | bwd_inner_microstep: 4567.83 | bwd_allreduce_microstep: 4.74 | step_microstep: 41.33
-[2025-01-25 13:06:42,853] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.34 | bwd: 4572.67 | bwd_inner: 4567.83 | bwd_allreduce: 4.78 | step: 41.33
- 27%|██▋       | 1584/5800 [4:20:12<8:04:49,  6.90s/it]                                                       {'loss': 0.0093, 'grad_norm': 5.449896335601807, 'learning_rate': 3.411446894030931e-05, 'epoch': 13.66}
- 27%|██▋       | 1584/5800 [4:20:12<8:04:49,  6.90s/it]score1 tensor([[0.4746],
-        [0.5234],
-        [0.5273],
-        [0.4980]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.5469, 0.5156, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:06:49,770] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 13:06:49,771] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.29 | bwd_microstep: 4631.48 | bwd_inner_microstep: 4626.15 | bwd_allreduce_microstep: 5.24 | step_microstep: 42.84
-[2025-01-25 13:06:49,771] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.25 | bwd: 4631.51 | bwd_inner: 4626.15 | bwd_allreduce: 5.29 | step: 42.85
- 27%|██▋       | 1585/5800 [4:20:19<8:05:07,  6.91s/it]                                                       {'loss': 0.0195, 'grad_norm': 3.6661157608032227, 'learning_rate': 3.410655426643685e-05, 'epoch': 13.66}
- 27%|██▋       | 1585/5800 [4:20:19<8:05:07,  6.91s/it]score1 tensor([[0.6914],
-        [0.4746],
-        [0.4180],
-        [0.3633]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.4473, 0.3730, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0449, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:06:56,678] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 13:06:56,679] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.88 | bwd_microstep: 4627.63 | bwd_inner_microstep: 4622.48 | bwd_allreduce_microstep: 5.04 | step_microstep: 43.00
-[2025-01-25 13:06:56,680] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.85 | bwd: 4627.66 | bwd_inner: 4622.48 | bwd_allreduce: 5.10 | step: 43.01
- 27%|██▋       | 1586/5800 [4:20:26<8:05:05,  6.91s/it]                                                       {'loss': 0.0449, 'grad_norm': 4.255904197692871, 'learning_rate': 3.4098635193895994e-05, 'epoch': 13.67}
- 27%|██▋       | 1586/5800 [4:20:26<8:05:05,  6.91s/it]score1 tensor([[0.3828],
-        [0.6523],
-        [0.4707],
-        [0.4258]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.6094, 0.4746, 0.4355], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0308, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:07:03,606] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.35 | optimizer_step: 4.37
-[2025-01-25 13:07:03,607] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.73 | bwd_microstep: 4633.07 | bwd_inner_microstep: 4626.65 | bwd_allreduce_microstep: 6.21 | step_microstep: 48.27
-[2025-01-25 13:07:03,608] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.69 | bwd: 4633.09 | bwd_inner: 4626.65 | bwd_allreduce: 6.36 | step: 48.28
- 27%|██▋       | 1587/5800 [4:20:33<8:05:27,  6.91s/it]                                                       {'loss': 0.0308, 'grad_norm': 3.033416986465454, 'learning_rate': 3.409071172515606e-05, 'epoch': 13.68}
- 27%|██▋       | 1587/5800 [4:20:33<8:05:27,  6.91s/it]score1 tensor([[0.4238],
-        [0.4980],
-        [0.6133],
-        [0.4648]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3398, 0.4980, 0.5508, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0591, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:07:10,469] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.37
-[2025-01-25 13:07:10,470] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.05 | bwd_microstep: 4574.99 | bwd_inner_microstep: 4569.98 | bwd_allreduce_microstep: 4.92 | step_microstep: 42.31
-[2025-01-25 13:07:10,470] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.02 | bwd: 4575.01 | bwd_inner: 4569.98 | bwd_allreduce: 4.96 | step: 42.32
- 27%|██▋       | 1588/5800 [4:20:40<8:04:29,  6.90s/it]                                                       {'loss': 0.0591, 'grad_norm': 5.615544319152832, 'learning_rate': 3.4082783862687714e-05, 'epoch': 13.69}
- 27%|██▋       | 1588/5800 [4:20:40<8:04:29,  6.90s/it]score1 tensor([[0.5234],
-        [0.5625],
-        [0.5234],
-        [0.4688]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.5508, 0.5039, 0.4297], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0366, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:07:17,425] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.84 | optimizer_step: 4.36
-[2025-01-25 13:07:17,427] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.92 | bwd_microstep: 4636.28 | bwd_inner_microstep: 4628.57 | bwd_allreduce_microstep: 7.38 | step_microstep: 53.28
-[2025-01-25 13:07:17,428] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.80 | bwd: 4636.35 | bwd_inner: 4628.57 | bwd_allreduce: 7.57 | step: 53.28
- 27%|██▋       | 1589/5800 [4:20:47<8:05:33,  6.92s/it]                                                       {'loss': 0.0366, 'grad_norm': 7.570906639099121, 'learning_rate': 3.4074851608962995e-05, 'epoch': 13.7}
- 27%|██▋       | 1589/5800 [4:20:47<8:05:33,  6.92s/it]score1 tensor([[0.5312],
-        [0.5195],
-        [0.3984],
-        [0.4961]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.5664, 0.3867, 0.3809], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0522, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:07:24,354] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 13:07:24,356] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.76 | bwd_microstep: 4619.82 | bwd_inner_microstep: 4613.68 | bwd_allreduce_microstep: 6.03 | step_microstep: 47.75
-[2025-01-25 13:07:24,356] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.72 | bwd: 4619.85 | bwd_inner: 4613.68 | bwd_allreduce: 6.09 | step: 47.77
- 27%|██▋       | 1590/5800 [4:20:54<8:05:28,  6.92s/it]                                                       {'loss': 0.0522, 'grad_norm': 0.7777058482170105, 'learning_rate': 3.406691496645533e-05, 'epoch': 13.71}
- 27%|██▋       | 1590/5800 [4:20:54<8:05:28,  6.92s/it]score1 tensor([[0.4863],
-        [0.5273],
-        [0.6250],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.5273, 0.6562, 0.5234], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:07:31,186] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 13:07:31,188] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.37 | bwd_microstep: 4538.94 | bwd_inner_microstep: 4534.20 | bwd_allreduce_microstep: 4.67 | step_microstep: 46.92
-[2025-01-25 13:07:31,189] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.34 | bwd: 4538.96 | bwd_inner: 4534.20 | bwd_allreduce: 4.70 | step: 46.97
- 27%|██▋       | 1591/5800 [4:21:01<8:03:31,  6.89s/it]                                                       {'loss': 0.0122, 'grad_norm': 3.970755100250244, 'learning_rate': 3.40589739376395e-05, 'epoch': 13.72}
- 27%|██▋       | 1591/5800 [4:21:01<8:03:31,  6.89s/it]score1 tensor([[0.5234],
-        [0.5312],
-        [0.4473],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.4961, 0.4043, 0.4297], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:07:38,101] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 13:07:38,103] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.35 | bwd_microstep: 4632.13 | bwd_inner_microstep: 4626.34 | bwd_allreduce_microstep: 5.64 | step_microstep: 44.65
-[2025-01-25 13:07:38,103] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.31 | bwd: 4632.16 | bwd_inner: 4626.34 | bwd_allreduce: 5.72 | step: 44.67
- 27%|██▋       | 1592/5800 [4:21:08<8:03:48,  6.90s/it]                                                       {'loss': 0.0469, 'grad_norm': 3.6914358139038086, 'learning_rate': 3.4051028524991644e-05, 'epoch': 13.72}
- 27%|██▋       | 1592/5800 [4:21:08<8:03:48,  6.90s/it]score1 tensor([[0.5039],
-        [0.4746],
-        [0.5273],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.4570, 0.5000, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0190, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:07:45,029] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.89 | optimizer_step: 4.36
-[2025-01-25 13:07:45,031] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.75 | bwd_microstep: 4629.57 | bwd_inner_microstep: 4620.27 | bwd_allreduce_microstep: 9.21 | step_microstep: 47.44
-[2025-01-25 13:07:45,031] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.69 | bwd: 4629.60 | bwd_inner: 4620.27 | bwd_allreduce: 9.26 | step: 47.44
- 27%|██▋       | 1593/5800 [4:21:14<8:04:06,  6.90s/it]                                                       {'loss': 0.019, 'grad_norm': 3.8569529056549072, 'learning_rate': 3.40430787309893e-05, 'epoch': 13.73}
- 27%|██▋       | 1593/5800 [4:21:14<8:04:06,  6.90s/it]score1 tensor([[0.5000],
-        [0.6914],
-        [0.5430],
-        [0.4961]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.6719, 0.5547, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:07:51,938] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.37
-[2025-01-25 13:07:51,939] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.54 | bwd_microstep: 4631.47 | bwd_inner_microstep: 4626.41 | bwd_allreduce_microstep: 4.96 | step_microstep: 45.38
-[2025-01-25 13:07:51,940] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2168.52 | bwd: 4631.49 | bwd_inner: 4626.41 | bwd_allreduce: 5.01 | step: 45.38
- 27%|██▋       | 1594/5800 [4:21:21<8:04:22,  6.91s/it]                                                       {'loss': 0.0122, 'grad_norm': 1.1575164794921875, 'learning_rate': 3.4035124558111325e-05, 'epoch': 13.74}
- 27%|██▋       | 1594/5800 [4:21:21<8:04:22,  6.91s/it]score1 tensor([[0.4414],
-        [0.4570],
-        [0.4941],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4961, 0.4473, 0.4980, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0229, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:07:58,851] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 13:07:58,852] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.09 | bwd_microstep: 4627.27 | bwd_inner_microstep: 4622.19 | bwd_allreduce_microstep: 4.98 | step_microstep: 44.11
-[2025-01-25 13:07:58,852] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.02 | bwd: 4627.29 | bwd_inner: 4622.18 | bwd_allreduce: 5.04 | step: 44.13
- 28%|██▊       | 1595/5800 [4:21:28<8:04:13,  6.91s/it]                                                       {'loss': 0.0229, 'grad_norm': 3.766605854034424, 'learning_rate': 3.402716600883799e-05, 'epoch': 13.75}
- 28%|██▊       | 1595/5800 [4:21:28<8:04:13,  6.91s/it]score1 tensor([[0.4219],
-        [0.5352],
-        [0.6211],
-        [0.4648]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4453, 0.5664, 0.6367, 0.4277], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0269, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:08:05,773] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 13:08:05,774] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.05 | bwd_microstep: 4631.87 | bwd_inner_microstep: 4627.20 | bwd_allreduce_microstep: 4.58 | step_microstep: 42.86
-[2025-01-25 13:08:05,774] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.01 | bwd: 4631.89 | bwd_inner: 4627.20 | bwd_allreduce: 4.62 | step: 42.88
- 28%|██▊       | 1596/5800 [4:21:35<8:04:22,  6.91s/it]                                                       {'loss': 0.0269, 'grad_norm': 4.017066478729248, 'learning_rate': 3.40192030856509e-05, 'epoch': 13.76}
- 28%|██▊       | 1596/5800 [4:21:35<8:04:22,  6.91s/it]score1 tensor([[0.4688],
-        [0.2188],
-        [0.5664],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4453, 0.4492, 0.5391, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0850, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:08:12,701] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.57 | optimizer_step: 4.42
-[2025-01-25 13:08:12,704] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.04 | bwd_microstep: 4631.88 | bwd_inner_microstep: 4623.65 | bwd_allreduce_microstep: 7.75 | step_microstep: 57.30
-[2025-01-25 13:08:12,705] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.00 | bwd: 4631.96 | bwd_inner: 4623.65 | bwd_allreduce: 8.04 | step: 57.31
- 28%|██▊       | 1597/5800 [4:21:42<8:05:04,  6.92s/it]                                                       {'loss': 0.085, 'grad_norm': 6.955930709838867, 'learning_rate': 3.4011235791033024e-05, 'epoch': 13.77}
- 28%|██▊       | 1597/5800 [4:21:42<8:05:04,  6.92s/it]score1 tensor([[0.4375],
-        [0.5469],
-        [0.6328],
-        [0.3965]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.5898, 0.6445, 0.4043], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0225, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:08:19,686] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 13:08:19,689] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.66 | bwd_microstep: 4626.88 | bwd_inner_microstep: 4618.74 | bwd_allreduce_microstep: 8.00 | step_microstep: 76.49
-[2025-01-25 13:08:19,689] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.59 | bwd: 4626.91 | bwd_inner: 4618.74 | bwd_allreduce: 8.08 | step: 76.54
- 28%|██▊       | 1598/5800 [4:21:49<8:06:41,  6.95s/it]                                                       {'loss': 0.0225, 'grad_norm': 7.622585773468018, 'learning_rate': 3.400326412746872e-05, 'epoch': 13.78}
- 28%|██▊       | 1598/5800 [4:21:49<8:06:41,  6.95s/it]score1 tensor([[0.6797],
-        [0.5273],
-        [0.5469],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6328, 0.5781, 0.5508, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0264, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:08:26,687] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 13:08:26,689] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2171.15 | bwd_microstep: 4633.47 | bwd_inner_microstep: 4627.50 | bwd_allreduce_microstep: 5.84 | step_microstep: 48.17
-[2025-01-25 13:08:26,689] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2171.12 | bwd: 4633.49 | bwd_inner: 4627.50 | bwd_allreduce: 5.91 | step: 48.17
- 28%|██▊       | 1599/5800 [4:21:56<8:06:43,  6.95s/it]                                                       {'loss': 0.0264, 'grad_norm': 18.010894775390625, 'learning_rate': 3.3995288097443683e-05, 'epoch': 13.78}
- 28%|██▊       | 1599/5800 [4:21:56<8:06:43,  6.95s/it]score1 tensor([[0.4492],
-        [0.3906],
-        [0.3223],
-        [0.3906]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.3809, 0.4668, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1021, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:08:33,614] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.36
-[2025-01-25 13:08:33,616] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.88 | bwd_microstep: 4628.14 | bwd_inner_microstep: 4623.07 | bwd_allreduce_microstep: 4.95 | step_microstep: 49.70
-[2025-01-25 13:08:33,616] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.83 | bwd: 4628.19 | bwd_inner: 4623.07 | bwd_allreduce: 5.01 | step: 49.72
- 28%|██▊       | 1600/5800 [4:22:03<8:06:06,  6.94s/it]                                                       {'loss': 0.1021, 'grad_norm': 13.708812713623047, 'learning_rate': 3.3987307703444984e-05, 'epoch': 13.79}
- 28%|██▊       | 1600/5800 [4:22:03<8:06:06,  6.94s/it]score1 tensor([[0.5547],
-        [0.5508],
-        [0.4941],
-        [0.4570]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.5078, 0.4141, 0.4727], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0376, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:08:40,526] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 13:08:40,527] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.12 | bwd_microstep: 4621.99 | bwd_inner_microstep: 4616.95 | bwd_allreduce_microstep: 4.94 | step_microstep: 43.10
-[2025-01-25 13:08:40,527] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.08 | bwd: 4622.01 | bwd_inner: 4616.95 | bwd_allreduce: 4.98 | step: 43.11
- 28%|██▊       | 1601/5800 [4:22:10<8:05:18,  6.93s/it]                                                       {'loss': 0.0376, 'grad_norm': 0.7162468433380127, 'learning_rate': 3.397932294796103e-05, 'epoch': 13.8}
- 28%|██▊       | 1601/5800 [4:22:10<8:05:18,  6.93s/it]score1 tensor([[0.4551],
-        [0.4355],
-        [0.7031],
-        [0.6250]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.4609, 0.6289, 0.6641], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0483, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:08:47,442] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 13:08:47,444] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.44 | bwd_microstep: 4621.92 | bwd_inner_microstep: 4616.49 | bwd_allreduce_microstep: 5.34 | step_microstep: 48.01
-[2025-01-25 13:08:47,444] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.41 | bwd: 4621.94 | bwd_inner: 4616.49 | bwd_allreduce: 5.39 | step: 48.01
- 28%|██▊       | 1602/5800 [4:22:17<8:04:45,  6.93s/it]                                                       {'loss': 0.0483, 'grad_norm': 0.8133872151374817, 'learning_rate': 3.397133383348163e-05, 'epoch': 13.81}
- 28%|██▊       | 1602/5800 [4:22:17<8:04:45,  6.93s/it]score1 tensor([[0.5273],
-        [0.4297],
-        [0.5938],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.3750, 0.5117, 0.4609], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0557, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:08:54,354] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.36
-[2025-01-25 13:08:54,355] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.03 | bwd_microstep: 4629.36 | bwd_inner_microstep: 4623.96 | bwd_allreduce_microstep: 5.32 | step_microstep: 45.11
-[2025-01-25 13:08:54,356] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.99 | bwd: 4629.38 | bwd_inner: 4623.96 | bwd_allreduce: 5.36 | step: 45.12
- 28%|██▊       | 1603/5800 [4:22:24<8:04:15,  6.92s/it]                                                       {'loss': 0.0557, 'grad_norm': 7.546109199523926, 'learning_rate': 3.3963340362497904e-05, 'epoch': 13.82}
- 28%|██▊       | 1603/5800 [4:22:24<8:04:15,  6.92s/it]score1 tensor([[0.4863],
-        [0.4961],
-        [0.5938],
-        [0.4902]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.4258, 0.5586, 0.4805], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0366, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:09:01,264] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 13:09:01,266] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.73 | bwd_microstep: 4626.14 | bwd_inner_microstep: 4620.31 | bwd_allreduce_microstep: 5.67 | step_microstep: 45.95
-[2025-01-25 13:09:01,266] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.68 | bwd: 4626.17 | bwd_inner: 4620.30 | bwd_allreduce: 5.76 | step: 45.96
- 28%|██▊       | 1604/5800 [4:22:31<8:03:55,  6.92s/it]                                                       {'loss': 0.0366, 'grad_norm': 7.442922592163086, 'learning_rate': 3.395534253750238e-05, 'epoch': 13.83}
- 28%|██▊       | 1604/5800 [4:22:31<8:03:55,  6.92s/it]score1 tensor([[0.5469],
-        [0.5039],
-        [0.6211],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5078, 0.4238, 0.5508, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0483, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:09:08,193] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.36
-[2025-01-25 13:09:08,194] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.43 | bwd_microstep: 4637.82 | bwd_inner_microstep: 4633.23 | bwd_allreduce_microstep: 4.51 | step_microstep: 41.35
-[2025-01-25 13:09:08,194] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.36 | bwd: 4637.85 | bwd_inner: 4633.23 | bwd_allreduce: 4.55 | step: 41.36
- 28%|██▊       | 1605/5800 [4:22:38<8:03:59,  6.92s/it]                                                       {'loss': 0.0483, 'grad_norm': 7.752247333526611, 'learning_rate': 3.3947340360988903e-05, 'epoch': 13.84}
- 28%|██▊       | 1605/5800 [4:22:38<8:03:59,  6.92s/it]score1 tensor([[0.5117],
-        [0.4766],
-        [0.4746],
-        [0.4922]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4141, 0.4316, 0.4336, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0479, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:09:15,117] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.00 | optimizer_step: 4.36
-[2025-01-25 13:09:15,118] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.52 | bwd_microstep: 4633.98 | bwd_inner_microstep: 4628.71 | bwd_allreduce_microstep: 5.16 | step_microstep: 49.98
-[2025-01-25 13:09:15,119] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.47 | bwd: 4634.00 | bwd_inner: 4628.71 | bwd_allreduce: 5.22 | step: 50.02
- 28%|██▊       | 1606/5800 [4:22:45<8:04:06,  6.93s/it]                                                       {'loss': 0.0479, 'grad_norm': 7.181713104248047, 'learning_rate': 3.393933383545269e-05, 'epoch': 13.84}
- 28%|██▊       | 1606/5800 [4:22:45<8:04:06,  6.93s/it]score1 tensor([[0.6250],
-        [0.3750],
-        [0.5508],
-        [0.4551]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6797, 0.3340, 0.5898, 0.4316], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0396, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:09:22,116] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 12.18 | optimizer_step: 4.82
-[2025-01-25 13:09:22,118] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.82 | bwd_microstep: 4632.90 | bwd_inner_microstep: 4624.80 | bwd_allreduce_microstep: 7.79 | step_microstep: 99.39
-[2025-01-25 13:09:22,118] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.78 | bwd: 4632.96 | bwd_inner: 4624.80 | bwd_allreduce: 7.98 | step: 99.40
- 28%|██▊       | 1607/5800 [4:22:52<8:06:09,  6.96s/it]                                                       {'loss': 0.0396, 'grad_norm': 1.1855683326721191, 'learning_rate': 3.393132296339033e-05, 'epoch': 13.85}
- 28%|██▊       | 1607/5800 [4:22:52<8:06:09,  6.96s/it]score1 tensor([[0.4824],
-        [0.3633],
-        [0.4805],
-        [0.2041]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.3555, 0.4922, 0.1787], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:09:29,102] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.37
-[2025-01-25 13:09:29,103] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2171.71 | bwd_microstep: 4631.46 | bwd_inner_microstep: 4625.81 | bwd_allreduce_microstep: 5.51 | step_microstep: 45.78
-[2025-01-25 13:09:29,104] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2171.66 | bwd: 4631.49 | bwd_inner: 4625.81 | bwd_allreduce: 5.59 | step: 45.79
- 28%|██▊       | 1608/5800 [4:22:59<8:05:50,  6.95s/it]                                                       {'loss': 0.0234, 'grad_norm': 1.6864440441131592, 'learning_rate': 3.392330774729973e-05, 'epoch': 13.86}
- 28%|██▊       | 1608/5800 [4:22:59<8:05:50,  6.95s/it]score1 tensor([[0.5742],
-        [0.5312],
-        [0.3809],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.4844, 0.3945, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0317, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:09:36,013] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.93 | optimizer_step: 4.37
-[2025-01-25 13:09:36,014] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.38 | bwd_microstep: 4629.87 | bwd_inner_microstep: 4624.18 | bwd_allreduce_microstep: 5.54 | step_microstep: 43.14
-[2025-01-25 13:09:36,014] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.33 | bwd: 4629.89 | bwd_inner: 4624.18 | bwd_allreduce: 5.62 | step: 43.15
- 28%|██▊       | 1609/5800 [4:23:05<8:04:48,  6.94s/it]                                                       {'loss': 0.0317, 'grad_norm': 4.459043502807617, 'learning_rate': 3.3915288189680185e-05, 'epoch': 13.87}
- 28%|██▊       | 1609/5800 [4:23:05<8:04:48,  6.94s/it]score1 tensor([[0.4766],
-        [0.3887],
-        [0.6016],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.4473, 0.6797, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0576, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:09:42,926] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 13:09:42,927] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.42 | bwd_microstep: 4626.95 | bwd_inner_microstep: 4622.11 | bwd_allreduce_microstep: 4.74 | step_microstep: 42.43
-[2025-01-25 13:09:42,927] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.39 | bwd: 4626.98 | bwd_inner: 4622.11 | bwd_allreduce: 4.79 | step: 42.44
- 28%|██▊       | 1610/5800 [4:23:12<8:04:06,  6.93s/it]                                                       {'loss': 0.0576, 'grad_norm': 4.823912143707275, 'learning_rate': 3.390726429303233e-05, 'epoch': 13.88}
- 28%|██▊       | 1610/5800 [4:23:12<8:04:06,  6.93s/it]evaluate!
-score1 tensor([[0.4453]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4805, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5039]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5625, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0586, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4785]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4766, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4785]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5703, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0918, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5039]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5742, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4277]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5547, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1270, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5312]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5977, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5469]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4219, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1250, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5117]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4277]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5000, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0723, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3789]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3945, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5000, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4316]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3730, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0586, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4336]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0586, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3945]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4414, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.8203]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6797, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1406, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4746]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5391, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0645, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4668]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4648, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3359]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4258, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0898, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4004]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4707, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4199]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3965, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4473]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3750, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0723, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4336]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4492, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4082]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5586, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1504, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5078]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6523, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1445, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5664]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5898, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5195]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4355, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0840, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5195]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5156, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4590]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3262, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1328, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3711]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4863, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1152, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3906]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2578, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6055, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0820, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4863]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4570, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1328, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5859]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.7031, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1172, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4375]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4336, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5039]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5391, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3906]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4473, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0566, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5430, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3828]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4590, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0762, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5938]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6641, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4785]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4453, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0332, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5508]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.7109, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1602, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5273, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4023]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3691, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0332, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3574]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4180, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0605, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4102]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3223, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0879, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6016, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0781, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4824]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4648, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3926]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3418, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4590, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0801, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4355]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5117, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0762, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4590]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4551, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4297]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5234, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0938, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5195]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5312, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5547]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5820, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3828]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4199, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0371, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3809]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3926, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4766]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1406, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4727]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4668]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5078]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5352, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5938]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6250, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5781]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5703, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4336]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4727, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5469]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4434, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1035, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4414]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4941, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0527, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4746]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5352, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0605, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4805]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4258]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3828, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3516]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4219, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4395]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4844, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0449, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4688]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4805, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5000]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1172, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5898, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6094]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6641, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4707]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4688, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3633]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4297, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3398]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4668, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1270, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4688]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1602, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4023]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4297, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3691]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4609, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0918, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4688]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1484, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4355]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4492, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3652]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3730, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5078]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6445, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1367, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4707]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1777, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4238]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5430, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1191, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4395]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5273, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0879, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3750]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3867, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4512]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4668, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4668]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4473]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5625, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1152, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5625]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6250, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0625, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5898]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1133, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4043]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4785, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0742, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4219]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4062, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4746]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4395, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4004]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5664, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1660, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4316]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3555, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0762, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4219]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4785, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0566, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5039]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5078, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5078]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6016, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0938, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4121]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4277, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5547]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5781, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4414]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3477, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0938, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4609]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6133, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1523, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5156, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0., device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3906]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3379, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0527, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3613]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3418, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4297]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3848, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0449, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3613]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3828, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4512]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4277, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3594]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4043, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0449, device='cuda:0', dtype=torch.bfloat16)
-Results saved to /DATA/DATA1/wjr/intern/InternVL/internvl_chat/new_train_result/stage2/mos3_test.csv
-Accuracy: 0.0
-SRCC_score: 0.653794397514804
-PLCC_score: 0.648158888581871
-KRCC_score: 0.4646485660814035
-SRCC_level: 0.653794397514804
-PLCC_level: 0.648158888581871
-KRCC_level: 0.4646485660814035
-New best SRCC_score: 0.653794397514804. Saving model...
-[INFO|trainer.py:3705] 2025-01-25 13:20:14,878 >> Saving model checkpoint to /DATA/env/wjr/newtrain/stage2/mos3
-[INFO|configuration_utils.py:410] 2025-01-25 13:20:14,887 >> Configuration saved in /DATA/env/wjr/newtrain/stage2/mos3/config.json
-[INFO|configuration_utils.py:868] 2025-01-25 13:20:14,888 >> Configuration saved in /DATA/env/wjr/newtrain/stage2/mos3/generation_config.json
-[INFO|modeling_utils.py:2844] 2025-01-25 13:21:53,001 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 4 checkpoint shards. You can find where each parameters has been saved in the index located at /DATA/env/wjr/newtrain/stage2/mos3/model.safetensors.index.json.
-[INFO|tokenization_utils_base.py:2641] 2025-01-25 13:21:53,005 >> tokenizer config file saved in /DATA/env/wjr/newtrain/stage2/mos3/tokenizer_config.json
-[INFO|tokenization_utils_base.py:2650] 2025-01-25 13:21:53,005 >> Special tokens file saved in /DATA/env/wjr/newtrain/stage2/mos3/special_tokens_map.json
-[INFO|tokenization_utils_base.py:2701] 2025-01-25 13:21:53,005 >> added tokens file saved in /DATA/env/wjr/newtrain/stage2/mos3/added_tokens.json
-01/25/2025 13:22:03 - INFO - __main__ - Saved LoRA weights to /DATA/env/wjr/newtrain/stage2/mos3/lora_weights.pth
-score1 tensor([[0.3926],
-        [0.5391],
-        [0.5664],
-        [0.4180]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4570, 0.5742, 0.6016, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0503, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:22:10,500] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 13:22:10,501] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2125.72 | bwd_microstep: 4577.66 | bwd_inner_microstep: 4572.55 | bwd_allreduce_microstep: 5.01 | step_microstep: 49.82
-[2025-01-25 13:22:10,502] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2125.64 | bwd: 4577.68 | bwd_inner: 4572.55 | bwd_allreduce: 5.06 | step: 49.84
- 28%|██▊       | 1611/5800 [4:35:40<266:36:46, 229.13s/it]                                                          {'loss': 0.0503, 'grad_norm': 7.214425086975098, 'learning_rate': 3.389923605985816e-05, 'epoch': 13.89}
- 28%|██▊       | 1611/5800 [4:35:40<266:36:46, 229.13s/it]score1 tensor([[0.5820],
-        [0.3770],
-        [0.3965],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.4492, 0.4453, 0.4863], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0396, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:22:17,343] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 13:22:17,345] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2135.33 | bwd_microstep: 4592.39 | bwd_inner_microstep: 4587.37 | bwd_allreduce_microstep: 4.89 | step_microstep: 43.27
-[2025-01-25 13:22:17,346] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2135.29 | bwd: 4592.41 | bwd_inner: 4587.37 | bwd_allreduce: 4.96 | step: 43.27
- 28%|██▊       | 1612/5800 [4:35:47<188:58:22, 162.44s/it]                                                          {'loss': 0.0396, 'grad_norm': 3.479844331741333, 'learning_rate': 3.389120349266102e-05, 'epoch': 13.9}
- 28%|██▊       | 1612/5800 [4:35:47<188:58:22, 162.44s/it]score1 tensor([[0.3457],
-        [0.5547],
-        [0.4766],
-        [0.5977]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4023, 0.6172, 0.5195, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0454, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:22:24,215] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.53 | optimizer_step: 4.36
-[2025-01-25 13:22:24,216] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2144.33 | bwd_microstep: 4608.04 | bwd_inner_microstep: 4603.40 | bwd_allreduce_microstep: 4.53 | step_microstep: 42.45
-[2025-01-25 13:22:24,216] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2144.29 | bwd: 4608.07 | bwd_inner: 4603.40 | bwd_allreduce: 4.58 | step: 42.45
- 28%|██▊       | 1613/5800 [4:35:54<134:38:58, 115.77s/it]                                                          {'loss': 0.0454, 'grad_norm': 7.253718852996826, 'learning_rate': 3.388316659394558e-05, 'epoch': 13.91}
- 28%|██▊       | 1613/5800 [4:35:54<134:38:58, 115.77s/it]score1 tensor([[0.3926],
-        [0.5625],
-        [0.3672],
-        [0.4297]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.5508, 0.4297, 0.4824], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0464, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:22:31,073] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 13:22:31,074] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2139.30 | bwd_microstep: 4582.60 | bwd_inner_microstep: 4577.53 | bwd_allreduce_microstep: 4.96 | step_microstep: 46.24
-[2025-01-25 13:22:31,075] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2135.02 | bwd: 4582.63 | bwd_inner: 4577.53 | bwd_allreduce: 5.02 | step: 46.25
- 28%|██▊       | 1614/5800 [4:36:01<96:37:12, 83.09s/it]                                                          {'loss': 0.0464, 'grad_norm': 2.9479098320007324, 'learning_rate': 3.387512536621792e-05, 'epoch': 13.91}
- 28%|██▊       | 1614/5800 [4:36:01<96:37:12, 83.09s/it]score1 tensor([[0.6055],
-        [0.5625],
-        [0.4902],
-        [0.3379]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5820, 0.5312, 0.4766, 0.3652], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0239, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:22:37,907] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 13:22:37,908] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2132.30 | bwd_microstep: 4589.44 | bwd_inner_microstep: 4584.13 | bwd_allreduce_microstep: 5.19 | step_microstep: 44.03
-[2025-01-25 13:22:37,908] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2132.27 | bwd: 4589.47 | bwd_inner: 4584.14 | bwd_allreduce: 5.26 | step: 44.04
- 28%|██▊       | 1615/5800 [4:36:07<70:00:02, 60.22s/it]                                                        {'loss': 0.0239, 'grad_norm': 4.440952301025391, 'learning_rate': 3.386707981198541e-05, 'epoch': 13.92}
- 28%|██▊       | 1615/5800 [4:36:07<70:00:02, 60.22s/it]score1 tensor([[0.5312],
-        [0.5703],
-        [0.6172],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4355, 0.5703, 0.6094, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0298, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:22:44,699] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 13:22:44,700] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2136.83 | bwd_microstep: 4544.84 | bwd_inner_microstep: 4539.50 | bwd_allreduce_microstep: 5.25 | step_microstep: 43.08
-[2025-01-25 13:22:44,700] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2136.80 | bwd: 4544.87 | bwd_inner: 4539.50 | bwd_allreduce: 5.30 | step: 43.08
- 28%|██▊       | 1616/5800 [4:36:14<51:21:26, 44.19s/it]                                                        {'loss': 0.0298, 'grad_norm': 2.2378451824188232, 'learning_rate': 3.38590299337568e-05, 'epoch': 13.93}
- 28%|██▊       | 1616/5800 [4:36:14<51:21:26, 44.19s/it]score1 tensor([[0.2871],
-        [0.3730],
-        [0.5039],
-        [0.4590]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3086, 0.3750, 0.4395, 0.4082], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0347, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:22:51,558] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.36
-[2025-01-25 13:22:51,559] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2147.63 | bwd_microstep: 4600.43 | bwd_inner_microstep: 4595.70 | bwd_allreduce_microstep: 4.63 | step_microstep: 42.65
-[2025-01-25 13:22:51,560] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2147.60 | bwd: 4600.45 | bwd_inner: 4595.70 | bwd_allreduce: 4.68 | step: 42.65
- 28%|██▊       | 1617/5800 [4:36:21<38:20:01, 32.99s/it]                                                        {'loss': 0.0347, 'grad_norm': 1.1247966289520264, 'learning_rate': 3.385097573404218e-05, 'epoch': 13.94}
- 28%|██▊       | 1617/5800 [4:36:21<38:20:01, 32.99s/it]score1 tensor([[0.4766],
-        [0.5742],
-        [0.4316],
-        [0.6016]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4922, 0.6484, 0.4551, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0381, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:22:58,453] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.55 | optimizer_step: 4.36
-[2025-01-25 13:22:58,454] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2148.35 | bwd_microstep: 4610.77 | bwd_inner_microstep: 4604.13 | bwd_allreduce_microstep: 6.49 | step_microstep: 56.59
-[2025-01-25 13:22:58,454] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2148.24 | bwd: 4610.82 | bwd_inner: 4604.13 | bwd_allreduce: 6.57 | step: 56.57
- 28%|██▊       | 1618/5800 [4:36:28<29:13:38, 25.16s/it]                                                        {'loss': 0.0381, 'grad_norm': 3.3858163356781006, 'learning_rate': 3.3842917215352984e-05, 'epoch': 13.95}
- 28%|██▊       | 1618/5800 [4:36:28<29:13:38, 25.16s/it]score1 tensor([[0.6406],
-        [0.5312],
-        [0.4746],
-        [0.4297]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6875, 0.4727, 0.4375, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0493, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:23:05,360] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.53 | optimizer_step: 4.36
-[2025-01-25 13:23:05,361] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.27 | bwd_microstep: 4613.06 | bwd_inner_microstep: 4605.28 | bwd_allreduce_microstep: 7.61 | step_microstep: 55.55
-[2025-01-25 13:23:05,362] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.18 | bwd: 4613.11 | bwd_inner: 4605.28 | bwd_allreduce: 7.69 | step: 55.53
- 28%|██▊       | 1619/5800 [4:36:35<22:52:00, 19.69s/it]                                                        {'loss': 0.0493, 'grad_norm': 3.2833797931671143, 'learning_rate': 3.383485438020202e-05, 'epoch': 13.96}
- 28%|██▊       | 1619/5800 [4:36:35<22:52:00, 19.69s/it]score1 tensor([[0.6562],
-        [0.6289],
-        [0.5820],
-        [0.5781]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6289, 0.5664, 0.5664, 0.4883], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0488, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:23:12,254] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.37
-[2025-01-25 13:23:12,256] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.52 | bwd_microstep: 4617.14 | bwd_inner_microstep: 4612.00 | bwd_allreduce_microstep: 5.02 | step_microstep: 45.29
-[2025-01-25 13:23:12,256] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.48 | bwd: 4617.16 | bwd_inner: 4612.00 | bwd_allreduce: 5.09 | step: 45.30
- 28%|██▊       | 1620/5800 [4:36:42<18:23:59, 15.85s/it]                                                        {'loss': 0.0488, 'grad_norm': 8.257408142089844, 'learning_rate': 3.3826787231103396e-05, 'epoch': 13.97}
- 28%|██▊       | 1620/5800 [4:36:42<18:23:59, 15.85s/it]score1 tensor([[0.4609],
-        [0.5547],
-        [0.5430],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.4863, 0.4941, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0410, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:23:19,152] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.36
-[2025-01-25 13:23:19,154] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.52 | bwd_microstep: 4619.02 | bwd_inner_microstep: 4614.20 | bwd_allreduce_microstep: 4.71 | step_microstep: 54.71
-[2025-01-25 13:23:19,154] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.48 | bwd: 4619.05 | bwd_inner: 4614.20 | bwd_allreduce: 4.78 | step: 54.72
- 28%|██▊       | 1621/5800 [4:36:49<15:16:43, 13.16s/it]                                                        {'loss': 0.041, 'grad_norm': 4.064204216003418, 'learning_rate': 3.3818715770572594e-05, 'epoch': 13.97}
- 28%|██▊       | 1621/5800 [4:36:49<15:16:43, 13.16s/it]score1 tensor([[0.4531],
-        [0.4805],
-        [0.5391],
-        [0.6406]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3262, 0.4941, 0.6836, 0.6484], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0732, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:23:26,067] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 13:23:26,069] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.66 | bwd_microstep: 4629.56 | bwd_inner_microstep: 4624.72 | bwd_allreduce_microstep: 4.75 | step_microstep: 45.82
-[2025-01-25 13:23:26,069] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.62 | bwd: 4629.58 | bwd_inner: 4624.72 | bwd_allreduce: 4.80 | step: 45.82
- 28%|██▊       | 1622/5800 [4:36:56<13:06:05, 11.29s/it]                                                        {'loss': 0.0732, 'grad_norm': 4.1220808029174805, 'learning_rate': 3.381064000112644e-05, 'epoch': 13.98}
- 28%|██▊       | 1622/5800 [4:36:56<13:06:05, 11.29s/it]score1 tensor([[0.4844],
-        [0.4902],
-        [0.5312],
-        [0.5977]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.5195, 0.6016, 0.6875], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0566, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:23:32,952] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.36
-[2025-01-25 13:23:32,953] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.68 | bwd_microstep: 4618.27 | bwd_inner_microstep: 4613.18 | bwd_allreduce_microstep: 4.99 | step_microstep: 43.15
-[2025-01-25 13:23:32,953] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.65 | bwd: 4618.29 | bwd_inner: 4613.18 | bwd_allreduce: 5.05 | step: 43.16
- 28%|██▊       | 1623/5800 [4:37:02<11:33:49,  9.97s/it]                                                        {'loss': 0.0566, 'grad_norm': 3.9467740058898926, 'learning_rate': 3.3802559925283106e-05, 'epoch': 13.99}
- 28%|██▊       | 1623/5800 [4:37:02<11:33:49,  9.97s/it][93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-score1 tensor([[0.4082]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:23:37,917] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.38 | optimizer_step: 4.36
-[2025-01-25 13:23:37,919] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 578.73 | bwd_microstep: 1221.59 | bwd_inner_microstep: 1215.34 | bwd_allreduce_microstep: 6.09 | step_microstep: 61.49
-[2025-01-25 13:23:37,919] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 578.64 | bwd: 1221.65 | bwd_inner: 1215.34 | bwd_allreduce: 6.15 | step: 61.47
- 28%|██▊       | 1624/5800 [4:37:07<9:49:15,  8.47s/it]                                                        {'loss': 0.0098, 'grad_norm': 6.427048206329346, 'learning_rate': 3.379447554556209e-05, 'epoch': 14.0}
- 28%|██▊       | 1624/5800 [4:37:07<9:49:15,  8.47s/it][2025-01-25 13:23:42,288] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 13:23:51,735] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 13:24:01,524] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 13:24:11,296] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-score1 tensor([[0.5195],
-        [0.4043],
-        [0.5742],
-        [0.4512]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5586, 0.4492, 0.5469, 0.4297], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0332, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:24:25,657] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 13:24:25,659] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2140.75 | bwd_microstep: 4605.94 | bwd_inner_microstep: 4601.27 | bwd_allreduce_microstep: 4.58 | step_microstep: 43.46
-[2025-01-25 13:24:25,659] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2140.71 | bwd: 4605.96 | bwd_inner: 4601.27 | bwd_allreduce: 4.63 | step: 43.47
- 28%|██▊       | 1625/5800 [4:37:55<23:28:56, 20.25s/it]                                                        {'loss': 0.0332, 'grad_norm': 5.691874027252197, 'learning_rate': 3.378638686448424e-05, 'epoch': 14.01}
- 28%|██▊       | 1625/5800 [4:37:55<23:28:56, 20.25s/it]score1 tensor([[0.4805],
-        [0.3516],
-        [0.5273],
-        [0.3887]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5078, 0.3613, 0.5547, 0.3340], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0298, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:24:32,496] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.37
-[2025-01-25 13:24:32,498] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2138.00 | bwd_microstep: 4589.10 | bwd_inner_microstep: 4583.98 | bwd_allreduce_microstep: 5.02 | step_microstep: 43.10
-[2025-01-25 13:24:32,498] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2137.97 | bwd: 4589.12 | bwd_inner: 4583.98 | bwd_allreduce: 5.07 | step: 43.11
- 28%|██▊       | 1626/5800 [4:38:02<18:48:51, 16.23s/it]                                                        {'loss': 0.0298, 'grad_norm': 3.4985108375549316, 'learning_rate': 3.3778293884571756e-05, 'epoch': 14.02}
- 28%|██▊       | 1626/5800 [4:38:02<18:48:51, 16.23s/it]score1 tensor([[0.4219],
-        [0.5781],
-        [0.5703],
-        [0.4668]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3789, 0.6172, 0.5391, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:24:39,350] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 13:24:39,351] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2136.04 | bwd_microstep: 4588.88 | bwd_inner_microstep: 4584.42 | bwd_allreduce_microstep: 4.40 | step_microstep: 51.81
-[2025-01-25 13:24:39,352] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2136.01 | bwd: 4588.91 | bwd_inner: 4584.42 | bwd_allreduce: 4.43 | step: 51.82
- 28%|██▊       | 1627/5800 [4:38:09<15:33:07, 13.42s/it]                                                        {'loss': 0.0312, 'grad_norm': 0.45718082785606384, 'learning_rate': 3.3770196608348156e-05, 'epoch': 14.03}
- 28%|██▊       | 1627/5800 [4:38:09<15:33:07, 13.42s/it]score1 tensor([[0.5039],
-        [0.5195],
-        [0.5430],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3652, 0.4980, 0.5352, 0.5000], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0488, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:24:46,229] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.53 | optimizer_step: 4.36
-[2025-01-25 13:24:46,231] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2140.36 | bwd_microstep: 4605.76 | bwd_inner_microstep: 4598.69 | bwd_allreduce_microstep: 6.88 | step_microstep: 48.88
-[2025-01-25 13:24:46,231] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2140.32 | bwd: 4605.82 | bwd_inner: 4598.69 | bwd_allreduce: 6.98 | step: 48.87
- 28%|██▊       | 1628/5800 [4:38:16<13:16:16, 11.45s/it]                                                        {'loss': 0.0488, 'grad_norm': 7.34169864654541, 'learning_rate': 3.376209503833833e-05, 'epoch': 14.03}
- 28%|██▊       | 1628/5800 [4:38:16<13:16:16, 11.45s/it]score1 tensor([[0.6328],
-        [0.5625],
-        [0.5430],
-        [0.6680]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.5469, 0.5430, 0.6250], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0342, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:24:53,074] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.53 | optimizer_step: 4.37
-[2025-01-25 13:24:53,076] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.90 | bwd_microstep: 4559.97 | bwd_inner_microstep: 4553.56 | bwd_allreduce_microstep: 6.24 | step_microstep: 51.13
-[2025-01-25 13:24:53,076] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.81 | bwd: 4560.03 | bwd_inner: 4553.56 | bwd_allreduce: 6.33 | step: 51.14
- 28%|██▊       | 1629/5800 [4:38:23<11:40:38, 10.08s/it]                                                        {'loss': 0.0342, 'grad_norm': 6.227017879486084, 'learning_rate': 3.375398917706847e-05, 'epoch': 14.04}
- 28%|���█▊       | 1629/5800 [4:38:23<11:40:38, 10.08s/it]score1 tensor([[0.5391],
-        [0.6016],
-        [0.4492],
-        [0.5977]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.5703, 0.4004, 0.6719], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0444, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:24:59,993] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.49 | optimizer_step: 4.36
-[2025-01-25 13:24:59,994] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2146.51 | bwd_microstep: 4619.36 | bwd_inner_microstep: 4614.53 | bwd_allreduce_microstep: 4.67 | step_microstep: 42.38
-[2025-01-25 13:24:59,995] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2146.48 | bwd: 4619.38 | bwd_inner: 4614.53 | bwd_allreduce: 4.78 | step: 42.38
- 28%|██▊       | 1630/5800 [4:38:29<10:33:59,  9.12s/it]                                                        {'loss': 0.0444, 'grad_norm': 0.6717785596847534, 'learning_rate': 3.374587902706613e-05, 'epoch': 14.05}
- 28%|██▊       | 1630/5800 [4:38:29<10:33:59,  9.12s/it]score1 tensor([[0.4648],
-        [0.5820],
-        [0.5352],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.6484, 0.5312, 0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0420, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:25:06,866] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 13:25:06,868] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2149.86 | bwd_microstep: 4614.66 | bwd_inner_microstep: 4609.61 | bwd_allreduce_microstep: 4.95 | step_microstep: 44.37
-[2025-01-25 13:25:06,868] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2149.83 | bwd: 4614.69 | bwd_inner: 4609.61 | bwd_allreduce: 5.00 | step: 44.38
- 28%|██▊       | 1631/5800 [4:38:36<9:47:02,  8.45s/it]                                                        {'loss': 0.042, 'grad_norm': 3.631749391555786, 'learning_rate': 3.37377645908602e-05, 'epoch': 14.06}
- 28%|██▊       | 1631/5800 [4:38:36<9:47:02,  8.45s/it]score1 tensor([[0.5273],
-        [0.4062],
-        [0.4746],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.3477, 0.5039, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0259, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:25:13,751] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.36
-[2025-01-25 13:25:13,752] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.80 | bwd_microstep: 4617.54 | bwd_inner_microstep: 4613.33 | bwd_allreduce_microstep: 4.13 | step_microstep: 42.18
-[2025-01-25 13:25:13,752] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.76 | bwd: 4617.56 | bwd_inner: 4613.33 | bwd_allreduce: 4.17 | step: 42.18
- 28%|██▊       | 1632/5800 [4:38:43<9:14:19,  7.98s/it]                                                       {'loss': 0.0259, 'grad_norm': 3.832899808883667, 'learning_rate': 3.3729645870980906e-05, 'epoch': 14.07}
- 28%|██▊       | 1632/5800 [4:38:43<9:14:19,  7.98s/it]score1 tensor([[0.5664],
-        [0.5117],
-        [0.4746],
-        [0.6680]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.4648, 0.4336, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0483, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:25:20,628] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.36
-[2025-01-25 13:25:20,630] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.48 | bwd_microstep: 4609.22 | bwd_inner_microstep: 4602.93 | bwd_allreduce_microstep: 6.20 | step_microstep: 46.37
-[2025-01-25 13:25:20,630] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.45 | bwd: 4609.25 | bwd_inner: 4602.93 | bwd_allreduce: 6.25 | step: 46.38
- 28%|██▊       | 1633/5800 [4:38:50<8:51:07,  7.65s/it]                                                       {'loss': 0.0483, 'grad_norm': 7.650967121124268, 'learning_rate': 3.37215228699598e-05, 'epoch': 14.08}
- 28%|██▊       | 1633/5800 [4:38:50<8:51:07,  7.65s/it]score1 tensor([[0.4609],
-        [0.5039],
-        [0.4375],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4121, 0.4141, 0.4043, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0488, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:25:27,508] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.52 | optimizer_step: 4.36
-[2025-01-25 13:25:27,509] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.97 | bwd_microstep: 4617.18 | bwd_inner_microstep: 4612.31 | bwd_allreduce_microstep: 4.77 | step_microstep: 43.57
-[2025-01-25 13:25:27,510] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.93 | bwd: 4617.21 | bwd_inner: 4612.31 | bwd_allreduce: 4.82 | step: 43.59
- 28%|██▊       | 1634/5800 [4:38:57<8:35:04,  7.42s/it]                                                       {'loss': 0.0488, 'grad_norm': 3.3131625652313232, 'learning_rate': 3.371339559032977e-05, 'epoch': 14.09}
- 28%|██▊       | 1634/5800 [4:38:57<8:35:04,  7.42s/it]score1 tensor([[0.5039],
-        [0.5234],
-        [0.5781],
-        [0.6719]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.6406, 0.6133, 0.6484], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0464, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:25:34,383] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.36
-[2025-01-25 13:25:34,384] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2151.11 | bwd_microstep: 4612.30 | bwd_inner_microstep: 4607.22 | bwd_allreduce_microstep: 4.96 | step_microstep: 41.60
-[2025-01-25 13:25:34,384] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2151.08 | bwd: 4612.32 | bwd_inner: 4607.21 | bwd_allreduce: 5.03 | step: 41.61
- 28%|██▊       | 1635/5800 [4:39:04<8:23:36,  7.25s/it]                                                       {'loss': 0.0464, 'grad_norm': 0.8050893545150757, 'learning_rate': 3.3705264034625046e-05, 'epoch': 14.09}
- 28%|██▊       | 1635/5800 [4:39:04<8:23:36,  7.25s/it]score1 tensor([[0.5352],
-        [0.4082],
-        [0.5352],
-        [0.4688]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.4043, 0.5508, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:25:41,266] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.37
-[2025-01-25 13:25:41,268] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.13 | bwd_microstep: 4616.02 | bwd_inner_microstep: 4610.93 | bwd_allreduce_microstep: 4.95 | step_microstep: 45.51
-[2025-01-25 13:25:41,268] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.10 | bwd: 4616.05 | bwd_inner: 4610.93 | bwd_allreduce: 5.01 | step: 45.53
- 28%|██▊       | 1636/5800 [4:39:11<8:15:46,  7.14s/it]                                                       {'loss': 0.0127, 'grad_norm': 3.2965943813323975, 'learning_rate': 3.3697128205381186e-05, 'epoch': 14.1}
- 28%|██▊       | 1636/5800 [4:39:11<8:15:46,  7.14s/it]score1 tensor([[0.5117],
-        [0.5664],
-        [0.4648],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4883, 0.6016, 0.4961, 0.6484], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0527, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:25:48,154] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 13:25:48,155] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.39 | bwd_microstep: 4616.51 | bwd_inner_microstep: 4610.51 | bwd_allreduce_microstep: 5.91 | step_microstep: 40.91
-[2025-01-25 13:25:48,156] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.36 | bwd: 4616.53 | bwd_inner: 4610.51 | bwd_allreduce: 5.96 | step: 40.91
- 28%|██▊       | 1637/5800 [4:39:18<8:10:20,  7.07s/it]                                                       {'loss': 0.0527, 'grad_norm': 3.7863152027130127, 'learning_rate': 3.3688988105135094e-05, 'epoch': 14.11}
- 28%|██▊       | 1637/5800 [4:39:18<8:10:20,  7.07s/it]score1 tensor([[0.4258],
-        [0.4785],
-        [0.4609],
-        [0.4668]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.5430, 0.4941, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0444, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:25:55,106] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.36 | optimizer_step: 4.37
-[2025-01-25 13:25:55,108] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.28 | bwd_microstep: 4627.87 | bwd_inner_microstep: 4621.42 | bwd_allreduce_microstep: 6.36 | step_microstep: 100.71
-[2025-01-25 13:25:55,108] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.26 | bwd: 4627.90 | bwd_inner: 4621.42 | bwd_allreduce: 6.41 | step: 100.77
- 28%|██▊       | 1638/5800 [4:39:25<8:07:56,  7.03s/it]                                                       {'loss': 0.0444, 'grad_norm': 6.8294267654418945, 'learning_rate': 3.368084373642498e-05, 'epoch': 14.12}
- 28%|██▊       | 1638/5800 [4:39:25<8:07:56,  7.03s/it]score1 tensor([[0.5781],
-        [0.4023],
-        [0.5234],
-        [0.3652]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.4688, 0.6289, 0.3789], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0674, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:26:02,026] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 13:26:02,027] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.19 | bwd_microstep: 4616.45 | bwd_inner_microstep: 4612.01 | bwd_allreduce_microstep: 4.36 | step_microstep: 41.79
-[2025-01-25 13:26:02,028] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.08 | bwd: 4616.47 | bwd_inner: 4612.01 | bwd_allreduce: 4.39 | step: 41.79
- 28%|██▊       | 1639/5800 [4:39:31<8:05:18,  7.00s/it]                                                       {'loss': 0.0674, 'grad_norm': 19.105712890625, 'learning_rate': 3.367269510179041e-05, 'epoch': 14.13}
- 28%|██▊       | 1639/5800 [4:39:32<8:05:18,  7.00s/it]score1 tensor([[0.3203],
-        [0.4180],
-        [0.3965],
-        [0.3750]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3867, 0.4785, 0.4414, 0.3945], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0479, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:26:08,918] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 13:26:08,919] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.78 | bwd_microstep: 4620.66 | bwd_inner_microstep: 4614.74 | bwd_allreduce_microstep: 5.78 | step_microstep: 43.55
-[2025-01-25 13:26:08,920] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.73 | bwd: 4620.69 | bwd_inner: 4614.74 | bwd_allreduce: 5.87 | step: 43.56
- 28%|██▊       | 1640/5800 [4:39:38<8:02:58,  6.97s/it]                                                       {'loss': 0.0479, 'grad_norm': 6.040486812591553, 'learning_rate': 3.366454220377226e-05, 'epoch': 14.14}
- 28%|██▊       | 1640/5800 [4:39:38<8:02:58,  6.97s/it]score1 tensor([[0.4414],
-        [0.8008],
-        [0.4629],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.3984, 0.4961, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1348, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:26:15,808] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 13:26:15,810] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.95 | bwd_microstep: 4617.88 | bwd_inner_microstep: 4613.00 | bwd_allreduce_microstep: 4.81 | step_microstep: 43.63
-[2025-01-25 13:26:15,810] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.91 | bwd: 4617.90 | bwd_inner: 4613.00 | bwd_allreduce: 4.85 | step: 43.66
- 28%|██▊       | 1641/5800 [4:39:45<8:01:17,  6.94s/it]                                                       {'loss': 0.1348, 'grad_norm': 11.673755645751953, 'learning_rate': 3.3656385044912754e-05, 'epoch': 14.15}
- 28%|██▊       | 1641/5800 [4:39:45<8:01:17,  6.94s/it]score1 tensor([[0.6523],
-        [0.4609],
-        [0.4238],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.5273, 0.4648, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0820, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:26:22,704] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 13:26:22,705] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.80 | bwd_microstep: 4628.25 | bwd_inner_microstep: 4622.41 | bwd_allreduce_microstep: 5.73 | step_microstep: 35.67
-[2025-01-25 13:26:22,706] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.76 | bwd: 4628.28 | bwd_inner: 4622.41 | bwd_allreduce: 5.78 | step: 35.66
- 28%|██▊       | 1642/5800 [4:39:52<8:00:14,  6.93s/it]                                                       {'loss': 0.082, 'grad_norm': 154.2752685546875, 'learning_rate': 3.3648223627755427e-05, 'epoch': 14.16}
- 28%|██▊       | 1642/5800 [4:39:52<8:00:14,  6.93s/it]score1 tensor([[2.2188],
-        [0.4766],
-        [0.5156],
-        [0.4434]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.4395, 0.5469, 0.4258], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.4473, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:26:29,615] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 13:26:29,617] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.38 | bwd_microstep: 4628.37 | bwd_inner_microstep: 4622.82 | bwd_allreduce_microstep: 5.43 | step_microstep: 48.38
-[2025-01-25 13:26:29,617] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.36 | bwd: 4628.39 | bwd_inner: 4622.82 | bwd_allreduce: 5.49 | step: 48.39
- 28%|██▊       | 1643/5800 [4:39:59<7:59:43,  6.92s/it]                                                       {'loss': 0.4473, 'grad_norm': 8.42727279663086, 'learning_rate': 3.364005795484516e-05, 'epoch': 14.16}
- 28%|██▊       | 1643/5800 [4:39:59<7:59:43,  6.92s/it]score1 tensor([[0.5938],
-        [1.4375],
-        [0.4688],
-        [0.4688]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5742, 0.4785, 0.4902, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2539, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:26:36,527] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 13:26:36,529] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.13 | bwd_microstep: 4629.92 | bwd_inner_microstep: 4624.42 | bwd_allreduce_microstep: 5.39 | step_microstep: 47.40
-[2025-01-25 13:26:36,529] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.09 | bwd: 4629.95 | bwd_inner: 4624.42 | bwd_allreduce: 5.46 | step: 47.41
- 28%|██▊       | 1644/5800 [4:40:06<7:59:23,  6.92s/it]                                                       {'loss': 0.2539, 'grad_norm': 5.039490699768066, 'learning_rate': 3.3631888028728145e-05, 'epoch': 14.17}
- 28%|██▊       | 1644/5800 [4:40:06<7:59:23,  6.92s/it]score1 tensor([[0.4590],
-        [0.6055],
-        [0.6250],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4434, 0.6055, 0.6445, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:26:43,383] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.37
-[2025-01-25 13:26:43,384] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.34 | bwd_microstep: 4569.67 | bwd_inner_microstep: 4564.50 | bwd_allreduce_microstep: 5.07 | step_microstep: 43.73
-[2025-01-25 13:26:43,384] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.29 | bwd: 4569.70 | bwd_inner: 4564.50 | bwd_allreduce: 5.13 | step: 43.74
- 28%|██▊       | 1645/5800 [4:40:13<7:57:49,  6.90s/it]                                                       {'loss': 0.0107, 'grad_norm': 4.1183295249938965, 'learning_rate': 3.3623713851951915e-05, 'epoch': 14.18}
- 28%|██▊       | 1645/5800 [4:40:13<7:57:49,  6.90s/it]score1 tensor([[0.6172],
-        [0.4883],
-        [0.4922],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6367, 0.4707, 0.4805, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0259, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:26:50,308] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.82 | optimizer_step: 4.36
-[2025-01-25 13:26:50,310] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.53 | bwd_microstep: 4634.94 | bwd_inner_microstep: 4629.26 | bwd_allreduce_microstep: 5.45 | step_microstep: 54.11
-[2025-01-25 13:26:50,311] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.50 | bwd: 4635.01 | bwd_inner: 4629.26 | bwd_allreduce: 5.56 | step: 54.16
- 28%|██▊       | 1646/5800 [4:40:20<7:58:27,  6.91s/it]                                                       {'loss': 0.0259, 'grad_norm': 3.670391321182251, 'learning_rate': 3.361553542706531e-05, 'epoch': 14.19}
- 28%|██▊       | 1646/5800 [4:40:20<7:58:27,  6.91s/it]score1 tensor([[0.4180],
-        [0.5000],
-        [0.3926],
-        [0.7539]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3398, 0.4629, 0.4082, 0.5977], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0718, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:26:57,245] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.83 | optimizer_step: 4.85
-[2025-01-25 13:26:57,251] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.22 | bwd_microstep: 4631.46 | bwd_inner_microstep: 4627.17 | bwd_allreduce_microstep: 4.22 | step_microstep: 58.06
-[2025-01-25 13:26:57,252] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.17 | bwd: 4631.48 | bwd_inner: 4627.17 | bwd_allreduce: 4.26 | step: 58.08
- 28%|██▊       | 1647/5800 [4:40:27<7:59:25,  6.93s/it]                                                       {'loss': 0.0718, 'grad_norm': 6.896650314331055, 'learning_rate': 3.360735275661851e-05, 'epoch': 14.2}
- 28%|██▊       | 1647/5800 [4:40:27<7:59:25,  6.93s/it]score1 tensor([[0.6055],
-        [0.4844],
-        [0.5547],
-        [0.3828]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6367, 0.4844, 0.5664, 0.3438], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0205, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:27:04,213] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.89 | optimizer_step: 4.59
-[2025-01-25 13:27:04,215] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2175.81 | bwd_microstep: 4583.57 | bwd_inner_microstep: 4573.71 | bwd_allreduce_microstep: 9.65 | step_microstep: 91.28
-[2025-01-25 13:27:04,216] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2175.71 | bwd: 4583.63 | bwd_inner: 4573.71 | bwd_allreduce: 9.75 | step: 91.29
- 28%|██▊       | 1648/5800 [4:40:34<8:00:07,  6.94s/it]                                                       {'loss': 0.0205, 'grad_norm': 2.536435127258301, 'learning_rate': 3.359916584316301e-05, 'epoch': 14.21}
- 28%|██▊       | 1648/5800 [4:40:34<8:00:07,  6.94s/it]score1 tensor([[0.4902],
-        [0.5117],
-        [0.6016],
-        [0.4316]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4531, 0.6094, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0356, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:27:11,169] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 13:27:11,170] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.47 | bwd_microstep: 4635.14 | bwd_inner_microstep: 4630.01 | bwd_allreduce_microstep: 5.04 | step_microstep: 42.16
-[2025-01-25 13:27:11,170] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.38 | bwd: 4635.16 | bwd_inner: 4630.01 | bwd_allreduce: 5.08 | step: 42.16
- 28%|██▊       | 1649/5800 [4:40:41<7:59:54,  6.94s/it]                                                       {'loss': 0.0356, 'grad_norm': 3.6626007556915283, 'learning_rate': 3.3590974689251634e-05, 'epoch': 14.22}
- 28%|██▊       | 1649/5800 [4:40:41<7:59:54,  6.94s/it]score1 tensor([[0.6328],
-        [0.4414],
-        [0.4961],
-        [0.4141]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6875, 0.4199, 0.4766, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0327, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:27:18,102] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.88 | optimizer_step: 4.36
-[2025-01-25 13:27:18,104] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.24 | bwd_microstep: 4634.29 | bwd_inner_microstep: 4628.59 | bwd_allreduce_microstep: 5.54 | step_microstep: 48.14
-[2025-01-25 13:27:18,105] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.21 | bwd: 4634.31 | bwd_inner: 4628.59 | bwd_allreduce: 5.60 | step: 48.16
- 28%|██▊       | 1650/5800 [4:40:48<7:59:44,  6.94s/it]                                                       {'loss': 0.0327, 'grad_norm': 0.6854590177536011, 'learning_rate': 3.358277929743853e-05, 'epoch': 14.22}
- 28%|██▊       | 1650/5800 [4:40:48<7:59:44,  6.94s/it]score1 tensor([[0.5117],
-        [0.4785],
-        [0.5039],
-        [0.5781]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.4941, 0.5312, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0278, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:27:25,033] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 13:27:25,034] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.08 | bwd_microstep: 4631.61 | bwd_inner_microstep: 4626.27 | bwd_allreduce_microstep: 5.23 | step_microstep: 45.76
-[2025-01-25 13:27:25,035] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.02 | bwd: 4631.64 | bwd_inner: 4626.27 | bwd_allreduce: 5.29 | step: 45.77
- 28%|██▊       | 1651/5800 [4:40:55<7:59:18,  6.93s/it]                                                       {'loss': 0.0278, 'grad_norm': 0.6157549619674683, 'learning_rate': 3.357457967027915e-05, 'epoch': 14.23}
- 28%|██▊       | 1651/5800 [4:40:55<7:59:18,  6.93s/it]score1 tensor([[0.5859],
-        [0.5742],
-        [0.0000],
-        [0.6016]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6289, 0.5117, 0.4609, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1504, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:27:31,893] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 13:27:31,895] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.82 | bwd_microstep: 4580.69 | bwd_inner_microstep: 4575.46 | bwd_allreduce_microstep: 5.13 | step_microstep: 44.31
-[2025-01-25 13:27:31,895] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.78 | bwd: 4580.71 | bwd_inner: 4575.46 | bwd_allreduce: 5.18 | step: 44.31
- 28%|██▊       | 1652/5800 [4:41:01<7:57:42,  6.91s/it]                                                       {'loss': 0.1504, 'grad_norm': 2.0982213020324707, 'learning_rate': 3.3566375810330294e-05, 'epoch': 14.24}
- 28%|██▊       | 1652/5800 [4:41:01<7:57:42,  6.91s/it]score1 tensor([[0.5156],
-        [0.4043],
-        [0.4902],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4961, 0.4141, 0.5117, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:27:38,791] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.37
-[2025-01-25 13:27:38,793] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.93 | bwd_microstep: 4623.57 | bwd_inner_microstep: 4618.04 | bwd_allreduce_microstep: 5.44 | step_microstep: 41.99
-[2025-01-25 13:27:38,793] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.90 | bwd: 4623.59 | bwd_inner: 4618.04 | bwd_allreduce: 5.48 | step: 42.00
- 28%|██▊       | 1653/5800 [4:41:08<7:57:34,  6.91s/it]                                                       {'loss': 0.0146, 'grad_norm': 5.64644718170166, 'learning_rate': 3.3558167720150064e-05, 'epoch': 14.25}
- 28%|██▊       | 1653/5800 [4:41:08<7:57:34,  6.91s/it]score1 tensor([[0.4922],
-        [0.4883],
-        [0.5078],
-        [0.3887]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.4473, 0.4629, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0303, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:27:45,704] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 13:27:45,705] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.10 | bwd_microstep: 4625.26 | bwd_inner_microstep: 4619.70 | bwd_allreduce_microstep: 5.44 | step_microstep: 44.04
-[2025-01-25 13:27:45,706] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.07 | bwd: 4625.28 | bwd_inner: 4619.70 | bwd_allreduce: 5.51 | step: 44.04
- 29%|██▊       | 1654/5800 [4:41:15<7:57:22,  6.91s/it]                                                       {'loss': 0.0303, 'grad_norm': 3.8758697509765625, 'learning_rate': 3.354995540229789e-05, 'epoch': 14.26}
- 29%|██▊       | 1654/5800 [4:41:15<7:57:22,  6.91s/it]score1 tensor([[0.0000],
-        [0.4629],
-        [0.1245],
-        [0.4648]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.4395, 0.4297, 0.4316], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2041, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:27:52,559] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.37
-[2025-01-25 13:27:52,561] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.88 | bwd_microstep: 4568.95 | bwd_inner_microstep: 4563.04 | bwd_allreduce_microstep: 5.73 | step_microstep: 46.48
-[2025-01-25 13:27:52,561] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.84 | bwd: 4568.98 | bwd_inner: 4563.04 | bwd_allreduce: 5.85 | step: 46.49
- 29%|██▊       | 1655/5800 [4:41:22<7:56:12,  6.89s/it]                                                       {'loss': 0.2041, 'grad_norm': 25.18466567993164, 'learning_rate': 3.3541738859334497e-05, 'epoch': 14.27}
- 29%|██▊       | 1655/5800 [4:41:22<7:56:12,  6.89s/it]score1 tensor([[0.6211],
-        [0.4746],
-        [0.6289],
-        [0.4805]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.4922, 0.5898, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0210, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:27:59,508] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.80 | optimizer_step: 4.36
-[2025-01-25 13:27:59,511] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.48 | bwd_microstep: 4635.50 | bwd_inner_microstep: 4629.12 | bwd_allreduce_microstep: 6.26 | step_microstep: 58.89
-[2025-01-25 13:27:59,511] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2168.44 | bwd: 4635.53 | bwd_inner: 4629.12 | bwd_allreduce: 6.34 | step: 58.93
- 29%|██▊       | 1656/5800 [4:41:29<7:57:39,  6.92s/it]                                                       {'loss': 0.021, 'grad_norm': 1.1103209257125854, 'learning_rate': 3.353351809382197e-05, 'epoch': 14.28}
- 29%|██▊       | 1656/5800 [4:41:29<7:57:39,  6.92s/it]score1 tensor([[0.4746],
-        [0.4316],
-        [0.6016],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.4160, 0.6406, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:28:06,474] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 13:28:06,475] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.79 | bwd_microstep: 4633.59 | bwd_inner_microstep: 4625.64 | bwd_allreduce_microstep: 7.75 | step_microstep: 49.95
-[2025-01-25 13:28:06,476] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.75 | bwd: 4633.64 | bwd_inner: 4625.64 | bwd_allreduce: 7.86 | step: 49.92
- 29%|██▊       | 1657/5800 [4:41:36<7:58:54,  6.94s/it]                                                       {'loss': 0.0293, 'grad_norm': 4.058741092681885, 'learning_rate': 3.3525293108323666e-05, 'epoch': 14.28}
- 29%|██▊       | 1657/5800 [4:41:36<7:58:54,  6.94s/it]score1 tensor([[0.6016],
-        [0.4883],
-        [0.4375],
-        [0.3906]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.5430, 0.4551, 0.3672], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0347, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:28:13,463] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 13:28:13,465] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.27 | bwd_microstep: 4637.61 | bwd_inner_microstep: 4632.22 | bwd_allreduce_microstep: 5.27 | step_microstep: 44.65
-[2025-01-25 13:28:13,465] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.21 | bwd: 4637.64 | bwd_inner: 4632.22 | bwd_allreduce: 5.32 | step: 44.66
- 29%|██▊       | 1658/5800 [4:41:43<7:59:06,  6.94s/it]                                                       {'loss': 0.0347, 'grad_norm': 3.966977119445801, 'learning_rate': 3.3517063905404295e-05, 'epoch': 14.29}
- 29%|██▊       | 1658/5800 [4:41:43<7:59:06,  6.94s/it]score1 tensor([[0.4395],
-        [0.4023],
-        [0.0000],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4629, 0.3809, 0.4277, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1211, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:28:20,325] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.94 | optimizer_step: 4.36
-[2025-01-25 13:28:20,327] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.06 | bwd_microstep: 4576.99 | bwd_inner_microstep: 4569.44 | bwd_allreduce_microstep: 7.44 | step_microstep: 46.15
-[2025-01-25 13:28:20,327] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.02 | bwd: 4577.03 | bwd_inner: 4569.44 | bwd_allreduce: 7.50 | step: 46.16
- 29%|██▊       | 1659/5800 [4:41:50<7:57:38,  6.92s/it]                                                       {'loss': 0.1211, 'grad_norm': 1.8684048652648926, 'learning_rate': 3.350883048762985e-05, 'epoch': 14.3}
- 29%|██▊       | 1659/5800 [4:41:50<7:57:38,  6.92s/it]score1 tensor([[0.5156],
-        [0.4258],
-        [0.5039],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5000, 0.4512, 0.5078, 0.5977], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0239, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:28:27,242] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 13:28:27,244] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.29 | bwd_microstep: 4621.83 | bwd_inner_microstep: 4617.04 | bwd_allreduce_microstep: 4.68 | step_microstep: 42.79
-[2025-01-25 13:28:27,244] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.25 | bwd: 4621.85 | bwd_inner: 4617.04 | bwd_allreduce: 4.73 | step: 42.79
- 29%|██▊       | 1660/5800 [4:41:57<7:57:03,  6.91s/it]                                                       {'loss': 0.0239, 'grad_norm': 3.5324838161468506, 'learning_rate': 3.350059285756766e-05, 'epoch': 14.31}
- 29%|██▊       | 1660/5800 [4:41:57<7:57:03,  6.91s/it]score1 tensor([[0.3750],
-        [0.1260],
-        [0.4512],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3691, 0.1787, 0.4180, 0.5742], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0249, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:28:34,135] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 13:28:34,137] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.32 | bwd_microstep: 4625.16 | bwd_inner_microstep: 4620.42 | bwd_allreduce_microstep: 4.64 | step_microstep: 40.94
-[2025-01-25 13:28:34,137] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.28 | bwd: 4625.19 | bwd_inner: 4620.42 | bwd_allreduce: 4.69 | step: 40.94
- 29%|██▊       | 1661/5800 [4:42:04<7:56:35,  6.91s/it]                                                       {'loss': 0.0249, 'grad_norm': 2.0637292861938477, 'learning_rate': 3.349235101778637e-05, 'epoch': 14.32}
- 29%|██▊       | 1661/5800 [4:42:04<7:56:35,  6.91s/it]score1 tensor([[0.0000],
-        [0.4961],
-        [0.4668],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5703, 0.5156, 0.4004, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1660, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:28:40,987] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 13:28:40,989] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.44 | bwd_microstep: 4572.08 | bwd_inner_microstep: 4566.76 | bwd_allreduce_microstep: 5.22 | step_microstep: 47.12
-[2025-01-25 13:28:40,989] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.41 | bwd: 4572.10 | bwd_inner: 4566.76 | bwd_allreduce: 5.27 | step: 47.13
- 29%|██▊       | 1662/5800 [4:42:10<7:55:13,  6.89s/it]                                                       {'loss': 0.166, 'grad_norm': 1.787499189376831, 'learning_rate': 3.348410497085591e-05, 'epoch': 14.33}
- 29%|██▊       | 1662/5800 [4:42:10<7:55:13,  6.89s/it]score1 tensor([[0.5430],
-        [0.4102],
-        [0.4727],
-        [0.0000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.3730, 0.5039, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1758, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:28:47,839] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 13:28:47,841] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.24 | bwd_microstep: 4579.65 | bwd_inner_microstep: 4574.73 | bwd_allreduce_microstep: 4.84 | step_microstep: 40.67
-[2025-01-25 13:28:47,841] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.20 | bwd: 4579.68 | bwd_inner: 4574.72 | bwd_allreduce: 4.88 | step: 40.68
- 29%|██▊       | 1663/5800 [4:42:17<7:54:26,  6.88s/it]                                                       {'loss': 0.1758, 'grad_norm': 2.099141836166382, 'learning_rate': 3.347585471934756e-05, 'epoch': 14.34}
- 29%|██▊       | 1663/5800 [4:42:17<7:54:26,  6.88s/it]score1 tensor([[0.4883],
-        [0.4883],
-        [0.5156],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5000, 0.4551, 0.4844, 0.4688], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0288, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:28:54,746] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.37
-[2025-01-25 13:28:54,748] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.98 | bwd_microstep: 4628.09 | bwd_inner_microstep: 4623.37 | bwd_allreduce_microstep: 4.64 | step_microstep: 41.19
-[2025-01-25 13:28:54,748] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.94 | bwd: 4628.11 | bwd_inner: 4623.37 | bwd_allreduce: 4.68 | step: 41.19
- 29%|██▊       | 1664/5800 [4:42:24<7:54:53,  6.89s/it]                                                       {'loss': 0.0288, 'grad_norm': 3.734694480895996, 'learning_rate': 3.346760026583387e-05, 'epoch': 14.34}
- 29%|██▊       | 1664/5800 [4:42:24<7:54:53,  6.89s/it]score1 tensor([[0.4551],
-        [0.4336],
-        [0.5586],
-        [0.4668]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.4492, 0.5391, 0.4746], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0200, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:29:01,663] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 13:29:01,665] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.69 | bwd_microstep: 4621.05 | bwd_inner_microstep: 4615.08 | bwd_allreduce_microstep: 5.76 | step_microstep: 51.20
-[2025-01-25 13:29:01,665] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.64 | bwd: 4621.11 | bwd_inner: 4615.08 | bwd_allreduce: 5.87 | step: 51.23
- 29%|██▊       | 1665/5800 [4:42:31<7:55:21,  6.90s/it]                                                       {'loss': 0.02, 'grad_norm': 0.5050510168075562, 'learning_rate': 3.345934161288874e-05, 'epoch': 14.35}
- 29%|██▊       | 1665/5800 [4:42:31<7:55:21,  6.90s/it]score1 tensor([[0.4219],
-        [0.5117],
-        [0.5234],
-        [0.4434]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.5625, 0.5508, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0288, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:29:08,589] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.52 | optimizer_step: 4.37
-[2025-01-25 13:29:08,590] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.37 | bwd_microstep: 4627.16 | bwd_inner_microstep: 4622.57 | bwd_allreduce_microstep: 4.49 | step_microstep: 42.41
-[2025-01-25 13:29:08,590] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.33 | bwd: 4627.19 | bwd_inner: 4622.57 | bwd_allreduce: 4.54 | step: 42.42
- 29%|██▊       | 1666/5800 [4:42:38<7:55:37,  6.90s/it]                                                       {'loss': 0.0288, 'grad_norm': 7.108630180358887, 'learning_rate': 3.3451078763087356e-05, 'epoch': 14.36}
- 29%|██▊       | 1666/5800 [4:42:38<7:55:37,  6.90s/it]score1 tensor([[0.6133],
-        [0.4023],
-        [0.5156],
-        [0.4121]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.3887, 0.5039, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0190, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:29:15,502] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.36
-[2025-01-25 13:29:15,506] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.61 | bwd_microstep: 4622.57 | bwd_inner_microstep: 4618.00 | bwd_allreduce_microstep: 4.45 | step_microstep: 55.83
-[2025-01-25 13:29:15,507] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.56 | bwd: 4622.59 | bwd_inner: 4618.00 | bwd_allreduce: 4.52 | step: 55.84
- 29%|██▊       | 1667/5800 [4:42:45<7:56:09,  6.91s/it]                                                       {'loss': 0.019, 'grad_norm': 0.5739309191703796, 'learning_rate': 3.3442811719006216e-05, 'epoch': 14.37}
- 29%|██▊       | 1667/5800 [4:42:45<7:56:09,  6.91s/it]score1 tensor([[0.4941],
-        [0.0000],
-        [0.4922],
-        [0.2539]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.4961, 0.5156, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2031, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:29:22,363] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 13:29:22,365] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.65 | bwd_microstep: 4571.05 | bwd_inner_microstep: 4566.31 | bwd_allreduce_microstep: 4.66 | step_microstep: 44.31
-[2025-01-25 13:29:22,365] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.62 | bwd: 4571.07 | bwd_inner: 4566.31 | bwd_allreduce: 4.69 | step: 44.32
- 29%|██▉       | 1668/5800 [4:42:52<7:54:37,  6.89s/it]                                                       {'loss': 0.2031, 'grad_norm': 14.44214153289795, 'learning_rate': 3.343454048322313e-05, 'epoch': 14.38}
- 29%|██▉       | 1668/5800 [4:42:52<7:54:37,  6.89s/it]score1 tensor([[0.6172],
-        [0.4180],
-        [0.5039],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133, 0.3906, 0.4941, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:29:29,278] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.53 | optimizer_step: 4.36
-[2025-01-25 13:29:29,279] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.31 | bwd_microstep: 4625.71 | bwd_inner_microstep: 4620.43 | bwd_allreduce_microstep: 5.20 | step_microstep: 41.08
-[2025-01-25 13:29:29,279] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.28 | bwd: 4625.74 | bwd_inner: 4620.43 | bwd_allreduce: 5.23 | step: 41.09
- 29%|██▉       | 1669/5800 [4:42:59<7:54:58,  6.90s/it]                                                       {'loss': 0.0112, 'grad_norm': 7.520633220672607, 'learning_rate': 3.342626505831721e-05, 'epoch': 14.39}
- 29%|██▉       | 1669/5800 [4:42:59<7:54:58,  6.90s/it]score1 tensor([[0.5391],
-        [0.4668],
-        [0.5039],
-        [0.5859]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5586, 0.4512, 0.5430, 0.6133], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:29:36,224] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.91 | optimizer_step: 4.36
-[2025-01-25 13:29:36,227] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.18 | bwd_microstep: 4629.02 | bwd_inner_microstep: 4621.85 | bwd_allreduce_microstep: 6.96 | step_microstep: 84.49
-[2025-01-25 13:29:36,228] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.15 | bwd: 4629.08 | bwd_inner: 4621.85 | bwd_allreduce: 7.06 | step: 84.51
- 29%|██▉       | 1670/5800 [4:43:06<7:56:16,  6.92s/it]                                                       {'loss': 0.0254, 'grad_norm': 4.095832824707031, 'learning_rate': 3.3417985446868884e-05, 'epoch': 14.4}
- 29%|██▉       | 1670/5800 [4:43:06<7:56:16,  6.92s/it]score1 tensor([[0.3906],
-        [0.6719],
-        [0.5039],
-        [0.4961]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3711, 0.6875, 0.4844, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0249, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:29:43,147] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.36
-[2025-01-25 13:29:43,148] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.13 | bwd_microstep: 4617.05 | bwd_inner_microstep: 4612.06 | bwd_allreduce_microstep: 4.89 | step_microstep: 42.94
-[2025-01-25 13:29:43,149] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.08 | bwd: 4617.07 | bwd_inner: 4612.06 | bwd_allreduce: 4.94 | step: 42.95
- 29%|██▉       | 1671/5800 [4:43:13<7:55:45,  6.91s/it]                                                       {'loss': 0.0249, 'grad_norm': 3.2064807415008545, 'learning_rate': 3.3409701651459866e-05, 'epoch': 14.41}
- 29%|██▉       | 1671/5800 [4:43:13<7:55:45,  6.91s/it]score1 tensor([[0.4824],
-        [0.4551],
-        [0.4844],
-        [0.4766]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.4707, 0.4648, 0.4355], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0264, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:29:50,030] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 13:29:50,032] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.21 | bwd_microstep: 4616.12 | bwd_inner_microstep: 4611.37 | bwd_allreduce_microstep: 4.66 | step_microstep: 41.93
-[2025-01-25 13:29:50,032] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.17 | bwd: 4616.14 | bwd_inner: 4611.37 | bwd_allreduce: 4.70 | step: 41.94
- 29%|██▉       | 1672/5800 [4:43:20<7:55:01,  6.90s/it]                                                       {'loss': 0.0264, 'grad_norm': 0.4515450894832611, 'learning_rate': 3.34014136746732e-05, 'epoch': 14.41}
- 29%|██▉       | 1672/5800 [4:43:20<7:55:01,  6.90s/it]score1 tensor([[0.6602],
-        [0.4062],
-        [0.5273],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6641, 0.4043, 0.5820, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0200, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:29:56,943] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.37
-[2025-01-25 13:29:56,944] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.32 | bwd_microstep: 4628.50 | bwd_inner_microstep: 4623.80 | bwd_allreduce_microstep: 4.63 | step_microstep: 42.51
-[2025-01-25 13:29:56,945] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.28 | bwd: 4628.52 | bwd_inner: 4623.80 | bwd_allreduce: 4.66 | step: 42.52
- 29%|██▉       | 1673/5800 [4:43:26<7:55:04,  6.91s/it]                                                       {'loss': 0.02, 'grad_norm': 4.468202590942383, 'learning_rate': 3.339312151909321e-05, 'epoch': 14.42}
- 29%|██▉       | 1673/5800 [4:43:26<7:55:04,  6.91s/it]score1 tensor([[0.5000],
-        [0.4492],
-        [0.5742],
-        [0.0000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4375, 0.4961, 0.6172, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1758, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:30:03,804] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 13:30:03,805] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.72 | bwd_microstep: 4585.28 | bwd_inner_microstep: 4580.12 | bwd_allreduce_microstep: 5.07 | step_microstep: 43.13
-[2025-01-25 13:30:03,806] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.69 | bwd: 4585.31 | bwd_inner: 4580.12 | bwd_allreduce: 5.12 | step: 43.13
- 29%|██▉       | 1674/5800 [4:43:33<7:54:08,  6.90s/it]                                                       {'loss': 0.1758, 'grad_norm': 1.9395629167556763, 'learning_rate': 3.338482518730555e-05, 'epoch': 14.43}
- 29%|██▉       | 1674/5800 [4:43:33<7:54:08,  6.90s/it]score1 tensor([[0.6328],
-        [0.4668],
-        [0.5156],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.4238, 0.5234, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:30:10,710] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.37
-[2025-01-25 13:30:10,712] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.56 | bwd_microstep: 4624.91 | bwd_inner_microstep: 4619.84 | bwd_allreduce_microstep: 4.96 | step_microstep: 44.62
-[2025-01-25 13:30:10,712] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.53 | bwd: 4624.94 | bwd_inner: 4619.84 | bwd_allreduce: 5.02 | step: 44.61
- 29%|██▉       | 1675/5800 [4:43:40<7:54:06,  6.90s/it]                                                       {'loss': 0.0254, 'grad_norm': 0.5352509021759033, 'learning_rate': 3.337652468189714e-05, 'epoch': 14.44}
- 29%|██▉       | 1675/5800 [4:43:40<7:54:06,  6.90s/it]score1 tensor([[0.3848],
-        [0.4180],
-        [0.4688],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4219, 0.3789, 0.4609, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0259, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:30:17,605] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 13:30:17,607] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.20 | bwd_microstep: 4625.38 | bwd_inner_microstep: 4620.76 | bwd_allreduce_microstep: 4.52 | step_microstep: 43.25
-[2025-01-25 13:30:17,607] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.17 | bwd: 4625.41 | bwd_inner: 4620.76 | bwd_allreduce: 4.57 | step: 43.25
- 29%|██▉       | 1676/5800 [4:43:47<7:54:12,  6.90s/it]                                                       {'loss': 0.0259, 'grad_norm': 0.8996525406837463, 'learning_rate': 3.336822000545623e-05, 'epoch': 14.45}
- 29%|██▉       | 1676/5800 [4:43:47<7:54:12,  6.90s/it]score1 tensor([[0.6953],
-        [0.5469],
-        [0.4727],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.5391, 0.4609, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0283, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:30:24,527] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 13:30:24,528] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.11 | bwd_microstep: 4624.51 | bwd_inner_microstep: 4617.13 | bwd_allreduce_microstep: 7.27 | step_microstep: 49.68
-[2025-01-25 13:30:24,529] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.01 | bwd: 4624.57 | bwd_inner: 4617.13 | bwd_allreduce: 7.32 | step: 49.66
- 29%|██▉       | 1677/5800 [4:43:54<7:54:54,  6.91s/it]                                                       {'loss': 0.0283, 'grad_norm': 7.9356489181518555, 'learning_rate': 3.335991116057237e-05, 'epoch': 14.46}
- 29%|██▉       | 1677/5800 [4:43:54<7:54:54,  6.91s/it]score1 tensor([[0.6367],
-        [0.4355],
-        [0.5039],
-        [0.4531]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.4707, 0.5273, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0361, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:30:31,514] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.28 | optimizer_step: 4.36
-[2025-01-25 13:30:31,516] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.30 | bwd_microstep: 4634.23 | bwd_inner_microstep: 4626.25 | bwd_allreduce_microstep: 7.72 | step_microstep: 71.69
-[2025-01-25 13:30:31,517] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.27 | bwd: 4634.29 | bwd_inner: 4626.25 | bwd_allreduce: 7.85 | step: 71.70
- 29%|██▉       | 1678/5800 [4:44:01<7:55:57,  6.93s/it]                                                       {'loss': 0.0361, 'grad_norm': 3.3242976665496826, 'learning_rate': 3.335159814983639e-05, 'epoch': 14.47}
- 29%|██▉       | 1678/5800 [4:44:01<7:55:57,  6.93s/it]score1 tensor([[0.5195],
-        [0.5820],
-        [0.5273],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5234, 0.5391, 0.4434, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0425, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:30:38,428] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.51 | optimizer_step: 4.36
-[2025-01-25 13:30:38,430] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.37 | bwd_microstep: 4620.01 | bwd_inner_microstep: 4615.03 | bwd_allreduce_microstep: 4.89 | step_microstep: 44.51
-[2025-01-25 13:30:38,430] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.33 | bwd: 4620.03 | bwd_inner: 4615.03 | bwd_allreduce: 4.93 | step: 44.52
- 29%|██▉       | 1679/5800 [4:44:08<7:55:20,  6.92s/it]                                                       {'loss': 0.0425, 'grad_norm': 1.1397336721420288, 'learning_rate': 3.334328097584044e-05, 'epoch': 14.47}
- 29%|██▉       | 1679/5800 [4:44:08<7:55:20,  6.92s/it]score1 tensor([[0.4707],
-        [0.5234],
-        [0.5508],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4863, 0.3262, 0.5430, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:30:45,331] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.54 | optimizer_step: 4.36
-[2025-01-25 13:30:45,333] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.94 | bwd_microstep: 4633.44 | bwd_inner_microstep: 4626.76 | bwd_allreduce_microstep: 6.60 | step_microstep: 49.63
-[2025-01-25 13:30:45,333] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.91 | bwd: 4633.46 | bwd_inner: 4626.76 | bwd_allreduce: 6.64 | step: 49.65
- 29%|██▉       | 1680/5800 [4:44:15<7:54:58,  6.92s/it]                                                       {'loss': 0.0664, 'grad_norm': 4.084677219390869, 'learning_rate': 3.333495964117796e-05, 'epoch': 14.48}
- 29%|██▉       | 1680/5800 [4:44:15<7:54:58,  6.92s/it]score1 tensor([[0.6289],
-        [0.5586],
-        [0.4902],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.5430, 0.4668, 0.4590], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0425, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:30:52,271] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.37
-[2025-01-25 13:30:52,273] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.74 | bwd_microstep: 4620.57 | bwd_inner_microstep: 4615.90 | bwd_allreduce_microstep: 4.60 | step_microstep: 41.50
-[2025-01-25 13:30:52,273] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.65 | bwd: 4620.60 | bwd_inner: 4615.90 | bwd_allreduce: 4.63 | step: 41.51
- 29%|██▉       | 1681/5800 [4:44:22<7:55:15,  6.92s/it]                                                       {'loss': 0.0425, 'grad_norm': 7.926888942718506, 'learning_rate': 3.3326634148443676e-05, 'epoch': 14.49}
- 29%|██▉       | 1681/5800 [4:44:22<7:55:15,  6.92s/it]score1 tensor([[0.3809],
-        [0.4336],
-        [0.4121],
-        [0.6445]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3086, 0.4531, 0.4160, 0.6562], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0269, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:30:59,171] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.37
-[2025-01-25 13:30:59,172] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.30 | bwd_microstep: 4622.06 | bwd_inner_microstep: 4617.52 | bwd_allreduce_microstep: 4.46 | step_microstep: 47.23
-[2025-01-25 13:30:59,172] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.25 | bwd: 4622.09 | bwd_inner: 4617.52 | bwd_allreduce: 4.50 | step: 47.24
- 29%|██▉       | 1682/5800 [4:44:29<7:54:46,  6.92s/it]                                                       {'loss': 0.0269, 'grad_norm': 4.003078937530518, 'learning_rate': 3.331830450023362e-05, 'epoch': 14.5}
- 29%|██▉       | 1682/5800 [4:44:29<7:54:46,  6.92s/it]score1 tensor([[0.6016],
-        [0.4648],
-        [0.4902],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.4141, 0.4473, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0332, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:31:06,091] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 13:31:06,093] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.32 | bwd_microstep: 4634.40 | bwd_inner_microstep: 4629.33 | bwd_allreduce_microstep: 4.92 | step_microstep: 47.65
-[2025-01-25 13:31:06,093] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.29 | bwd: 4634.43 | bwd_inner: 4629.33 | bwd_allreduce: 5.00 | step: 47.65
- 29%|██▉       | 1683/5800 [4:44:36<7:54:43,  6.92s/it]                                                       {'loss': 0.0332, 'grad_norm': 7.738142967224121, 'learning_rate': 3.3309970699145144e-05, 'epoch': 14.51}
- 29%|██▉       | 1683/5800 [4:44:36<7:54:43,  6.92s/it]score1 tensor([[0.5703],
-        [0.5352],
-        [0.5039],
-        [0.3926]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.5469, 0.5430, 0.3418], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0342, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:31:12,993] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 13:31:12,994] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.70 | bwd_microstep: 4623.78 | bwd_inner_microstep: 4619.13 | bwd_allreduce_microstep: 4.56 | step_microstep: 41.99
-[2025-01-25 13:31:12,995] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.67 | bwd: 4623.80 | bwd_inner: 4619.13 | bwd_allreduce: 4.60 | step: 41.99
- 29%|██▉       | 1684/5800 [4:44:42<7:54:15,  6.91s/it]                                                       {'loss': 0.0342, 'grad_norm': 4.269383907318115, 'learning_rate': 3.330163274777685e-05, 'epoch': 14.52}
- 29%|██▉       | 1684/5800 [4:44:42<7:54:15,  6.91s/it]score1 tensor([[0.6016],
-        [0.5195],
-        [0.6289],
-        [0.6172]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5742, 0.4902, 0.6094, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0356, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:31:19,898] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.54 | optimizer_step: 4.36
-[2025-01-25 13:31:19,899] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.75 | bwd_microstep: 4623.46 | bwd_inner_microstep: 4618.92 | bwd_allreduce_microstep: 4.45 | step_microstep: 42.47
-[2025-01-25 13:31:19,900] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.71 | bwd: 4623.51 | bwd_inner: 4618.92 | bwd_allreduce: 4.49 | step: 42.48
- 29%|██▉       | 1685/5800 [4:44:49<7:53:52,  6.91s/it]                                                       {'loss': 0.0356, 'grad_norm': 8.323565483093262, 'learning_rate': 3.329329064872866e-05, 'epoch': 14.53}
- 29%|██▉       | 1685/5800 [4:44:49<7:53:52,  6.91s/it]score1 tensor([[0.5234],
-        [0.4805],
-        [0.6328],
-        [0.6289]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.4160, 0.5781, 0.6250], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0337, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:31:26,816] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.58 | optimizer_step: 4.37
-[2025-01-25 13:31:26,818] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.77 | bwd_microstep: 4631.25 | bwd_inner_microstep: 4626.04 | bwd_allreduce_microstep: 5.04 | step_microstep: 60.24
-[2025-01-25 13:31:26,819] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.74 | bwd: 4631.30 | bwd_inner: 4626.04 | bwd_allreduce: 5.12 | step: 60.23
- 29%|██▉       | 1686/5800 [4:44:56<7:54:10,  6.92s/it]                                                       {'loss': 0.0337, 'grad_norm': 8.139405250549316, 'learning_rate': 3.328494440460178e-05, 'epoch': 14.53}
- 29%|██▉       | 1686/5800 [4:44:56<7:54:10,  6.92s/it]score1 tensor([[0.6289],
-        [0.5000],
-        [0.6172],
-        [0.4316]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5898, 0.4688, 0.6328, 0.4883], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0356, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:31:33,773] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.84 | optimizer_step: 4.36
-[2025-01-25 13:31:33,776] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.78 | bwd_microstep: 4624.16 | bwd_inner_microstep: 4616.40 | bwd_allreduce_microstep: 7.55 | step_microstep: 86.04
-[2025-01-25 13:31:33,777] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.75 | bwd: 4624.21 | bwd_inner: 4616.40 | bwd_allreduce: 7.65 | step: 86.06
- 29%|██▉       | 1687/5800 [4:45:03<7:54:59,  6.93s/it]                                                       {'loss': 0.0356, 'grad_norm': 0.5733634829521179, 'learning_rate': 3.3276594017998735e-05, 'epoch': 14.54}
- 29%|██▉       | 1687/5800 [4:45:03<7:54:59,  6.93s/it]score1 tensor([[0.4883],
-        [0.0000],
-        [0.4746],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.4688, 0.3730, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1748, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:31:40,677] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.41 | optimizer_step: 4.37
-[2025-01-25 13:31:40,679] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2169.10 | bwd_microstep: 4583.81 | bwd_inner_microstep: 4578.52 | bwd_allreduce_microstep: 5.22 | step_microstep: 56.10
-[2025-01-25 13:31:40,679] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2169.06 | bwd: 4583.84 | bwd_inner: 4578.52 | bwd_allreduce: 5.26 | step: 56.14
- 29%|██▉       | 1688/5800 [4:45:10<7:54:09,  6.92s/it]                                                       {'loss': 0.1748, 'grad_norm': 2.2268149852752686, 'learning_rate': 3.326823949152329e-05, 'epoch': 14.55}
- 29%|██▉       | 1688/5800 [4:45:10<7:54:09,  6.92s/it]score1 tensor([[0.5117],
-        [0.3301],
-        [0.5703],
-        [0.3965]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.5781, 0.5664, 0.4199], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0737, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:31:47,597] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.52 | optimizer_step: 4.36
-[2025-01-25 13:31:47,598] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.80 | bwd_microstep: 4629.11 | bwd_inner_microstep: 4624.44 | bwd_allreduce_microstep: 4.57 | step_microstep: 41.64
-[2025-01-25 13:31:47,598] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.77 | bwd: 4629.13 | bwd_inner: 4624.44 | bwd_allreduce: 4.62 | step: 41.65
- 29%|██▉       | 1689/5800 [4:45:17<7:53:56,  6.92s/it]                                                       {'loss': 0.0737, 'grad_norm': 5.509603500366211, 'learning_rate': 3.325988082778056e-05, 'epoch': 14.56}
- 29%|██▉       | 1689/5800 [4:45:17<7:53:56,  6.92s/it]score1 tensor([[0.4531],
-        [0.4707],
-        [0.4023],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4219, 0.4023, 0.4414, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0474, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:31:54,495] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.53 | optimizer_step: 4.37
-[2025-01-25 13:31:54,496] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.55 | bwd_microstep: 4620.61 | bwd_inner_microstep: 4615.44 | bwd_allreduce_microstep: 5.06 | step_microstep: 46.62
-[2025-01-25 13:31:54,497] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.52 | bwd: 4620.63 | bwd_inner: 4615.44 | bwd_allreduce: 5.12 | step: 46.64
- 29%|██▉       | 1690/5800 [4:45:24<7:53:23,  6.91s/it]                                                       {'loss': 0.0474, 'grad_norm': 10.403751373291016, 'learning_rate': 3.3251518029376906e-05, 'epoch': 14.57}
- 29%|██▉       | 1690/5800 [4:45:24<7:53:23,  6.91s/it]score1 tensor([[0.5234],
-        [0.6562],
-        [0.4512],
-        [0.4902]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.7031, 0.4609, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:32:01,401] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 7.66 | optimizer_step: 4.36
-[2025-01-25 13:32:01,402] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.12 | bwd_microstep: 4626.83 | bwd_inner_microstep: 4621.95 | bwd_allreduce_microstep: 4.79 | step_microstep: 48.86
-[2025-01-25 13:32:01,402] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.06 | bwd: 4626.85 | bwd_inner: 4621.95 | bwd_allreduce: 4.83 | step: 48.87
- 29%|██▉       | 1691/5800 [4:45:31<7:53:10,  6.91s/it]                                                       {'loss': 0.0273, 'grad_norm': 7.790441513061523, 'learning_rate': 3.3243151098920004e-05, 'epoch': 14.58}
- 29%|██▉       | 1691/5800 [4:45:31<7:53:10,  6.91s/it]score1 tensor([[0.5156],
-        [0.5195],
-        [0.6172],
-        [0.3965]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.6016, 0.6133, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0854, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:32:08,308] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.37
-[2025-01-25 13:32:08,310] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.09 | bwd_microstep: 4627.41 | bwd_inner_microstep: 4622.36 | bwd_allreduce_microstep: 4.96 | step_microstep: 49.02
-[2025-01-25 13:32:08,310] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.05 | bwd: 4627.43 | bwd_inner: 4622.36 | bwd_allreduce: 5.01 | step: 49.03
- 29%|██▉       | 1692/5800 [4:45:38<7:53:03,  6.91s/it]                                                       {'loss': 0.0854, 'grad_norm': 38.62544631958008, 'learning_rate': 3.323478003901879e-05, 'epoch': 14.59}
- 29%|██▉       | 1692/5800 [4:45:38<7:53:03,  6.91s/it]score1 tensor([[0.3945],
-        [0.6367],
-        [0.4883],
-        [0.4375]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.6133, 0.5039, 0.4668], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0220, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:32:15,204] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 13:32:15,205] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.88 | bwd_microstep: 4623.08 | bwd_inner_microstep: 4618.03 | bwd_allreduce_microstep: 4.95 | step_microstep: 42.81
-[2025-01-25 13:32:15,206] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.85 | bwd: 4623.10 | bwd_inner: 4618.03 | bwd_allreduce: 5.01 | step: 42.80
- 29%|██▉       | 1693/5800 [4:45:45<7:52:42,  6.91s/it]                                                       {'loss': 0.022, 'grad_norm': 0.7705693244934082, 'learning_rate': 3.322640485228352e-05, 'epoch': 14.59}
- 29%|██▉       | 1693/5800 [4:45:45<7:52:42,  6.91s/it]score1 tensor([[0.5859],
-        [0.4492],
-        [0.5156],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.4551, 0.5117, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0298, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:32:22,125] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 13:32:22,126] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.94 | bwd_microstep: 4634.77 | bwd_inner_microstep: 4629.30 | bwd_allreduce_microstep: 5.39 | step_microstep: 44.88
-[2025-01-25 13:32:22,127] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.90 | bwd: 4634.80 | bwd_inner: 4629.30 | bwd_allreduce: 5.43 | step: 44.89
- 29%|██▉       | 1694/5800 [4:45:52<7:52:54,  6.91s/it]                                                       {'loss': 0.0298, 'grad_norm': 1.20513916015625, 'learning_rate': 3.321802554132572e-05, 'epoch': 14.6}
- 29%|██▉       | 1694/5800 [4:45:52<7:52:54,  6.91s/it]score1 tensor([[0.5430],
-        [0.6445],
-        [0.5000],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.6367, 0.5195, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0210, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:32:29,022] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 13:32:29,024] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.69 | bwd_microstep: 4623.30 | bwd_inner_microstep: 4618.41 | bwd_allreduce_microstep: 4.78 | step_microstep: 41.20
-[2025-01-25 13:32:29,024] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.65 | bwd: 4623.33 | bwd_inner: 4618.41 | bwd_allreduce: 4.84 | step: 41.21
- 29%|██▉       | 1695/5800 [4:45:58<7:52:32,  6.91s/it]                                                       {'loss': 0.021, 'grad_norm': 0.6271003484725952, 'learning_rate': 3.32096421087582e-05, 'epoch': 14.61}
- 29%|██▉       | 1695/5800 [4:45:59<7:52:32,  6.91s/it]score1 tensor([[0.4941],
-        [0.7656],
-        [0.5938],
-        [0.4805]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.4180, 0.7070, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1260, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:32:35,928] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 13:32:35,929] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.27 | bwd_microstep: 4620.43 | bwd_inner_microstep: 4615.84 | bwd_allreduce_microstep: 4.50 | step_microstep: 52.09
-[2025-01-25 13:32:35,929] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.23 | bwd: 4620.45 | bwd_inner: 4615.84 | bwd_allreduce: 4.54 | step: 52.10
- 29%|██▉       | 1696/5800 [4:46:05<7:52:25,  6.91s/it]                                                       {'loss': 0.126, 'grad_norm': 7.35801362991333, 'learning_rate': 3.320125455719507e-05, 'epoch': 14.62}
- 29%|██▉       | 1696/5800 [4:46:05<7:52:25,  6.91s/it]score1 tensor([[0.5039],
-        [0.4746],
-        [0.4922],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.4062, 0.4238, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0386, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:32:42,840] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 13:32:42,841] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.08 | bwd_microstep: 4625.20 | bwd_inner_microstep: 4620.27 | bwd_allreduce_microstep: 4.82 | step_microstep: 42.18
-[2025-01-25 13:32:42,842] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.04 | bwd: 4625.22 | bwd_inner: 4620.27 | bwd_allreduce: 4.88 | step: 42.22
- 29%|██▉       | 1697/5800 [4:46:12<7:53:03,  6.92s/it]                                                       {'loss': 0.0386, 'grad_norm': 3.582728385925293, 'learning_rate': 3.31928628892517e-05, 'epoch': 14.63}
- 29%|██▉       | 1697/5800 [4:46:12<7:53:03,  6.92s/it]score1 tensor([[0.5664],
-        [0.5391],
-        [0.5430],
-        [0.4492]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.5391, 0.5703, 0.4082], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0239, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:32:49,780] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 13:32:49,781] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.85 | bwd_microstep: 4586.42 | bwd_inner_microstep: 4578.58 | bwd_allreduce_microstep: 7.57 | step_microstep: 57.54
-[2025-01-25 13:32:49,782] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2168.77 | bwd: 4586.49 | bwd_inner: 4578.58 | bwd_allreduce: 7.71 | step: 57.52
- 29%|██▉       | 1698/5800 [4:46:19<7:52:52,  6.92s/it]                                                       {'loss': 0.0239, 'grad_norm': 1.8570183515548706, 'learning_rate': 3.318446710754477e-05, 'epoch': 14.64}
- 29%|██▉       | 1698/5800 [4:46:19<7:52:52,  6.92s/it]score1 tensor([[0.4844],
-        [0.5078],
-        [0.4727],
-        [0.4570]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.4863, 0.4727, 0.4668], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:32:56,652] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 13:32:56,654] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.40 | bwd_microstep: 4579.19 | bwd_inner_microstep: 4574.63 | bwd_allreduce_microstep: 4.46 | step_microstep: 50.40
-[2025-01-25 13:32:56,654] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.36 | bwd: 4579.22 | bwd_inner: 4574.63 | bwd_allreduce: 4.51 | step: 50.39
- 29%|██▉       | 1699/5800 [4:46:26<7:51:39,  6.90s/it]                                                       {'loss': 0.0146, 'grad_norm': 1.7609020471572876, 'learning_rate': 3.317606721469222e-05, 'epoch': 14.65}
- 29%|██▉       | 1699/5800 [4:46:26<7:51:39,  6.90s/it]score1 tensor([[0.4668],
-        [0.4512],
-        [0.4980],
-        [0.4922]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4375, 0.4277, 0.6055, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:33:03,568] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 13:33:03,569] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.75 | bwd_microstep: 4634.44 | bwd_inner_microstep: 4630.04 | bwd_allreduce_microstep: 4.32 | step_microstep: 43.99
-[2025-01-25 13:33:03,570] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.71 | bwd: 4634.46 | bwd_inner: 4630.04 | bwd_allreduce: 4.36 | step: 44.00
- 29%|██▉       | 1700/5800 [4:46:33<7:51:50,  6.90s/it]                                                       {'loss': 0.0508, 'grad_norm': 0.49610385298728943, 'learning_rate': 3.316766321331329e-05, 'epoch': 14.66}
- 29%|██▉       | 1700/5800 [4:46:33<7:51:50,  6.90s/it]score1 tensor([[0.0000],
-        [0.4980],
-        [0.5664],
-        [0.4668]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5742, 0.5469, 0.6562, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1816, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:33:10,430] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.37
-[2025-01-25 13:33:10,432] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.02 | bwd_microstep: 4581.13 | bwd_inner_microstep: 4576.37 | bwd_allreduce_microstep: 4.61 | step_microstep: 45.65
-[2025-01-25 13:33:10,432] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.99 | bwd: 4581.15 | bwd_inner: 4576.37 | bwd_allreduce: 4.67 | step: 45.66
- 29%|██▉       | 1701/5800 [4:46:40<7:50:52,  6.89s/it]                                                       {'loss': 0.1816, 'grad_norm': 2.137876033782959, 'learning_rate': 3.315925510602849e-05, 'epoch': 14.66}
- 29%|██▉       | 1701/5800 [4:46:40<7:50:52,  6.89s/it]score1 tensor([[0.3457],
-        [0.4414],
-        [0.5234],
-        [0.5781]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3105, 0.4863, 0.5586, 0.6562], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0483, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:33:17,330] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 13:33:17,331] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.29 | bwd_microstep: 4628.22 | bwd_inner_microstep: 4623.61 | bwd_allreduce_microstep: 4.50 | step_microstep: 41.28
-[2025-01-25 13:33:17,332] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.26 | bwd: 4628.26 | bwd_inner: 4623.61 | bwd_allreduce: 4.54 | step: 41.29
- 29%|██▉       | 1702/5800 [4:46:47<7:50:51,  6.89s/it]                                                       {'loss': 0.0483, 'grad_norm': 4.193480014801025, 'learning_rate': 3.3150842895459626e-05, 'epoch': 14.67}
- 29%|██▉       | 1702/5800 [4:46:47<7:50:51,  6.89s/it]score1 tensor([[0.4336],
-        [0.5391],
-        [0.5469],
-        [0.4707]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.5508, 0.6172, 0.4336], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0381, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:33:24,241] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 13:33:24,242] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.06 | bwd_microstep: 4636.03 | bwd_inner_microstep: 4630.98 | bwd_allreduce_microstep: 4.94 | step_microstep: 48.21
-[2025-01-25 13:33:24,242] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.02 | bwd: 4636.06 | bwd_inner: 4630.98 | bwd_allreduce: 5.00 | step: 48.22
- 29%|██▉       | 1703/5800 [4:46:54<7:51:03,  6.90s/it]                                                       {'loss': 0.0381, 'grad_norm': 0.6974416971206665, 'learning_rate': 3.314242658422974e-05, 'epoch': 14.68}
- 29%|██▉       | 1703/5800 [4:46:54<7:51:03,  6.90s/it]score1 tensor([[0.5000],
-        [0.6172],
-        [0.5625],
-        [0.4980]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.6484, 0.5625, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:33:31,103] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.36
-[2025-01-25 13:33:31,104] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.41 | bwd_microstep: 4591.78 | bwd_inner_microstep: 4586.67 | bwd_allreduce_microstep: 4.99 | step_microstep: 44.16
-[2025-01-25 13:33:31,105] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.37 | bwd: 4591.81 | bwd_inner: 4586.67 | bwd_allreduce: 5.04 | step: 44.16
- 29%|██▉       | 1704/5800 [4:47:01<7:50:20,  6.89s/it]                                                       {'loss': 0.0161, 'grad_norm': 1.6719027757644653, 'learning_rate': 3.313400617496322e-05, 'epoch': 14.69}
- 29%|██▉       | 1704/5800 [4:47:01<7:50:20,  6.89s/it]score1 tensor([[0.5039],
-        [0.5039],
-        [0.5469],
-        [0.4980]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3809, 0.4844, 0.5469, 0.4805], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0400, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:33:37,959] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 13:33:37,960] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.98 | bwd_microstep: 4577.07 | bwd_inner_microstep: 4572.33 | bwd_allreduce_microstep: 4.65 | step_microstep: 42.05
-[2025-01-25 13:33:37,960] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.95 | bwd: 4577.09 | bwd_inner: 4572.33 | bwd_allreduce: 4.69 | step: 42.05
- 29%|██▉       | 1705/5800 [4:47:07<7:49:25,  6.88s/it]                                                       {'loss': 0.04, 'grad_norm': 5.586382865905762, 'learning_rate': 3.312558167028568e-05, 'epoch': 14.7}
- 29%|██▉       | 1705/5800 [4:47:07<7:49:25,  6.88s/it]score1 tensor([[0.6406],
-        [0.5508],
-        [0.5195],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6641, 0.5352, 0.5000, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0303, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:33:44,865] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.34 | optimizer_step: 4.36
-[2025-01-25 13:33:44,866] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.18 | bwd_microstep: 4625.09 | bwd_inner_microstep: 4618.77 | bwd_allreduce_microstep: 6.24 | step_microstep: 48.36
-[2025-01-25 13:33:44,870] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.15 | bwd: 4625.11 | bwd_inner: 4618.77 | bwd_allreduce: 6.28 | step: 48.37
- 29%|██▉       | 1706/5800 [4:47:14<7:50:07,  6.89s/it]                                                       {'loss': 0.0303, 'grad_norm': 3.6690571308135986, 'learning_rate': 3.311715307282402e-05, 'epoch': 14.71}
- 29%|██▉       | 1706/5800 [4:47:14<7:50:07,  6.89s/it]score1 tensor([[0.4941],
-        [0.4473],
-        [0.5430],
-        [0.4707]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.4492, 0.6016, 0.4746], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0225, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:33:51,806] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.54 | optimizer_step: 4.36
-[2025-01-25 13:33:51,808] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.68 | bwd_microstep: 4629.36 | bwd_inner_microstep: 4622.51 | bwd_allreduce_microstep: 6.68 | step_microstep: 67.73
-[2025-01-25 13:33:51,808] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.64 | bwd: 4629.42 | bwd_inner: 4622.51 | bwd_allreduce: 6.77 | step: 67.73
- 29%|██▉       | 1707/5800 [4:47:21<7:51:03,  6.91s/it]                                                       {'loss': 0.0225, 'grad_norm': 7.349471569061279, 'learning_rate': 3.310872038520643e-05, 'epoch': 14.72}
- 29%|██▉       | 1707/5800 [4:47:21<7:51:03,  6.91s/it]score1 tensor([[0.4902],
-        [0.4629],
-        [0.4570],
-        [0.4141]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.4844, 0.4277, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0513, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:33:58,709] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 13:33:58,711] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.25 | bwd_microstep: 4618.85 | bwd_inner_microstep: 4614.03 | bwd_allreduce_microstep: 4.74 | step_microstep: 41.68
-[2025-01-25 13:33:58,711] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.22 | bwd: 4618.87 | bwd_inner: 4614.03 | bwd_allreduce: 4.78 | step: 41.69
- 29%|██▉       | 1708/5800 [4:47:28<7:50:58,  6.91s/it]                                                       {'loss': 0.0513, 'grad_norm': 3.594825029373169, 'learning_rate': 3.3100283610062374e-05, 'epoch': 14.72}
- 29%|██▉       | 1708/5800 [4:47:28<7:50:58,  6.91s/it]score1 tensor([[0.4727],
-        [0.5234],
-        [0.5000],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4824, 0.4883, 0.5039, 0.5586], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0142, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:34:05,638] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 13:34:05,640] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.34 | bwd_microstep: 4628.96 | bwd_inner_microstep: 4623.64 | bwd_allreduce_microstep: 5.19 | step_microstep: 40.48
-[2025-01-25 13:34:05,640] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.24 | bwd: 4628.99 | bwd_inner: 4623.64 | bwd_allreduce: 5.26 | step: 40.49
- 29%|██▉       | 1709/5800 [4:47:35<7:51:07,  6.91s/it]                                                       {'loss': 0.0142, 'grad_norm': 3.7374677658081055, 'learning_rate': 3.309184275002257e-05, 'epoch': 14.73}
- 29%|██▉       | 1709/5800 [4:47:35<7:51:07,  6.91s/it]score1 tensor([[0.5547],
-        [0.6289],
-        [0.5703],
-        [0.4824]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5820, 0.6797, 0.5898, 0.4258], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0386, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:34:12,545] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 13:34:12,546] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.43 | bwd_microstep: 4626.52 | bwd_inner_microstep: 4621.92 | bwd_allreduce_microstep: 4.52 | step_microstep: 41.33
-[2025-01-25 13:34:12,547] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.40 | bwd: 4626.54 | bwd_inner: 4621.92 | bwd_allreduce: 4.56 | step: 41.33
- 29%|██▉       | 1710/5800 [4:47:42<7:50:57,  6.91s/it]                                                       {'loss': 0.0386, 'grad_norm': 4.400872707366943, 'learning_rate': 3.308339780771904e-05, 'epoch': 14.74}
- 29%|██▉       | 1710/5800 [4:47:42<7:50:57,  6.91s/it]score1 tensor([[0.4805],
-        [0.5234],
-        [0.4766],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.5391, 0.3750, 0.4570], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0439, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:34:19,450] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.37
-[2025-01-25 13:34:19,451] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.12 | bwd_microstep: 4624.53 | bwd_inner_microstep: 4619.55 | bwd_allreduce_microstep: 4.87 | step_microstep: 44.14
-[2025-01-25 13:34:19,451] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.09 | bwd: 4624.55 | bwd_inner: 4619.55 | bwd_allreduce: 4.92 | step: 44.15
- 30%|██▉       | 1711/5800 [4:47:49<7:50:44,  6.91s/it]                                                       {'loss': 0.0439, 'grad_norm': 3.976407527923584, 'learning_rate': 3.3074948785785054e-05, 'epoch': 14.75}
- 30%|██▉       | 1711/5800 [4:47:49<7:50:44,  6.91s/it]score1 tensor([[0.5977],
-        [0.4395],
-        [0.5625],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.4121, 0.5664, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0303, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:34:26,358] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 13:34:26,359] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.85 | bwd_microstep: 4633.00 | bwd_inner_microstep: 4628.14 | bwd_allreduce_microstep: 4.77 | step_microstep: 41.32
-[2025-01-25 13:34:26,360] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.82 | bwd: 4633.02 | bwd_inner: 4628.14 | bwd_allreduce: 4.81 | step: 41.32
- 30%|██▉       | 1712/5800 [4:47:56<7:50:31,  6.91s/it]                                                       {'loss': 0.0303, 'grad_norm': 3.979419469833374, 'learning_rate': 3.306649568685517e-05, 'epoch': 14.76}
- 30%|██▉       | 1712/5800 [4:47:56<7:50:31,  6.91s/it]score1 tensor([[0.5078],
-        [0.5859],
-        [0.5078],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.5508, 0.4473, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0269, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:34:33,250] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.52 | optimizer_step: 4.37
-[2025-01-25 13:34:33,251] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.53 | bwd_microstep: 4623.22 | bwd_inner_microstep: 4618.54 | bwd_allreduce_microstep: 4.61 | step_microstep: 43.80
-[2025-01-25 13:34:33,251] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.50 | bwd: 4623.25 | bwd_inner: 4618.54 | bwd_allreduce: 4.64 | step: 43.80
- 30%|██▉       | 1713/5800 [4:48:03<7:50:07,  6.90s/it]                                                       {'loss': 0.0269, 'grad_norm': 0.5534338355064392, 'learning_rate': 3.3058038513565195e-05, 'epoch': 14.77}
- 30%|██▉       | 1713/5800 [4:48:03<7:50:07,  6.90s/it]score1 tensor([[0.4922],
-        [0.4648],
-        [0.4629],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4805, 0.3457, 0.4023, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0488, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:34:40,146] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 13:34:40,147] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.93 | bwd_microstep: 4624.08 | bwd_inner_microstep: 4619.62 | bwd_allreduce_microstep: 4.40 | step_microstep: 42.35
-[2025-01-25 13:34:40,148] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.89 | bwd: 4624.11 | bwd_inner: 4619.62 | bwd_allreduce: 4.43 | step: 42.36
- 30%|██▉       | 1714/5800 [4:48:10<7:49:57,  6.90s/it]                                                       {'loss': 0.0488, 'grad_norm': 7.702733039855957, 'learning_rate': 3.304957726855225e-05, 'epoch': 14.78}
- 30%|██▉       | 1714/5800 [4:48:10<7:49:57,  6.90s/it]score1 tensor([[0.0000],
-        [0.6484],
-        [0.4824],
-        [0.4766]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.6836, 0.4453, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1641, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:34:46,986] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.27 | optimizer_step: 4.36
-[2025-01-25 13:34:46,987] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.92 | bwd_microstep: 4570.39 | bwd_inner_microstep: 4566.04 | bwd_allreduce_microstep: 4.28 | step_microstep: 35.27
-[2025-01-25 13:34:46,988] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.88 | bwd: 4570.41 | bwd_inner: 4566.04 | bwd_allreduce: 4.31 | step: 35.28
- 30%|██▉       | 1715/5800 [4:48:16<7:48:20,  6.88s/it]                                                       {'loss': 0.1641, 'grad_norm': 2.3355064392089844, 'learning_rate': 3.3041111954454677e-05, 'epoch': 14.78}
- 30%|██▉       | 1715/5800 [4:48:16<7:48:20,  6.88s/it]score1 tensor([[0.4688],
-        [0.5547],
-        [0.3594],
-        [0.4902]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.4648, 0.3652, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0537, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:34:53,879] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.97 | optimizer_step: 4.37
-[2025-01-25 13:34:53,881] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.87 | bwd_microstep: 4624.55 | bwd_inner_microstep: 4618.90 | bwd_allreduce_microstep: 5.52 | step_microstep: 48.86
-[2025-01-25 13:34:53,881] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.84 | bwd: 4624.58 | bwd_inner: 4618.90 | bwd_allreduce: 5.60 | step: 48.89
- 30%|██▉       | 1716/5800 [4:48:23<7:48:44,  6.89s/it]                                                       {'loss': 0.0537, 'grad_norm': 2.1684651374816895, 'learning_rate': 3.3032642573912114e-05, 'epoch': 14.79}
- 30%|██▉       | 1716/5800 [4:48:23<7:48:44,  6.89s/it]score1 tensor([[0.6602],
-        [0.4062],
-        [0.5117],
-        [0.0000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6289, 0.4258, 0.4180, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1660, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:35:00,744] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 13:35:00,745] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.54 | bwd_microstep: 4584.89 | bwd_inner_microstep: 4580.33 | bwd_allreduce_microstep: 4.48 | step_microstep: 44.06
-[2025-01-25 13:35:00,746] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.49 | bwd: 4584.93 | bwd_inner: 4580.33 | bwd_allreduce: 4.52 | step: 44.07
- 30%|██▉       | 1717/5800 [4:48:30<7:48:27,  6.88s/it]                                                       {'loss': 0.166, 'grad_norm': 3.7519516944885254, 'learning_rate': 3.3024169129565456e-05, 'epoch': 14.8}
- 30%|██▉       | 1717/5800 [4:48:30<7:48:27,  6.88s/it]score1 tensor([[0.4375],
-        [0.5703],
-        [0.4375],
-        [0.4434]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4395, 0.5547, 0.3926, 0.4023], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0259, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:35:07,692] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 13:35:07,693] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.16 | bwd_microstep: 4622.43 | bwd_inner_microstep: 4614.57 | bwd_allreduce_microstep: 7.70 | step_microstep: 49.73
-[2025-01-25 13:35:07,694] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2168.06 | bwd: 4622.49 | bwd_inner: 4614.57 | bwd_allreduce: 7.78 | step: 49.71
- 30%|██▉       | 1718/5800 [4:48:37<7:49:20,  6.90s/it]                                                       {'loss': 0.0259, 'grad_norm': 16.06961441040039, 'learning_rate': 3.301569162405688e-05, 'epoch': 14.81}
- 30%|██▉       | 1718/5800 [4:48:37<7:49:20,  6.90s/it]score1 tensor([[0.5156],
-        [0.4414],
-        [0.2852],
-        [0.1055]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4863, 0.4453, 0.4766, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1416, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:35:14,588] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 13:35:14,589] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.41 | bwd_microstep: 4626.10 | bwd_inner_microstep: 4621.27 | bwd_allreduce_microstep: 4.75 | step_microstep: 41.84
-[2025-01-25 13:35:14,590] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.38 | bwd: 4626.12 | bwd_inner: 4621.27 | bwd_allreduce: 4.79 | step: 41.84
- 30%|██▉       | 1719/5800 [4:48:44<7:49:13,  6.90s/it]                                                       {'loss': 0.1416, 'grad_norm': 26.96866798400879, 'learning_rate': 3.3007210060029806e-05, 'epoch': 14.82}
- 30%|██▉       | 1719/5800 [4:48:44<7:49:13,  6.90s/it]score1 tensor([[0.],
-        [0.],
-        [0.],
-        [0.]], device='cuda:0', dtype=torch.bfloat16, grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.5664, 0.4590, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.5039, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:35:21,349] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 13:35:21,350] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.37 | bwd_microstep: 4487.48 | bwd_inner_microstep: 4482.89 | bwd_allreduce_microstep: 4.50 | step_microstep: 41.06
-[2025-01-25 13:35:21,350] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.33 | bwd: 4487.52 | bwd_inner: 4482.89 | bwd_allreduce: 4.55 | step: 41.07
- 30%|██▉       | 1720/5800 [4:48:51<7:46:16,  6.86s/it]                                                       {'loss': 0.5039, 'grad_norm': 0.0, 'learning_rate': 3.299872444012895e-05, 'epoch': 14.83}
- 30%|██▉       | 1720/5800 [4:48:51<7:46:16,  6.86s/it]score1 tensor([[0.0649],
-        [0.0000],
-        [0.0000],
-        [0.0000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3555, 0.4570, 0.5820, 0.3457], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.4180, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:35:28,119] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.51 | optimizer_step: 4.37
-[2025-01-25 13:35:28,120] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.10 | bwd_microstep: 4501.72 | bwd_inner_microstep: 4497.05 | bwd_allreduce_microstep: 4.58 | step_microstep: 41.72
-[2025-01-25 13:35:28,120] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.06 | bwd: 4501.75 | bwd_inner: 4497.05 | bwd_allreduce: 4.63 | step: 41.73
- 30%|██▉       | 1721/5800 [4:48:58<7:44:25,  6.83s/it]                                                       {'loss': 0.418, 'grad_norm': 1.9926207065582275, 'learning_rate': 3.299023476700026e-05, 'epoch': 14.84}
- 30%|██▉       | 1721/5800 [4:48:58<7:44:25,  6.83s/it]score1 tensor([[0.],
-        [0.],
-        [0.],
-        [0.]], device='cuda:0', dtype=torch.bfloat16, grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.4004, 0.6445, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.4805, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:35:34,876] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 13:35:34,878] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.66 | bwd_microstep: 4487.40 | bwd_inner_microstep: 4482.72 | bwd_allreduce_microstep: 4.59 | step_microstep: 41.81
-[2025-01-25 13:35:34,878] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.63 | bwd: 4487.43 | bwd_inner: 4482.72 | bwd_allreduce: 4.64 | step: 41.82
- 30%|██▉       | 1722/5800 [4:49:04<7:42:47,  6.81s/it]                                                       {'loss': 0.4805, 'grad_norm': 0.0, 'learning_rate': 3.2981741043290975e-05, 'epoch': 14.84}
- 30%|██▉       | 1722/5800 [4:49:04<7:42:47,  6.81s/it]score1 tensor([[0.],
-        [0.],
-        [0.],
-        [0.]], device='cuda:0', dtype=torch.bfloat16, grad_fn=<ReluBackward0>)
-mos tensor([0.4863, 0.3984, 0.6094, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.4961, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:35:41,633] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 13:35:41,634] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.06 | bwd_microstep: 4482.50 | bwd_inner_microstep: 4477.83 | bwd_allreduce_microstep: 4.58 | step_microstep: 41.39
-[2025-01-25 13:35:41,635] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.02 | bwd: 4482.52 | bwd_inner: 4477.83 | bwd_allreduce: 4.62 | step: 41.39
- 30%|██▉       | 1723/5800 [4:49:11<7:41:35,  6.79s/it]                                                       {'loss': 0.4961, 'grad_norm': 0.0, 'learning_rate': 3.297324327164958e-05, 'epoch': 14.85}
- 30%|██▉       | 1723/5800 [4:49:11<7:41:35,  6.79s/it]score1 tensor([[0.],
-        [0.],
-        [0.],
-        [0.]], device='cuda:0', dtype=torch.bfloat16, grad_fn=<ReluBackward0>)
-mos tensor([0.5078, 0.4766, 0.5938, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.5469, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:35:48,395] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 13:35:48,398] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.78 | bwd_microstep: 4489.40 | bwd_inner_microstep: 4484.63 | bwd_allreduce_microstep: 4.67 | step_microstep: 43.65
-[2025-01-25 13:35:48,399] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.73 | bwd: 4489.42 | bwd_inner: 4484.63 | bwd_allreduce: 4.72 | step: 43.66
- 30%|██▉       | 1724/5800 [4:49:18<7:40:57,  6.79s/it]                                                       {'loss': 0.5469, 'grad_norm': 0.0, 'learning_rate': 3.296474145472583e-05, 'epoch': 14.86}
- 30%|██▉       | 1724/5800 [4:49:18<7:40:57,  6.79s/it]score1 tensor([[0.0000],
-        [0.1523],
-        [0.0000],
-        [0.0000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4043, 0.4844, 0.3945, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.4434, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:35:55,217] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.36
-[2025-01-25 13:35:55,218] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.80 | bwd_microstep: 4538.39 | bwd_inner_microstep: 4533.87 | bwd_allreduce_microstep: 4.42 | step_microstep: 44.57
-[2025-01-25 13:35:55,218] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.76 | bwd: 4538.41 | bwd_inner: 4533.87 | bwd_allreduce: 4.47 | step: 44.57
- 30%|██▉       | 1725/5800 [4:49:25<7:41:26,  6.79s/it]                                                       {'loss': 0.4434, 'grad_norm': 4.590134620666504, 'learning_rate': 3.2956235595170726e-05, 'epoch': 14.87}
- 30%|██▉       | 1725/5800 [4:49:25<7:41:26,  6.79s/it]score1 tensor([[0.0645],
-        [0.0000],
-        [0.0850],
-        [0.0000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.5625, 0.5312, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.5273, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:36:02,042] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.36
-[2025-01-25 13:36:02,043] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.00 | bwd_microstep: 4538.58 | bwd_inner_microstep: 4534.09 | bwd_allreduce_microstep: 4.40 | step_microstep: 58.22
-[2025-01-25 13:36:02,044] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.96 | bwd: 4538.61 | bwd_inner: 4534.09 | bwd_allreduce: 4.45 | step: 58.23
- 30%|██▉       | 1726/5800 [4:49:32<7:42:04,  6.81s/it]                                                       {'loss': 0.5273, 'grad_norm': 6.8493499755859375, 'learning_rate': 3.294772569563656e-05, 'epoch': 14.88}
- 30%|██▉       | 1726/5800 [4:49:32<7:42:04,  6.81s/it]score1 tensor([[0.3320],
-        [0.5000],
-        [0.3066],
-        [0.2393]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.5352, 0.4727, 0.3223], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1299, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:36:08,964] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.37
-[2025-01-25 13:36:08,966] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.18 | bwd_microstep: 4620.85 | bwd_inner_microstep: 4615.31 | bwd_allreduce_microstep: 5.34 | step_microstep: 62.58
-[2025-01-25 13:36:08,966] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.14 | bwd: 4620.92 | bwd_inner: 4615.31 | bwd_allreduce: 5.43 | step: 62.56
- 30%|██▉       | 1727/5800 [4:49:38<7:44:22,  6.84s/it]                                                       {'loss': 0.1299, 'grad_norm': 10.658013343811035, 'learning_rate': 3.293921175877685e-05, 'epoch': 14.89}
- 30%|██▉       | 1727/5800 [4:49:38<7:44:22,  6.84s/it]score1 tensor([[0.4707],
-        [0.7461],
-        [0.3809],
-        [0.4688]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.4941, 0.4453, 0.4316], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1094, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:36:15,882] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 13:36:15,883] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2169.14 | bwd_microstep: 4619.57 | bwd_inner_microstep: 4614.58 | bwd_allreduce_microstep: 4.88 | step_microstep: 44.56
-[2025-01-25 13:36:15,884] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2169.07 | bwd: 4619.59 | bwd_inner: 4614.58 | bwd_allreduce: 4.94 | step: 44.57
- 30%|██▉       | 1728/5800 [4:49:45<7:45:46,  6.86s/it]                                                       {'loss': 0.1094, 'grad_norm': 5.290804862976074, 'learning_rate': 3.293069378724641e-05, 'epoch': 14.9}
- 30%|██▉       | 1728/5800 [4:49:45<7:45:46,  6.86s/it]score1 tensor([[0.6523],
-        [0.3164],
-        [0.5898],
-        [0.4590]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.6523, 0.4570, 0.6016], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1855, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:36:22,793] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 13:36:22,795] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.63 | bwd_microstep: 4625.51 | bwd_inner_microstep: 4620.92 | bwd_allreduce_microstep: 4.51 | step_microstep: 46.85
-[2025-01-25 13:36:22,795] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.59 | bwd: 4625.53 | bwd_inner: 4620.92 | bwd_allreduce: 4.55 | step: 46.86
- 30%|██▉       | 1729/5800 [4:49:52<7:46:32,  6.88s/it]                                                       {'loss': 0.1855, 'grad_norm': 4.77406120300293, 'learning_rate': 3.292217178370128e-05, 'epoch': 14.91}
- 30%|██▉       | 1729/5800 [4:49:52<7:46:32,  6.88s/it]score1 tensor([[0.6367],
-        [0.5977],
-        [0.5117],
-        [0.6172]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4805, 0.4277, 0.4785, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1035, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:36:29,692] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.89 | optimizer_step: 4.37
-[2025-01-25 13:36:29,694] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.04 | bwd_microstep: 4621.69 | bwd_inner_microstep: 4613.65 | bwd_allreduce_microstep: 7.86 | step_microstep: 49.38
-[2025-01-25 13:36:29,694] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.01 | bwd: 4621.75 | bwd_inner: 4613.65 | bwd_allreduce: 7.95 | step: 49.36
- 30%|██▉       | 1730/5800 [4:49:59<7:47:22,  6.89s/it]                                                       {'loss': 0.1035, 'grad_norm': 11.53789234161377, 'learning_rate': 3.291364575079876e-05, 'epoch': 14.91}
- 30%|██▉       | 1730/5800 [4:49:59<7:47:22,  6.89s/it]score1 tensor([[0.5312],
-        [0.3867],
-        [0.5586],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3516, 0.4297, 0.6875, 0.4629], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1030, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:36:36,620] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.36
-[2025-01-25 13:36:36,621] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.93 | bwd_microstep: 4619.64 | bwd_inner_microstep: 4614.28 | bwd_allreduce_microstep: 5.23 | step_microstep: 44.22
-[2025-01-25 13:36:36,621] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.90 | bwd: 4619.68 | bwd_inner: 4614.28 | bwd_allreduce: 5.32 | step: 44.22
- 30%|██▉       | 1731/5800 [4:50:06<7:47:34,  6.89s/it]                                                       {'loss': 0.103, 'grad_norm': 2.4105780124664307, 'learning_rate': 3.290511569119743e-05, 'epoch': 14.92}
- 30%|██▉       | 1731/5800 [4:50:06<7:47:34,  6.89s/it]score1 tensor([[0.3926],
-        [0.3438],
-        [0.3379],
-        [0.3359]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.5195, 0.4570, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1172, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:36:43,519] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.37
-[2025-01-25 13:36:43,520] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.97 | bwd_microstep: 4622.66 | bwd_inner_microstep: 4616.59 | bwd_allreduce_microstep: 5.99 | step_microstep: 43.50
-[2025-01-25 13:36:43,521] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.93 | bwd: 4622.68 | bwd_inner: 4616.59 | bwd_allreduce: 6.03 | step: 43.51
- 30%|██▉       | 1732/5800 [4:50:13<7:47:33,  6.90s/it]                                                       {'loss': 0.1172, 'grad_norm': 8.254839897155762, 'learning_rate': 3.28965816075571e-05, 'epoch': 14.93}
- 30%|██▉       | 1732/5800 [4:50:13<7:47:33,  6.90s/it]score1 tensor([[0.3652],
-        [0.7617],
-        [0.4434],
-        [0.3828]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4453, 0.6797, 0.4980, 0.4355], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0674, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:36:50,434] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 8.44 | optimizer_step: 4.37
-[2025-01-25 13:36:50,435] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2170.79 | bwd_microstep: 4618.51 | bwd_inner_microstep: 4613.34 | bwd_allreduce_microstep: 5.07 | step_microstep: 55.03
-[2025-01-25 13:36:50,436] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2170.76 | bwd: 4618.53 | bwd_inner: 4613.34 | bwd_allreduce: 5.12 | step: 55.04
- 30%|██▉       | 1733/5800 [4:50:20<7:47:54,  6.90s/it]                                                       {'loss': 0.0674, 'grad_norm': 6.331440448760986, 'learning_rate': 3.288804350253885e-05, 'epoch': 14.94}
- 30%|██▉       | 1733/5800 [4:50:20<7:47:54,  6.90s/it]score1 tensor([[0.4980],
-        [0.5469],
-        [0.5234],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.4336, 0.6094, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:36:57,306] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 13:36:57,307] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.71 | bwd_microstep: 4582.05 | bwd_inner_microstep: 4577.70 | bwd_allreduce_microstep: 4.27 | step_microstep: 41.47
-[2025-01-25 13:36:57,308] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.68 | bwd: 4582.07 | bwd_inner: 4577.70 | bwd_allreduce: 4.31 | step: 41.47
- 30%|██▉       | 1734/5800 [4:50:27<7:47:04,  6.89s/it]                                                       {'loss': 0.0547, 'grad_norm': 2.7083747386932373, 'learning_rate': 3.287950137880502e-05, 'epoch': 14.95}
- 30%|██▉       | 1734/5800 [4:50:27<7:47:04,  6.89s/it]score1 tensor([[0.5977],
-        [0.7109],
-        [0.6562],
-        [0.6836]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4062, 0.4941, 0.6953, 0.3672], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1914, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:37:04,211] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.37
-[2025-01-25 13:37:04,212] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.22 | bwd_microstep: 4634.48 | bwd_inner_microstep: 4629.74 | bwd_allreduce_microstep: 4.66 | step_microstep: 41.79
-[2025-01-25 13:37:04,213] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.19 | bwd: 4634.50 | bwd_inner: 4629.73 | bwd_allreduce: 4.70 | step: 41.80
- 30%|██▉       | 1735/5800 [4:50:34<7:47:14,  6.90s/it]                                                       {'loss': 0.1914, 'grad_norm': 5.739887714385986, 'learning_rate': 3.2870955239019173e-05, 'epoch': 14.96}
- 30%|██▉       | 1735/5800 [4:50:34<7:47:14,  6.90s/it]score1 tensor([[0.7656],
-        [0.7617],
-        [0.7266],
-        [0.7461]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6602, 0.5000, 0.5156, 0.4824], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2109, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:37:11,130] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 13:37:11,131] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.57 | bwd_microstep: 4641.26 | bwd_inner_microstep: 4636.71 | bwd_allreduce_microstep: 4.46 | step_microstep: 40.91
-[2025-01-25 13:37:11,131] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.54 | bwd: 4641.28 | bwd_inner: 4636.71 | bwd_allreduce: 4.51 | step: 40.92
- 30%|██▉       | 1736/5800 [4:50:41<7:47:41,  6.90s/it]                                                       {'loss': 0.2109, 'grad_norm': 11.76147174835205, 'learning_rate': 3.286240508584615e-05, 'epoch': 14.97}
- 30%|██▉       | 1736/5800 [4:50:41<7:47:41,  6.90s/it]score1 tensor([[0.7539],
-        [0.7305],
-        [0.7070],
-        [0.7266]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.2812, 0.5508, 0.4121], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.3066, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:37:18,048] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 13:37:18,050] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.49 | bwd_microstep: 4633.72 | bwd_inner_microstep: 4628.65 | bwd_allreduce_microstep: 4.97 | step_microstep: 42.86
-[2025-01-25 13:37:18,050] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.45 | bwd: 4633.75 | bwd_inner: 4628.65 | bwd_allreduce: 5.02 | step: 42.87
- 30%|██▉       | 1737/5800 [4:50:48<7:47:44,  6.91s/it]                                                       {'loss': 0.3066, 'grad_norm': 11.637667655944824, 'learning_rate': 3.2853850921952046e-05, 'epoch': 14.97}
- 30%|██▉       | 1737/5800 [4:50:48<7:47:44,  6.91s/it]score1 tensor([[0.6484],
-        [0.7305],
-        [0.6836],
-        [0.7383]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.5430, 0.5039, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1396, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:37:24,966] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.37
-[2025-01-25 13:37:24,968] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.43 | bwd_microstep: 4637.64 | bwd_inner_microstep: 4632.88 | bwd_allreduce_microstep: 4.68 | step_microstep: 42.34
-[2025-01-25 13:37:24,968] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2168.39 | bwd: 4637.67 | bwd_inner: 4632.88 | bwd_allreduce: 4.72 | step: 42.35
- 30%|██▉       | 1738/5800 [4:50:54<7:47:47,  6.91s/it]                                                       {'loss': 0.1396, 'grad_norm': 11.30223274230957, 'learning_rate': 3.2845292750004185e-05, 'epoch': 14.98}
- 30%|██▉       | 1738/5800 [4:50:54<7:47:47,  6.91s/it]score1 tensor([[0.6133],
-        [0.6445],
-        [0.5898],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5938, 0.4375, 0.6211, 0.3652], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1138, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:37:31,895] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 13:37:31,896] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2170.73 | bwd_microstep: 4639.62 | bwd_inner_microstep: 4634.71 | bwd_allreduce_microstep: 4.80 | step_microstep: 52.43
-[2025-01-25 13:37:31,897] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2170.69 | bwd: 4639.64 | bwd_inner: 4634.71 | bwd_allreduce: 4.85 | step: 52.44
- 30%|██▉       | 1739/5800 [4:51:01<7:48:06,  6.92s/it]                                                       {'loss': 0.1138, 'grad_norm': 5.471914768218994, 'learning_rate': 3.2836730572671155e-05, 'epoch': 14.99}
- 30%|██▉       | 1739/5800 [4:51:01<7:48:06,  6.92s/it][93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-score1 tensor([[0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4590], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:37:35,791] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.53 | optimizer_step: 4.36
-[2025-01-25 13:37:35,792] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 572.60 | bwd_microstep: 1221.01 | bwd_inner_microstep: 1216.22 | bwd_allreduce_microstep: 4.71 | step_microstep: 40.91
-[2025-01-25 13:37:35,792] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 572.57 | bwd: 1221.03 | bwd_inner: 1216.21 | bwd_allreduce: 4.75 | step: 40.91
- 30%|███       | 1740/5800 [4:51:05<6:46:36,  6.01s/it]                                                       {'loss': 0.0254, 'grad_norm': 9.3744535446167, 'learning_rate': 3.2828164392622804e-05, 'epoch': 15.0}
- 30%|███       | 1740/5800 [4:51:05<6:46:36,  6.01s/it][2025-01-25 13:37:40,432] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 13:37:50,300] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 13:38:00,095] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 13:38:09,343] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-score1 tensor([[0.3711],
-        [0.4102],
-        [0.3984],
-        [0.3711]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6562, 0.5547, 0.5547, 0.4902], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1758, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:38:23,472] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.11 | optimizer_step: 4.37
-[2025-01-25 13:38:23,473] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2138.01 | bwd_microstep: 4571.48 | bwd_inner_microstep: 4566.63 | bwd_allreduce_microstep: 4.76 | step_microstep: 47.50
-[2025-01-25 13:38:23,474] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2137.96 | bwd: 4571.50 | bwd_inner: 4566.63 | bwd_allreduce: 4.81 | step: 47.51
- 30%|███       | 1741/5800 [4:51:53<20:52:26, 18.51s/it]                                                        {'loss': 0.1758, 'grad_norm': 8.514272689819336, 'learning_rate': 3.28195942125302e-05, 'epoch': 15.01}
- 30%|███       | 1741/5800 [4:51:53<20:52:26, 18.51s/it]score1 tensor([[0.2695],
-        [0.2500],
-        [0.2949],
-        [0.2275]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4277, 0.4941, 0.5195, 0.4043], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2012, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:38:30,313] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 13:38:30,315] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2130.06 | bwd_microstep: 4585.72 | bwd_inner_microstep: 4580.99 | bwd_allreduce_microstep: 4.64 | step_microstep: 41.95
-[2025-01-25 13:38:30,315] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2130.02 | bwd: 4585.74 | bwd_inner: 4580.99 | bwd_allreduce: 4.69 | step: 41.96
- 30%|███       | 1742/5800 [4:52:00<16:55:14, 15.01s/it]                                                        {'loss': 0.2012, 'grad_norm': 7.5994977951049805, 'learning_rate': 3.281102003506569e-05, 'epoch': 15.02}
- 30%|███       | 1742/5800 [4:52:00<16:55:14, 15.01s/it]score1 tensor([[0.2812],
-        [0.2969],
-        [0.2637],
-        [0.2734]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4863, 0.4961, 0.4863, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2168, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:38:37,168] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.36
-[2025-01-25 13:38:37,169] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2143.07 | bwd_microstep: 4589.19 | bwd_inner_microstep: 4583.64 | bwd_allreduce_microstep: 5.44 | step_microstep: 45.73
-[2025-01-25 13:38:37,170] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2143.04 | bwd: 4589.21 | bwd_inner: 4583.65 | bwd_allreduce: 5.49 | step: 45.74
- 30%|███       | 1743/5800 [4:52:07<14:09:30, 12.56s/it]                                                        {'loss': 0.2168, 'grad_norm': 7.665403842926025, 'learning_rate': 3.2802441862902836e-05, 'epoch': 15.03}
- 30%|███       | 1743/5800 [4:52:07<14:09:30, 12.56s/it]score1 tensor([[0.3730],
-        [0.3848],
-        [0.3262],
-        [0.3750]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.6016, 0.4980, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1348, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:38:44,037] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.37
-[2025-01-25 13:38:44,038] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2147.38 | bwd_microstep: 4597.90 | bwd_inner_microstep: 4592.42 | bwd_allreduce_microstep: 5.38 | step_microstep: 47.49
-[2025-01-25 13:38:44,038] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2147.35 | bwd: 4597.92 | bwd_inner: 4592.42 | bwd_allreduce: 5.43 | step: 47.50
- 30%|███       | 1744/5800 [4:52:14<12:13:47, 10.85s/it]                                                        {'loss': 0.1348, 'grad_norm': 8.255167007446289, 'learning_rate': 3.279385969871647e-05, 'epoch': 15.03}
- 30%|███       | 1744/5800 [4:52:14<12:13:47, 10.85s/it]score1 tensor([[0.4141],
-        [0.3887],
-        [0.4238],
-        [0.3535]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.4023, 0.6055, 0.4766], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1167, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:38:50,918] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.39 | optimizer_step: 4.36
-[2025-01-25 13:38:50,920] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2151.97 | bwd_microstep: 4601.04 | bwd_inner_microstep: 4594.89 | bwd_allreduce_microstep: 5.97 | step_microstep: 54.41
-[2025-01-25 13:38:50,920] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2151.91 | bwd: 4601.12 | bwd_inner: 4594.89 | bwd_allreduce: 6.07 | step: 54.44
- 30%|███       | 1745/5800 [4:52:20<10:53:09,  9.66s/it]                                                        {'loss': 0.1167, 'grad_norm': 8.406033515930176, 'learning_rate': 3.278527354518265e-05, 'epoch': 15.04}
- 30%|███       | 1745/5800 [4:52:20<10:53:09,  9.66s/it]score1 tensor([[0.5469],
-        [0.4395],
-        [0.4629],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6367, 0.4805, 0.6055, 0.4629], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0835, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:38:57,833] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 13:38:57,834] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.60 | bwd_microstep: 4605.25 | bwd_inner_microstep: 4599.51 | bwd_allreduce_microstep: 5.63 | step_microstep: 69.01
-[2025-01-25 13:38:57,835] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.50 | bwd: 4605.29 | bwd_inner: 4599.51 | bwd_allreduce: 5.67 | step: 69.01
- 30%|███       | 1746/5800 [4:52:27<9:57:31,  8.84s/it]                                                        {'loss': 0.0835, 'grad_norm': 4.641445159912109, 'learning_rate': 3.2776683404978705e-05, 'epoch': 15.05}
- 30%|███       | 1746/5800 [4:52:27<9:57:31,  8.84s/it]score1 tensor([[0.5117],
-        [0.5273],
-        [0.5625],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.3730, 0.5664, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0723, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:39:04,758] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 10.36 | optimizer_step: 4.71
-[2025-01-25 13:39:04,760] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.07 | bwd_microstep: 4602.80 | bwd_inner_microstep: 4597.24 | bwd_allreduce_microstep: 5.47 | step_microstep: 58.04
-[2025-01-25 13:39:04,761] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.03 | bwd: 4602.83 | bwd_inner: 4597.24 | bwd_allreduce: 5.50 | step: 58.05
- 30%|███       | 1747/5800 [4:52:34<9:18:39,  8.27s/it]                                                       {'loss': 0.0723, 'grad_norm': 4.799535751342773, 'learning_rate': 3.2768089280783174e-05, 'epoch': 15.06}
- 30%|███       | 1747/5800 [4:52:34<9:18:39,  8.27s/it]score1 tensor([[0.6133],
-        [0.6094],
-        [0.6953],
-        [0.6289]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3945, 0.4043, 0.6836, 0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1328, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:39:11,680] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 13:39:11,681] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2151.45 | bwd_microstep: 4610.11 | bwd_inner_microstep: 4604.38 | bwd_allreduce_microstep: 5.57 | step_microstep: 47.46
-[2025-01-25 13:39:11,682] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2151.40 | bwd: 4610.15 | bwd_inner: 4604.38 | bwd_allreduce: 5.65 | step: 47.44
- 30%|███       | 1748/5800 [4:52:41<8:50:40,  7.86s/it]                                                       {'loss': 0.1328, 'grad_norm': 10.47259521484375, 'learning_rate': 3.275949117527586e-05, 'epoch': 15.07}
- 30%|███       | 1748/5800 [4:52:41<8:50:40,  7.86s/it]score1 tensor([[0.6484],
-        [0.6367],
-        [0.6562],
-        [0.6133]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.4297, 0.5156, 0.4434], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1621, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:39:18,560] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.36
-[2025-01-25 13:39:18,561] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2149.98 | bwd_microstep: 4615.27 | bwd_inner_microstep: 4609.73 | bwd_allreduce_microstep: 5.47 | step_microstep: 43.35
-[2025-01-25 13:39:18,562] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2149.92 | bwd: 4615.30 | bwd_inner: 4609.73 | bwd_allreduce: 5.50 | step: 43.36
- 30%|███       | 1749/5800 [4:52:48<8:30:44,  7.56s/it]                                                       {'loss': 0.1621, 'grad_norm': 10.413982391357422, 'learning_rate': 3.275088909113781e-05, 'epoch': 15.08}
- 30%|███       | 1749/5800 [4:52:48<8:30:44,  7.56s/it]score1 tensor([[0.6406],
-        [0.7031],
-        [0.6406],
-        [0.6680]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4766, 0.4570, 0.4551, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1953, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:39:25,447] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 13:39:25,449] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.47 | bwd_microstep: 4614.84 | bwd_inner_microstep: 4609.78 | bwd_allreduce_microstep: 4.96 | step_microstep: 43.76
-[2025-01-25 13:39:25,449] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.43 | bwd: 4614.87 | bwd_inner: 4609.78 | bwd_allreduce: 5.02 | step: 43.77
- 30%|███       | 1750/5800 [4:52:55<8:16:55,  7.36s/it]                                                       {'loss': 0.1953, 'grad_norm': 10.641377449035645, 'learning_rate': 3.2742283031051286e-05, 'epoch': 15.09}
- 30%|███       | 1750/5800 [4:52:55<8:16:55,  7.36s/it]score1 tensor([[0.6328],
-        [0.6523],
-        [0.5742],
-        [0.6172]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.4492, 0.4004, 0.3418], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1855, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:39:32,336] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 13:39:32,338] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.51 | bwd_microstep: 4610.46 | bwd_inner_microstep: 4604.07 | bwd_allreduce_microstep: 6.26 | step_microstep: 45.80
-[2025-01-25 13:39:32,338] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.47 | bwd: 4610.49 | bwd_inner: 4604.07 | bwd_allreduce: 6.33 | step: 45.80
- 30%|███       | 1751/5800 [4:53:02<8:07:10,  7.22s/it]                                                       {'loss': 0.1855, 'grad_norm': 10.14184856414795, 'learning_rate': 3.273367299769982e-05, 'epoch': 15.09}
- 30%|███       | 1751/5800 [4:53:02<8:07:10,  7.22s/it]score1 tensor([[0.5781],
-        [0.6992],
-        [0.5664],
-        [0.7461]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.3809, 0.4707, 0.6406], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1387, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:39:39,221] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 13:39:39,222] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.00 | bwd_microstep: 4615.63 | bwd_inner_microstep: 4610.58 | bwd_allreduce_microstep: 4.96 | step_microstep: 44.03
-[2025-01-25 13:39:39,223] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.97 | bwd: 4615.65 | bwd_inner: 4610.58 | bwd_allreduce: 5.00 | step: 44.03
- 30%|███       | 1752/5800 [4:53:09<8:00:19,  7.12s/it]                                                       {'loss': 0.1387, 'grad_norm': 10.56175422668457, 'learning_rate': 3.272505899376816e-05, 'epoch': 15.1}
- 30%|███       | 1752/5800 [4:53:09<8:00:19,  7.12s/it]score1 tensor([[0.5273],
-        [0.6172],
-        [0.6055],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.4844, 0.6172, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0562, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:39:46,114] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 13:39:46,115] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.50 | bwd_microstep: 4613.97 | bwd_inner_microstep: 4608.76 | bwd_allreduce_microstep: 5.11 | step_microstep: 44.44
-[2025-01-25 13:39:46,115] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.47 | bwd: 4613.99 | bwd_inner: 4608.76 | bwd_allreduce: 5.16 | step: 44.45
- 30%|███       | 1753/5800 [4:53:16<7:55:34,  7.05s/it]                                                       {'loss': 0.0562, 'grad_norm': 0.8990932106971741, 'learning_rate': 3.27164410219423e-05, 'epoch': 15.11}
- 30%|███       | 1753/5800 [4:53:16<7:55:34,  7.05s/it]score1 tensor([[0.4785],
-        [0.5195],
-        [0.3965],
-        [0.3828]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3789, 0.4492, 0.4727, 0.3652], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0659, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:39:53,029] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.09 | optimizer_step: 4.43
-[2025-01-25 13:39:53,031] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.95 | bwd_microstep: 4619.32 | bwd_inner_microstep: 4613.99 | bwd_allreduce_microstep: 5.24 | step_microstep: 57.27
-[2025-01-25 13:39:53,032] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.91 | bwd: 4619.34 | bwd_inner: 4614.00 | bwd_allreduce: 5.28 | step: 57.36
- 30%|███       | 1754/5800 [4:53:23<7:53:13,  7.02s/it]                                                       {'loss': 0.0659, 'grad_norm': 4.702240467071533, 'learning_rate': 3.270781908490949e-05, 'epoch': 15.12}
- 30%|███       | 1754/5800 [4:53:23<7:53:13,  7.02s/it]score1 tensor([[0.4277],
-        [0.3770],
-        [0.3750],
-        [0.3848]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5938, 0.5547, 0.4844, 0.4609], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1328, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:39:59,970] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.36
-[2025-01-25 13:39:59,972] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.19 | bwd_microstep: 4615.81 | bwd_inner_microstep: 4607.33 | bwd_allreduce_microstep: 8.21 | step_microstep: 67.30
-[2025-01-25 13:39:59,972] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.15 | bwd: 4615.87 | bwd_inner: 4607.33 | bwd_allreduce: 8.35 | step: 67.29
- 30%|███       | 1755/5800 [4:53:29<7:51:47,  7.00s/it]                                                       {'loss': 0.1328, 'grad_norm': 8.02151107788086, 'learning_rate': 3.269919318535819e-05, 'epoch': 15.13}
- 30%|███       | 1755/5800 [4:53:29<7:51:47,  7.00s/it]score1 tensor([[0.3691],
-        [0.3789],
-        [0.3438],
-        [0.3203]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.3926, 0.3750, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1045, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:40:06,961] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.79 | optimizer_step: 4.40
-[2025-01-25 13:40:06,963] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.92 | bwd_microstep: 4625.76 | bwd_inner_microstep: 4618.96 | bwd_allreduce_microstep: 6.61 | step_microstep: 86.44
-[2025-01-25 13:40:06,963] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2168.88 | bwd: 4625.80 | bwd_inner: 4618.96 | bwd_allreduce: 6.71 | step: 86.47
- 30%|███       | 1756/5800 [4:53:36<7:51:07,  6.99s/it]                                                       {'loss': 0.1045, 'grad_norm': 7.73003625869751, 'learning_rate': 3.26905633259781e-05, 'epoch': 15.14}
- 30%|███       | 1756/5800 [4:53:36<7:51:07,  6.99s/it]score1 tensor([[0.2969],
-        [0.3281],
-        [0.3496],
-        [0.3848]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.5508, 0.5156, 0.6484], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2061, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:40:13,873] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 13:40:13,874] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.01 | bwd_microstep: 4612.57 | bwd_inner_microstep: 4607.14 | bwd_allreduce_microstep: 5.31 | step_microstep: 44.73
-[2025-01-25 13:40:13,874] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.97 | bwd: 4612.59 | bwd_inner: 4607.15 | bwd_allreduce: 5.37 | step: 44.74
- 30%|███       | 1757/5800 [4:53:43<7:49:04,  6.96s/it]                                                       {'loss': 0.2061, 'grad_norm': 7.666561603546143, 'learning_rate': 3.2681929509460166e-05, 'epoch': 15.15}
- 30%|███       | 1757/5800 [4:53:43<7:49:04,  6.96s/it]score1 tensor([[0.4102],
-        [0.4902],
-        [0.4727],
-        [0.3379]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.5898, 0.5977, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0991, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:40:20,759] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 13:40:20,761] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.74 | bwd_microstep: 4610.45 | bwd_inner_microstep: 4605.51 | bwd_allreduce_microstep: 4.83 | step_microstep: 43.40
-[2025-01-25 13:40:20,761] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.71 | bwd: 4610.47 | bwd_inner: 4605.51 | bwd_allreduce: 4.89 | step: 43.41
- 30%|███       | 1758/5800 [4:53:50<7:47:27,  6.94s/it]                                                       {'loss': 0.0991, 'grad_norm': 8.347312927246094, 'learning_rate': 3.267329173849656e-05, 'epoch': 15.16}
- 30%|███       | 1758/5800 [4:53:50<7:47:27,  6.94s/it]score1 tensor([[0.4199],
-        [0.4180],
-        [0.4473],
-        [0.3926]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.4863, 0.5781, 0.4316], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0864, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:40:27,652] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 13:40:27,654] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.47 | bwd_microstep: 4610.09 | bwd_inner_microstep: 4604.80 | bwd_allreduce_microstep: 5.18 | step_microstep: 51.76
-[2025-01-25 13:40:27,654] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.43 | bwd: 4610.11 | bwd_inner: 4604.80 | bwd_allreduce: 5.24 | step: 51.76
- 30%|███       | 1759/5800 [4:53:57<7:46:23,  6.92s/it]                                                       {'loss': 0.0864, 'grad_norm': 8.230609893798828, 'learning_rate': 3.266465001578069e-05, 'epoch': 15.16}
- 30%|███       | 1759/5800 [4:53:57<7:46:23,  6.92s/it]score1 tensor([[0.4648],
-        [0.4746],
-        [0.3809],
-        [0.4375]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5000, 0.5391, 0.4863, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0537, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:40:34,549] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 13:40:34,550] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.51 | bwd_microstep: 4615.38 | bwd_inner_microstep: 4609.48 | bwd_allreduce_microstep: 5.80 | step_microstep: 49.67
-[2025-01-25 13:40:34,551] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.47 | bwd: 4615.40 | bwd_inner: 4609.48 | bwd_allreduce: 5.85 | step: 49.67
- 30%|███       | 1760/5800 [4:54:04<7:45:43,  6.92s/it]                                                       {'loss': 0.0537, 'grad_norm': 7.82038688659668, 'learning_rate': 3.265600434400719e-05, 'epoch': 15.17}
- 30%|███       | 1760/5800 [4:54:04<7:45:43,  6.92s/it]score1 tensor([[0.5312],
-        [0.4512],
-        [0.5430],
-        [0.4824]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4629, 0.4023, 0.5273, 0.6016], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0630, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:40:41,449] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 13:40:41,450] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.66 | bwd_microstep: 4622.48 | bwd_inner_microstep: 4617.54 | bwd_allreduce_microstep: 4.85 | step_microstep: 43.18
-[2025-01-25 13:40:41,450] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.60 | bwd: 4622.51 | bwd_inner: 4617.53 | bwd_allreduce: 4.90 | step: 43.18
- 30%|███       | 1761/5800 [4:54:11<7:45:14,  6.91s/it]                                                       {'loss': 0.063, 'grad_norm': 4.633824348449707, 'learning_rate': 3.264735472587195e-05, 'epoch': 15.18}
- 30%|███       | 1761/5800 [4:54:11<7:45:14,  6.91s/it]score1 tensor([[0.6094],
-        [0.6680],
-        [0.5664],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.6211, 0.4746, 0.4668], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0869, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:40:48,341] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.37
-[2025-01-25 13:40:48,343] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.99 | bwd_microstep: 4617.42 | bwd_inner_microstep: 4612.34 | bwd_allreduce_microstep: 4.99 | step_microstep: 43.41
-[2025-01-25 13:40:48,343] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.94 | bwd: 4617.44 | bwd_inner: 4612.34 | bwd_allreduce: 5.03 | step: 43.41
- 30%|███       | 1762/5800 [4:54:18<7:44:46,  6.91s/it]                                                       {'loss': 0.0869, 'grad_norm': 10.003738403320312, 'learning_rate': 3.263870116407204e-05, 'epoch': 15.19}
- 30%|███       | 1762/5800 [4:54:18<7:44:46,  6.91s/it]score1 tensor([[0.5586],
-        [0.6406],
-        [0.5938],
-        [0.6250]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4570, 0.5547, 0.5117, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1025, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:40:55,241] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 13:40:55,242] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.18 | bwd_microstep: 4623.49 | bwd_inner_microstep: 4618.50 | bwd_allreduce_microstep: 4.91 | step_microstep: 46.79
-[2025-01-25 13:40:55,243] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.14 | bwd: 4623.51 | bwd_inner: 4618.50 | bwd_allreduce: 4.94 | step: 46.81
- 30%|███       | 1763/5800 [4:54:25<7:44:31,  6.90s/it]                                                       {'loss': 0.1025, 'grad_norm': 9.927833557128906, 'learning_rate': 3.263004366130581e-05, 'epoch': 15.2}
- 30%|███       | 1763/5800 [4:54:25<7:44:31,  6.90s/it]score1 tensor([[0.6562],
-        [0.5156],
-        [0.6211],
-        [0.4297]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.3086, 0.6055, 0.6016], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1416, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:41:02,149] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.36
-[2025-01-25 13:41:02,150] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.13 | bwd_microstep: 4625.72 | bwd_inner_microstep: 4620.36 | bwd_allreduce_microstep: 5.23 | step_microstep: 46.25
-[2025-01-25 13:41:02,151] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.08 | bwd: 4625.75 | bwd_inner: 4620.36 | bwd_allreduce: 5.31 | step: 46.27
- 30%|███       | 1764/5800 [4:54:32<7:44:32,  6.91s/it]                                                       {'loss': 0.1416, 'grad_norm': 7.605889797210693, 'learning_rate': 3.262138222027281e-05, 'epoch': 15.21}
- 30%|███       | 1764/5800 [4:54:32<7:44:32,  6.91s/it]score1 tensor([[0.5742],
-        [0.6289],
-        [0.5195],
-        [0.6602]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.6172, 0.5156, 0.6641], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0166, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:41:09,076] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.87 | optimizer_step: 4.87
-[2025-01-25 13:41:09,079] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.46 | bwd_microstep: 4618.38 | bwd_inner_microstep: 4612.63 | bwd_allreduce_microstep: 5.67 | step_microstep: 70.01
-[2025-01-25 13:41:09,080] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.41 | bwd: 4618.40 | bwd_inner: 4612.63 | bwd_allreduce: 5.71 | step: 70.05
- 30%|███       | 1765/5800 [4:54:39<7:45:23,  6.92s/it]                                                       {'loss': 0.0166, 'grad_norm': 0.7937525510787964, 'learning_rate': 3.261271684367385e-05, 'epoch': 15.22}
- 30%|███       | 1765/5800 [4:54:39<7:45:23,  6.92s/it]score1 tensor([[0.5859],
-        [0.5703],
-        [0.5586],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133, 0.5156, 0.5273, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0342, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:41:16,032] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.37
-[2025-01-25 13:41:16,033] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.90 | bwd_microstep: 4619.45 | bwd_inner_microstep: 4614.38 | bwd_allreduce_microstep: 4.97 | step_microstep: 54.37
-[2025-01-25 13:41:16,033] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.86 | bwd: 4619.47 | bwd_inner: 4614.38 | bwd_allreduce: 5.02 | step: 54.38
- 30%|███       | 1766/5800 [4:54:46<7:45:26,  6.92s/it]                                                       {'loss': 0.0342, 'grad_norm': 4.613661289215088, 'learning_rate': 3.260404753421092e-05, 'epoch': 15.22}
- 30%|███       | 1766/5800 [4:54:46<7:45:26,  6.92s/it]score1 tensor([[0.4883],
-        [0.4883],
-        [0.5234],
-        [0.6133]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3457, 0.4551, 0.4922, 0.6523], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0615, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:41:22,949] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.36
-[2025-01-25 13:41:22,950] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.85 | bwd_microstep: 4628.05 | bwd_inner_microstep: 4622.64 | bwd_allreduce_microstep: 5.31 | step_microstep: 44.88
-[2025-01-25 13:41:22,951] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.80 | bwd: 4628.08 | bwd_inner: 4622.64 | bwd_allreduce: 5.36 | step: 44.89
- 30%|███       | 1767/5800 [4:54:52<7:45:12,  6.92s/it]                                                       {'loss': 0.0615, 'grad_norm': 4.160454273223877, 'learning_rate': 3.259537429458726e-05, 'epoch': 15.23}
- 30%|███       | 1767/5800 [4:54:52<7:45:12,  6.92s/it]score1 tensor([[0.4414],
-        [0.4727],
-        [0.4863],
-        [0.4629]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.4648, 0.5938, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0513, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:41:29,845] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 13:41:29,846] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.95 | bwd_microstep: 4616.39 | bwd_inner_microstep: 4610.88 | bwd_allreduce_microstep: 5.40 | step_microstep: 43.74
-[2025-01-25 13:41:29,847] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.90 | bwd: 4616.42 | bwd_inner: 4610.88 | bwd_allreduce: 5.45 | step: 43.74
- 30%|███       | 1768/5800 [4:54:59<7:44:34,  6.91s/it]                                                       {'loss': 0.0513, 'grad_norm': 0.7448033690452576, 'learning_rate': 3.258669712750736e-05, 'epoch': 15.24}
- 30%|███       | 1768/5800 [4:54:59<7:44:34,  6.91s/it]score1 tensor([[0.4434],
-        [0.4277],
-        [0.4414],
-        [0.3867]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3945, 0.4668, 0.5391, 0.4609], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0649, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:41:36,748] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.37
-[2025-01-25 13:41:36,749] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.12 | bwd_microstep: 4623.75 | bwd_inner_microstep: 4618.77 | bwd_allreduce_microstep: 4.88 | step_microstep: 43.77
-[2025-01-25 13:41:36,749] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.08 | bwd: 4623.78 | bwd_inner: 4618.77 | bwd_allreduce: 4.94 | step: 43.78
- 30%|███       | 1769/5800 [4:55:06<7:44:15,  6.91s/it]                                                       {'loss': 0.0649, 'grad_norm': 4.075119495391846, 'learning_rate': 3.2578016035676895e-05, 'epoch': 15.25}
- 30%|███       | 1769/5800 [4:55:06<7:44:15,  6.91s/it]score1 tensor([[0.3906],
-        [0.3945],
-        [0.4434],
-        [0.3867]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4121, 0.4043, 0.6875, 0.4590], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0869, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:41:43,645] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.37
-[2025-01-25 13:41:43,647] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.24 | bwd_microstep: 4616.89 | bwd_inner_microstep: 4611.94 | bwd_allreduce_microstep: 4.83 | step_microstep: 45.01
-[2025-01-25 13:41:43,647] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.19 | bwd: 4616.91 | bwd_inner: 4611.94 | bwd_allreduce: 4.89 | step: 45.01
- 31%|███       | 1770/5800 [4:55:13<7:43:51,  6.91s/it]                                                       {'loss': 0.0869, 'grad_norm': 7.972666263580322, 'learning_rate': 3.256933102180278e-05, 'epoch': 15.26}
- 31%|███       | 1770/5800 [4:55:13<7:43:51,  6.91s/it]score1 tensor([[0.4453],
-        [0.5156],
-        [0.3516],
-        [0.4023]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.5391, 0.4238, 0.4707], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0674, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:41:50,552] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 13:41:50,553] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.41 | bwd_microstep: 4619.02 | bwd_inner_microstep: 4613.73 | bwd_allreduce_microstep: 5.11 | step_microstep: 43.27
-[2025-01-25 13:41:50,554] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.37 | bwd: 4619.04 | bwd_inner: 4613.73 | bwd_allreduce: 5.23 | step: 43.28
- 31%|███       | 1771/5800 [4:55:20<7:43:46,  6.91s/it]                                                       {'loss': 0.0674, 'grad_norm': 8.215972900390625, 'learning_rate': 3.256064208859315e-05, 'epoch': 15.27}
- 31%|███       | 1771/5800 [4:55:20<7:43:46,  6.91s/it]score1 tensor([[0.3555],
-        [0.4043],
-        [0.4297],
-        [0.3398]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.4316, 0.6289, 0.3457], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1030, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:41:57,453] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 13:41:57,454] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.15 | bwd_microstep: 4616.84 | bwd_inner_microstep: 4611.17 | bwd_allreduce_microstep: 5.56 | step_microstep: 43.82
-[2025-01-25 13:41:57,455] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.11 | bwd: 4616.87 | bwd_inner: 4611.17 | bwd_allreduce: 5.62 | step: 43.83
- 31%|███       | 1772/5800 [4:55:27<7:43:43,  6.91s/it]                                                       {'loss': 0.103, 'grad_norm': 7.839386463165283, 'learning_rate': 3.255194923875738e-05, 'epoch': 15.28}
- 31%|███       | 1772/5800 [4:55:27<7:43:43,  6.91s/it]score1 tensor([[0.4824],
-        [0.5273],
-        [0.4375],
-        [0.6094]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.6445, 0.4395, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0615, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:42:04,379] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.37
-[2025-01-25 13:42:04,380] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.01 | bwd_microstep: 4630.50 | bwd_inner_microstep: 4624.91 | bwd_allreduce_microstep: 5.50 | step_microstep: 47.45
-[2025-01-25 13:42:04,380] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.97 | bwd: 4630.53 | bwd_inner: 4624.91 | bwd_allreduce: 5.55 | step: 47.46
- 31%|███       | 1773/5800 [4:55:34<7:44:01,  6.91s/it]                                                       {'loss': 0.0615, 'grad_norm': 0.9695403575897217, 'learning_rate': 3.2543252475006034e-05, 'epoch': 15.28}
- 31%|███       | 1773/5800 [4:55:34<7:44:01,  6.91s/it]score1 tensor([[0.5117],
-        [0.4902],
-        [0.5117],
-        [0.5781]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.4336, 0.5352, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0474, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:42:11,341] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 13:42:11,342] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.07 | bwd_microstep: 4622.96 | bwd_inner_microstep: 4614.21 | bwd_allreduce_microstep: 8.52 | step_microstep: 77.34
-[2025-01-25 13:42:11,342] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.01 | bwd: 4623.04 | bwd_inner: 4614.21 | bwd_allreduce: 8.65 | step: 77.32
- 31%|███       | 1774/5800 [4:55:41<7:45:00,  6.93s/it]                                                       {'loss': 0.0474, 'grad_norm': 4.70314884185791, 'learning_rate': 3.253455180005093e-05, 'epoch': 15.29}
- 31%|███       | 1774/5800 [4:55:41<7:45:00,  6.93s/it]score1 tensor([[0.5703],
-        [0.4844],
-        [0.5508],
-        [0.5859]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.4336, 0.5742, 0.6289], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0474, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:42:18,302] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.10 | optimizer_step: 4.36
-[2025-01-25 13:42:18,303] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.08 | bwd_microstep: 4640.75 | bwd_inner_microstep: 4630.87 | bwd_allreduce_microstep: 9.70 | step_microstep: 53.01
-[2025-01-25 13:42:18,304] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.97 | bwd: 4640.80 | bwd_inner: 4630.87 | bwd_allreduce: 9.79 | step: 52.99
- 31%|███       | 1775/5800 [4:55:48<7:45:10,  6.93s/it]                                                       {'loss': 0.0474, 'grad_norm': 0.8977437615394592, 'learning_rate': 3.252584721660508e-05, 'epoch': 15.3}
- 31%|███       | 1775/5800 [4:55:48<7:45:10,  6.93s/it]score1 tensor([[0.5391],
-        [0.5742],
-        [0.5859],
-        [0.6094]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.5117, 0.4922, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0493, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:42:25,186] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.37
-[2025-01-25 13:42:25,188] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.72 | bwd_microstep: 4594.86 | bwd_inner_microstep: 4589.13 | bwd_allreduce_microstep: 5.65 | step_microstep: 47.09
-[2025-01-25 13:42:25,188] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.67 | bwd: 4594.88 | bwd_inner: 4589.13 | bwd_allreduce: 5.69 | step: 47.10
- 31%|███       | 1776/5800 [4:55:55<7:44:02,  6.92s/it]                                                       {'loss': 0.0493, 'grad_norm': 7.186075687408447, 'learning_rate': 3.2517138727382725e-05, 'epoch': 15.31}
- 31%|███       | 1776/5800 [4:55:55<7:44:02,  6.92s/it]score1 tensor([[0.6211],
-        [0.5703],
-        [0.5781],
-        [0.6445]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3262, 0.4297, 0.5703, 0.6562], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1138, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:42:32,113] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 13:42:32,114] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2170.70 | bwd_microstep: 4634.97 | bwd_inner_microstep: 4630.17 | bwd_allreduce_microstep: 4.69 | step_microstep: 43.51
-[2025-01-25 13:42:32,114] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2170.67 | bwd: 4635.00 | bwd_inner: 4630.17 | bwd_allreduce: 4.74 | step: 43.56
- 31%|███       | 1777/5800 [4:56:02<7:44:04,  6.92s/it]                                                       {'loss': 0.1138, 'grad_norm': 4.933861255645752, 'learning_rate': 3.250842633509931e-05, 'epoch': 15.32}
- 31%|███       | 1777/5800 [4:56:02<7:44:04,  6.92s/it]score1 tensor([[0.6133],
-        [0.5547],
-        [0.5547],
-        [0.6172]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.4902, 0.4980, 0.5820], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0728, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:42:39,033] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 13:42:39,034] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.65 | bwd_microstep: 4634.82 | bwd_inner_microstep: 4629.95 | bwd_allreduce_microstep: 4.78 | step_microstep: 47.69
-[2025-01-25 13:42:39,034] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.59 | bwd: 4634.84 | bwd_inner: 4629.95 | bwd_allreduce: 4.82 | step: 47.70
- 31%|███       | 1778/5800 [4:56:09<7:43:54,  6.92s/it]                                                       {'loss': 0.0728, 'grad_norm': 9.681526184082031, 'learning_rate': 3.2499710042471544e-05, 'epoch': 15.33}
- 31%|███       | 1778/5800 [4:56:09<7:43:54,  6.92s/it]score1 tensor([[0.5469],
-        [0.5273],
-        [0.5039],
-        [0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4824, 0.5625, 0.4824, 0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0420, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:42:45,970] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.97 | optimizer_step: 4.36
-[2025-01-25 13:42:45,972] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.85 | bwd_microstep: 4638.66 | bwd_inner_microstep: 4632.92 | bwd_allreduce_microstep: 5.58 | step_microstep: 52.30
-[2025-01-25 13:42:45,972] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.81 | bwd: 4638.69 | bwd_inner: 4632.92 | bwd_allreduce: 5.66 | step: 52.31
- 31%|███       | 1779/5800 [4:56:15<7:44:21,  6.93s/it]                                                       {'loss': 0.042, 'grad_norm': 0.7429052591323853, 'learning_rate': 3.2490989852217285e-05, 'epoch': 15.34}
- 31%|███       | 1779/5800 [4:56:15<7:44:21,  6.93s/it]score1 tensor([[0.4082],
-        [0.4707],
-        [0.5156],
-        [0.4727]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.4785, 0.4883, 0.4414], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:42:52,901] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 13:42:52,905] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.33 | bwd_microstep: 4639.60 | bwd_inner_microstep: 4634.54 | bwd_allreduce_microstep: 4.99 | step_microstep: 45.47
-[2025-01-25 13:42:52,906] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.29 | bwd: 4639.63 | bwd_inner: 4634.54 | bwd_allreduce: 5.03 | step: 45.47
- 31%|███       | 1780/5800 [4:56:22<7:44:08,  6.93s/it]                                                       {'loss': 0.0273, 'grad_norm': 0.691450297832489, 'learning_rate': 3.248226576705566e-05, 'epoch': 15.34}
- 31%|███       | 1780/5800 [4:56:22<7:44:08,  6.93s/it]score1 tensor([[0.4941],
-        [0.4629],
-        [0.3984],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4277, 0.3730, 0.4141, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0684, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:42:59,831] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 13:42:59,832] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.39 | bwd_microstep: 4637.44 | bwd_inner_microstep: 4631.88 | bwd_allreduce_microstep: 5.40 | step_microstep: 45.90
-[2025-01-25 13:42:59,833] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2168.35 | bwd: 4637.47 | bwd_inner: 4631.88 | bwd_allreduce: 5.49 | step: 45.91
- 31%|███       | 1781/5800 [4:56:29<7:43:59,  6.93s/it]                                                       {'loss': 0.0684, 'grad_norm': 0.8008298277854919, 'learning_rate': 3.247353778970698e-05, 'epoch': 15.35}
- 31%|███       | 1781/5800 [4:56:29<7:43:59,  6.93s/it]score1 tensor([[0.4590],
-        [0.4609],
-        [0.3926],
-        [0.3730]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.3652, 0.4219, 0.3555], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0762, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:43:06,756] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.37
-[2025-01-25 13:43:06,758] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2169.26 | bwd_microstep: 4636.88 | bwd_inner_microstep: 4631.40 | bwd_allreduce_microstep: 5.38 | step_microstep: 46.47
-[2025-01-25 13:43:06,758] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2169.23 | bwd: 4636.92 | bwd_inner: 4631.40 | bwd_allreduce: 5.43 | step: 46.48
- 31%|███       | 1782/5800 [4:56:36<7:43:52,  6.93s/it]                                                       {'loss': 0.0762, 'grad_norm': 0.522159218788147, 'learning_rate': 3.2464805922892786e-05, 'epoch': 15.36}
- 31%|███       | 1782/5800 [4:56:36<7:43:52,  6.93s/it]score1 tensor([[0.4297],
-        [0.4668],
-        [0.3691],
-        [0.4141]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.4668, 0.5039, 0.4609], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0786, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:43:13,634] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.37
-[2025-01-25 13:43:13,636] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.04 | bwd_microstep: 4583.68 | bwd_inner_microstep: 4577.95 | bwd_allreduce_microstep: 5.60 | step_microstep: 48.06
-[2025-01-25 13:43:13,637] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.02 | bwd: 4583.72 | bwd_inner: 4577.95 | bwd_allreduce: 5.66 | step: 48.09
- 31%|███       | 1783/5800 [4:56:43<7:43:01,  6.92s/it]                                                       {'loss': 0.0786, 'grad_norm': 6.008127212524414, 'learning_rate': 3.245607016933582e-05, 'epoch': 15.37}
- 31%|███       | 1783/5800 [4:56:43<7:43:01,  6.92s/it]score1 tensor([[0.4414],
-        [0.4629],
-        [0.3809],
-        [0.4570]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.5078, 0.4395, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0796, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:43:20,654] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 7.12 | optimizer_step: 4.47
-[2025-01-25 13:43:20,655] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2172.97 | bwd_microstep: 4653.63 | bwd_inner_microstep: 4644.48 | bwd_allreduce_microstep: 8.86 | step_microstep: 93.64
-[2025-01-25 13:43:20,656] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2172.93 | bwd: 4653.70 | bwd_inner: 4644.48 | bwd_allreduce: 9.01 | step: 93.62
- 31%|███       | 1784/5800 [4:56:50<7:44:47,  6.94s/it]                                                       {'loss': 0.0796, 'grad_norm': 8.432671546936035, 'learning_rate': 3.244733053176003e-05, 'epoch': 15.38}
- 31%|███       | 1784/5800 [4:56:50<7:44:47,  6.94s/it]score1 tensor([[0.3750],
-        [0.4961],
-        [0.4219],
-        [0.3867]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4238, 0.5781, 0.3652, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:43:27,595] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.37
-[2025-01-25 13:43:27,599] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.20 | bwd_microstep: 4633.24 | bwd_inner_microstep: 4628.48 | bwd_allreduce_microstep: 4.67 | step_microstep: 45.04
-[2025-01-25 13:43:27,599] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2168.13 | bwd: 4633.26 | bwd_inner: 4628.48 | bwd_allreduce: 4.71 | step: 45.05
- 31%|███       | 1785/5800 [4:56:57<7:44:32,  6.94s/it]                                                       {'loss': 0.0664, 'grad_norm': 4.088449954986572, 'learning_rate': 3.243858701289061e-05, 'epoch': 15.39}
- 31%|███       | 1785/5800 [4:56:57<7:44:32,  6.94s/it]score1 tensor([[0.5469],
-        [0.3984],
-        [0.4531],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.3438, 0.5039, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0361, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:43:34,517] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.36
-[2025-01-25 13:43:34,518] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2169.00 | bwd_microstep: 4640.85 | bwd_inner_microstep: 4635.02 | bwd_allreduce_microstep: 5.73 | step_microstep: 41.74
-[2025-01-25 13:43:34,518] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.82 | bwd: 4640.87 | bwd_inner: 4635.02 | bwd_allreduce: 5.78 | step: 41.74
- 31%|███       | 1786/5800 [4:57:04<7:43:57,  6.94s/it]                                                       {'loss': 0.0361, 'grad_norm': 2.0355043411254883, 'learning_rate': 3.242983961545394e-05, 'epoch': 15.4}
- 31%|███       | 1786/5800 [4:57:04<7:43:57,  6.94s/it]score1 tensor([[0.5039],
-        [0.4238],
-        [0.4609],
-        [0.4961]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133, 0.3340, 0.4160, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0610, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:43:41,388] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.88 | optimizer_step: 4.36
-[2025-01-25 13:43:41,390] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.54 | bwd_microstep: 4588.98 | bwd_inner_microstep: 4583.80 | bwd_allreduce_microstep: 5.02 | step_microstep: 48.46
-[2025-01-25 13:43:41,390] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.50 | bwd: 4589.02 | bwd_inner: 4583.80 | bwd_allreduce: 5.10 | step: 48.47
- 31%|███       | 1787/5800 [4:57:11<7:42:40,  6.92s/it]                                                       {'loss': 0.061, 'grad_norm': 2.002364158630371, 'learning_rate': 3.2421088342177595e-05, 'epoch': 15.41}
- 31%|███       | 1787/5800 [4:57:11<7:42:40,  6.92s/it]score1 tensor([[0.4824],
-        [0.4746],
-        [0.5273],
-        [0.4473]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4121, 0.4746, 0.5508, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0376, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:43:48,252] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 13:43:48,253] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.37 | bwd_microstep: 4579.36 | bwd_inner_microstep: 4574.59 | bwd_allreduce_microstep: 4.68 | step_microstep: 42.42
-[2025-01-25 13:43:48,253] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.35 | bwd: 4579.39 | bwd_inner: 4574.59 | bwd_allreduce: 4.73 | step: 42.43
- 31%|███       | 1788/5800 [4:57:18<7:41:25,  6.90s/it]                                                       {'loss': 0.0376, 'grad_norm': 2.28433895111084, 'learning_rate': 3.2412333195790377e-05, 'epoch': 15.41}
- 31%|███       | 1788/5800 [4:57:18<7:41:25,  6.90s/it]score1 tensor([[0.5664],
-        [0.5352],
-        [0.4902],
-        [0.4922]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133, 0.4941, 0.4629, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:43:55,173] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 13:43:55,174] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.17 | bwd_microstep: 4633.99 | bwd_inner_microstep: 4628.32 | bwd_allreduce_microstep: 5.56 | step_microstep: 46.12
-[2025-01-25 13:43:55,174] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.11 | bwd: 4634.02 | bwd_inner: 4628.32 | bwd_allreduce: 5.62 | step: 46.12
- 31%|███       | 1789/5800 [4:57:25<7:41:45,  6.91s/it]                                                       {'loss': 0.0293, 'grad_norm': 0.6861523985862732, 'learning_rate': 3.24035741790223e-05, 'epoch': 15.42}
- 31%|███       | 1789/5800 [4:57:25<7:41:45,  6.91s/it]score1 tensor([[0.6289],
-        [0.5156],
-        [0.4551],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.7070, 0.5508, 0.4375, 0.4688], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0444, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:44:02,100] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 13:44:02,101] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.25 | bwd_microstep: 4639.52 | bwd_inner_microstep: 4634.12 | bwd_allreduce_microstep: 5.31 | step_microstep: 43.57
-[2025-01-25 13:44:02,101] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.20 | bwd: 4639.55 | bwd_inner: 4634.12 | bwd_allreduce: 5.36 | step: 43.58
- 31%|███       | 1790/5800 [4:57:32<7:42:02,  6.91s/it]                                                       {'loss': 0.0444, 'grad_norm': 0.8099661469459534, 'learning_rate': 3.239481129460457e-05, 'epoch': 15.43}
- 31%|███       | 1790/5800 [4:57:32<7:42:02,  6.91s/it]score1 tensor([[0.5195],
-        [0.6445],
-        [0.4902],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.6445, 0.5586, 0.4258], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:44:08,964] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 13:44:08,965] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.18 | bwd_microstep: 4580.48 | bwd_inner_microstep: 4575.19 | bwd_allreduce_microstep: 5.15 | step_microstep: 45.94
-[2025-01-25 13:44:08,966] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.13 | bwd: 4580.51 | bwd_inner: 4575.19 | bwd_allreduce: 5.23 | step: 45.96
- 31%|███       | 1791/5800 [4:57:38<7:41:02,  6.90s/it]                                                       {'loss': 0.0547, 'grad_norm': 2.316786527633667, 'learning_rate': 3.238604454526961e-05, 'epoch': 15.44}
- 31%|███       | 1791/5800 [4:57:38<7:41:02,  6.90s/it]score1 tensor([[0.5820],
-        [0.5508],
-        [0.5312],
-        [0.6328]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.5391, 0.4941, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0552, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:44:15,913] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.64 | optimizer_step: 4.36
-[2025-01-25 13:44:15,915] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.10 | bwd_microstep: 4643.47 | bwd_inner_microstep: 4635.74 | bwd_allreduce_microstep: 7.48 | step_microstep: 62.58
-[2025-01-25 13:44:15,915] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.08 | bwd: 4643.55 | bwd_inner: 4635.74 | bwd_allreduce: 7.60 | step: 62.61
- 31%|███       | 1792/5800 [4:57:45<7:41:47,  6.91s/it]                                                       {'loss': 0.0552, 'grad_norm': 9.701289176940918, 'learning_rate': 3.237727393375105e-05, 'epoch': 15.45}
- 31%|███       | 1792/5800 [4:57:45<7:41:47,  6.91s/it]score1 tensor([[0.5586],
-        [0.5352],
-        [0.4570],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.6406, 0.4199, 0.5234], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0405, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:44:22,846] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.37
-[2025-01-25 13:44:22,847] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.35 | bwd_microstep: 4638.81 | bwd_inner_microstep: 4630.04 | bwd_allreduce_microstep: 8.55 | step_microstep: 47.97
-[2025-01-25 13:44:22,848] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.31 | bwd: 4638.88 | bwd_inner: 4630.04 | bwd_allreduce: 8.65 | step: 47.94
- 31%|███       | 1793/5800 [4:57:52<7:42:36,  6.93s/it]                                                       {'loss': 0.0405, 'grad_norm': 4.810572624206543, 'learning_rate': 3.236849946278371e-05, 'epoch': 15.46}
- 31%|███       | 1793/5800 [4:57:52<7:42:36,  6.93s/it]score1 tensor([[0.5039],
-        [0.5469],
-        [0.6172],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.5039, 0.5742, 0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0605, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:44:29,856] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 13:44:29,858] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2170.57 | bwd_microstep: 4642.36 | bwd_inner_microstep: 4637.10 | bwd_allreduce_microstep: 5.11 | step_microstep: 63.56
-[2025-01-25 13:44:29,858] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2170.53 | bwd: 4642.40 | bwd_inner: 4637.10 | bwd_allreduce: 5.18 | step: 63.57
- 31%|███       | 1794/5800 [4:57:59<7:44:17,  6.95s/it]                                                       {'loss': 0.0605, 'grad_norm': 4.822843551635742, 'learning_rate': 3.2359721135103624e-05, 'epoch': 15.47}
- 31%|███       | 1794/5800 [4:57:59<7:44:17,  6.95s/it]score1 tensor([[0.5078],
-        [0.6562],
-        [0.4512],
-        [0.6719]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6289, 0.6094, 0.4141, 0.6328], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0610, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:44:36,834] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 13:44:36,836] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.48 | bwd_microstep: 4636.98 | bwd_inner_microstep: 4629.69 | bwd_allreduce_microstep: 7.07 | step_microstep: 48.28
-[2025-01-25 13:44:36,836] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.41 | bwd: 4637.04 | bwd_inner: 4629.69 | bwd_allreduce: 7.18 | step: 48.29
- 31%|███       | 1795/5800 [4:58:06<7:44:01,  6.95s/it]                                                       {'loss': 0.061, 'grad_norm': 5.444936275482178, 'learning_rate': 3.235093895344803e-05, 'epoch': 15.47}
- 31%|███       | 1795/5800 [4:58:06<7:44:01,  6.95s/it]score1 tensor([[0.5664],
-        [0.5625],
-        [0.4434],
-        [0.4297]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.5625, 0.3711, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0269, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:44:43,711] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 13:44:43,713] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.62 | bwd_microstep: 4583.27 | bwd_inner_microstep: 4578.27 | bwd_allreduce_microstep: 4.92 | step_microstep: 45.88
-[2025-01-25 13:44:43,713] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.56 | bwd: 4583.30 | bwd_inner: 4578.26 | bwd_allreduce: 4.97 | step: 45.88
- 31%|███       | 1796/5800 [4:58:13<7:42:19,  6.93s/it]                                                       {'loss': 0.0269, 'grad_norm': 6.557787895202637, 'learning_rate': 3.234215292055535e-05, 'epoch': 15.48}
- 31%|███       | 1796/5800 [4:58:13<7:42:19,  6.93s/it]score1 tensor([[0.4941],
-        [0.4609],
-        [0.5820],
-        [0.4297]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.5625, 0.5664, 0.4629], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:44:50,629] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.51 | optimizer_step: 4.36
-[2025-01-25 13:44:50,630] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.15 | bwd_microstep: 4637.13 | bwd_inner_microstep: 4631.68 | bwd_allreduce_microstep: 5.36 | step_microstep: 45.47
-[2025-01-25 13:44:50,630] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.12 | bwd: 4637.15 | bwd_inner: 4631.68 | bwd_allreduce: 5.41 | step: 45.48
- 31%|███       | 1797/5800 [4:58:20<7:42:00,  6.92s/it]                                                       {'loss': 0.0508, 'grad_norm': 4.162106990814209, 'learning_rate': 3.233336303916525e-05, 'epoch': 15.49}
- 31%|███       | 1797/5800 [4:58:20<7:42:00,  6.92s/it]score1 tensor([[0.4434],
-        [0.4590],
-        [0.3633],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.5469, 0.4922, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0918, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:44:57,546] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.37
-[2025-01-25 13:44:57,547] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.43 | bwd_microstep: 4634.24 | bwd_inner_microstep: 4629.31 | bwd_allreduce_microstep: 4.83 | step_microstep: 46.83
-[2025-01-25 13:44:57,548] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.41 | bwd: 4634.26 | bwd_inner: 4629.31 | bwd_allreduce: 4.88 | step: 46.85
- 31%|███       | 1798/5800 [4:58:27<7:41:42,  6.92s/it]                                                       {'loss': 0.0918, 'grad_norm': 29.446874618530273, 'learning_rate': 3.232456931201855e-05, 'epoch': 15.5}
- 31%|███       | 1798/5800 [4:58:27<7:41:42,  6.92s/it]score1 tensor([[0.4160],
-        [0.5977],
-        [0.3281],
-        [0.4297]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.5547, 0.4004, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0918, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:45:04,463] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.36
-[2025-01-25 13:45:04,464] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.66 | bwd_microstep: 4640.76 | bwd_inner_microstep: 4635.98 | bwd_allreduce_microstep: 4.69 | step_microstep: 41.48
-[2025-01-25 13:45:04,464] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.62 | bwd: 4640.78 | bwd_inner: 4635.98 | bwd_allreduce: 4.73 | step: 41.49
- 31%|███       | 1799/5800 [4:58:34<7:41:27,  6.92s/it]                                                       {'loss': 0.0918, 'grad_norm': 3.715151786804199, 'learning_rate': 3.231577174185729e-05, 'epoch': 15.51}
- 31%|███       | 1799/5800 [4:58:34<7:41:27,  6.92s/it]score1 tensor([[0.4434],
-        [0.4297],
-        [0.5703],
-        [0.4121]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5586, 0.4941, 0.6094, 0.5820], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0972, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:45:11,372] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.36
-[2025-01-25 13:45:11,373] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.58 | bwd_microstep: 4634.39 | bwd_inner_microstep: 4629.77 | bwd_allreduce_microstep: 4.53 | step_microstep: 40.51
-[2025-01-25 13:45:11,373] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.55 | bwd: 4634.42 | bwd_inner: 4629.77 | bwd_allreduce: 4.58 | step: 40.52
- 31%|███       | 1800/5800 [4:58:41<7:41:06,  6.92s/it]                                                       {'loss': 0.0972, 'grad_norm': 8.720622062683105, 'learning_rate': 3.23069703314247e-05, 'epoch': 15.52}
- 31%|███       | 1800/5800 [4:58:41<7:41:06,  6.92s/it]score1 tensor([[0.4219],
-        [0.4688],
-        [0.4414],
-        [0.6133]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5820, 0.5391, 0.5781, 0.6797], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1084, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:45:18,288] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 13:45:18,289] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.48 | bwd_microstep: 4631.61 | bwd_inner_microstep: 4626.66 | bwd_allreduce_microstep: 4.87 | step_microstep: 46.96
-[2025-01-25 13:45:18,290] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.44 | bwd: 4631.64 | bwd_inner: 4626.66 | bwd_allreduce: 4.91 | step: 46.97
- 31%|███       | 1801/5800 [4:58:48<7:41:09,  6.92s/it]                                                       {'loss': 0.1084, 'grad_norm': 8.947983741760254, 'learning_rate': 3.229816508346521e-05, 'epoch': 15.53}
- 31%|███       | 1801/5800 [4:58:48<7:41:09,  6.92s/it]score1 tensor([[0.4590],
-        [0.3867],
-        [0.6992],
-        [0.4961]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.4434, 0.6797, 0.4414], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:45:25,219] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.37
-[2025-01-25 13:45:25,220] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.43 | bwd_microstep: 4634.15 | bwd_inner_microstep: 4629.60 | bwd_allreduce_microstep: 4.46 | step_microstep: 57.87
-[2025-01-25 13:45:25,221] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.39 | bwd: 4634.18 | bwd_inner: 4629.60 | bwd_allreduce: 4.51 | step: 57.88
- 31%|███       | 1802/5800 [4:58:55<7:41:16,  6.92s/it]                                                       {'loss': 0.0547, 'grad_norm': 100.9085922241211, 'learning_rate': 3.228935600072446e-05, 'epoch': 15.53}
- 31%|███       | 1802/5800 [4:58:55<7:41:16,  6.92s/it]score1 tensor([[0.4609],
-        [0.5078],
-        [0.5508],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.4805, 0.4844, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0303, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:45:32,192] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 13:45:32,195] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.83 | bwd_microstep: 4648.03 | bwd_inner_microstep: 4640.57 | bwd_allreduce_microstep: 7.25 | step_microstep: 73.17
-[2025-01-25 13:45:32,195] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.79 | bwd: 4648.08 | bwd_inner: 4640.57 | bwd_allreduce: 7.36 | step: 73.18
- 31%|███       | 1803/5800 [4:59:02<7:42:47,  6.95s/it]                                                       {'loss': 0.0303, 'grad_norm': 0.9589237570762634, 'learning_rate': 3.228054308594927e-05, 'epoch': 15.54}
- 31%|███       | 1803/5800 [4:59:02<7:42:47,  6.95s/it]score1 tensor([[0.5195],
-        [0.5469],
-        [0.4785],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.4277, 0.3867, 0.5000], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1094, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:45:39,153] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 13:45:39,155] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.29 | bwd_microstep: 4639.46 | bwd_inner_microstep: 4634.41 | bwd_allreduce_microstep: 4.97 | step_microstep: 43.41
-[2025-01-25 13:45:39,155] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.24 | bwd: 4639.49 | bwd_inner: 4634.41 | bwd_allreduce: 5.01 | step: 43.41
- 31%|███       | 1804/5800 [4:59:09<7:42:16,  6.94s/it]                                                       {'loss': 0.1094, 'grad_norm': 9.322770118713379, 'learning_rate': 3.227172634188766e-05, 'epoch': 15.55}
- 31%|███       | 1804/5800 [4:59:09<7:42:16,  6.94s/it]score1 tensor([[0.7266],
-        [0.5469],
-        [0.6289],
-        [0.5859]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.5078, 0.5781, 0.5898], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0605, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:45:46,114] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 13:45:46,119] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.52 | bwd_microstep: 4637.93 | bwd_inner_microstep: 4629.59 | bwd_allreduce_microstep: 8.14 | step_microstep: 85.52
-[2025-01-25 13:45:46,120] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.48 | bwd: 4637.99 | bwd_inner: 4629.59 | bwd_allreduce: 8.25 | step: 85.51
- 31%|███       | 1805/5800 [4:59:16<7:43:11,  6.96s/it]                                                       {'loss': 0.0605, 'grad_norm': 5.466519355773926, 'learning_rate': 3.226290577128885e-05, 'epoch': 15.56}
- 31%|███       | 1805/5800 [4:59:16<7:43:11,  6.96s/it]score1 tensor([[0.5469],
-        [0.6797],
-        [0.5820],
-        [0.6523]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.5625, 0.5352, 0.4805], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0918, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:45:53,074] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 13:45:53,075] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.18 | bwd_microstep: 4633.58 | bwd_inner_microstep: 4627.83 | bwd_allreduce_microstep: 5.58 | step_microstep: 43.82
-[2025-01-25 13:45:53,076] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.14 | bwd: 4633.60 | bwd_inner: 4627.83 | bwd_allreduce: 5.67 | step: 43.83
- 31%|███       | 1806/5800 [4:59:23<7:42:32,  6.95s/it]                                                       {'loss': 0.0918, 'grad_norm': 10.09577751159668, 'learning_rate': 3.2254081376903236e-05, 'epoch': 15.57}
- 31%|███       | 1806/5800 [4:59:23<7:42:32,  6.95s/it]score1 tensor([[0.6406],
-        [0.5742],
-        [0.4629],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.4258, 0.5039, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0776, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:46:00,004] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 13:46:00,006] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.30 | bwd_microstep: 4639.50 | bwd_inner_microstep: 4633.98 | bwd_allreduce_microstep: 5.43 | step_microstep: 48.59
-[2025-01-25 13:46:00,006] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.26 | bwd: 4639.52 | bwd_inner: 4633.98 | bwd_allreduce: 5.47 | step: 48.59
- 31%|███       | 1807/5800 [4:59:29<7:42:06,  6.94s/it]                                                       {'loss': 0.0776, 'grad_norm': 5.20902156829834, 'learning_rate': 3.224525316148242e-05, 'epoch': 15.58}
- 31%|███       | 1807/5800 [4:59:29<7:42:06,  6.94s/it]score1 tensor([[0.5156],
-        [0.3945],
-        [0.6016],
-        [0.4375]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.4512, 0.4805, 0.4531], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0737, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:46:06,925] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.37
-[2025-01-25 13:46:06,927] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.82 | bwd_microstep: 4637.26 | bwd_inner_microstep: 4632.18 | bwd_allreduce_microstep: 4.96 | step_microstep: 42.34
-[2025-01-25 13:46:06,927] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.77 | bwd: 4637.28 | bwd_inner: 4632.18 | bwd_allreduce: 5.02 | step: 42.36
- 31%|███       | 1808/5800 [4:59:36<7:41:28,  6.94s/it]                                                       {'loss': 0.0737, 'grad_norm': 3.837731122970581, 'learning_rate': 3.2236421127779195e-05, 'epoch': 15.59}
- 31%|███       | 1808/5800 [4:59:36<7:41:28,  6.94s/it]score1 tensor([[0.5039],
-        [0.5234],
-        [0.5273],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4707, 0.4375, 0.6445, 0.6602], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0962, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:46:13,847] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 13:46:13,848] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.93 | bwd_microstep: 4640.62 | bwd_inner_microstep: 4635.48 | bwd_allreduce_microstep: 5.02 | step_microstep: 43.87
-[2025-01-25 13:46:13,848] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.89 | bwd: 4640.65 | bwd_inner: 4635.48 | bwd_allreduce: 5.10 | step: 43.87
- 31%|███       | 1809/5800 [4:59:43<7:41:04,  6.93s/it]                                                       {'loss': 0.0962, 'grad_norm': 0.7584691047668457, 'learning_rate': 3.222758527854754e-05, 'epoch': 15.59}
- 31%|███       | 1809/5800 [4:59:43<7:41:04,  6.93s/it]score1 tensor([[0.4297],
-        [0.5234],
-        [0.5117],
-        [0.5938]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3223, 0.6172, 0.4941, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:46:20,764] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.37
-[2025-01-25 13:46:20,766] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.42 | bwd_microstep: 4637.24 | bwd_inner_microstep: 4632.05 | bwd_allreduce_microstep: 5.08 | step_microstep: 46.41
-[2025-01-25 13:46:20,766] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.39 | bwd: 4637.27 | bwd_inner: 4632.05 | bwd_allreduce: 5.14 | step: 46.42
- 31%|███       | 1810/5800 [4:59:50<7:40:38,  6.93s/it]                                                       {'loss': 0.0664, 'grad_norm': 4.574215412139893, 'learning_rate': 3.221874561654263e-05, 'epoch': 15.6}
- 31%|███       | 1810/5800 [4:59:50<7:40:38,  6.93s/it]score1 tensor([[0.4180],
-        [0.4473],
-        [0.4512],
-        [0.4219]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.4473, 0.6367, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0830, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:46:27,624] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 13:46:27,625] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.27 | bwd_microstep: 4576.49 | bwd_inner_microstep: 4570.76 | bwd_allreduce_microstep: 5.62 | step_microstep: 43.84
-[2025-01-25 13:46:27,626] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.24 | bwd: 4576.51 | bwd_inner: 4570.76 | bwd_allreduce: 5.67 | step: 43.85
- 31%|███       | 1811/5800 [4:59:57<7:39:12,  6.91s/it]                                                       {'loss': 0.083, 'grad_norm': 6.178772449493408, 'learning_rate': 3.2209902144520826e-05, 'epoch': 15.61}
- 31%|███       | 1811/5800 [4:59:57<7:39:12,  6.91s/it]score1 tensor([[0.4043],
-        [0.4355],
-        [0.4727],
-        [0.3691]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.4922, 0.6875, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1162, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:46:34,595] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.41 | optimizer_step: 4.37
-[2025-01-25 13:46:34,602] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2174.86 | bwd_microstep: 4647.36 | bwd_inner_microstep: 4638.85 | bwd_allreduce_microstep: 8.32 | step_microstep: 76.83
-[2025-01-25 13:46:34,602] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2174.82 | bwd: 4647.42 | bwd_inner: 4638.85 | bwd_allreduce: 8.42 | step: 76.82
- 31%|███       | 1812/5800 [5:00:04<7:40:47,  6.93s/it]                                                       {'loss': 0.1162, 'grad_norm': 8.077237129211426, 'learning_rate': 3.2201054865239676e-05, 'epoch': 15.62}
- 31%|███       | 1812/5800 [5:00:04<7:40:47,  6.93s/it]score1 tensor([[0.5586],
-        [0.3574],
-        [0.2715],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.3789, 0.1787, 0.4062], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0559, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:46:41,567] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.36
-[2025-01-25 13:46:41,569] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.47 | bwd_microstep: 4637.39 | bwd_inner_microstep: 4630.53 | bwd_allreduce_microstep: 6.54 | step_microstep: 57.65
-[2025-01-25 13:46:41,569] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.43 | bwd: 4637.47 | bwd_inner: 4630.53 | bwd_allreduce: 6.72 | step: 57.64
- 31%|███▏      | 1813/5800 [5:00:11<7:41:10,  6.94s/it]                                                       {'loss': 0.0559, 'grad_norm': 4.520226955413818, 'learning_rate': 3.219220378145791e-05, 'epoch': 15.63}
- 31%|███▏      | 1813/5800 [5:00:11<7:41:10,  6.94s/it]score1 tensor([[0.4395],
-        [0.5664],
-        [0.3945],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.6875, 0.4492, 0.5742], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0747, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:46:48,495] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 13:46:48,497] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.63 | bwd_microstep: 4634.46 | bwd_inner_microstep: 4629.17 | bwd_allreduce_microstep: 5.18 | step_microstep: 43.79
-[2025-01-25 13:46:48,497] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.58 | bwd: 4634.49 | bwd_inner: 4629.17 | bwd_allreduce: 5.24 | step: 43.79
- 31%|███▏      | 1814/5800 [5:00:18<7:40:37,  6.93s/it]                                                       {'loss': 0.0747, 'grad_norm': 8.629796981811523, 'learning_rate': 3.218334889593544e-05, 'epoch': 15.64}
- 31%|███▏      | 1814/5800 [5:00:18<7:40:37,  6.93s/it]score1 tensor([[0.5312],
-        [0.4355],
-        [0.5078],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6641, 0.5781, 0.4570, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0874, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:46:55,408] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.37
-[2025-01-25 13:46:55,410] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.07 | bwd_microstep: 4628.56 | bwd_inner_microstep: 4623.15 | bwd_allreduce_microstep: 5.31 | step_microstep: 45.55
-[2025-01-25 13:46:55,410] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.03 | bwd: 4628.59 | bwd_inner: 4623.15 | bwd_allreduce: 5.37 | step: 45.56
- 31%|███▏      | 1815/5800 [5:00:25<7:40:08,  6.93s/it]                                                       {'loss': 0.0874, 'grad_norm': 1.1118355989456177, 'learning_rate': 3.217449021143339e-05, 'epoch': 15.65}
- 31%|███▏      | 1815/5800 [5:00:25<7:40:08,  6.93s/it]score1 tensor([[0.4727],
-        [0.4297],
-        [0.5078],
-        [0.3906]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4297, 0.3984, 0.4277, 0.3789], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0415, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:47:02,329] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.05 | optimizer_step: 4.37
-[2025-01-25 13:47:02,330] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.11 | bwd_microstep: 4632.99 | bwd_inner_microstep: 4626.70 | bwd_allreduce_microstep: 6.10 | step_microstep: 47.85
-[2025-01-25 13:47:02,330] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.06 | bwd: 4633.01 | bwd_inner: 4626.70 | bwd_allreduce: 6.23 | step: 47.86
- 31%|███▏      | 1816/5800 [5:00:32<7:39:52,  6.93s/it]                                                       {'loss': 0.0415, 'grad_norm': 8.33837604522705, 'learning_rate': 3.216562773071405e-05, 'epoch': 15.66}
- 31%|███▏      | 1816/5800 [5:00:32<7:39:52,  6.93s/it]score1 tensor([[0.5352],
-        [0.4883],
-        [0.5430],
-        [0.4277]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.5430, 0.6250, 0.4121], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0400, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:47:09,247] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 13:47:09,249] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.19 | bwd_microstep: 4637.10 | bwd_inner_microstep: 4631.91 | bwd_allreduce_microstep: 5.09 | step_microstep: 44.61
-[2025-01-25 13:47:09,249] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.16 | bwd: 4637.12 | bwd_inner: 4631.91 | bwd_allreduce: 5.14 | step: 44.62
- 31%|███▏      | 1817/5800 [5:00:39<7:39:35,  6.92s/it]                                                       {'loss': 0.04, 'grad_norm': 0.7714766263961792, 'learning_rate': 3.2156761456540896e-05, 'epoch': 15.66}
- 31%|███▏      | 1817/5800 [5:00:39<7:39:35,  6.92s/it]score1 tensor([[0.4199],
-        [0.3848],
-        [0.5156],
-        [0.4707]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3809, 0.4336, 0.4453, 0.4883], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0439, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:47:16,152] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 13:47:16,154] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.34 | bwd_microstep: 4627.59 | bwd_inner_microstep: 4622.97 | bwd_allreduce_microstep: 4.53 | step_microstep: 42.35
-[2025-01-25 13:47:16,154] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.30 | bwd: 4627.61 | bwd_inner: 4622.97 | bwd_allreduce: 4.57 | step: 42.35
- 31%|███▏      | 1818/5800 [5:00:46<7:39:04,  6.92s/it]                                                       {'loss': 0.0439, 'grad_norm': 0.7243660688400269, 'learning_rate': 3.214789139167858e-05, 'epoch': 15.67}
- 31%|███▏      | 1818/5800 [5:00:46<7:39:04,  6.92s/it]score1 tensor([[0.5352],
-        [0.4238],
-        [0.5078],
-        [0.4570]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5078, 0.4375, 0.5352, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0269, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:47:23,070] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.37
-[2025-01-25 13:47:23,072] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.85 | bwd_microstep: 4637.75 | bwd_inner_microstep: 4631.38 | bwd_allreduce_microstep: 6.27 | step_microstep: 54.29
-[2025-01-25 13:47:23,072] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.81 | bwd: 4637.78 | bwd_inner: 4631.38 | bwd_allreduce: 6.32 | step: 54.30
- 31%|███▏      | 1819/5800 [5:00:53<7:39:01,  6.92s/it]                                                       {'loss': 0.0269, 'grad_norm': 0.5893488526344299, 'learning_rate': 3.2139017538892944e-05, 'epoch': 15.68}
- 31%|███▏      | 1819/5800 [5:00:53<7:39:01,  6.92s/it]score1 tensor([[0.4707],
-        [0.4141],
-        [0.4473],
-        [0.4316]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5977, 0.4492, 0.5391, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0776, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:47:30,001] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.08 | optimizer_step: 4.36
-[2025-01-25 13:47:30,002] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.06 | bwd_microstep: 4635.86 | bwd_inner_microstep: 4631.34 | bwd_allreduce_microstep: 4.44 | step_microstep: 51.38
-[2025-01-25 13:47:30,003] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.02 | bwd: 4635.88 | bwd_inner: 4631.34 | bwd_allreduce: 4.47 | step: 51.38
- 31%|███▏      | 1820/5800 [5:00:59<7:39:27,  6.93s/it]                                                       {'loss': 0.0776, 'grad_norm': 4.562873363494873, 'learning_rate': 3.2130139900951e-05, 'epoch': 15.69}
- 31%|███▏      | 1820/5800 [5:00:59<7:39:27,  6.93s/it]score1 tensor([[0.4297],
-        [0.3516],
-        [0.4473],
-        [0.3418]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.5117, 0.5469, 0.4043], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0991, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:47:36,942] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 13:47:36,943] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.63 | bwd_microstep: 4630.00 | bwd_inner_microstep: 4622.51 | bwd_allreduce_microstep: 7.27 | step_microstep: 50.03
-[2025-01-25 13:47:36,943] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.58 | bwd: 4630.05 | bwd_inner: 4622.51 | bwd_allreduce: 7.40 | step: 50.01
- 31%|███▏      | 1821/5800 [5:01:06<7:39:29,  6.93s/it]                                                       {'loss': 0.0991, 'grad_norm': 8.401921272277832, 'learning_rate': 3.212125848062097e-05, 'epoch': 15.7}
- 31%|███▏      | 1821/5800 [5:01:06<7:39:29,  6.93s/it]score1 tensor([[0.4258],
-        [0.5781],
-        [0.3770],
-        [0.4590]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.5664, 0.4590, 0.6484], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0923, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:47:43,858] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 13:47:43,859] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.71 | bwd_microstep: 4630.88 | bwd_inner_microstep: 4626.11 | bwd_allreduce_microstep: 4.68 | step_microstep: 43.03
-[2025-01-25 13:47:43,859] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.67 | bwd: 4630.90 | bwd_inner: 4626.11 | bwd_allreduce: 4.72 | step: 43.04
- 31%|███▏      | 1822/5800 [5:01:13<7:38:53,  6.92s/it]                                                       {'loss': 0.0923, 'grad_norm': 4.272563457489014, 'learning_rate': 3.2112373280672215e-05, 'epoch': 15.71}
- 31%|███▏      | 1822/5800 [5:01:13<7:38:53,  6.92s/it]score1 tensor([[0.5312],
-        [0.4434],
-        [0.5039],
-        [0.3984]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.4551, 0.4160, 0.6719], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0962, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:47:50,767] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.37
-[2025-01-25 13:47:50,769] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.04 | bwd_microstep: 4634.53 | bwd_inner_microstep: 4629.70 | bwd_allreduce_microstep: 4.71 | step_microstep: 42.52
-[2025-01-25 13:47:50,769] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.00 | bwd: 4634.55 | bwd_inner: 4629.70 | bwd_allreduce: 4.78 | step: 42.53
- 31%|███▏      | 1823/5800 [5:01:20<7:38:34,  6.92s/it]                                                       {'loss': 0.0962, 'grad_norm': 4.496738910675049, 'learning_rate': 3.210348430387531e-05, 'epoch': 15.72}
- 31%|███▏      | 1823/5800 [5:01:20<7:38:34,  6.92s/it]score1 tensor([[0.5195],
-        [0.5977],
-        [0.6250],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4570, 0.4531, 0.4961, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0898, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:47:57,682] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.02 | optimizer_step: 4.36
-[2025-01-25 13:47:57,684] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.80 | bwd_microstep: 4632.35 | bwd_inner_microstep: 4627.39 | bwd_allreduce_microstep: 4.83 | step_microstep: 52.15
-[2025-01-25 13:47:57,684] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.75 | bwd: 4632.39 | bwd_inner: 4627.39 | bwd_allreduce: 4.90 | step: 52.16
- 31%|███▏      | 1824/5800 [5:01:27<7:38:32,  6.92s/it]                                                       {'loss': 0.0898, 'grad_norm': 6.349094867706299, 'learning_rate': 3.209459155300198e-05, 'epoch': 15.72}
- 31%|███▏      | 1824/5800 [5:01:27<7:38:32,  6.92s/it]score1 tensor([[0.5547],
-        [0.6250],
-        [0.6172],
-        [0.6289]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.5234, 0.6055, 0.3398], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1221, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:48:04,618] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 13:48:04,619] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.68 | bwd_microstep: 4629.37 | bwd_inner_microstep: 4624.40 | bwd_allreduce_microstep: 4.88 | step_microstep: 56.58
-[2025-01-25 13:48:04,620] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.65 | bwd: 4629.40 | bwd_inner: 4624.40 | bwd_allreduce: 4.93 | step: 56.59
- 31%|███▏      | 1825/5800 [5:01:34<7:38:45,  6.92s/it]                                                       {'loss': 0.1221, 'grad_norm': 11.638300895690918, 'learning_rate': 3.2085695030825145e-05, 'epoch': 15.73}
- 31%|███▏      | 1825/5800 [5:01:34<7:38:45,  6.92s/it]score1 tensor([[0.5039],
-        [0.5859],
-        [0.5000],
-        [0.6055]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3887, 0.4922, 0.4648, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0747, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:48:11,562] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.27 | optimizer_step: 4.37
-[2025-01-25 13:48:11,563] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.50 | bwd_microstep: 4638.41 | bwd_inner_microstep: 4632.05 | bwd_allreduce_microstep: 6.14 | step_microstep: 45.14
-[2025-01-25 13:48:11,564] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.36 | bwd: 4638.43 | bwd_inner: 4632.06 | bwd_allreduce: 6.29 | step: 45.15
- 31%|███▏      | 1826/5800 [5:01:41<7:39:02,  6.93s/it]                                                       {'loss': 0.0747, 'grad_norm': 10.076868057250977, 'learning_rate': 3.207679474011889e-05, 'epoch': 15.74}
- 31%|███▏      | 1826/5800 [5:01:41<7:39:02,  6.93s/it]score1 tensor([[0.5430],
-        [0.5273],
-        [0.5898],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.5664, 0.4023, 0.5000], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0723, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:48:18,486] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 13:48:18,488] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.34 | bwd_microstep: 4630.57 | bwd_inner_microstep: 4624.94 | bwd_allreduce_microstep: 5.53 | step_microstep: 48.88
-[2025-01-25 13:48:18,488] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.28 | bwd: 4630.59 | bwd_inner: 4624.94 | bwd_allreduce: 5.58 | step: 48.89
- 32%|███▏      | 1827/5800 [5:01:48<7:38:43,  6.93s/it]                                                       {'loss': 0.0723, 'grad_norm': 4.895623207092285, 'learning_rate': 3.20678906836585e-05, 'epoch': 15.75}
- 32%|███▏      | 1827/5800 [5:01:48<7:38:43,  6.93s/it]score1 tensor([[0.5977],
-        [0.5469],
-        [0.5859],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.4785, 0.4141, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1230, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:48:25,399] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 13:48:25,401] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.26 | bwd_microstep: 4628.88 | bwd_inner_microstep: 4623.82 | bwd_allreduce_microstep: 4.97 | step_microstep: 42.01
-[2025-01-25 13:48:25,401] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.22 | bwd: 4628.91 | bwd_inner: 4623.82 | bwd_allreduce: 5.02 | step: 42.02
- 32%|███▏      | 1828/5800 [5:01:55<7:38:13,  6.92s/it]                                                       {'loss': 0.123, 'grad_norm': 9.8250732421875, 'learning_rate': 3.205898286422038e-05, 'epoch': 15.76}
- 32%|███▏      | 1828/5800 [5:01:55<7:38:13,  6.92s/it]score1 tensor([[0.5938],
-        [0.5352],
-        [0.5312],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.4941, 0.4512, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0654, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:48:32,298] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 13:48:32,299] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.68 | bwd_microstep: 4627.53 | bwd_inner_microstep: 4622.79 | bwd_allreduce_microstep: 4.65 | step_microstep: 43.13
-[2025-01-25 13:48:32,299] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.64 | bwd: 4627.56 | bwd_inner: 4622.79 | bwd_allreduce: 4.70 | step: 43.14
- 32%|███▏      | 1829/5800 [5:02:02<7:37:39,  6.91s/it]                                                       {'loss': 0.0654, 'grad_norm': 4.86682653427124, 'learning_rate': 3.205007128458218e-05, 'epoch': 15.77}
- 32%|███▏      | 1829/5800 [5:02:02<7:37:39,  6.91s/it]score1 tensor([[0.3438],
-        [0.4355],
-        [0.3574],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.7031, 0.4688, 0.5195, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1504, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:48:39,206] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 13:48:39,208] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.11 | bwd_microstep: 4631.50 | bwd_inner_microstep: 4626.57 | bwd_allreduce_microstep: 4.83 | step_microstep: 43.39
-[2025-01-25 13:48:39,208] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.07 | bwd: 4631.53 | bwd_inner: 4626.57 | bwd_allreduce: 4.88 | step: 43.41
- 32%|███▏      | 1830/5800 [5:02:09<7:37:26,  6.91s/it]                                                       {'loss': 0.1504, 'grad_norm': 8.868001937866211, 'learning_rate': 3.204115594752265e-05, 'epoch': 15.78}
- 32%|███▏      | 1830/5800 [5:02:09<7:37:26,  6.91s/it]score1 tensor([[0.4375],
-        [0.4902],
-        [0.4238],
-        [0.4707]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5000, 0.5078, 0.5508, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0557, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:48:46,121] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 13:48:46,123] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.74 | bwd_microstep: 4636.85 | bwd_inner_microstep: 4632.03 | bwd_allreduce_microstep: 4.72 | step_microstep: 43.43
-[2025-01-25 13:48:46,124] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.71 | bwd: 4636.87 | bwd_inner: 4632.03 | bwd_allreduce: 4.77 | step: 43.74
- 32%|███▏      | 1831/5800 [5:02:16<7:37:21,  6.91s/it]                                                       {'loss': 0.0557, 'grad_norm': 4.263772964477539, 'learning_rate': 3.203223685582177e-05, 'epoch': 15.78}
- 32%|███▏      | 1831/5800 [5:02:16<7:37:21,  6.91s/it]score1 tensor([[0.4062],
-        [0.4316],
-        [0.4727],
-        [0.4238]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.6016, 0.6250, 0.4199], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0923, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:48:53,043] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 13:48:53,044] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.08 | bwd_microstep: 4629.52 | bwd_inner_microstep: 4624.62 | bwd_allreduce_microstep: 4.79 | step_microstep: 43.59
-[2025-01-25 13:48:53,045] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.97 | bwd: 4629.55 | bwd_inner: 4624.62 | bwd_allreduce: 4.84 | step: 43.59
- 32%|███▏      | 1832/5800 [5:02:23<7:37:41,  6.92s/it]                                                       {'loss': 0.0923, 'grad_norm': 5.027456760406494, 'learning_rate': 3.202331401226066e-05, 'epoch': 15.79}
- 32%|███▏      | 1832/5800 [5:02:23<7:37:41,  6.92s/it]score1 tensor([[0.4023],
-        [0.4844],
-        [0.4629],
-        [0.4629]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3906, 0.5508, 0.4004, 0.4609], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0356, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:48:59,990] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 13:48:59,992] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.40 | bwd_microstep: 4633.98 | bwd_inner_microstep: 4628.08 | bwd_allreduce_microstep: 5.79 | step_microstep: 46.25
-[2025-01-25 13:48:59,992] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.36 | bwd: 4634.03 | bwd_inner: 4628.08 | bwd_allreduce: 5.84 | step: 46.22
- 32%|███▏      | 1833/5800 [5:02:29<7:37:52,  6.93s/it]                                                       {'loss': 0.0356, 'grad_norm': 4.201264381408691, 'learning_rate': 3.201438741962162e-05, 'epoch': 15.8}
- 32%|███▏      | 1833/5800 [5:02:29<7:37:52,  6.93s/it]score1 tensor([[0.5664],
-        [0.3848],
-        [0.3691],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.4355, 0.4512, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0439, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:49:06,871] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.49 | optimizer_step: 4.37
-[2025-01-25 13:49:06,874] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.84 | bwd_microstep: 4578.90 | bwd_inner_microstep: 4573.05 | bwd_allreduce_microstep: 5.65 | step_microstep: 57.13
-[2025-01-25 13:49:06,874] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.80 | bwd: 4578.96 | bwd_inner: 4573.05 | bwd_allreduce: 5.74 | step: 57.14
- 32%|███▏      | 1834/5800 [5:02:36<7:37:15,  6.92s/it]                                                       {'loss': 0.0439, 'grad_norm': 2.551739454269409, 'learning_rate': 3.2005457080688114e-05, 'epoch': 15.81}
- 32%|███▏      | 1834/5800 [5:02:36<7:37:15,  6.92s/it]score1 tensor([[0.4160],
-        [0.4355],
-        [0.5078],
-        [0.3887]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.5195, 0.5703, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1177, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:49:13,837] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 13:49:13,838] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.41 | bwd_microstep: 4637.10 | bwd_inner_microstep: 4631.28 | bwd_allreduce_microstep: 5.72 | step_microstep: 52.62
-[2025-01-25 13:49:13,839] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.31 | bwd: 4637.12 | bwd_inner: 4631.28 | bwd_allreduce: 5.77 | step: 52.63
- 32%|███▏      | 1835/5800 [5:02:43<7:37:48,  6.93s/it]                                                       {'loss': 0.1177, 'grad_norm': 8.533618927001953, 'learning_rate': 3.199652299824478e-05, 'epoch': 15.82}
- 32%|███▏      | 1835/5800 [5:02:43<7:37:48,  6.93s/it]score1 tensor([[0.4688],
-        [0.5859],
-        [0.5078],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5078, 0.5273, 0.4453, 0.4883], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0439, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:49:20,753] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 13:49:20,755] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.58 | bwd_microstep: 4630.42 | bwd_inner_microstep: 4625.33 | bwd_allreduce_microstep: 4.98 | step_microstep: 43.08
-[2025-01-25 13:49:20,755] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.55 | bwd: 4630.44 | bwd_inner: 4625.33 | bwd_allreduce: 5.04 | step: 43.10
- 32%|███▏      | 1836/5800 [5:02:50<7:37:19,  6.92s/it]                                                       {'loss': 0.0439, 'grad_norm': 4.962275981903076, 'learning_rate': 3.198758517507742e-05, 'epoch': 15.83}
- 32%|███▏      | 1836/5800 [5:02:50<7:37:19,  6.92s/it]score1 tensor([[0.4746],
-        [0.4375],
-        [0.4414],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4082, 0.4004, 0.5312, 0.5742], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0522, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:49:27,681] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.37
-[2025-01-25 13:49:27,682] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.51 | bwd_microstep: 4635.52 | bwd_inner_microstep: 4631.00 | bwd_allreduce_microstep: 4.45 | step_microstep: 42.27
-[2025-01-25 13:49:27,683] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.46 | bwd: 4635.55 | bwd_inner: 4631.00 | bwd_allreduce: 4.49 | step: 42.28
- 32%|███▏      | 1837/5800 [5:02:57<7:37:19,  6.92s/it]                                                       {'loss': 0.0522, 'grad_norm': 0.7767015099525452, 'learning_rate': 3.197864361397299e-05, 'epoch': 15.84}
- 32%|███▏      | 1837/5800 [5:02:57<7:37:19,  6.92s/it]score1 tensor([[0.4844],
-        [0.5156],
-        [0.4961],
-        [0.5977]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4863, 0.6211, 0.4395, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0459, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:49:34,599] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.37
-[2025-01-25 13:49:34,601] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.72 | bwd_microstep: 4636.11 | bwd_inner_microstep: 4631.11 | bwd_allreduce_microstep: 4.90 | step_microstep: 43.03
-[2025-01-25 13:49:34,601] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.69 | bwd: 4636.14 | bwd_inner: 4631.12 | bwd_allreduce: 4.95 | step: 43.04
- 32%|███▏      | 1838/5800 [5:03:04<7:37:03,  6.92s/it]                                                       {'loss': 0.0459, 'grad_norm': 4.8865251541137695, 'learning_rate': 3.196969831771964e-05, 'epoch': 15.84}
- 32%|███▏      | 1838/5800 [5:03:04<7:37:03,  6.92s/it]score1 tensor([[0.4785],
-        [0.4082],
-        [0.5664],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3672, 0.3613, 0.4551, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0771, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:49:41,513] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 13:49:41,514] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.90 | bwd_microstep: 4631.61 | bwd_inner_microstep: 4626.69 | bwd_allreduce_microstep: 4.81 | step_microstep: 42.19
-[2025-01-25 13:49:41,515] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.86 | bwd: 4631.64 | bwd_inner: 4626.69 | bwd_allreduce: 4.87 | step: 42.19
- 32%|███▏      | 1839/5800 [5:03:11<7:36:48,  6.92s/it]                                                       {'loss': 0.0771, 'grad_norm': 9.159695625305176, 'learning_rate': 3.196074928910665e-05, 'epoch': 15.85}
- 32%|███▏      | 1839/5800 [5:03:11<7:36:48,  6.92s/it]score1 tensor([[0.5195],
-        [0.4922],
-        [0.4316],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6953, 0.6172, 0.6445, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1426, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 13:49:48,423] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 13:49:48,425] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.90 | bwd_microstep: 4629.84 | bwd_inner_microstep: 4625.10 | bwd_allreduce_microstep: 4.63 | step_microstep: 42.17
-[2025-01-25 13:49:48,425] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.87 | bwd: 4629.87 | bwd_inner: 4625.10 | bwd_allreduce: 4.70 | step: 42.18
- 32%|███▏      | 1840/5800 [5:03:18<7:36:33,  6.92s/it]                                                       {'loss': 0.1426, 'grad_norm': 4.560313701629639, 'learning_rate': 3.195179653092451e-05, 'epoch': 15.86}
- 32%|███▏      | 1840/5800 [5:03:18<7:36:33,  6.92s/it]evaluate!
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4805, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5625, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4766]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4766, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0., device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6094]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5703, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5195]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5742, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4727]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5547, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0820, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5820]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5977, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5898]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4219, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1680, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6055]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1172, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5430]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5000, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4238]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3945, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5625]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5000, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0625, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5000]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3730, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1270, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4668]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4414, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0938, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6797, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1641, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5859]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5391, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5469]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4648, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0820, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4883]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4258, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0625, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4922]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4707, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3965, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1387, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4941]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3750, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1191, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5703]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4492, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1211, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4336]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5586, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1250, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6250]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6523, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6211]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5898, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6289]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4355, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1934, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5547]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5156, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5078]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3262, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1816, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4492]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4863, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0371, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1211, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5859]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6055, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5781]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4570, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1211, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5703]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0781, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.7031, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1797, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4688]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4336, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5820]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5391, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5078]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4473, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0605, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5430, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5195]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4590, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0605, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5547]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6641, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1094, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4453, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0938, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4824]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.7109, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2285, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5781]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5273, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4980]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3691, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1289, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4180, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1211, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5508]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3223, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2285, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5469]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6016, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4902]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4648, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4727]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3418, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1309, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4590, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0762, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5117, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6250]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4551, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1699, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5078]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5234, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5430]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5312, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5469]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5820, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4980]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4199, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0781, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5508]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3926, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1582, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4492]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1680, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6016]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1094, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6055]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1172, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5352, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6523]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6250, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6445]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5703, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0742, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4414]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4727, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5781]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4434, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1348, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5898]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4941, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0957, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6133]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5352, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0781, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5898]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3828, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2070, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5469]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4219, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1250, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4980]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4844, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5586]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4805, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0781, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6562]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5508]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5898, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6250]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6641, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5586]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4688, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0898, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5039]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4297, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0742, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5000]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4668, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0332, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6055]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5781]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4297, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1484, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4590]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4609, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5078]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1094, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4492, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0781, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4395]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3730, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5547]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6445, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0898, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5195]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1289, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6016]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5430, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0586, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5000]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5273, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3867, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1523, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4668, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0488, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5547]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0625, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5586]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5625, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4785]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6250, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1465, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5312]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0977, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5977]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4785, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0488, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4062, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1289, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5742]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4395, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1348, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4805]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5664, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0859, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5195]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3555, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1641, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4785]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4785, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0., device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5625]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5078, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4844]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6016, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1172, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5195]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4277, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0918, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5195]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5781, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0586, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5703]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3477, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2227, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5625]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6133, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5156, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5430]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3379, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2051, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4473]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3418, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1055, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4414]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3848, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0566, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4961]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3828, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1133, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4961]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4277, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0684, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4688]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4043, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0645, device='cuda:0', dtype=torch.bfloat16)
-Results saved to /DATA/DATA1/wjr/intern/InternVL/internvl_chat/new_train_result/stage2/mos3_test.csv
-Accuracy: 0.0
-SRCC_score: 0.2702660142276894
-PLCC_score: 0.2715170508326454
-KRCC_score: 0.1816527008186016
-SRCC_level: 0.2702660142276894
-PLCC_level: 0.2715170508326454
-KRCC_level: 0.1816527008186016
-score1 tensor([[0.4473],
-        [0.5195],
-        [0.6406],
-        [0.4316]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3672, 0.4453, 0.5625, 0.2812], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0957, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:00:16,065] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.01 | optimizer_step: 4.36
-[2025-01-25 14:00:16,066] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2143.32 | bwd_microstep: 4593.97 | bwd_inner_microstep: 4588.83 | bwd_allreduce_microstep: 4.97 | step_microstep: 56.16
-[2025-01-25 14:00:16,067] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2143.25 | bwd: 4594.00 | bwd_inner: 4588.83 | bwd_allreduce: 5.05 | step: 56.16
- 32%|███▏      | 1841/5800 [5:13:46<212:23:43, 193.14s/it]                                                          {'loss': 0.0957, 'grad_norm': 9.069360733032227, 'learning_rate': 3.1942840045964824e-05, 'epoch': 15.87}
- 32%|███▏      | 1841/5800 [5:13:46<212:23:43, 193.14s/it]score1 tensor([[0.6250],
-        [0.5469],
-        [0.5820],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.5586, 0.5703, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0283, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:00:22,871] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.36
-[2025-01-25 14:00:22,872] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2129.03 | bwd_microstep: 4564.94 | bwd_inner_microstep: 4560.69 | bwd_allreduce_microstep: 4.18 | step_microstep: 44.56
-[2025-01-25 14:00:22,872] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2129.01 | bwd: 4564.97 | bwd_inner: 4560.69 | bwd_allreduce: 4.21 | step: 44.56
- 32%|███▏      | 1842/5800 [5:13:52<150:52:52, 137.23s/it]                                                          {'loss': 0.0283, 'grad_norm': 5.063046932220459, 'learning_rate': 3.1933879837020386e-05, 'epoch': 15.88}
- 32%|███▏      | 1842/5800 [5:13:52<150:52:52, 137.23s/it]score1 tensor([[0.5156],
-        [0.5117],
-        [0.4297],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4590, 0.5586, 0.3105, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0928, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:00:29,710] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.09 | optimizer_step: 4.37
-[2025-01-25 14:00:29,711] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2132.62 | bwd_microstep: 4587.80 | bwd_inner_microstep: 4583.78 | bwd_allreduce_microstep: 3.95 | step_microstep: 47.48
-[2025-01-25 14:00:29,711] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2132.57 | bwd: 4587.81 | bwd_inner: 4583.78 | bwd_allreduce: 3.98 | step: 47.49
- 32%|███▏      | 1843/5800 [5:13:59<107:50:47, 98.12s/it]                                                          {'loss': 0.0928, 'grad_norm': 4.453879356384277, 'learning_rate': 3.192491590688515e-05, 'epoch': 15.89}
- 32%|███▏      | 1843/5800 [5:13:59<107:50:47, 98.12s/it]score1 tensor([[0.6016],
-        [0.5781],
-        [0.5781],
-        [0.4219]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.6211, 0.4980, 0.3984], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0454, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:00:36,552] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.34 | optimizer_step: 4.37
-[2025-01-25 14:00:36,553] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2145.81 | bwd_microstep: 4589.56 | bwd_inner_microstep: 4585.29 | bwd_allreduce_microstep: 4.20 | step_microstep: 35.31
-[2025-01-25 14:00:36,553] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2145.78 | bwd: 4589.58 | bwd_inner: 4585.29 | bwd_allreduce: 4.24 | step: 35.31
- 32%|███▏      | 1844/5800 [5:14:06<77:43:28, 70.73s/it]                                                         {'loss': 0.0454, 'grad_norm': 4.594106674194336, 'learning_rate': 3.191594825835421e-05, 'epoch': 15.9}
- 32%|███▏      | 1844/5800 [5:14:06<77:43:28, 70.73s/it]score1 tensor([[0.4316],
-        [0.5156],
-        [0.5273],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.6094, 0.5508, 0.5898], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0757, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:00:43,380] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.55 | optimizer_step: 4.36
-[2025-01-25 14:00:43,381] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2143.54 | bwd_microstep: 4598.38 | bwd_inner_microstep: 4594.01 | bwd_allreduce_microstep: 4.28 | step_microstep: 38.97
-[2025-01-25 14:00:43,381] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2143.54 | bwd: 4598.40 | bwd_inner: 4594.01 | bwd_allreduce: 4.31 | step: 38.97
- 32%|███▏      | 1845/5800 [5:14:13<56:38:53, 51.56s/it]                                                        {'loss': 0.0757, 'grad_norm': 9.094940185546875, 'learning_rate': 3.190697689422384e-05, 'epoch': 15.91}
- 32%|███▏      | 1845/5800 [5:14:13<56:38:53, 51.56s/it]score1 tensor([[0.4434],
-        [0.5156],
-        [0.4355],
-        [0.4414]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.6367, 0.6055, 0.6484], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1328, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:00:50,248] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.42 | optimizer_step: 4.37
-[2025-01-25 14:00:50,249] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2145.34 | bwd_microstep: 4603.37 | bwd_inner_microstep: 4598.91 | bwd_allreduce_microstep: 4.38 | step_microstep: 43.08
-[2025-01-25 14:00:50,249] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2145.31 | bwd: 4603.40 | bwd_inner: 4598.91 | bwd_allreduce: 4.42 | step: 43.08
- 32%|███▏      | 1846/5800 [5:14:20<41:54:16, 38.15s/it]                                                        {'loss': 0.1328, 'grad_norm': 8.89882755279541, 'learning_rate': 3.189800181729149e-05, 'epoch': 15.91}
- 32%|███▏      | 1846/5800 [5:14:20<41:54:16, 38.15s/it]score1 tensor([[0.4453],
-        [0.5352],
-        [0.4785],
-        [0.4355]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.6484, 0.5508, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0737, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:00:57,089] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.16 | optimizer_step: 4.37
-[2025-01-25 14:00:57,090] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2148.88 | bwd_microstep: 4603.44 | bwd_inner_microstep: 4599.66 | bwd_allreduce_microstep: 3.72 | step_microstep: 31.74
-[2025-01-25 14:00:57,090] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2148.88 | bwd: 4603.46 | bwd_inner: 4599.66 | bwd_allreduce: 3.75 | step: 31.75
- 32%|███▏      | 1847/5800 [5:14:27<31:34:37, 28.76s/it]                                                        {'loss': 0.0737, 'grad_norm': 8.908123016357422, 'learning_rate': 3.1889023030355705e-05, 'epoch': 15.92}
- 32%|███▏      | 1847/5800 [5:14:27<31:34:37, 28.76s/it]score1 tensor([[0.4453],
-        [0.4004],
-        [0.4219],
-        [0.4316]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.4258, 0.3477, 0.3691], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0649, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:01:03,928] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 14:01:03,929] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2146.75 | bwd_microstep: 4598.58 | bwd_inner_microstep: 4592.76 | bwd_allreduce_microstep: 5.71 | step_microstep: 47.55
-[2025-01-25 14:01:03,930] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2146.75 | bwd: 4598.60 | bwd_inner: 4592.76 | bwd_allreduce: 5.77 | step: 47.55
- 32%|███▏      | 1848/5800 [5:14:33<24:21:17, 22.19s/it]                                                        {'loss': 0.0649, 'grad_norm': 0.7086767554283142, 'learning_rate': 3.1880040536216256e-05, 'epoch': 15.93}
- 32%|███▏      | 1848/5800 [5:14:33<24:21:17, 22.19s/it]score1 tensor([[0.4570],
-        [0.5156],
-        [0.3945],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.4980, 0.4082, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:01:10,815] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.91 | optimizer_step: 4.37
-[2025-01-25 14:01:10,816] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.75 | bwd_microstep: 4612.44 | bwd_inner_microstep: 4605.48 | bwd_allreduce_microstep: 6.76 | step_microstep: 47.35
-[2025-01-25 14:01:10,816] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.71 | bwd: 4612.49 | bwd_inner: 4605.48 | bwd_allreduce: 6.88 | step: 47.33
- 32%|███▏      | 1849/5800 [5:14:40<19:18:41, 17.60s/it]                                                        {'loss': 0.0293, 'grad_norm': 3.9994869232177734, 'learning_rate': 3.187105433767402e-05, 'epoch': 15.94}
- 32%|███▏      | 1849/5800 [5:14:40<19:18:41, 17.60s/it]score1 tensor([[0.5039],
-        [0.5391],
-        [0.5742],
-        [0.4512]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4219, 0.5352, 0.5352, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0420, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:01:17,683] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.55 | optimizer_step: 4.36
-[2025-01-25 14:01:17,684] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.59 | bwd_microstep: 4607.68 | bwd_inner_microstep: 4603.03 | bwd_allreduce_microstep: 4.57 | step_microstep: 35.03
-[2025-01-25 14:01:17,684] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.56 | bwd: 4607.70 | bwd_inner: 4603.03 | bwd_allreduce: 4.61 | step: 35.03
- 32%|███▏      | 1850/5800 [5:14:47<15:46:47, 14.38s/it]                                                        {'loss': 0.042, 'grad_norm': 5.217141151428223, 'learning_rate': 3.186206443753108e-05, 'epoch': 15.95}
- 32%|███▏      | 1850/5800 [5:14:47<15:46:47, 14.38s/it]score1 tensor([[0.4727],
-        [0.4902],
-        [0.5469],
-        [0.4824]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.5664, 0.4355, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0620, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:01:24,607] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.18 | optimizer_step: 4.36
-[2025-01-25 14:01:24,608] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.03 | bwd_microstep: 4611.39 | bwd_inner_microstep: 4606.58 | bwd_allreduce_microstep: 4.74 | step_microstep: 36.18
-[2025-01-25 14:01:24,608] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.96 | bwd: 4611.41 | bwd_inner: 4606.58 | bwd_allreduce: 4.77 | step: 36.18
- 32%|███▏      | 1851/5800 [5:14:54<13:19:08, 12.14s/it]                                                        {'loss': 0.062, 'grad_norm': 4.56279993057251, 'learning_rate': 3.185307083859061e-05, 'epoch': 15.96}
- 32%|███▏      | 1851/5800 [5:14:54<13:19:08, 12.14s/it]score1 tensor([[0.5078],
-        [0.4141],
-        [0.4316],
-        [0.3984]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.4785, 0.4180, 0.4160], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0386, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:01:31,506] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.39 | optimizer_step: 4.36
-[2025-01-25 14:01:31,507] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.89 | bwd_microstep: 4624.27 | bwd_inner_microstep: 4620.10 | bwd_allreduce_microstep: 4.10 | step_microstep: 41.45
-[2025-01-25 14:01:31,508] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.86 | bwd: 4624.29 | bwd_inner: 4620.10 | bwd_allreduce: 4.13 | step: 38.99
- 32%|███▏      | 1852/5800 [5:15:01<11:35:14, 10.57s/it]                                                        {'loss': 0.0386, 'grad_norm': 4.300041198730469, 'learning_rate': 3.1844073543656986e-05, 'epoch': 15.97}
- 32%|███▏      | 1852/5800 [5:15:01<11:35:14, 10.57s/it]score1 tensor([[0.5156],
-        [0.5820],
-        [0.4746],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.6094, 0.4766, 0.6562], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:01:38,407] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.45 | optimizer_step: 4.37
-[2025-01-25 14:01:38,408] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.70 | bwd_microstep: 4633.85 | bwd_inner_microstep: 4628.91 | bwd_allreduce_microstep: 4.84 | step_microstep: 42.22
-[2025-01-25 14:01:38,409] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.67 | bwd: 4633.88 | bwd_inner: 4628.91 | bwd_allreduce: 4.89 | step: 42.22
- 32%|███▏      | 1853/5800 [5:15:08<10:22:44,  9.47s/it]                                                        {'loss': 0.043, 'grad_norm': 4.886722087860107, 'learning_rate': 3.183507255553572e-05, 'epoch': 15.97}
- 32%|███▏      | 1853/5800 [5:15:08<10:22:44,  9.47s/it]score1 tensor([[0.4883],
-        [0.7188],
-        [0.5547],
-        [0.4980]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4961, 0.6445, 0.6133, 0.4062], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0581, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:01:45,323] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.52 | optimizer_step: 4.36
-[2025-01-25 14:01:45,324] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.29 | bwd_microstep: 4644.57 | bwd_inner_microstep: 4639.75 | bwd_allreduce_microstep: 4.74 | step_microstep: 43.21
-[2025-01-25 14:01:45,325] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.27 | bwd: 4644.59 | bwd_inner: 4639.75 | bwd_allreduce: 4.78 | step: 43.22
- 32%|███▏      | 1854/5800 [5:15:15<9:32:17,  8.70s/it]                                                        {'loss': 0.0581, 'grad_norm': 1.1309645175933838, 'learning_rate': 3.182606787703348e-05, 'epoch': 15.98}
- 32%|███▏      | 1854/5800 [5:15:15<9:32:17,  8.70s/it]score1 tensor([[0.5312],
-        [0.4570],
-        [0.5273],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.3516, 0.5000, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0645, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:01:52,248] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 14:01:52,249] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.74 | bwd_microstep: 4648.37 | bwd_inner_microstep: 4643.13 | bwd_allreduce_microstep: 5.14 | step_microstep: 47.29
-[2025-01-25 14:01:52,249] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.70 | bwd: 4648.39 | bwd_inner: 4643.13 | bwd_allreduce: 5.19 | step: 47.29
- 32%|███▏      | 1855/5800 [5:15:22<8:57:09,  8.17s/it]                                                       {'loss': 0.0645, 'grad_norm': 8.65265941619873, 'learning_rate': 3.181705951095807e-05, 'epoch': 15.99}
- 32%|███▏      | 1855/5800 [5:15:22<8:57:09,  8.17s/it][93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-score1 tensor([[0.6445]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:01:56,370] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 14:01:56,371] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 573.69 | bwd_microstep: 1223.46 | bwd_inner_microstep: 1218.44 | bwd_allreduce_microstep: 4.93 | step_microstep: 43.32
-[2025-01-25 14:01:56,372] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 573.65 | bwd: 1223.48 | bwd_inner: 1218.44 | bwd_allreduce: 4.98 | step: 43.33
- 32%|███▏      | 1856/5800 [5:15:26<7:37:15,  6.96s/it]                                                       {'loss': 0.0312, 'grad_norm': 10.48116683959961, 'learning_rate': 3.1808047460118454e-05, 'epoch': 16.0}
- 32%|███▏      | 1856/5800 [5:15:26<7:37:15,  6.96s/it][2025-01-25 14:02:00,445] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 14:02:09,928] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 14:02:19,107] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 14:02:28,866] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-score1 tensor([[0.5625],
-        [0.5469],
-        [0.4238],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.6016, 0.4434, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0347, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:02:48,133] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.36
-[2025-01-25 14:02:48,135] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2132.08 | bwd_microstep: 4566.82 | bwd_inner_microstep: 4561.90 | bwd_allreduce_microstep: 4.80 | step_microstep: 43.30
-[2025-01-25 14:02:48,135] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2132.03 | bwd: 4566.84 | bwd_inner: 4561.90 | bwd_allreduce: 4.87 | step: 43.31
- 32%|███▏      | 1857/5800 [5:16:18<22:20:24, 20.40s/it]                                                        {'loss': 0.0347, 'grad_norm': 4.255687236785889, 'learning_rate': 3.179903172732476e-05, 'epoch': 16.01}
- 32%|███▏      | 1857/5800 [5:16:18<22:20:24, 20.40s/it]score1 tensor([[0.5977],
-        [0.5625],
-        [0.5859],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.4824, 0.5469, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0693, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:02:54,968] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 14:02:54,970] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2132.99 | bwd_microstep: 4587.19 | bwd_inner_microstep: 4582.25 | bwd_allreduce_microstep: 4.85 | step_microstep: 45.84
-[2025-01-25 14:02:54,970] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2132.95 | bwd: 4587.21 | bwd_inner: 4582.25 | bwd_allreduce: 4.89 | step: 45.84
- 32%|███▏      | 1858/5800 [5:16:24<17:52:49, 16.33s/it]                                                        {'loss': 0.0693, 'grad_norm': 9.591263771057129, 'learning_rate': 3.1790012315388244e-05, 'epoch': 16.02}
- 32%|███▏      | 1858/5800 [5:16:24<17:52:49, 16.33s/it]score1 tensor([[0.5508],
-        [0.4531],
-        [0.5391],
-        [0.4180]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.4648, 0.5664, 0.3555], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0371, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:03:01,817] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.36
-[2025-01-25 14:03:01,818] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2139.73 | bwd_microstep: 4590.25 | bwd_inner_microstep: 4585.28 | bwd_allreduce_microstep: 4.86 | step_microstep: 42.62
-[2025-01-25 14:03:01,818] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2139.70 | bwd: 4590.27 | bwd_inner: 4585.28 | bwd_allreduce: 4.91 | step: 42.63
- 32%|███▏      | 1859/5800 [5:16:31<14:45:53, 13.49s/it]                                                        {'loss': 0.0371, 'grad_norm': 0.7982847690582275, 'learning_rate': 3.178098922712131e-05, 'epoch': 16.03}
- 32%|███▏      | 1859/5800 [5:16:31<14:45:53, 13.49s/it]score1 tensor([[0.4922],
-        [0.6172],
-        [0.6484],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.5078, 0.6211, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0522, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:03:08,677] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.37
-[2025-01-25 14:03:08,678] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2142.02 | bwd_microstep: 4595.33 | bwd_inner_microstep: 4590.47 | bwd_allreduce_microstep: 4.75 | step_microstep: 41.85
-[2025-01-25 14:03:08,679] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2141.98 | bwd: 4595.35 | bwd_inner: 4590.47 | bwd_allreduce: 4.80 | step: 41.86
- 32%|███▏      | 1860/5800 [5:16:38<12:35:05, 11.50s/it]                                                        {'loss': 0.0522, 'grad_norm': 5.279278755187988, 'learning_rate': 3.177196246533752e-05, 'epoch': 16.03}
- 32%|███▏      | 1860/5800 [5:16:38<12:35:05, 11.50s/it]score1 tensor([[0.5430],
-        [0.5273],
-        [0.5312],
-        [0.6250]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5000, 0.5508, 0.4941, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0405, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:03:15,535] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 14:03:15,537] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2144.01 | bwd_microstep: 4591.02 | bwd_inner_microstep: 4586.43 | bwd_allreduce_microstep: 4.51 | step_microstep: 41.79
-[2025-01-25 14:03:15,537] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2143.98 | bwd: 4591.05 | bwd_inner: 4586.43 | bwd_allreduce: 4.55 | step: 41.80
- 32%|███▏      | 1861/5800 [5:16:45<11:03:19, 10.10s/it]                                                        {'loss': 0.0405, 'grad_norm': 4.760221004486084, 'learning_rate': 3.176293203285157e-05, 'epoch': 16.04}
- 32%|███▏      | 1861/5800 [5:16:45<11:03:19, 10.10s/it]score1 tensor([[0.4590],
-        [0.4902],
-        [0.6055],
-        [0.5977]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4609, 0.5312, 0.5547, 0.3262], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0913, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:03:22,392] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.37
-[2025-01-25 14:03:22,394] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2144.95 | bwd_microstep: 4598.34 | bwd_inner_microstep: 4593.02 | bwd_allreduce_microstep: 5.20 | step_microstep: 44.13
-[2025-01-25 14:03:22,395] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2144.92 | bwd: 4598.37 | bwd_inner: 4593.02 | bwd_allreduce: 5.26 | step: 44.14
- 32%|███▏      | 1862/5800 [5:16:52<9:59:14,  9.13s/it]                                                        {'loss': 0.0913, 'grad_norm': 1.3667954206466675, 'learning_rate': 3.1753897932479306e-05, 'epoch': 16.05}
- 32%|███▏      | 1862/5800 [5:16:52<9:59:14,  9.13s/it]score1 tensor([[0.4941],
-        [0.5234],
-        [0.4258],
-        [0.4180]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5078, 0.5195, 0.4844, 0.3867], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0269, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:03:29,253] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 14:03:29,254] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2148.69 | bwd_microstep: 4597.54 | bwd_inner_microstep: 4592.97 | bwd_allreduce_microstep: 4.49 | step_microstep: 40.54
-[2025-01-25 14:03:29,255] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2148.66 | bwd: 4597.56 | bwd_inner: 4592.97 | bwd_allreduce: 4.53 | step: 40.54
- 32%|███▏      | 1863/5800 [5:16:59<9:14:26,  8.45s/it]                                                       {'loss': 0.0269, 'grad_norm': 0.5428075194358826, 'learning_rate': 3.174486016703772e-05, 'epoch': 16.06}
- 32%|███▏      | 1863/5800 [5:16:59<9:14:26,  8.45s/it]score1 tensor([[0.4590],
-        [0.4434],
-        [0.4199],
-        [0.4941]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4082, 0.4238, 0.4141, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0479, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:03:36,121] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 14:03:36,123] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2151.87 | bwd_microstep: 4597.47 | bwd_inner_microstep: 4592.92 | bwd_allreduce_microstep: 4.47 | step_microstep: 42.16
-[2025-01-25 14:03:36,124] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2151.85 | bwd: 4597.50 | bwd_inner: 4592.92 | bwd_allreduce: 4.51 | step: 42.18
- 32%|███▏      | 1864/5800 [5:17:06<8:43:22,  7.98s/it]                                                       {'loss': 0.0479, 'grad_norm': 3.823587656021118, 'learning_rate': 3.1735818739344944e-05, 'epoch': 16.07}
- 32%|███▏      | 1864/5800 [5:17:06<8:43:22,  7.98s/it]score1 tensor([[0.4805],
-        [0.3945],
-        [0.3906],
-        [0.4414]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.4199, 0.4336, 0.5000], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0376, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:03:43,003] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.36
-[2025-01-25 14:03:43,004] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.30 | bwd_microstep: 4607.33 | bwd_inner_microstep: 4602.32 | bwd_allreduce_microstep: 4.93 | step_microstep: 40.71
-[2025-01-25 14:03:43,004] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.27 | bwd: 4607.35 | bwd_inner: 4602.32 | bwd_allreduce: 4.97 | step: 40.72
- 32%|███▏      | 1865/5800 [5:17:12<8:21:31,  7.65s/it]                                                       {'loss': 0.0376, 'grad_norm': 7.931296348571777, 'learning_rate': 3.172677365222025e-05, 'epoch': 16.08}
- 32%|███▏      | 1865/5800 [5:17:12<8:21:31,  7.65s/it]score1 tensor([[0.4355],
-        [0.4102],
-        [0.4961],
-        [0.4492]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.3984, 0.6055, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1147, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:03:49,890] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 14:03:49,891] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.24 | bwd_microstep: 4615.23 | bwd_inner_microstep: 4610.60 | bwd_allreduce_microstep: 4.54 | step_microstep: 42.58
-[2025-01-25 14:03:49,892] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.20 | bwd: 4615.26 | bwd_inner: 4610.61 | bwd_allreduce: 4.58 | step: 42.59
- 32%|███▏      | 1866/5800 [5:17:19<8:06:22,  7.42s/it]                                                       {'loss': 0.1147, 'grad_norm': 4.407723903656006, 'learning_rate': 3.171772490848406e-05, 'epoch': 16.09}
- 32%|███▏      | 1866/5800 [5:17:19<8:06:22,  7.42s/it]score1 tensor([[0.4434],
-        [0.4297],
-        [0.3613],
-        [0.4180]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5586, 0.5273, 0.3086, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1133, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:03:56,791] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.52 | optimizer_step: 4.36
-[2025-01-25 14:03:56,792] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.67 | bwd_microstep: 4629.55 | bwd_inner_microstep: 4624.80 | bwd_allreduce_microstep: 4.65 | step_microstep: 42.13
-[2025-01-25 14:03:56,793] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.63 | bwd: 4629.58 | bwd_inner: 4624.80 | bwd_allreduce: 4.70 | step: 42.14
- 32%|███▏      | 1867/5800 [5:17:26<7:56:06,  7.26s/it]                                                       {'loss': 0.1133, 'grad_norm': 4.138533592224121, 'learning_rate': 3.1708672510957914e-05, 'epoch': 16.09}
- 32%|███▏      | 1867/5800 [5:17:26<7:56:06,  7.26s/it]score1 tensor([[0.4492],
-        [0.4297],
-        [0.4336],
-        [0.4492]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.5391, 0.4648, 0.5938], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0977, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:04:03,697] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.37
-[2025-01-25 14:04:03,699] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.59 | bwd_microstep: 4633.14 | bwd_inner_microstep: 4628.41 | bwd_allreduce_microstep: 4.64 | step_microstep: 41.41
-[2025-01-25 14:04:03,699] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.56 | bwd: 4633.17 | bwd_inner: 4628.41 | bwd_allreduce: 4.68 | step: 41.42
- 32%|███▏      | 1868/5800 [5:17:33<7:48:59,  7.16s/it]                                                       {'loss': 0.0977, 'grad_norm': 8.140192031860352, 'learning_rate': 3.169961646246452e-05, 'epoch': 16.1}
- 32%|███▏      | 1868/5800 [5:17:33<7:48:59,  7.16s/it]score1 tensor([[0.4453],
-        [0.4082],
-        [0.4512],
-        [0.4707]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4336, 0.4844, 0.4473, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0566, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:04:10,602] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.37
-[2025-01-25 14:04:10,604] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.24 | bwd_microstep: 4627.88 | bwd_inner_microstep: 4623.52 | bwd_allreduce_microstep: 4.28 | step_microstep: 41.64
-[2025-01-25 14:04:10,604] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.21 | bwd: 4627.91 | bwd_inner: 4623.52 | bwd_allreduce: 4.32 | step: 41.64
- 32%|███▏      | 1869/5800 [5:17:40<7:43:56,  7.08s/it]                                                       {'loss': 0.0566, 'grad_norm': 1.4446579217910767, 'learning_rate': 3.1690556765827706e-05, 'epoch': 16.11}
- 32%|███▏      | 1869/5800 [5:17:40<7:43:56,  7.08s/it]score1 tensor([[0.4551],
-        [0.4883],
-        [0.5039],
-        [0.4766]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3984, 0.6367, 0.5156, 0.4121], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:04:17,518] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 14:04:17,519] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.46 | bwd_microstep: 4639.14 | bwd_inner_microstep: 4634.01 | bwd_allreduce_microstep: 5.01 | step_microstep: 43.08
-[2025-01-25 14:04:17,520] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.43 | bwd: 4639.16 | bwd_inner: 4634.01 | bwd_allreduce: 5.08 | step: 43.08
- 32%|███▏      | 1870/5800 [5:17:47<7:40:36,  7.03s/it]                                                       {'loss': 0.0703, 'grad_norm': 0.7404163479804993, 'learning_rate': 3.168149342387245e-05, 'epoch': 16.12}
- 32%|███▏      | 1870/5800 [5:17:47<7:40:36,  7.03s/it]score1 tensor([[0.5195],
-        [0.4668],
-        [0.5195],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6367, 0.4609, 0.5508, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0464, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:04:24,438] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.37
-[2025-01-25 14:04:24,439] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.24 | bwd_microstep: 4638.99 | bwd_inner_microstep: 4634.33 | bwd_allreduce_microstep: 4.56 | step_microstep: 41.86
-[2025-01-25 14:04:24,439] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.20 | bwd: 4639.02 | bwd_inner: 4634.33 | bwd_allreduce: 4.61 | step: 41.88
- 32%|███▏      | 1871/5800 [5:17:54<7:38:17,  7.00s/it]                                                       {'loss': 0.0464, 'grad_norm': 4.691607475280762, 'learning_rate': 3.1672426439424846e-05, 'epoch': 16.13}
- 32%|███▏      | 1871/5800 [5:17:54<7:38:17,  7.00s/it]score1 tensor([[0.5352],
-        [0.5195],
-        [0.5508],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5703, 0.6094, 0.6211, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0498, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:04:31,364] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.18 | optimizer_step: 4.36
-[2025-01-25 14:04:31,366] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.94 | bwd_microstep: 4633.36 | bwd_inner_microstep: 4627.42 | bwd_allreduce_microstep: 5.87 | step_microstep: 57.76
-[2025-01-25 14:04:31,366] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.91 | bwd: 4633.39 | bwd_inner: 4627.42 | bwd_allreduce: 5.91 | step: 57.80
- 32%|███▏      | 1872/5800 [5:18:01<7:36:54,  6.98s/it]                                                       {'loss': 0.0498, 'grad_norm': 8.993629455566406, 'learning_rate': 3.1663355815312135e-05, 'epoch': 16.14}
- 32%|███▏      | 1872/5800 [5:18:01<7:36:54,  6.98s/it]score1 tensor([[0.5938],
-        [0.5508],
-        [0.5781],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4766, 0.4629, 0.5625, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0947, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:04:38,319] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.49 | optimizer_step: 4.36
-[2025-01-25 14:04:38,322] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.49 | bwd_microstep: 4638.06 | bwd_inner_microstep: 4633.17 | bwd_allreduce_microstep: 4.80 | step_microstep: 63.40
-[2025-01-25 14:04:38,322] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.44 | bwd: 4638.08 | bwd_inner: 4633.17 | bwd_allreduce: 4.84 | step: 63.44
- 32%|███▏      | 1873/5800 [5:18:08<7:36:21,  6.97s/it]                                                       {'loss': 0.0947, 'grad_norm': 9.141059875488281, 'learning_rate': 3.165428155436272e-05, 'epoch': 16.15}
- 32%|███▏      | 1873/5800 [5:18:08<7:36:21,  6.97s/it]score1 tensor([[0.5273],
-        [0.5664],
-        [0.5664],
-        [0.4980]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3730, 0.3750, 0.4531, 0.2812], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1689, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:04:45,284] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.54 | optimizer_step: 4.36
-[2025-01-25 14:04:45,285] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.15 | bwd_microstep: 4637.85 | bwd_inner_microstep: 4630.94 | bwd_allreduce_microstep: 6.72 | step_microstep: 63.39
-[2025-01-25 14:04:45,286] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.12 | bwd: 4637.90 | bwd_inner: 4630.94 | bwd_allreduce: 6.81 | step: 63.37
- 32%|███▏      | 1874/5800 [5:18:15<7:35:53,  6.97s/it]                                                       {'loss': 0.1689, 'grad_norm': 8.800911903381348, 'learning_rate': 3.164520365940609e-05, 'epoch': 16.16}
- 32%|███▏      | 1874/5800 [5:18:15<7:35:53,  6.97s/it]score1 tensor([[0.5781],
-        [0.6172],
-        [0.5938],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4629, 0.5273, 0.4629, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1050, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:04:52,202] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.91 | optimizer_step: 4.37
-[2025-01-25 14:04:52,204] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.71 | bwd_microstep: 4633.98 | bwd_inner_microstep: 4629.08 | bwd_allreduce_microstep: 4.80 | step_microstep: 44.34
-[2025-01-25 14:04:52,204] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.64 | bwd: 4634.01 | bwd_inner: 4629.08 | bwd_allreduce: 4.85 | step: 44.35
- 32%|███▏      | 1875/5800 [5:18:22<7:34:55,  6.95s/it]                                                       {'loss': 0.105, 'grad_norm': 9.379281997680664, 'learning_rate': 3.16361221332729e-05, 'epoch': 16.16}
- 32%|███▏      | 1875/5800 [5:18:22<7:34:55,  6.95s/it]score1 tensor([[0.5430],
-        [0.5586],
-        [0.5742],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4297, 0.5234, 0.5117, 0.4824], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0698, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:04:59,136] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.36
-[2025-01-25 14:04:59,140] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.46 | bwd_microstep: 4635.25 | bwd_inner_microstep: 4630.13 | bwd_allreduce_microstep: 5.04 | step_microstep: 49.13
-[2025-01-25 14:04:59,140] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2168.43 | bwd: 4635.27 | bwd_inner: 4630.13 | bwd_allreduce: 5.08 | step: 49.17
- 32%|███▏      | 1876/5800 [5:18:29<7:34:36,  6.95s/it]                                                       {'loss': 0.0698, 'grad_norm': 9.018441200256348, 'learning_rate': 3.1627036978794925e-05, 'epoch': 16.17}
- 32%|███▏      | 1876/5800 [5:18:29<7:34:36,  6.95s/it]score1 tensor([[0.5430],
-        [0.5391],
-        [0.6172],
-        [0.6133]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.4551, 0.5625, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0732, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:05:06,080] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.37
-[2025-01-25 14:05:06,081] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.50 | bwd_microstep: 4642.11 | bwd_inner_microstep: 4637.47 | bwd_allreduce_microstep: 4.55 | step_microstep: 40.85
-[2025-01-25 14:05:06,082] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.47 | bwd: 4642.13 | bwd_inner: 4637.47 | bwd_allreduce: 4.60 | step: 40.85
- 32%|███▏      | 1877/5800 [5:18:36<7:34:00,  6.94s/it]                                                       {'loss': 0.0732, 'grad_norm': 9.307194709777832, 'learning_rate': 3.161794819880507e-05, 'epoch': 16.18}
- 32%|███▏      | 1877/5800 [5:18:36<7:34:00,  6.94s/it]score1 tensor([[0.5664],
-        [0.5820],
-        [0.5273],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.5703, 0.4922, 0.4727], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0322, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:05:12,997] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 14:05:12,999] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.92 | bwd_microstep: 4638.62 | bwd_inner_microstep: 4633.90 | bwd_allreduce_microstep: 4.65 | step_microstep: 41.37
-[2025-01-25 14:05:12,999] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.88 | bwd: 4638.64 | bwd_inner: 4633.90 | bwd_allreduce: 4.68 | step: 41.38
- 32%|███▏      | 1878/5800 [5:18:42<7:33:21,  6.94s/it]                                                       {'loss': 0.0322, 'grad_norm': 8.962971687316895, 'learning_rate': 3.160885579613738e-05, 'epoch': 16.19}
- 32%|███▏      | 1878/5800 [5:18:42<7:33:21,  6.94s/it]score1 tensor([[0.4473],
-        [0.5430],
-        [0.5430],
-        [0.4785]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.5391, 0.6836, 0.4043], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0620, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:05:19,926] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 8.30 | optimizer_step: 4.37
-[2025-01-25 14:05:19,927] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.43 | bwd_microstep: 4638.62 | bwd_inner_microstep: 4634.07 | bwd_allreduce_microstep: 4.47 | step_microstep: 44.45
-[2025-01-25 14:05:19,927] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.39 | bwd: 4638.64 | bwd_inner: 4634.07 | bwd_allreduce: 4.51 | step: 44.46
- 32%|███▏      | 1879/5800 [5:18:49<7:33:08,  6.93s/it]                                                       {'loss': 0.062, 'grad_norm': 4.141922950744629, 'learning_rate': 3.159975977362702e-05, 'epoch': 16.2}
- 32%|███▏      | 1879/5800 [5:18:49<7:33:08,  6.93s/it]score1 tensor([[0.4902],
-        [0.5234],
-        [0.4805],
-        [0.4902]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4258, 0.6484, 0.4492, 0.4160], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0737, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:05:26,853] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.37
-[2025-01-25 14:05:26,854] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2170.57 | bwd_microstep: 4638.08 | bwd_inner_microstep: 4633.33 | bwd_allreduce_microstep: 4.68 | step_microstep: 41.70
-[2025-01-25 14:05:26,855] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2170.52 | bwd: 4638.11 | bwd_inner: 4633.33 | bwd_allreduce: 4.72 | step: 41.71
- 32%|███▏      | 1880/5800 [5:18:56<7:32:49,  6.93s/it]                                                       {'loss': 0.0737, 'grad_norm': 4.12275505065918, 'learning_rate': 3.159066013411029e-05, 'epoch': 16.21}
- 32%|███▏      | 1880/5800 [5:18:56<7:32:49,  6.93s/it]score1 tensor([[0.4355],
-        [0.4414],
-        [0.4844],
-        [0.4668]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.4746, 0.5312, 0.5977], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0737, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:05:33,772] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 14:05:33,773] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.32 | bwd_microstep: 4638.64 | bwd_inner_microstep: 4634.06 | bwd_allreduce_microstep: 4.50 | step_microstep: 43.32
-[2025-01-25 14:05:33,774] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.28 | bwd: 4638.66 | bwd_inner: 4634.06 | bwd_allreduce: 4.54 | step: 43.33
- 32%|███▏      | 1881/5800 [5:19:03<7:32:29,  6.93s/it]                                                       {'loss': 0.0737, 'grad_norm': 8.354333877563477, 'learning_rate': 3.1581556880424615e-05, 'epoch': 16.22}
- 32%|███▏      | 1881/5800 [5:19:03<7:32:29,  6.93s/it]score1 tensor([[0.4277],
-        [0.4414],
-        [0.3867],
-        [0.4336]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4316, 0.4766, 0.3789, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0371, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:05:40,703] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 14:05:40,705] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.27 | bwd_microstep: 4639.81 | bwd_inner_microstep: 4633.95 | bwd_allreduce_microstep: 5.72 | step_microstep: 49.44
-[2025-01-25 14:05:40,706] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.24 | bwd: 4639.84 | bwd_inner: 4633.95 | bwd_allreduce: 5.80 | step: 49.41
- 32%|███▏      | 1882/5800 [5:19:10<7:32:36,  6.93s/it]                                                       {'loss': 0.0371, 'grad_norm': 4.177657127380371, 'learning_rate': 3.1572450015408545e-05, 'epoch': 16.22}
- 32%|███▏      | 1882/5800 [5:19:10<7:32:36,  6.93s/it]score1 tensor([[0.4160],
-        [0.4121],
-        [0.4766],
-        [0.3984]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4258, 0.4062, 0.6641, 0.4570], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0654, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:05:47,644] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 14:05:47,645] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.35 | bwd_microstep: 4642.32 | bwd_inner_microstep: 4634.18 | bwd_allreduce_microstep: 7.94 | step_microstep: 46.00
-[2025-01-25 14:05:47,646] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.31 | bwd: 4642.38 | bwd_inner: 4634.18 | bwd_allreduce: 8.04 | step: 45.98
- 32%|███▏      | 1883/5800 [5:19:17<7:32:34,  6.93s/it]                                                       {'loss': 0.0654, 'grad_norm': 4.311216831207275, 'learning_rate': 3.156333954190176e-05, 'epoch': 16.23}
- 32%|███▏      | 1883/5800 [5:19:17<7:32:34,  6.93s/it]score1 tensor([[0.3828],
-        [0.3965],
-        [0.4414],
-        [0.4238]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4766, 0.3789, 0.5469, 0.6875], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1201, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:05:54,567] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 14:05:54,568] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.73 | bwd_microstep: 4636.48 | bwd_inner_microstep: 4629.38 | bwd_allreduce_microstep: 6.93 | step_microstep: 44.12
-[2025-01-25 14:05:54,569] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.70 | bwd: 4636.53 | bwd_inner: 4629.38 | bwd_allreduce: 7.01 | step: 44.10
- 32%|███▏      | 1884/5800 [5:19:24<7:32:25,  6.93s/it]                                                       {'loss': 0.1201, 'grad_norm': 4.176969051361084, 'learning_rate': 3.155422546274506e-05, 'epoch': 16.24}
- 32%|███▏      | 1884/5800 [5:19:24<7:32:25,  6.93s/it]score1 tensor([[0.4102],
-        [0.3672],
-        [0.3887],
-        [0.4785]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.3672, 0.3438, 0.5938], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0537, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:06:01,457] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.37
-[2025-01-25 14:06:01,458] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2172.49 | bwd_microstep: 4588.65 | bwd_inner_microstep: 4583.96 | bwd_allreduce_microstep: 4.62 | step_microstep: 41.91
-[2025-01-25 14:06:01,459] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2172.37 | bwd: 4588.68 | bwd_inner: 4583.96 | bwd_allreduce: 4.65 | step: 41.92
- 32%|███▎      | 1885/5800 [5:19:31<7:31:16,  6.92s/it]                                                       {'loss': 0.0537, 'grad_norm': 2.3040850162506104, 'learning_rate': 3.154510778078039e-05, 'epoch': 16.25}
- 32%|███▎      | 1885/5800 [5:19:31<7:31:16,  6.92s/it]score1 tensor([[0.4648],
-        [0.4648],
-        [0.4062],
-        [0.4707]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4746, 0.5703, 0.4219, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0410, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:06:08,372] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 14:06:08,379] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.70 | bwd_microstep: 4636.06 | bwd_inner_microstep: 4631.39 | bwd_allreduce_microstep: 4.59 | step_microstep: 46.96
-[2025-01-25 14:06:08,379] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.66 | bwd: 4636.09 | bwd_inner: 4631.39 | bwd_allreduce: 4.63 | step: 46.97
- 33%|███▎      | 1886/5800 [5:19:38<7:31:12,  6.92s/it]                                                       {'loss': 0.041, 'grad_norm': 8.389520645141602, 'learning_rate': 3.1535986498850784e-05, 'epoch': 16.26}
- 33%|███▎      | 1886/5800 [5:19:38<7:31:12,  6.92s/it]score1 tensor([[0.4883],
-        [0.4961],
-        [0.4453],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6406, 0.4355, 0.4473, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:06:15,291] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 14:06:15,292] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.00 | bwd_microstep: 4635.88 | bwd_inner_microstep: 4630.84 | bwd_allreduce_microstep: 4.94 | step_microstep: 41.62
-[2025-01-25 14:06:15,292] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.96 | bwd: 4635.90 | bwd_inner: 4630.84 | bwd_allreduce: 4.99 | step: 41.63
- 33%|███▎      | 1887/5800 [5:19:45<7:31:06,  6.92s/it]                                                       {'loss': 0.0703, 'grad_norm': 4.3712263107299805, 'learning_rate': 3.152686161980042e-05, 'epoch': 16.27}
- 33%|███▎      | 1887/5800 [5:19:45<7:31:06,  6.92s/it]score1 tensor([[0.5117],
-        [0.5352],
-        [0.4570],
-        [0.5781]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.7031, 0.6953, 0.4121, 0.5742], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1001, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:06:22,214] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 14:06:22,215] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.25 | bwd_microstep: 4641.24 | bwd_inner_microstep: 4636.10 | bwd_allreduce_microstep: 5.02 | step_microstep: 40.88
-[2025-01-25 14:06:22,215] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.23 | bwd: 4641.27 | bwd_inner: 4636.10 | bwd_allreduce: 5.09 | step: 40.88
- 33%|███▎      | 1888/5800 [5:19:52<7:31:09,  6.92s/it]                                                       {'loss': 0.1001, 'grad_norm': 0.5401613712310791, 'learning_rate': 3.15177331464746e-05, 'epoch': 16.28}
- 33%|███▎      | 1888/5800 [5:19:52<7:31:09,  6.92s/it]score1 tensor([[0.5508],
-        [0.3926],
-        [0.5508],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6719, 0.1787, 0.5625, 0.6562], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1104, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:06:29,147] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 14:06:29,148] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.70 | bwd_microstep: 4647.53 | bwd_inner_microstep: 4642.50 | bwd_allreduce_microstep: 4.93 | step_microstep: 42.90
-[2025-01-25 14:06:29,149] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.67 | bwd: 4647.55 | bwd_inner: 4642.50 | bwd_allreduce: 4.98 | step: 42.91
- 33%|███▎      | 1889/5800 [5:19:59<7:31:12,  6.92s/it]                                                       {'loss': 0.1104, 'grad_norm': 5.16541862487793, 'learning_rate': 3.150860108171973e-05, 'epoch': 16.28}
- 33%|███▎      | 1889/5800 [5:19:59<7:31:12,  6.92s/it]score1 tensor([[0.5703],
-        [0.5352],
-        [0.5977],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5000, 0.5430, 0.4219, 0.5898], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0693, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:06:36,069] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 14:06:36,070] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.08 | bwd_microstep: 4637.87 | bwd_inner_microstep: 4633.03 | bwd_allreduce_microstep: 4.72 | step_microstep: 47.40
-[2025-01-25 14:06:36,071] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.05 | bwd: 4637.89 | bwd_inner: 4633.03 | bwd_allreduce: 4.79 | step: 47.41
- 33%|███▎      | 1890/5800 [5:20:06<7:31:06,  6.92s/it]                                                       {'loss': 0.0693, 'grad_norm': 0.8631377816200256, 'learning_rate': 3.1499465428383345e-05, 'epoch': 16.29}
- 33%|███▎      | 1890/5800 [5:20:06<7:31:06,  6.92s/it]score1 tensor([[0.5664],
-        [0.6523],
-        [0.6641],
-        [0.6484]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4805, 0.6094, 0.4355, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1465, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:06:42,997] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.37
-[2025-01-25 14:06:42,998] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.83 | bwd_microstep: 4639.77 | bwd_inner_microstep: 4635.13 | bwd_allreduce_microstep: 4.55 | step_microstep: 42.34
-[2025-01-25 14:06:42,999] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2168.80 | bwd: 4639.80 | bwd_inner: 4635.13 | bwd_allreduce: 4.59 | step: 42.35
- 33%|███▎      | 1891/5800 [5:20:12<7:31:04,  6.92s/it]                                                       {'loss': 0.1465, 'grad_norm': 10.30612564086914, 'learning_rate': 3.149032618931411e-05, 'epoch': 16.3}
- 33%|███▎      | 1891/5800 [5:20:12<7:31:04,  6.92s/it]score1 tensor([[0.5820],
-        [0.6367],
-        [0.6602],
-        [0.6992]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.4453, 0.5078, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1602, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:06:49,921] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 14:06:49,922] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.94 | bwd_microstep: 4641.28 | bwd_inner_microstep: 4636.19 | bwd_allreduce_microstep: 4.98 | step_microstep: 46.34
-[2025-01-25 14:06:49,922] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.91 | bwd: 4641.30 | bwd_inner: 4636.19 | bwd_allreduce: 5.03 | step: 46.35
- 33%|███▎      | 1892/5800 [5:20:19<7:30:57,  6.92s/it]                                                       {'loss': 0.1602, 'grad_norm': 10.154860496520996, 'learning_rate': 3.14811833673618e-05, 'epoch': 16.31}
- 33%|███▎      | 1892/5800 [5:20:19<7:30:57,  6.92s/it]score1 tensor([[0.6719],
-        [0.6406],
-        [0.7344],
-        [0.6562]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.4785, 0.5664, 0.4277], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1904, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:06:56,846] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 14:06:56,847] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.01 | bwd_microstep: 4639.28 | bwd_inner_microstep: 4634.52 | bwd_allreduce_microstep: 4.68 | step_microstep: 41.46
-[2025-01-25 14:06:56,848] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.98 | bwd: 4639.31 | bwd_inner: 4634.52 | bwd_allreduce: 4.72 | step: 41.47
- 33%|███▎      | 1893/5800 [5:20:26<7:30:53,  6.92s/it]                                                       {'loss': 0.1904, 'grad_norm': 10.381840705871582, 'learning_rate': 3.147203696537729e-05, 'epoch': 16.32}
- 33%|███▎      | 1893/5800 [5:20:26<7:30:53,  6.92s/it]score1 tensor([[0.6953],
-        [0.7148],
-        [0.6680],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.6016, 0.5430, 0.5586], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0850, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:07:03,804] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.51 | optimizer_step: 4.36
-[2025-01-25 14:07:03,806] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.50 | bwd_microstep: 4648.07 | bwd_inner_microstep: 4642.61 | bwd_allreduce_microstep: 5.37 | step_microstep: 72.24
-[2025-01-25 14:07:03,806] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.47 | bwd: 4648.10 | bwd_inner: 4642.61 | bwd_allreduce: 5.41 | step: 72.25
- 33%|███▎      | 1894/5800 [5:20:33<7:31:50,  6.94s/it]                                                       {'loss': 0.085, 'grad_norm': 10.398029327392578, 'learning_rate': 3.1462886986212605e-05, 'epoch': 16.33}
- 33%|███▎      | 1894/5800 [5:20:33<7:31:50,  6.94s/it]score1 tensor([[0.6211],
-        [0.6133],
-        [0.6055],
-        [0.6875]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4707, 0.6484, 0.4844, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1079, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:07:10,799] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.96 | optimizer_step: 4.36
-[2025-01-25 14:07:10,801] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.08 | bwd_microstep: 4642.22 | bwd_inner_microstep: 4637.00 | bwd_allreduce_microstep: 5.12 | step_microstep: 50.40
-[2025-01-25 14:07:10,806] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.93 | bwd: 4642.26 | bwd_inner: 4637.00 | bwd_allreduce: 5.17 | step: 50.44
- 33%|███▎      | 1895/5800 [5:20:40<7:32:32,  6.95s/it]                                                       {'loss': 0.1079, 'grad_norm': 5.121610164642334, 'learning_rate': 3.145373343272086e-05, 'epoch': 16.34}
- 33%|███▎      | 1895/5800 [5:20:40<7:32:32,  6.95s/it]score1 tensor([[0.5039],
-        [0.5898],
-        [0.6367],
-        [0.6953]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4414, 0.5352, 0.6602, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:07:17,730] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 14:07:17,731] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.30 | bwd_microstep: 4639.16 | bwd_inner_microstep: 4634.25 | bwd_allreduce_microstep: 4.80 | step_microstep: 44.20
-[2025-01-25 14:07:17,732] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.27 | bwd: 4639.19 | bwd_inner: 4634.26 | bwd_allreduce: 4.85 | step: 44.21
- 33%|███▎      | 1896/5800 [5:20:47<7:31:47,  6.94s/it]                                                       {'loss': 0.0547, 'grad_norm': 4.899987697601318, 'learning_rate': 3.144457630775629e-05, 'epoch': 16.34}
- 33%|███▎      | 1896/5800 [5:20:47<7:31:47,  6.94s/it]score1 tensor([[0.5547],
-        [0.5859],
-        [0.5508],
-        [0.6367]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.4941, 0.3906, 0.5742], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1128, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:07:24,672] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 14:07:24,674] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2178.21 | bwd_microstep: 4633.97 | bwd_inner_microstep: 4629.30 | bwd_allreduce_microstep: 4.58 | step_microstep: 41.62
-[2025-01-25 14:07:24,674] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2178.18 | bwd: 4633.99 | bwd_inner: 4629.30 | bwd_allreduce: 4.63 | step: 41.62
- 33%|███▎      | 1897/5800 [5:20:54<7:31:42,  6.94s/it]                                                       {'loss': 0.1128, 'grad_norm': 9.694636344909668, 'learning_rate': 3.143541561417425e-05, 'epoch': 16.35}
- 33%|███▎      | 1897/5800 [5:20:54<7:31:42,  6.94s/it]score1 tensor([[0.5703],
-        [0.5469],
-        [0.6367],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.4512, 0.6133, 0.4121], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0796, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:07:31,595] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.25 | optimizer_step: 4.36
-[2025-01-25 14:07:31,597] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.28 | bwd_microstep: 4635.75 | bwd_inner_microstep: 4630.88 | bwd_allreduce_microstep: 4.76 | step_microstep: 45.50
-[2025-01-25 14:07:31,598] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.25 | bwd: 4635.78 | bwd_inner: 4630.88 | bwd_allreduce: 4.82 | step: 45.51
- 33%|███▎      | 1898/5800 [5:21:01<7:31:08,  6.94s/it]                                                       {'loss': 0.0796, 'grad_norm': 9.494820594787598, 'learning_rate': 3.1426251354831196e-05, 'epoch': 16.36}
- 33%|███▎      | 1898/5800 [5:21:01<7:31:08,  6.94s/it]score1 tensor([[0.5312],
-        [0.4199],
-        [0.4727],
-        [0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.5195, 0.3711, 0.4883], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0708, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:07:38,513] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 14:07:38,515] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.99 | bwd_microstep: 4637.31 | bwd_inner_microstep: 4632.71 | bwd_allreduce_microstep: 4.52 | step_microstep: 41.03
-[2025-01-25 14:07:38,515] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.96 | bwd: 4637.33 | bwd_inner: 4632.71 | bwd_allreduce: 4.56 | step: 41.04
- 33%|███▎      | 1899/5800 [5:21:08<7:30:40,  6.93s/it]                                                       {'loss': 0.0708, 'grad_norm': 4.494131088256836, 'learning_rate': 3.14170835325847e-05, 'epoch': 16.37}
- 33%|███▎      | 1899/5800 [5:21:08<7:30:40,  6.93s/it]score1 tensor([[0.3789],
-        [0.3848],
-        [0.4082],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3105, 0.4688, 0.4180, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0620, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:07:45,426] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 14:07:45,427] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.75 | bwd_microstep: 4633.64 | bwd_inner_microstep: 4629.01 | bwd_allreduce_microstep: 4.55 | step_microstep: 40.89
-[2025-01-25 14:07:45,428] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.71 | bwd: 4633.66 | bwd_inner: 4629.00 | bwd_allreduce: 4.59 | step: 40.90
- 33%|███▎      | 1900/5800 [5:21:15<7:30:10,  6.93s/it]                                                       {'loss': 0.062, 'grad_norm': 4.429121971130371, 'learning_rate': 3.140791215029347e-05, 'epoch': 16.38}
- 33%|███▎      | 1900/5800 [5:21:15<7:30:10,  6.93s/it]score1 tensor([[0.5391],
-        [0.4902],
-        [0.5508],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.4141, 0.5508, 0.6797], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0815, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:07:52,291] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 14:07:52,292] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.12 | bwd_microstep: 4589.74 | bwd_inner_microstep: 4585.03 | bwd_allreduce_microstep: 4.60 | step_microstep: 40.89
-[2025-01-25 14:07:52,293] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.09 | bwd: 4589.76 | bwd_inner: 4585.03 | bwd_allreduce: 4.66 | step: 40.90
- 33%|███▎      | 1901/5800 [5:21:22<7:28:50,  6.91s/it]                                                       {'loss': 0.0815, 'grad_norm': 2.452800989151001, 'learning_rate': 3.1398737210817274e-05, 'epoch': 16.39}
- 33%|███▎      | 1901/5800 [5:21:22<7:28:50,  6.91s/it]score1 tensor([[0.4707],
-        [0.4805],
-        [0.4766],
-        [0.4785]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.4922, 0.4922, 0.4609], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0225, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:07:59,222] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.83 | optimizer_step: 4.36
-[2025-01-25 14:07:59,223] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.75 | bwd_microstep: 4644.61 | bwd_inner_microstep: 4639.72 | bwd_allreduce_microstep: 4.81 | step_microstep: 54.44
-[2025-01-25 14:07:59,224] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.71 | bwd: 4644.64 | bwd_inner: 4639.72 | bwd_allreduce: 4.84 | step: 54.45
- 33%|███▎      | 1902/5800 [5:21:29<7:29:18,  6.92s/it]                                                       {'loss': 0.0225, 'grad_norm': 4.288153171539307, 'learning_rate': 3.1389558717017036e-05, 'epoch': 16.4}
- 33%|███▎      | 1902/5800 [5:21:29<7:29:18,  6.92s/it]score1 tensor([[0.4727],
-        [0.4043],
-        [0.4297],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4434, 0.4922, 0.3809, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0435, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:08:06,181] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.07 | optimizer_step: 4.37
-[2025-01-25 14:08:06,184] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.77 | bwd_microstep: 4639.84 | bwd_inner_microstep: 4632.23 | bwd_allreduce_microstep: 7.45 | step_microstep: 69.89
-[2025-01-25 14:08:06,184] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.72 | bwd: 4639.90 | bwd_inner: 4632.23 | bwd_allreduce: 7.53 | step: 69.91
- 33%|███▎      | 1903/5800 [5:21:36<7:30:06,  6.93s/it]                                                       {'loss': 0.0435, 'grad_norm': 4.548338413238525, 'learning_rate': 3.1380376671754756e-05, 'epoch': 16.41}
- 33%|███▎      | 1903/5800 [5:21:36<7:30:06,  6.93s/it]score1 tensor([[0.5117],
-        [0.5078],
-        [0.4863],
-        [0.4922]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.5273, 0.5117, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:08:13,121] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 14:08:13,122] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.68 | bwd_microstep: 4642.77 | bwd_inner_microstep: 4638.15 | bwd_allreduce_microstep: 4.53 | step_microstep: 52.32
-[2025-01-25 14:08:13,122] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.64 | bwd: 4642.79 | bwd_inner: 4638.16 | bwd_allreduce: 4.57 | step: 52.33
- 33%|███▎      | 1904/5800 [5:21:43<7:30:00,  6.93s/it]                                                       {'loss': 0.0176, 'grad_norm': 9.000245094299316, 'learning_rate': 3.1371191077893574e-05, 'epoch': 16.41}
- 33%|███▎      | 1904/5800 [5:21:43<7:30:00,  6.93s/it]score1 tensor([[0.3672],
-        [0.5039],
-        [0.5195],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3652, 0.4980, 0.6133, 0.6367], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:08:20,050] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 14:08:20,051] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.57 | bwd_microstep: 4642.31 | bwd_inner_microstep: 4637.75 | bwd_allreduce_microstep: 4.45 | step_microstep: 46.35
-[2025-01-25 14:08:20,052] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.54 | bwd: 4642.34 | bwd_inner: 4637.75 | bwd_allreduce: 4.51 | step: 46.36
- 33%|███▎      | 1905/5800 [5:21:50<7:30:11,  6.93s/it]                                                       {'loss': 0.043, 'grad_norm': 0.9458140730857849, 'learning_rate': 3.13620019382977e-05, 'epoch': 16.42}
- 33%|███▎      | 1905/5800 [5:21:50<7:30:11,  6.93s/it]score1 tensor([[0.3906],
-        [0.4141],
-        [0.4512],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4629, 0.4668, 0.5039, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0620, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:08:26,992] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 14:08:26,993] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.99 | bwd_microstep: 4641.61 | bwd_inner_microstep: 4634.50 | bwd_allreduce_microstep: 6.91 | step_microstep: 47.59
-[2025-01-25 14:08:26,993] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.96 | bwd: 4641.67 | bwd_inner: 4634.50 | bwd_allreduce: 7.01 | step: 47.56
- 33%|███▎      | 1906/5800 [5:21:56<7:29:55,  6.93s/it]                                                       {'loss': 0.062, 'grad_norm': 8.620661735534668, 'learning_rate': 3.135280925583248e-05, 'epoch': 16.43}
- 33%|███▎      | 1906/5800 [5:21:56<7:29:55,  6.93s/it]score1 tensor([[0.5781],
-        [0.5547],
-        [0.4941],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.6016, 0.4648, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0347, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:08:33,916] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 14:08:33,917] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.40 | bwd_microstep: 4642.49 | bwd_inner_microstep: 4637.84 | bwd_allreduce_microstep: 4.56 | step_microstep: 42.55
-[2025-01-25 14:08:33,918] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.37 | bwd: 4642.51 | bwd_inner: 4637.84 | bwd_allreduce: 4.60 | step: 42.55
- 33%|███▎      | 1907/5800 [5:22:03<7:29:43,  6.93s/it]                                                       {'loss': 0.0347, 'grad_norm': 4.950976848602295, 'learning_rate': 3.1343613033364335e-05, 'epoch': 16.44}
- 33%|███▎      | 1907/5800 [5:22:03<7:29:43,  6.93s/it]score1 tensor([[0.5625],
-        [0.5820],
-        [0.6172],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6328, 0.6172, 0.6133, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0366, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:08:40,856] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 14:08:40,857] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.48 | bwd_microstep: 4645.39 | bwd_inner_microstep: 4638.98 | bwd_allreduce_microstep: 6.22 | step_microstep: 50.52
-[2025-01-25 14:08:40,858] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.43 | bwd: 4645.44 | bwd_inner: 4638.98 | bwd_allreduce: 6.31 | step: 50.46
- 33%|███▎      | 1908/5800 [5:22:10<7:29:46,  6.93s/it]                                                       {'loss': 0.0366, 'grad_norm': 0.5811206698417664, 'learning_rate': 3.133441327376083e-05, 'epoch': 16.45}
- 33%|███▎      | 1908/5800 [5:22:10<7:29:46,  6.93s/it]score1 tensor([[0.4453],
-        [0.6055],
-        [0.5586],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3691, 0.5742, 0.4922, 0.4688], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0669, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:08:47,801] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.37
-[2025-01-25 14:08:47,802] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.79 | bwd_microstep: 4637.99 | bwd_inner_microstep: 4633.50 | bwd_allreduce_microstep: 4.40 | step_microstep: 52.15
-[2025-01-25 14:08:47,802] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2168.73 | bwd: 4638.01 | bwd_inner: 4633.50 | bwd_allreduce: 4.44 | step: 52.17
- 33%|███▎      | 1909/5800 [5:22:17<7:29:55,  6.94s/it]                                                       {'loss': 0.0669, 'grad_norm': 9.44660758972168, 'learning_rate': 3.1325209979890594e-05, 'epoch': 16.46}
- 33%|███▎      | 1909/5800 [5:22:17<7:29:55,  6.94s/it]score1 tensor([[0.5195],
-        [0.4805],
-        [0.5195],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.3789, 0.4609, 0.3398], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1133, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:08:54,752] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.36
-[2025-01-25 14:08:54,753] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2169.88 | bwd_microstep: 4640.56 | bwd_inner_microstep: 4635.78 | bwd_allreduce_microstep: 4.71 | step_microstep: 41.20
-[2025-01-25 14:08:54,754] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2169.84 | bwd: 4640.59 | bwd_inner: 4635.78 | bwd_allreduce: 4.75 | step: 41.21
- 33%|███▎      | 1910/5800 [5:22:24<7:29:57,  6.94s/it]                                                       {'loss': 0.1133, 'grad_norm': 9.183616638183594, 'learning_rate': 3.131600315462337e-05, 'epoch': 16.47}
- 33%|███▎      | 1910/5800 [5:22:24<7:29:57,  6.94s/it]score1 tensor([[0.5195],
-        [0.5820],
-        [0.6367],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4531, 0.4805, 0.5547, 0.4570], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0811, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:09:01,699] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 14:09:01,701] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.58 | bwd_microstep: 4643.80 | bwd_inner_microstep: 4636.11 | bwd_allreduce_microstep: 7.61 | step_microstep: 47.52
-[2025-01-25 14:09:01,701] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.50 | bwd: 4643.83 | bwd_inner: 4636.10 | bwd_allreduce: 7.65 | step: 47.53
- 33%|███▎      | 1911/5800 [5:22:31<7:29:55,  6.94s/it]                                                       {'loss': 0.0811, 'grad_norm': 10.207103729248047, 'learning_rate': 3.130679280083002e-05, 'epoch': 16.47}
- 33%|███▎      | 1911/5800 [5:22:31<7:29:55,  6.94s/it]score1 tensor([[0.5664],
-        [0.5742],
-        [0.3594],
-        [0.6172]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4375, 0.5391, 0.3652, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0601, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:09:08,625] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.37
-[2025-01-25 14:09:08,626] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.34 | bwd_microstep: 4642.81 | bwd_inner_microstep: 4637.88 | bwd_allreduce_microstep: 4.81 | step_microstep: 41.25
-[2025-01-25 14:09:08,626] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.27 | bwd: 4642.84 | bwd_inner: 4637.87 | bwd_allreduce: 4.87 | step: 41.26
- 33%|███▎      | 1912/5800 [5:22:38<7:29:37,  6.94s/it]                                                       {'loss': 0.0601, 'grad_norm': 5.547024726867676, 'learning_rate': 3.1297578921382474e-05, 'epoch': 16.48}
- 33%|███▎      | 1912/5800 [5:22:38<7:29:37,  6.94s/it]score1 tensor([[0.5703],
-        [0.3984],
-        [0.5938],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5742, 0.3750, 0.5391, 0.5586], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0205, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:09:15,519] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 14:09:15,521] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2169.48 | bwd_microstep: 4595.46 | bwd_inner_microstep: 4590.02 | bwd_allreduce_microstep: 5.37 | step_microstep: 48.87
-[2025-01-25 14:09:15,522] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2169.44 | bwd: 4595.49 | bwd_inner: 4590.02 | bwd_allreduce: 5.41 | step: 48.87
- 33%|███▎      | 1913/5800 [5:22:45<7:28:32,  6.92s/it]                                                       {'loss': 0.0205, 'grad_norm': 2.2020418643951416, 'learning_rate': 3.1288361519153795e-05, 'epoch': 16.49}
- 33%|███▎      | 1913/5800 [5:22:45<7:28:32,  6.92s/it]score1 tensor([[0.4375],
-        [0.5000],
-        [0.5117],
-        [0.4961]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4453, 0.5508, 0.5469, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0537, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:09:22,436] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 14:09:22,437] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.11 | bwd_microstep: 4634.32 | bwd_inner_microstep: 4627.94 | bwd_allreduce_microstep: 6.30 | step_microstep: 42.52
-[2025-01-25 14:09:22,438] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.07 | bwd: 4634.34 | bwd_inner: 4627.94 | bwd_allreduce: 6.34 | step: 42.53
- 33%|███▎      | 1914/5800 [5:22:52<7:28:15,  6.92s/it]                                                       {'loss': 0.0537, 'grad_norm': 4.485086441040039, 'learning_rate': 3.1279140597018135e-05, 'epoch': 16.5}
- 33%|███▎      | 1914/5800 [5:22:52<7:28:15,  6.92s/it]score1 tensor([[0.5234],
-        [0.5742],
-        [0.6055],
-        [0.4590]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.5508, 0.5391, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0386, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:09:29,348] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 14:09:29,349] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.50 | bwd_microstep: 4633.79 | bwd_inner_microstep: 4628.81 | bwd_allreduce_microstep: 4.89 | step_microstep: 44.01
-[2025-01-25 14:09:29,350] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.44 | bwd: 4633.82 | bwd_inner: 4628.81 | bwd_allreduce: 4.94 | step: 44.01
- 33%|███▎      | 1915/5800 [5:22:59<7:27:57,  6.92s/it]                                                       {'loss': 0.0386, 'grad_norm': 0.8627259135246277, 'learning_rate': 3.1269916157850716e-05, 'epoch': 16.51}
- 33%|███▎      | 1915/5800 [5:22:59<7:27:57,  6.92s/it]score1 tensor([[0.3848],
-        [0.3242],
-        [0.3926],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3730, 0.3945, 0.5117, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0718, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:09:36,259] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.37
-[2025-01-25 14:09:36,260] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.11 | bwd_microstep: 4633.93 | bwd_inner_microstep: 4629.20 | bwd_allreduce_microstep: 4.64 | step_microstep: 45.98
-[2025-01-25 14:09:36,261] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.08 | bwd: 4633.95 | bwd_inner: 4629.20 | bwd_allreduce: 4.68 | step: 45.98
- 33%|███▎      | 1916/5800 [5:23:06<7:27:47,  6.92s/it]                                                       {'loss': 0.0718, 'grad_norm': 4.392560958862305, 'learning_rate': 3.126068820452789e-05, 'epoch': 16.52}
- 33%|███▎      | 1916/5800 [5:23:06<7:27:47,  6.92s/it]score1 tensor([[0.4434],
-        [0.5156],
-        [0.4375],
-        [0.3828]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.5977, 0.5156, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0762, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:09:43,190] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 14:09:43,191] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.05 | bwd_microstep: 4639.11 | bwd_inner_microstep: 4634.05 | bwd_allreduce_microstep: 4.94 | step_microstep: 48.79
-[2025-01-25 14:09:43,192] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.02 | bwd: 4639.13 | bwd_inner: 4634.05 | bwd_allreduce: 5.01 | step: 48.80
- 33%|███▎      | 1917/5800 [5:23:13<7:27:50,  6.92s/it]                                                       {'loss': 0.0762, 'grad_norm': 8.681478500366211, 'learning_rate': 3.12514567399271e-05, 'epoch': 16.53}
- 33%|███▎      | 1917/5800 [5:23:13<7:27:50,  6.92s/it]score1 tensor([[0.4375],
-        [0.5078],
-        [0.4336],
-        [0.4062]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.5625, 0.4570, 0.4590], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0376, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:09:50,116] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 14:09:50,118] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.94 | bwd_microstep: 4640.40 | bwd_inner_microstep: 4635.09 | bwd_allreduce_microstep: 5.22 | step_microstep: 52.76
-[2025-01-25 14:09:50,118] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.92 | bwd: 4640.43 | bwd_inner: 4635.09 | bwd_allreduce: 5.27 | step: 52.76
- 33%|███▎      | 1918/5800 [5:23:20<7:27:57,  6.92s/it]                                                       {'loss': 0.0376, 'grad_norm': 4.39774227142334, 'learning_rate': 3.124222176692686e-05, 'epoch': 16.53}
- 33%|███▎      | 1918/5800 [5:23:20<7:27:57,  6.92s/it]score1 tensor([[0.6055],
-        [0.4766],
-        [0.5195],
-        [0.4766]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.4805, 0.5781, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0322, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:09:57,054] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.37
-[2025-01-25 14:09:57,055] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2169.33 | bwd_microstep: 4643.88 | bwd_inner_microstep: 4639.33 | bwd_allreduce_microstep: 4.48 | step_microstep: 46.38
-[2025-01-25 14:09:57,055] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2169.30 | bwd: 4643.90 | bwd_inner: 4639.33 | bwd_allreduce: 4.51 | step: 46.39
- 33%|███▎      | 1919/5800 [5:23:27<7:28:00,  6.93s/it]                                                       {'loss': 0.0322, 'grad_norm': 9.304676055908203, 'learning_rate': 3.12329832884068e-05, 'epoch': 16.54}
- 33%|███▎      | 1919/5800 [5:23:27<7:28:00,  6.93s/it]score1 tensor([[0.4961],
-        [0.5703],
-        [0.5430],
-        [0.4961]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.4941, 0.4844, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0581, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:10:04,005] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.09 | optimizer_step: 4.64
-[2025-01-25 14:10:04,008] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2171.92 | bwd_microstep: 4644.79 | bwd_inner_microstep: 4637.48 | bwd_allreduce_microstep: 7.10 | step_microstep: 66.38
-[2025-01-25 14:10:04,009] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2171.89 | bwd: 4644.85 | bwd_inner: 4637.48 | bwd_allreduce: 7.23 | step: 66.40
- 33%|███▎      | 1920/5800 [5:23:33<7:28:45,  6.94s/it]                                                       {'loss': 0.0581, 'grad_norm': 4.741851806640625, 'learning_rate': 3.122374130724765e-05, 'epoch': 16.55}
- 33%|███▎      | 1920/5800 [5:23:34<7:28:45,  6.94s/it]score1 tensor([[0.4238],
-        [0.4395],
-        [0.6758],
-        [0.6211]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3223, 0.3418, 0.6445, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0781, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:10:10,963] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 14:10:10,964] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2174.77 | bwd_microstep: 4639.60 | bwd_inner_microstep: 4634.01 | bwd_allreduce_microstep: 5.46 | step_microstep: 45.90
-[2025-01-25 14:10:10,965] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2174.73 | bwd: 4639.64 | bwd_inner: 4634.01 | bwd_allreduce: 5.53 | step: 45.90
- 33%|███▎      | 1921/5800 [5:23:40<7:28:38,  6.94s/it]                                                       {'loss': 0.0781, 'grad_norm': 9.486093521118164, 'learning_rate': 3.121449582633121e-05, 'epoch': 16.56}
- 33%|███▎      | 1921/5800 [5:23:40<7:28:38,  6.94s/it]score1 tensor([[0.6250],
-        [0.5781],
-        [0.4824],
-        [0.4980]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6562, 0.5625, 0.4512, 0.4297], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0366, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:10:17,877] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.37
-[2025-01-25 14:10:17,878] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.89 | bwd_microstep: 4631.20 | bwd_inner_microstep: 4626.20 | bwd_allreduce_microstep: 4.92 | step_microstep: 44.50
-[2025-01-25 14:10:17,878] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.85 | bwd: 4631.22 | bwd_inner: 4626.20 | bwd_allreduce: 4.96 | step: 44.51
- 33%|███▎      | 1922/5800 [5:23:47<7:27:59,  6.93s/it]                                                       {'loss': 0.0366, 'grad_norm': 4.3029632568359375, 'learning_rate': 3.120524684854038e-05, 'epoch': 16.57}
- 33%|███▎      | 1922/5800 [5:23:47<7:27:59,  6.93s/it]score1 tensor([[0.5195],
-        [0.5703],
-        [0.5117],
-        [0.5938]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.5195, 0.4473, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0435, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:10:24,793] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.37
-[2025-01-25 14:10:24,794] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.19 | bwd_microstep: 4639.51 | bwd_inner_microstep: 4634.81 | bwd_allreduce_microstep: 4.61 | step_microstep: 41.55
-[2025-01-25 14:10:24,795] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.12 | bwd: 4639.54 | bwd_inner: 4634.81 | bwd_allreduce: 4.65 | step: 41.56
- 33%|███▎      | 1923/5800 [5:23:54<7:27:40,  6.93s/it]                                                       {'loss': 0.0435, 'grad_norm': 9.554580688476562, 'learning_rate': 3.119599437675916e-05, 'epoch': 16.58}
- 33%|███▎      | 1923/5800 [5:23:54<7:27:40,  6.93s/it]score1 tensor([[0.5703],
-        [0.3828],
-        [0.4980],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.4004, 0.5156, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0420, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:10:31,707] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 14:10:31,708] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.12 | bwd_microstep: 4633.15 | bwd_inner_microstep: 4628.40 | bwd_allreduce_microstep: 4.69 | step_microstep: 42.09
-[2025-01-25 14:10:31,709] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.08 | bwd: 4633.18 | bwd_inner: 4628.39 | bwd_allreduce: 4.72 | step: 42.10
- 33%|███▎      | 1924/5800 [5:24:01<7:27:08,  6.92s/it]                                                       {'loss': 0.042, 'grad_norm': 0.8840880990028381, 'learning_rate': 3.118673841387262e-05, 'epoch': 16.59}
- 33%|███▎      | 1924/5800 [5:24:01<7:27:08,  6.92s/it]score1 tensor([[0.5312],
-        [0.5664],
-        [0.5234],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.5508, 0.4961, 0.6797], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:10:38,605] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 14:10:38,607] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.74 | bwd_microstep: 4626.92 | bwd_inner_microstep: 4622.37 | bwd_allreduce_microstep: 4.45 | step_microstep: 41.44
-[2025-01-25 14:10:38,607] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.71 | bwd: 4626.95 | bwd_inner: 4622.37 | bwd_allreduce: 4.50 | step: 41.45
- 33%|███▎      | 1925/5800 [5:24:08<7:26:37,  6.92s/it]                                                       {'loss': 0.043, 'grad_norm': 0.5364620685577393, 'learning_rate': 3.117747896276694e-05, 'epoch': 16.59}
- 33%|███▎      | 1925/5800 [5:24:08<7:26:37,  6.92s/it]score1 tensor([[0.5273],
-        [0.4727],
-        [0.4961],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.5000, 0.5273, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0303, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:10:45,509] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 14:10:45,511] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.40 | bwd_microstep: 4627.93 | bwd_inner_microstep: 4623.02 | bwd_allreduce_microstep: 4.81 | step_microstep: 43.14
-[2025-01-25 14:10:45,511] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.36 | bwd: 4627.95 | bwd_inner: 4623.02 | bwd_allreduce: 4.86 | step: 43.16
- 33%|███▎      | 1926/5800 [5:24:15<7:26:18,  6.91s/it]                                                       {'loss': 0.0303, 'grad_norm': 4.524678707122803, 'learning_rate': 3.116821602632936e-05, 'epoch': 16.6}
- 33%|███▎      | 1926/5800 [5:24:15<7:26:18,  6.91s/it]score1 tensor([[0.4453],
-        [0.5391],
-        [0.4355],
-        [0.3887]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4082, 0.5039, 0.4023, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0298, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:10:52,422] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 14:10:52,423] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.60 | bwd_microstep: 4636.62 | bwd_inner_microstep: 4631.72 | bwd_allreduce_microstep: 4.80 | step_microstep: 41.57
-[2025-01-25 14:10:52,424] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.57 | bwd: 4636.65 | bwd_inner: 4631.72 | bwd_allreduce: 4.85 | step: 41.57
- 33%|███▎      | 1927/5800 [5:24:22<7:26:24,  6.92s/it]                                                       {'loss': 0.0298, 'grad_norm': 8.62944221496582, 'learning_rate': 3.115894960744826e-05, 'epoch': 16.61}
- 33%|███▎      | 1927/5800 [5:24:22<7:26:24,  6.92s/it]score1 tensor([[0.4062],
-        [0.4629],
-        [0.4727],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4141, 0.5195, 0.4570, 0.6289], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0396, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:10:59,363] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.36
-[2025-01-25 14:10:59,365] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.01 | bwd_microstep: 4640.02 | bwd_inner_microstep: 4633.90 | bwd_allreduce_microstep: 6.04 | step_microstep: 51.33
-[2025-01-25 14:10:59,365] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.87 | bwd: 4640.05 | bwd_inner: 4633.90 | bwd_allreduce: 6.08 | step: 51.31
- 33%|███▎      | 1928/5800 [5:24:29<7:26:46,  6.92s/it]                                                       {'loss': 0.0396, 'grad_norm': 4.487732887268066, 'learning_rate': 3.1149679709013035e-05, 'epoch': 16.62}
- 33%|███▎      | 1928/5800 [5:24:29<7:26:46,  6.92s/it]score1 tensor([[0.4434],
-        [0.3867],
-        [0.5000],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.4336, 0.4844, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0278, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:11:06,306] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.10 | optimizer_step: 4.36
-[2025-01-25 14:11:06,308] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.25 | bwd_microstep: 4633.47 | bwd_inner_microstep: 4626.26 | bwd_allreduce_microstep: 7.04 | step_microstep: 63.80
-[2025-01-25 14:11:06,309] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.17 | bwd: 4633.51 | bwd_inner: 4626.26 | bwd_allreduce: 7.12 | step: 63.82
- 33%|███▎      | 1929/5800 [5:24:36<7:26:57,  6.93s/it]                                                       {'loss': 0.0278, 'grad_norm': 4.1501078605651855, 'learning_rate': 3.114040633391421e-05, 'epoch': 16.63}
- 33%|███▎      | 1929/5800 [5:24:36<7:26:57,  6.93s/it]score1 tensor([[0.4258],
-        [0.3613],
-        [0.3867],
-        [0.4023]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.3516, 0.3457, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0322, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:11:13,238] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 14:11:13,239] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.61 | bwd_microstep: 4637.92 | bwd_inner_microstep: 4630.41 | bwd_allreduce_microstep: 7.39 | step_microstep: 43.45
-[2025-01-25 14:11:13,239] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.58 | bwd: 4637.95 | bwd_inner: 4630.41 | bwd_allreduce: 7.46 | step: 43.46
- 33%|███▎      | 1930/5800 [5:24:43<7:27:00,  6.93s/it]                                                       {'loss': 0.0322, 'grad_norm': 3.9932639598846436, 'learning_rate': 3.113112948504339e-05, 'epoch': 16.64}
- 33%|███▎      | 1930/5800 [5:24:43<7:27:00,  6.93s/it]score1 tensor([[0.5078],
-        [0.4355],
-        [0.4180],
-        [0.4551]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4453, 0.3887, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0264, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:11:20,166] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 14:11:20,168] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.23 | bwd_microstep: 4629.76 | bwd_inner_microstep: 4624.44 | bwd_allreduce_microstep: 5.22 | step_microstep: 44.70
-[2025-01-25 14:11:20,168] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.20 | bwd: 4629.79 | bwd_inner: 4624.44 | bwd_allreduce: 5.26 | step: 44.70
- 33%|███▎      | 1931/5800 [5:24:50<7:26:54,  6.93s/it]                                                       {'loss': 0.0264, 'grad_norm': 4.476679801940918, 'learning_rate': 3.112184916529326e-05, 'epoch': 16.65}
- 33%|███▎      | 1931/5800 [5:24:50<7:26:54,  6.93s/it]score1 tensor([[0.4102],
-        [0.4570],
-        [0.4551],
-        [0.4648]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4961, 0.5391, 0.4883, 0.4902], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0566, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:11:27,105] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.53 | optimizer_step: 4.37
-[2025-01-25 14:11:27,106] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.14 | bwd_microstep: 4637.79 | bwd_inner_microstep: 4632.36 | bwd_allreduce_microstep: 5.35 | step_microstep: 46.10
-[2025-01-25 14:11:27,107] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.11 | bwd: 4637.81 | bwd_inner: 4632.36 | bwd_allreduce: 5.39 | step: 46.11
- 33%|███▎      | 1932/5800 [5:24:57<7:26:43,  6.93s/it]                                                       {'loss': 0.0566, 'grad_norm': 8.486061096191406, 'learning_rate': 3.111256537755757e-05, 'epoch': 16.66}
- 33%|███▎      | 1932/5800 [5:24:57<7:26:43,  6.93s/it]score1 tensor([[0.4961],
-        [0.5078],
-        [0.4082],
-        [0.4023]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.6172, 0.4512, 0.4395], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0513, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:11:34,014] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 14:11:34,015] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.78 | bwd_microstep: 4630.90 | bwd_inner_microstep: 4626.03 | bwd_allreduce_microstep: 4.79 | step_microstep: 43.45
-[2025-01-25 14:11:34,015] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.74 | bwd: 4630.93 | bwd_inner: 4626.03 | bwd_allreduce: 4.83 | step: 43.45
- 33%|███▎      | 1933/5800 [5:25:03<7:26:08,  6.92s/it]                                                       {'loss': 0.0513, 'grad_norm': 8.593661308288574, 'learning_rate': 3.110327812473119e-05, 'epoch': 16.66}
- 33%|███▎      | 1933/5800 [5:25:03<7:26:08,  6.92s/it]score1 tensor([[0.4434],
-        [0.5664],
-        [0.5898],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4238, 0.4863, 0.6250, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0356, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:11:40,939] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 14:11:40,940] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.57 | bwd_microstep: 4641.78 | bwd_inner_microstep: 4636.42 | bwd_allreduce_microstep: 5.27 | step_microstep: 45.43
-[2025-01-25 14:11:40,941] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.53 | bwd: 4641.80 | bwd_inner: 4636.42 | bwd_allreduce: 5.32 | step: 45.44
- 33%|███▎      | 1934/5800 [5:25:10<7:26:06,  6.92s/it]                                                       {'loss': 0.0356, 'grad_norm': 4.357010364532471, 'learning_rate': 3.1093987409710015e-05, 'epoch': 16.67}
- 33%|███▎      | 1934/5800 [5:25:10<7:26:06,  6.92s/it]score1 tensor([[0.5508],
-        [0.4512],
-        [0.6055],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4277, 0.4004, 0.6445, 0.4199], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0742, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:11:47,850] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.36
-[2025-01-25 14:11:47,851] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.37 | bwd_microstep: 4630.93 | bwd_inner_microstep: 4626.06 | bwd_allreduce_microstep: 4.78 | step_microstep: 43.44
-[2025-01-25 14:11:47,852] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.33 | bwd: 4630.96 | bwd_inner: 4626.06 | bwd_allreduce: 4.82 | step: 43.45
- 33%|███▎      | 1935/5800 [5:25:17<7:25:56,  6.92s/it]                                                       {'loss': 0.0742, 'grad_norm': 4.232607364654541, 'learning_rate': 3.1084693235391084e-05, 'epoch': 16.68}
- 33%|███▎      | 1935/5800 [5:25:17<7:25:56,  6.92s/it]score1 tensor([[0.4785],
-        [0.4629],
-        [0.5508],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.4980, 0.5195, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0381, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:11:54,774] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 14:11:54,776] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.52 | bwd_microstep: 4629.50 | bwd_inner_microstep: 4624.62 | bwd_allreduce_microstep: 4.80 | step_microstep: 43.58
-[2025-01-25 14:11:54,776] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.48 | bwd: 4629.53 | bwd_inner: 4624.62 | bwd_allreduce: 4.84 | step: 43.59
- 33%|███▎      | 1936/5800 [5:25:24<7:25:40,  6.92s/it]                                                       {'loss': 0.0381, 'grad_norm': 4.614284515380859, 'learning_rate': 3.107539560467246e-05, 'epoch': 16.69}
- 33%|███▎      | 1936/5800 [5:25:24<7:25:40,  6.92s/it]score1 tensor([[0.6016],
-        [0.4863],
-        [0.5586],
-        [0.4707]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.7070, 0.4707, 0.4453, 0.4043], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0752, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:12:01,685] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 14:12:01,686] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.03 | bwd_microstep: 4633.59 | bwd_inner_microstep: 4627.91 | bwd_allreduce_microstep: 5.59 | step_microstep: 44.69
-[2025-01-25 14:12:01,687] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.00 | bwd: 4633.61 | bwd_inner: 4627.92 | bwd_allreduce: 5.63 | step: 44.70
- 33%|███▎      | 1937/5800 [5:25:31<7:25:24,  6.92s/it]                                                       {'loss': 0.0752, 'grad_norm': 4.27338171005249, 'learning_rate': 3.106609452045331e-05, 'epoch': 16.7}
- 33%|███▎      | 1937/5800 [5:25:31<7:25:24,  6.92s/it]score1 tensor([[0.5508],
-        [0.5195],
-        [0.5000],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.4727, 0.5156, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:12:08,608] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 14:12:08,609] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.30 | bwd_microstep: 4638.43 | bwd_inner_microstep: 4632.11 | bwd_allreduce_microstep: 6.23 | step_microstep: 43.06
-[2025-01-25 14:12:08,610] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.26 | bwd: 4638.45 | bwd_inner: 4632.11 | bwd_allreduce: 6.27 | step: 43.06
- 33%|███▎      | 1938/5800 [5:25:38<7:25:20,  6.92s/it]                                                       {'loss': 0.0352, 'grad_norm': 4.765144348144531, 'learning_rate': 3.105678998563387e-05, 'epoch': 16.71}
- 33%|███▎      | 1938/5800 [5:25:38<7:25:20,  6.92s/it]score1 tensor([[0.4023],
-        [0.5156],
-        [0.4395],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3672, 0.4004, 0.4062, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0596, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:12:15,541] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 14:12:15,543] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.11 | bwd_microstep: 4638.29 | bwd_inner_microstep: 4630.93 | bwd_allreduce_microstep: 7.25 | step_microstep: 52.55
-[2025-01-25 14:12:15,544] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.07 | bwd: 4638.31 | bwd_inner: 4630.93 | bwd_allreduce: 7.31 | step: 52.56
- 33%|███▎      | 1939/5800 [5:25:45<7:26:17,  6.94s/it]                                                       {'loss': 0.0596, 'grad_norm': 4.158584117889404, 'learning_rate': 3.104748200311547e-05, 'epoch': 16.72}
- 33%|███▎      | 1939/5800 [5:25:45<7:26:17,  6.94s/it]score1 tensor([[0.4824],
-        [0.5312],
-        [0.4961],
-        [0.4688]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4414, 0.5508, 0.4688, 0.4590], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0244, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:12:22,524] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.18 | optimizer_step: 4.36
-[2025-01-25 14:12:22,526] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.92 | bwd_microstep: 4631.22 | bwd_inner_microstep: 4624.54 | bwd_allreduce_microstep: 6.60 | step_microstep: 58.57
-[2025-01-25 14:12:22,526] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.89 | bwd: 4631.25 | bwd_inner: 4624.54 | bwd_allreduce: 6.64 | step: 58.58
- 33%|███▎      | 1940/5800 [5:25:52<7:26:29,  6.94s/it]                                                       {'loss': 0.0244, 'grad_norm': 4.240230560302734, 'learning_rate': 3.1038170575800483e-05, 'epoch': 16.72}
- 33%|███▎      | 1940/5800 [5:25:52<7:26:29,  6.94s/it]score1 tensor([[0.5391],
-        [0.4668],
-        [0.4590],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.6875, 0.4941, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1050, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:12:29,455] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 14:12:29,456] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.96 | bwd_microstep: 4636.86 | bwd_inner_microstep: 4631.33 | bwd_allreduce_microstep: 5.42 | step_microstep: 43.57
-[2025-01-25 14:12:29,456] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.92 | bwd: 4636.89 | bwd_inner: 4631.34 | bwd_allreduce: 5.48 | step: 43.57
- 33%|███▎      | 1941/5800 [5:25:59<7:26:02,  6.94s/it]                                                       {'loss': 0.105, 'grad_norm': 9.093562126159668, 'learning_rate': 3.102885570659239e-05, 'epoch': 16.73}
- 33%|███▎      | 1941/5800 [5:25:59<7:26:02,  6.94s/it]score1 tensor([[0.2949],
-        [0.4219],
-        [0.4434],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.4863, 0.4961, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0903, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:12:36,322] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.37
-[2025-01-25 14:12:36,323] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.38 | bwd_microstep: 4586.96 | bwd_inner_microstep: 4582.14 | bwd_allreduce_microstep: 4.73 | step_microstep: 43.13
-[2025-01-25 14:12:36,323] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.33 | bwd: 4586.98 | bwd_inner: 4582.14 | bwd_allreduce: 4.78 | step: 43.14
- 33%|███▎      | 1942/5800 [5:26:06<7:24:34,  6.91s/it]                                                       {'loss': 0.0903, 'grad_norm': 6.076432228088379, 'learning_rate': 3.101953739839572e-05, 'epoch': 16.74}
- 33%|███▎      | 1942/5800 [5:26:06<7:24:34,  6.91s/it]score1 tensor([[0.3574],
-        [0.4102],
-        [0.4629],
-        [0.4219]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3477, 0.4277, 0.4844, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0386, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:12:43,236] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.37
-[2025-01-25 14:12:43,237] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.83 | bwd_microstep: 4637.33 | bwd_inner_microstep: 4631.90 | bwd_allreduce_microstep: 5.30 | step_microstep: 43.57
-[2025-01-25 14:12:43,238] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.79 | bwd: 4637.36 | bwd_inner: 4631.90 | bwd_allreduce: 5.37 | step: 43.59
- 34%|███▎      | 1943/5800 [5:26:13<7:24:33,  6.92s/it]                                                       {'loss': 0.0386, 'grad_norm': 4.406393527984619, 'learning_rate': 3.1010215654116075e-05, 'epoch': 16.75}
- 34%|███▎      | 1943/5800 [5:26:13<7:24:33,  6.92s/it]score1 tensor([[0.4336],
-        [0.4453],
-        [0.3496],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5820, 0.4707, 0.4043, 0.6484], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0942, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:12:50,161] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.60 | optimizer_step: 4.36
-[2025-01-25 14:12:50,162] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.79 | bwd_microstep: 4636.90 | bwd_inner_microstep: 4632.16 | bwd_allreduce_microstep: 4.67 | step_microstep: 45.96
-[2025-01-25 14:12:50,163] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.75 | bwd: 4636.93 | bwd_inner: 4632.16 | bwd_allreduce: 4.70 | step: 45.97
- 34%|███▎      | 1944/5800 [5:26:20<7:24:34,  6.92s/it]                                                       {'loss': 0.0942, 'grad_norm': 8.384655952453613, 'learning_rate': 3.100089047666015e-05, 'epoch': 16.76}
- 34%|███▎      | 1944/5800 [5:26:20<7:24:34,  6.92s/it]score1 tensor([[0.5078],
-        [0.3926],
-        [0.4316],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.5039, 0.4492, 0.5820], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0615, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:12:57,072] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.36
-[2025-01-25 14:12:57,073] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.04 | bwd_microstep: 4628.78 | bwd_inner_microstep: 4623.68 | bwd_allreduce_microstep: 5.01 | step_microstep: 44.79
-[2025-01-25 14:12:57,074] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.00 | bwd: 4628.81 | bwd_inner: 4623.68 | bwd_allreduce: 5.06 | step: 44.80
- 34%|███▎      | 1945/5800 [5:26:27<7:24:14,  6.91s/it]                                                       {'loss': 0.0615, 'grad_norm': 8.755136489868164, 'learning_rate': 3.099156186893569e-05, 'epoch': 16.77}
- 34%|███▎      | 1945/5800 [5:26:27<7:24:14,  6.91s/it]score1 tensor([[0.4961],
-        [0.4609],
-        [0.4102],
-        [0.4453]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.4668, 0.3340, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0225, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:13:03,986] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 14:13:03,988] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.89 | bwd_microstep: 4635.55 | bwd_inner_microstep: 4630.31 | bwd_allreduce_microstep: 5.12 | step_microstep: 43.92
-[2025-01-25 14:13:03,988] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.86 | bwd: 4635.57 | bwd_inner: 4630.31 | bwd_allreduce: 5.18 | step: 43.93
- 34%|███▎      | 1946/5800 [5:26:33<7:24:09,  6.91s/it]                                                       {'loss': 0.0225, 'grad_norm': 4.5235724449157715, 'learning_rate': 3.098222983385152e-05, 'epoch': 16.78}
- 34%|███▎      | 1946/5800 [5:26:33<7:24:09,  6.91s/it]score1 tensor([[0.5508],
-        [0.4258],
-        [0.5430],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5898, 0.3613, 0.5547, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0327, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:13:10,902] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 14:13:10,903] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.74 | bwd_microstep: 4637.37 | bwd_inner_microstep: 4632.51 | bwd_allreduce_microstep: 4.78 | step_microstep: 42.31
-[2025-01-25 14:13:10,903] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.71 | bwd: 4637.40 | bwd_inner: 4632.52 | bwd_allreduce: 4.82 | step: 42.32
- 34%|███▎      | 1947/5800 [5:26:40<7:24:08,  6.92s/it]                                                       {'loss': 0.0327, 'grad_norm': 5.032413005828857, 'learning_rate': 3.0972894374317534e-05, 'epoch': 16.78}
- 34%|███▎      | 1947/5800 [5:26:40<7:24:08,  6.92s/it]score1 tensor([[0.5508],
-        [0.5000],
-        [0.5938],
-        [0.6367]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.4023, 0.4473, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0874, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:13:17,816] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 14:13:17,817] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.74 | bwd_microstep: 4636.31 | bwd_inner_microstep: 4631.21 | bwd_allreduce_microstep: 5.01 | step_microstep: 41.09
-[2025-01-25 14:13:17,818] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.70 | bwd: 4636.33 | bwd_inner: 4631.21 | bwd_allreduce: 5.05 | step: 41.09
- 34%|███▎      | 1948/5800 [5:26:47<7:23:57,  6.92s/it]                                                       {'loss': 0.0874, 'grad_norm': 9.582024574279785, 'learning_rate': 3.096355549324468e-05, 'epoch': 16.79}
- 34%|███▎      | 1948/5800 [5:26:47<7:23:57,  6.92s/it]score1 tensor([[0.5938],
-        [0.5938],
-        [0.5352],
-        [0.6328]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4805, 0.4512, 0.4551, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1289, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:13:24,772] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.97 | optimizer_step: 4.36
-[2025-01-25 14:13:24,775] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.96 | bwd_microstep: 4637.78 | bwd_inner_microstep: 4630.61 | bwd_allreduce_microstep: 6.96 | step_microstep: 81.66
-[2025-01-25 14:13:24,776] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.92 | bwd: 4637.84 | bwd_inner: 4630.61 | bwd_allreduce: 7.09 | step: 81.70
- 34%|███▎      | 1949/5800 [5:26:54<7:24:39,  6.93s/it]                                                       {'loss': 0.1289, 'grad_norm': 9.76904010772705, 'learning_rate': 3.0954213193544994e-05, 'epoch': 16.8}
- 34%|███▎      | 1949/5800 [5:26:54<7:24:39,  6.93s/it]score1 tensor([[0.7656],
-        [0.5820],
-        [0.5820],
-        [0.6055]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6250, 0.4453, 0.4785, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1177, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:13:31,694] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 14:13:31,696] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.59 | bwd_microstep: 4636.34 | bwd_inner_microstep: 4630.92 | bwd_allreduce_microstep: 5.31 | step_microstep: 42.75
-[2025-01-25 14:13:31,696] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.55 | bwd: 4636.37 | bwd_inner: 4630.92 | bwd_allreduce: 5.37 | step: 42.76
- 34%|███▎      | 1950/5800 [5:27:01<7:24:39,  6.93s/it]                                                       {'loss': 0.1177, 'grad_norm': 10.174951553344727, 'learning_rate': 3.094486747813156e-05, 'epoch': 16.81}
- 34%|███▎      | 1950/5800 [5:27:01<7:24:39,  6.93s/it]score1 tensor([[0.6133],
-        [0.5664],
-        [0.6406],
-        [0.5938]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3945, 0.4316, 0.5117, 0.6289], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1289, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:13:38,615] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.01 | optimizer_step: 4.37
-[2025-01-25 14:13:38,616] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.96 | bwd_microstep: 4624.72 | bwd_inner_microstep: 4619.97 | bwd_allreduce_microstep: 4.67 | step_microstep: 42.94
-[2025-01-25 14:13:38,617] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.93 | bwd: 4624.75 | bwd_inner: 4619.97 | bwd_allreduce: 4.71 | step: 42.95
- 34%|███▎      | 1951/5800 [5:27:08<7:24:09,  6.92s/it]                                                       {'loss': 0.1289, 'grad_norm': 5.043484210968018, 'learning_rate': 3.093551834991853e-05, 'epoch': 16.82}
- 34%|███▎      | 1951/5800 [5:27:08<7:24:09,  6.92s/it]score1 tensor([[0.6914],
-        [0.6602],
-        [0.6602],
-        [0.6094]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6523, 0.4492, 0.6289, 0.5000], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0977, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:13:45,531] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 14:13:45,533] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.12 | bwd_microstep: 4633.48 | bwd_inner_microstep: 4628.10 | bwd_allreduce_microstep: 5.24 | step_microstep: 44.13
-[2025-01-25 14:13:45,533] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.09 | bwd: 4633.51 | bwd_inner: 4628.10 | bwd_allreduce: 5.31 | step: 44.14
- 34%|███▎      | 1952/5800 [5:27:15<7:23:50,  6.92s/it]                                                       {'loss': 0.0977, 'grad_norm': 10.417657852172852, 'learning_rate': 3.092616581182114e-05, 'epoch': 16.83}
- 34%|███▎      | 1952/5800 [5:27:15<7:23:50,  6.92s/it]score1 tensor([[0.7148],
-        [0.5820],
-        [0.6562],
-        [0.6016]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.4863, 0.5547, 0.3809], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1670, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:13:52,442] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.37
-[2025-01-25 14:13:52,443] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.60 | bwd_microstep: 4626.57 | bwd_inner_microstep: 4621.30 | bwd_allreduce_microstep: 5.15 | step_microstep: 49.86
-[2025-01-25 14:13:52,444] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.57 | bwd: 4626.59 | bwd_inner: 4621.30 | bwd_allreduce: 5.21 | step: 49.88
- 34%|███▎      | 1953/5800 [5:27:22<7:23:28,  6.92s/it]                                                       {'loss': 0.167, 'grad_norm': 10.13373851776123, 'learning_rate': 3.0916809866755655e-05, 'epoch': 16.84}
- 34%|███▎      | 1953/5800 [5:27:22<7:23:28,  6.92s/it]score1 tensor([[0.5742],
-        [0.6133],
-        [0.6094],
-        [0.5977]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4863, 0.4668, 0.6094, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:13:59,304] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.89 | optimizer_step: 4.37
-[2025-01-25 14:13:59,306] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.69 | bwd_microstep: 4578.13 | bwd_inner_microstep: 4572.88 | bwd_allreduce_microstep: 5.15 | step_microstep: 43.39
-[2025-01-25 14:13:59,306] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.65 | bwd: 4578.15 | bwd_inner: 4572.88 | bwd_allreduce: 5.20 | step: 43.39
- 34%|███▎      | 1954/5800 [5:27:29<7:22:27,  6.90s/it]                                                       {'loss': 0.0664, 'grad_norm': 7.399521827697754, 'learning_rate': 3.090745051763944e-05, 'epoch': 16.84}
- 34%|███▎      | 1954/5800 [5:27:29<7:22:27,  6.90s/it]score1 tensor([[0.4941],
-        [0.5352],
-        [0.5352],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.5469, 0.5391, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0415, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:14:06,232] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.37
-[2025-01-25 14:14:06,233] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.92 | bwd_microstep: 4636.77 | bwd_inner_microstep: 4631.98 | bwd_allreduce_microstep: 4.71 | step_microstep: 43.09
-[2025-01-25 14:14:06,234] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.88 | bwd: 4636.79 | bwd_inner: 4631.98 | bwd_allreduce: 4.75 | step: 43.10
- 34%|███▎      | 1955/5800 [5:27:36<7:22:41,  6.91s/it]                                                       {'loss': 0.0415, 'grad_norm': 0.5120006799697876, 'learning_rate': 3.089808776739089e-05, 'epoch': 16.85}
- 34%|███▎      | 1955/5800 [5:27:36<7:22:41,  6.91s/it]score1 tensor([[0.4980],
-        [0.5469],
-        [0.4648],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4160, 0.4473, 0.4375, 0.6641], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0776, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:14:13,139] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 14:14:13,140] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.06 | bwd_microstep: 4629.12 | bwd_inner_microstep: 4622.97 | bwd_allreduce_microstep: 6.07 | step_microstep: 43.07
-[2025-01-25 14:14:13,140] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.01 | bwd: 4629.14 | bwd_inner: 4622.97 | bwd_allreduce: 6.11 | step: 43.08
- 34%|███▎      | 1956/5800 [5:27:43<7:22:33,  6.91s/it]                                                       {'loss': 0.0776, 'grad_norm': 4.454219818115234, 'learning_rate': 3.0888721618929474e-05, 'epoch': 16.86}
- 34%|███▎      | 1956/5800 [5:27:43<7:22:33,  6.91s/it]score1 tensor([[0.5156],
-        [0.5469],
-        [0.5273],
-        [0.4824]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4590, 0.5625, 0.4961, 0.4863], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0269, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:14:20,055] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 14:14:20,057] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.52 | bwd_microstep: 4635.75 | bwd_inner_microstep: 4628.56 | bwd_allreduce_microstep: 7.06 | step_microstep: 48.36
-[2025-01-25 14:14:20,057] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.49 | bwd: 4635.78 | bwd_inner: 4628.56 | bwd_allreduce: 7.12 | step: 48.37
- 34%|███▎      | 1957/5800 [5:27:50<7:22:45,  6.91s/it]                                                       {'loss': 0.0269, 'grad_norm': 0.4702533483505249, 'learning_rate': 3.087935207517572e-05, 'epoch': 16.87}
- 34%|███▎      | 1957/5800 [5:27:50<7:22:45,  6.91s/it]score1 tensor([[0.4844],
-        [0.5820],
-        [0.4668],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4961, 0.6562, 0.5352, 0.6016], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0601, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:14:26,991] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 14:14:26,993] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.06 | bwd_microstep: 4635.77 | bwd_inner_microstep: 4630.73 | bwd_allreduce_microstep: 4.93 | step_microstep: 44.50
-[2025-01-25 14:14:26,993] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.02 | bwd: 4635.80 | bwd_inner: 4630.73 | bwd_allreduce: 4.99 | step: 44.51
- 34%|███▍      | 1958/5800 [5:27:56<7:23:17,  6.92s/it]                                                       {'loss': 0.0601, 'grad_norm': 9.142373085021973, 'learning_rate': 3.0869979139051216e-05, 'epoch': 16.88}
- 34%|███▍      | 1958/5800 [5:27:56<7:23:17,  6.92s/it]score1 tensor([[0.4219],
-        [0.4824],
-        [0.4395],
-        [0.4414]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.5898, 0.4258, 0.6484], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0977, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:14:33,938] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 10.14 | optimizer_step: 4.46
-[2025-01-25 14:14:33,939] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2172.06 | bwd_microstep: 4630.15 | bwd_inner_microstep: 4622.19 | bwd_allreduce_microstep: 7.88 | step_microstep: 53.01
-[2025-01-25 14:14:33,940] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2172.01 | bwd: 4630.18 | bwd_inner: 4622.19 | bwd_allreduce: 7.92 | step: 53.02
- 34%|███▍      | 1959/5800 [5:28:03<7:23:35,  6.93s/it]                                                       {'loss': 0.0977, 'grad_norm': 4.42121696472168, 'learning_rate': 3.086060281347861e-05, 'epoch': 16.89}
- 34%|███▍      | 1959/5800 [5:28:03<7:23:35,  6.93s/it]score1 tensor([[0.4180],
-        [0.4746],
-        [0.5156],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.6406, 0.5625, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0874, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:14:40,895] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 14:14:40,896] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.55 | bwd_microstep: 4631.35 | bwd_inner_microstep: 4625.87 | bwd_allreduce_microstep: 5.37 | step_microstep: 44.56
-[2025-01-25 14:14:40,896] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.48 | bwd: 4631.37 | bwd_inner: 4625.87 | bwd_allreduce: 5.42 | step: 44.57
- 34%|███▍      | 1960/5800 [5:28:10<7:23:44,  6.93s/it]                                                       {'loss': 0.0874, 'grad_norm': 8.825788497924805, 'learning_rate': 3.08512231013816e-05, 'epoch': 16.9}
- 34%|███▍      | 1960/5800 [5:28:10<7:23:44,  6.93s/it]score1 tensor([[0.4297],
-        [0.5195],
-        [0.3770],
-        [0.4316]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4277, 0.5352, 0.3652, 0.4395], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:14:47,804] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 14:14:47,806] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.94 | bwd_microstep: 4624.16 | bwd_inner_microstep: 4619.01 | bwd_allreduce_microstep: 5.07 | step_microstep: 47.85
-[2025-01-25 14:14:47,807] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.90 | bwd: 4624.18 | bwd_inner: 4619.01 | bwd_allreduce: 5.11 | step: 47.85
- 34%|███▍      | 1961/5800 [5:28:17<7:23:14,  6.93s/it]                                                       {'loss': 0.0093, 'grad_norm': 0.6862221956253052, 'learning_rate': 3.0841840005684936e-05, 'epoch': 16.91}
- 34%|███▍      | 1961/5800 [5:28:17<7:23:14,  6.93s/it]score1 tensor([[0.4570],
-        [0.4160],
-        [0.4707],
-        [0.4922]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4023, 0.4727, 0.5508, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0801, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:14:54,741] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.59 | optimizer_step: 4.36
-[2025-01-25 14:14:54,742] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.68 | bwd_microstep: 4630.81 | bwd_inner_microstep: 4624.63 | bwd_allreduce_microstep: 6.03 | step_microstep: 59.90
-[2025-01-25 14:14:54,743] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2168.63 | bwd: 4630.86 | bwd_inner: 4624.63 | bwd_allreduce: 6.08 | step: 59.88
- 34%|███▍      | 1962/5800 [5:28:24<7:23:40,  6.94s/it]                                                       {'loss': 0.0801, 'grad_norm': 4.355982303619385, 'learning_rate': 3.0832453529314444e-05, 'epoch': 16.91}
- 34%|███▍      | 1962/5800 [5:28:24<7:23:40,  6.94s/it]score1 tensor([[0.4355],
-        [0.4648],
-        [0.3594],
-        [0.3789]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.6055, 0.4648, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1289, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:15:01,719] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.96 | optimizer_step: 4.37
-[2025-01-25 14:15:01,720] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2169.09 | bwd_microstep: 4640.35 | bwd_inner_microstep: 4632.71 | bwd_allreduce_microstep: 7.48 | step_microstep: 59.04
-[2025-01-25 14:15:01,720] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2169.06 | bwd: 4640.40 | bwd_inner: 4632.71 | bwd_allreduce: 7.56 | step: 59.02
- 34%|███▍      | 1963/5800 [5:28:31<7:24:27,  6.95s/it]                                                       {'loss': 0.1289, 'grad_norm': 8.24394416809082, 'learning_rate': 3.082306367519698e-05, 'epoch': 16.92}
- 34%|███▍      | 1963/5800 [5:28:31<7:24:27,  6.95s/it]score1 tensor([[0.5195],
-        [0.4297],
-        [0.4141],
-        [0.4453]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.5820, 0.4473, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0845, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:15:08,672] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 14:15:08,673] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.80 | bwd_microstep: 4634.46 | bwd_inner_microstep: 4629.19 | bwd_allreduce_microstep: 5.15 | step_microstep: 44.40
-[2025-01-25 14:15:08,674] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.76 | bwd: 4634.49 | bwd_inner: 4629.19 | bwd_allreduce: 5.20 | step: 44.41
- 34%|███▍      | 1964/5800 [5:28:38<7:23:53,  6.94s/it]                                                       {'loss': 0.0845, 'grad_norm': 8.570627212524414, 'learning_rate': 3.081367044626046e-05, 'epoch': 16.93}
- 34%|███▍      | 1964/5800 [5:28:38<7:23:53,  6.94s/it]score1 tensor([[0.4805],
-        [0.4961],
-        [0.4648],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4902, 0.5352, 0.4883, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0444, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:15:15,603] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 14:15:15,604] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.80 | bwd_microstep: 4640.83 | bwd_inner_microstep: 4634.98 | bwd_allreduce_microstep: 5.70 | step_microstep: 52.95
-[2025-01-25 14:15:15,605] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.76 | bwd: 4640.86 | bwd_inner: 4634.98 | bwd_allreduce: 5.80 | step: 52.96
- 34%|███▍      | 1965/5800 [5:28:45<7:23:31,  6.94s/it]                                                       {'loss': 0.0444, 'grad_norm': 8.930054664611816, 'learning_rate': 3.0804273845433856e-05, 'epoch': 16.94}
- 34%|███▍      | 1965/5800 [5:28:45<7:23:31,  6.94s/it]score1 tensor([[0.5938],
-        [0.5820],
-        [0.5273],
-        [0.4824]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.6133, 0.5312, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0298, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:15:22,516] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 14:15:22,517] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.88 | bwd_microstep: 4630.13 | bwd_inner_microstep: 4624.67 | bwd_allreduce_microstep: 5.37 | step_microstep: 48.00
-[2025-01-25 14:15:22,518] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.84 | bwd: 4630.16 | bwd_inner: 4624.67 | bwd_allreduce: 5.42 | step: 48.02
- 34%|███▍      | 1966/5800 [5:28:52<7:22:56,  6.93s/it]                                                       {'loss': 0.0298, 'grad_norm': 4.437926769256592, 'learning_rate': 3.079487387564721e-05, 'epoch': 16.95}
- 34%|███▍      | 1966/5800 [5:28:52<7:22:56,  6.93s/it]score1 tensor([[0.5234],
-        [0.6055],
-        [0.5820],
-        [0.4355]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.6875, 0.4941, 0.3926], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0669, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:15:29,438] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 14:15:29,439] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.55 | bwd_microstep: 4636.73 | bwd_inner_microstep: 4631.70 | bwd_allreduce_microstep: 4.93 | step_microstep: 44.25
-[2025-01-25 14:15:29,440] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.51 | bwd: 4636.75 | bwd_inner: 4631.70 | bwd_allreduce: 4.98 | step: 44.26
- 34%|███▍      | 1967/5800 [5:28:59<7:22:34,  6.93s/it]                                                       {'loss': 0.0669, 'grad_norm': 0.7678787708282471, 'learning_rate': 3.078547053983157e-05, 'epoch': 16.96}
- 34%|███▍      | 1967/5800 [5:28:59<7:22:34,  6.93s/it]score1 tensor([[0.6562],
-        [0.6172],
-        [0.5039],
-        [0.6328]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.4395, 0.4160, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1094, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:15:36,357] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 14:15:36,359] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.72 | bwd_microstep: 4636.09 | bwd_inner_microstep: 4630.72 | bwd_allreduce_microstep: 5.26 | step_microstep: 42.18
-[2025-01-25 14:15:36,359] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.68 | bwd: 4636.11 | bwd_inner: 4630.72 | bwd_allreduce: 5.31 | step: 42.17
- 34%|███▍      | 1968/5800 [5:29:06<7:22:21,  6.93s/it]                                                       {'loss': 0.1094, 'grad_norm': 9.94503402709961, 'learning_rate': 3.077606384091908e-05, 'epoch': 16.97}
- 34%|███▍      | 1968/5800 [5:29:06<7:22:21,  6.93s/it]score1 tensor([[0.5117],
-        [0.6328],
-        [0.6875],
-        [0.7109]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3457, 0.5586, 0.5430, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1367, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:15:43,294] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.45 | optimizer_step: 4.37
-[2025-01-25 14:15:43,295] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.44 | bwd_microstep: 4634.21 | bwd_inner_microstep: 4627.44 | bwd_allreduce_microstep: 6.62 | step_microstep: 51.93
-[2025-01-25 14:15:43,296] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.40 | bwd: 4634.25 | bwd_inner: 4627.44 | bwd_allreduce: 6.70 | step: 51.94
- 34%|███▍      | 1969/5800 [5:29:13<7:22:27,  6.93s/it]                                                       {'loss': 0.1367, 'grad_norm': 10.261914253234863, 'learning_rate': 3.07666537818429e-05, 'epoch': 16.97}
- 34%|███▍      | 1969/5800 [5:29:13<7:22:27,  6.93s/it]score1 tensor([[0.6523],
-        [0.6523],
-        [0.5781],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.4980, 0.4492, 0.4043], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1621, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:15:50,213] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 14:15:50,215] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.83 | bwd_microstep: 4633.26 | bwd_inner_microstep: 4625.98 | bwd_allreduce_microstep: 7.20 | step_microstep: 45.09
-[2025-01-25 14:15:50,215] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.79 | bwd: 4633.29 | bwd_inner: 4625.98 | bwd_allreduce: 7.24 | step: 45.13
- 34%|███▍      | 1970/5800 [5:29:20<7:22:20,  6.93s/it]                                                       {'loss': 0.1621, 'grad_norm': 9.985322952270508, 'learning_rate': 3.075724036553726e-05, 'epoch': 16.98}
- 34%|███▍      | 1970/5800 [5:29:20<7:22:20,  6.93s/it]score1 tensor([[0.6211],
-        [0.6836],
-        [0.6875],
-        [0.6094]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.4297, 0.5234, 0.4375], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1699, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:15:57,164] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.99 | optimizer_step: 4.37
-[2025-01-25 14:15:57,165] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.90 | bwd_microstep: 4640.04 | bwd_inner_microstep: 4634.88 | bwd_allreduce_microstep: 5.04 | step_microstep: 50.53
-[2025-01-25 14:15:57,166] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.87 | bwd: 4640.08 | bwd_inner: 4634.88 | bwd_allreduce: 5.11 | step: 50.53
- 34%|███▍      | 1971/5800 [5:29:27<7:22:26,  6.93s/it]                                                       {'loss': 0.1699, 'grad_norm': 10.397711753845215, 'learning_rate': 3.074782359493741e-05, 'epoch': 16.99}
- 34%|███▍      | 1971/5800 [5:29:27<7:22:26,  6.93s/it][93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-score1 tensor([[0.7461]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1328, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:16:03,074] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.53 | optimizer_step: 4.36
-[2025-01-25 14:16:03,075] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 588.54 | bwd_microstep: 1232.06 | bwd_inner_microstep: 1224.63 | bwd_allreduce_microstep: 7.18 | step_microstep: 67.80
-[2025-01-25 14:16:03,076] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 588.43 | bwd: 1232.12 | bwd_inner: 1224.63 | bwd_allreduce: 7.32 | step: 67.77
- 34%|███▍      | 1972/5800 [5:29:33<7:02:54,  6.63s/it]                                                       {'loss': 0.1328, 'grad_norm': 11.480481147766113, 'learning_rate': 3.073840347297968e-05, 'epoch': 17.0}
- 34%|███▍      | 1972/5800 [5:29:33<7:02:54,  6.63s/it][2025-01-25 14:16:08,101] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 14:16:20,805] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 14:16:31,092] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 14:16:42,369] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-score1 tensor([[0.4414],
-        [0.6445],
-        [0.6094],
-        [0.6328]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.5781, 0.5195, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0903, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:17:02,068] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.03 | optimizer_step: 4.36
-[2025-01-25 14:17:02,070] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2141.01 | bwd_microstep: 4571.07 | bwd_inner_microstep: 4564.28 | bwd_allreduce_microstep: 6.68 | step_microstep: 49.02
-[2025-01-25 14:17:02,071] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2140.84 | bwd: 4571.10 | bwd_inner: 4564.28 | bwd_allreduce: 6.75 | step: 49.03
- 34%|███▍      | 1973/5800 [5:30:32<23:44:41, 22.34s/it]                                                        {'loss': 0.0903, 'grad_norm': 10.012941360473633, 'learning_rate': 3.0728980002601416e-05, 'epoch': 17.01}
- 34%|███▍      | 1973/5800 [5:30:32<23:44:41, 22.34s/it]score1 tensor([[0.5547],
-        [0.5352],
-        [0.6562],
-        [0.6016]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.3887, 0.5312, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1021, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:17:08,913] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.05 | optimizer_step: 4.37
-[2025-01-25 14:17:08,915] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2131.04 | bwd_microstep: 4584.22 | bwd_inner_microstep: 4578.68 | bwd_allreduce_microstep: 5.39 | step_microstep: 45.78
-[2025-01-25 14:17:08,915] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2131.00 | bwd: 4584.24 | bwd_inner: 4578.68 | bwd_allreduce: 5.49 | step: 45.79
- 34%|███▍      | 1974/5800 [5:30:38<18:47:59, 17.69s/it]                                                        {'loss': 0.1021, 'grad_norm': 9.774712562561035, 'learning_rate': 3.0719553186741025e-05, 'epoch': 17.02}
- 34%|███▍      | 1974/5800 [5:30:38<18:47:59, 17.69s/it]score1 tensor([[0.5781],
-        [0.5469],
-        [0.6172],
-        [0.6055]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.4062, 0.6016, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0864, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:17:15,756] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.37
-[2025-01-25 14:17:15,758] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2137.21 | bwd_microstep: 4580.78 | bwd_inner_microstep: 4575.51 | bwd_allreduce_microstep: 5.19 | step_microstep: 44.11
-[2025-01-25 14:17:15,758] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2137.18 | bwd: 4580.80 | bwd_inner: 4575.51 | bwd_allreduce: 5.23 | step: 44.12
- 34%|███▍      | 1975/5800 [5:30:45<15:20:09, 14.43s/it]                                                        {'loss': 0.0864, 'grad_norm': 9.739381790161133, 'learning_rate': 3.071012302833795e-05, 'epoch': 17.03}
- 34%|███▍      | 1975/5800 [5:30:45<15:20:09, 14.43s/it]score1 tensor([[0.5742],
-        [0.4316],
-        [0.6836],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.3945, 0.6250, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0386, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:17:22,618] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 14:17:22,619] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2142.49 | bwd_microstep: 4597.92 | bwd_inner_microstep: 4592.67 | bwd_allreduce_microstep: 5.17 | step_microstep: 43.31
-[2025-01-25 14:17:22,619] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2142.43 | bwd: 4597.94 | bwd_inner: 4592.67 | bwd_allreduce: 5.21 | step: 43.31
- 34%|███▍      | 1976/5800 [5:30:52<12:55:08, 12.16s/it]                                                        {'loss': 0.0386, 'grad_norm': 9.54697322845459, 'learning_rate': 3.0700689530332674e-05, 'epoch': 17.03}
- 34%|███▍      | 1976/5800 [5:30:52<12:55:08, 12.16s/it]score1 tensor([[0.6250],
-        [0.4844],
-        [0.3926],
-        [0.7070]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6523, 0.4629, 0.4785, 0.6367], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0513, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:17:29,478] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.37
-[2025-01-25 14:17:29,480] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2145.38 | bwd_microstep: 4588.96 | bwd_inner_microstep: 4583.57 | bwd_allreduce_microstep: 5.29 | step_microstep: 43.90
-[2025-01-25 14:17:29,480] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2145.33 | bwd: 4588.99 | bwd_inner: 4583.57 | bwd_allreduce: 5.34 | step: 43.90
- 34%|███▍      | 1977/5800 [5:30:59<11:13:35, 10.57s/it]                                                        {'loss': 0.0513, 'grad_norm': 0.9709947109222412, 'learning_rate': 3.069125269566673e-05, 'epoch': 17.04}
- 34%|███▍      | 1977/5800 [5:30:59<11:13:35, 10.57s/it]score1 tensor([[0.6289],
-        [0.3594],
-        [0.6016],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.4727, 0.5781, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:17:36,323] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 14:17:36,324] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2140.27 | bwd_microstep: 4582.40 | bwd_inner_microstep: 4576.99 | bwd_allreduce_microstep: 5.31 | step_microstep: 46.18
-[2025-01-25 14:17:36,325] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2140.23 | bwd: 4582.44 | bwd_inner: 4576.99 | bwd_allreduce: 5.37 | step: 46.17
- 34%|███▍      | 1978/5800 [5:31:06<10:02:09,  9.45s/it]                                                        {'loss': 0.0703, 'grad_norm': 5.553532600402832, 'learning_rate': 3.0681812527282686e-05, 'epoch': 17.05}
- 34%|███▍      | 1978/5800 [5:31:06<10:02:09,  9.45s/it]score1 tensor([[0.4727],
-        [0.3164],
-        [0.3535],
-        [0.3457]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4395, 0.3789, 0.4512, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:17:43,181] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.96 | optimizer_step: 4.37
-[2025-01-25 14:17:43,183] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2145.08 | bwd_microstep: 4582.45 | bwd_inner_microstep: 4577.53 | bwd_allreduce_microstep: 4.82 | step_microstep: 53.28
-[2025-01-25 14:17:43,183] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2145.05 | bwd: 4582.47 | bwd_inner: 4577.53 | bwd_allreduce: 4.88 | step: 53.29
- 34%|███▍      | 1979/5800 [5:31:13<9:12:39,  8.68s/it]                                                        {'loss': 0.0664, 'grad_norm': 3.688051223754883, 'learning_rate': 3.067236902812415e-05, 'epoch': 17.06}
- 34%|███▍      | 1979/5800 [5:31:13<9:12:39,  8.68s/it]score1 tensor([[0.4004],
-        [0.5000],
-        [0.5117],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.5508, 0.6016, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0723, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:17:50,073] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.91 | optimizer_step: 4.36
-[2025-01-25 14:17:50,076] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.58 | bwd_microstep: 4585.70 | bwd_inner_microstep: 4579.61 | bwd_allreduce_microstep: 6.00 | step_microstep: 64.96
-[2025-01-25 14:17:50,076] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.52 | bwd: 4585.73 | bwd_inner: 4579.61 | bwd_allreduce: 6.05 | step: 65.00
- 34%|███▍      | 1980/5800 [5:31:20<8:38:42,  8.15s/it]                                                       {'loss': 0.0723, 'grad_norm': 4.412917137145996, 'learning_rate': 3.0662922201135774e-05, 'epoch': 17.07}
- 34%|███▍      | 1980/5800 [5:31:20<8:38:42,  8.15s/it]score1 tensor([[0.4062],
-        [0.2930],
-        [0.4141],
-        [0.4375]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.4668, 0.4648, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1240, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:17:56,983] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.22 | optimizer_step: 4.37
-[2025-01-25 14:17:56,983] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2148.72 | bwd_microstep: 4599.61 | bwd_inner_microstep: 4588.06 | bwd_allreduce_microstep: 11.47 | step_microstep: 35.91
-[2025-01-25 14:17:56,984] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2148.64 | bwd: 4599.63 | bwd_inner: 4588.06 | bwd_allreduce: 11.51 | step: 35.91
- 34%|███▍      | 1981/5800 [5:31:26<8:14:33,  7.77s/it]                                                       {'loss': 0.124, 'grad_norm': 7.934030532836914, 'learning_rate': 3.065347204926323e-05, 'epoch': 17.08}
- 34%|███▍      | 1981/5800 [5:31:26<8:14:33,  7.77s/it]score1 tensor([[0.5195],
-        [0.4863],
-        [0.4258],
-        [0.4688]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5586, 0.5391, 0.3750, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0552, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:18:03,861] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.36
-[2025-01-25 14:18:03,863] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.91 | bwd_microstep: 4600.47 | bwd_inner_microstep: 4595.02 | bwd_allreduce_microstep: 5.35 | step_microstep: 43.12
-[2025-01-25 14:18:03,863] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.87 | bwd: 4600.50 | bwd_inner: 4595.01 | bwd_allreduce: 5.40 | step: 43.12
- 34%|███▍      | 1982/5800 [5:31:33<7:57:22,  7.50s/it]                                                       {'loss': 0.0552, 'grad_norm': 4.729247093200684, 'learning_rate': 3.0644018575453255e-05, 'epoch': 17.09}
- 34%|███▍      | 1982/5800 [5:31:33<7:57:22,  7.50s/it]score1 tensor([[0.4238],
-        [0.4590],
-        [0.4473],
-        [0.3730]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4863, 0.4453, 0.6289, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0991, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:18:10,732] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.36
-[2025-01-25 14:18:10,733] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2149.50 | bwd_microstep: 4595.63 | bwd_inner_microstep: 4590.13 | bwd_allreduce_microstep: 5.39 | step_microstep: 40.87
-[2025-01-25 14:18:10,734] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2149.46 | bwd: 4595.66 | bwd_inner: 4590.13 | bwd_allreduce: 5.45 | step: 40.88
- 34%|███▍      | 1983/5800 [5:31:40<7:45:27,  7.32s/it]                                                       {'loss': 0.0991, 'grad_norm': 4.136173725128174, 'learning_rate': 3.06345617826536e-05, 'epoch': 17.09}
- 34%|███▍      | 1983/5800 [5:31:40<7:45:27,  7.32s/it]score1 tensor([[0.4512],
-        [0.3887],
-        [0.5312],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5000, 0.3945, 0.5469, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:18:17,624] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 14:18:17,627] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.77 | bwd_microstep: 4598.26 | bwd_inner_microstep: 4593.02 | bwd_allreduce_microstep: 5.13 | step_microstep: 48.62
-[2025-01-25 14:18:17,627] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.72 | bwd: 4598.28 | bwd_inner: 4593.02 | bwd_allreduce: 5.18 | step: 48.63
- 34%|███▍      | 1984/5800 [5:31:47<7:36:50,  7.18s/it]                                                       {'loss': 0.0186, 'grad_norm': 8.831923484802246, 'learning_rate': 3.0625101673813045e-05, 'epoch': 17.1}
- 34%|███▍      | 1984/5800 [5:31:47<7:36:50,  7.18s/it]score1 tensor([[0.3477],
-        [0.5273],
-        [0.5430],
-        [0.4785]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3457, 0.6133, 0.5312, 0.4707], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0269, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:18:24,504] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.37
-[2025-01-25 14:18:24,506] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.56 | bwd_microstep: 4603.37 | bwd_inner_microstep: 4598.33 | bwd_allreduce_microstep: 4.95 | step_microstep: 46.73
-[2025-01-25 14:18:24,506] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.53 | bwd: 4603.39 | bwd_inner: 4598.33 | bwd_allreduce: 4.99 | step: 46.74
- 34%|███▍      | 1985/5800 [5:31:54<7:30:58,  7.09s/it]                                                       {'loss': 0.0269, 'grad_norm': 4.163885116577148, 'learning_rate': 3.061563825188144e-05, 'epoch': 17.11}
- 34%|███▍      | 1985/5800 [5:31:54<7:30:58,  7.09s/it]score1 tensor([[0.4199],
-        [0.5664],
-        [0.4531],
-        [0.4668]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4082, 0.5078, 0.4922, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:18:31,407] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.36
-[2025-01-25 14:18:31,409] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.02 | bwd_microstep: 4618.46 | bwd_inner_microstep: 4613.04 | bwd_allreduce_microstep: 5.33 | step_microstep: 44.85
-[2025-01-25 14:18:31,409] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.97 | bwd: 4618.48 | bwd_inner: 4613.04 | bwd_allreduce: 5.38 | step: 44.86
- 34%|███▍      | 1986/5800 [5:32:01<7:27:08,  7.03s/it]                                                       {'loss': 0.0352, 'grad_norm': 0.6321960091590881, 'learning_rate': 3.060617151980962e-05, 'epoch': 17.12}
- 34%|███▍      | 1986/5800 [5:32:01<7:27:08,  7.03s/it]score1 tensor([[0.4570],
-        [0.5195],
-        [0.5859],
-        [0.4395]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.5039, 0.5547, 0.4336], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0317, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:18:38,307] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 14:18:38,308] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.57 | bwd_microstep: 4624.89 | bwd_inner_microstep: 4619.75 | bwd_allreduce_microstep: 5.05 | step_microstep: 45.85
-[2025-01-25 14:18:38,309] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.52 | bwd: 4624.91 | bwd_inner: 4619.75 | bwd_allreduce: 5.09 | step: 45.86
- 34%|███▍      | 1987/5800 [5:32:08<7:24:30,  6.99s/it]                                                       {'loss': 0.0317, 'grad_norm': 4.712921619415283, 'learning_rate': 3.059670148054949e-05, 'epoch': 17.13}
- 34%|███▍      | 1987/5800 [5:32:08<7:24:30,  6.99s/it]score1 tensor([[0.4824],
-        [0.5391],
-        [0.5469],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.6055, 0.5703, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0327, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:18:45,219] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.54 | optimizer_step: 4.36
-[2025-01-25 14:18:45,221] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.59 | bwd_microstep: 4637.98 | bwd_inner_microstep: 4633.07 | bwd_allreduce_microstep: 4.83 | step_microstep: 45.44
-[2025-01-25 14:18:45,221] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.56 | bwd: 4638.01 | bwd_inner: 4633.07 | bwd_allreduce: 4.87 | step: 45.46
- 34%|███▍      | 1988/5800 [5:32:15<7:22:48,  6.97s/it]                                                       {'loss': 0.0327, 'grad_norm': 4.401895046234131, 'learning_rate': 3.058722813705397e-05, 'epoch': 17.14}
- 34%|███▍      | 1988/5800 [5:32:15<7:22:48,  6.97s/it]score1 tensor([[0.6094],
-        [0.5781],
-        [0.4199],
-        [0.3652]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.6641, 0.4648, 0.3555], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0381, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:18:52,129] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.36
-[2025-01-25 14:18:52,130] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.00 | bwd_microstep: 4630.41 | bwd_inner_microstep: 4624.17 | bwd_allreduce_microstep: 6.15 | step_microstep: 49.97
-[2025-01-25 14:18:52,131] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.97 | bwd: 4630.43 | bwd_inner: 4624.17 | bwd_allreduce: 6.20 | step: 49.98
- 34%|███▍      | 1989/5800 [5:32:22<7:22:00,  6.96s/it]                                                       {'loss': 0.0381, 'grad_norm': 4.996274948120117, 'learning_rate': 3.057775149227701e-05, 'epoch': 17.15}
- 34%|███▍      | 1989/5800 [5:32:22<7:22:00,  6.96s/it]score1 tensor([[0.5586],
-        [0.5234],
-        [0.6641],
-        [0.6055]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.5000, 0.6133, 0.5977], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0244, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:18:59,086] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 14:18:59,087] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.79 | bwd_microstep: 4633.64 | bwd_inner_microstep: 4624.11 | bwd_allreduce_microstep: 9.31 | step_microstep: 53.93
-[2025-01-25 14:18:59,087] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.74 | bwd: 4633.71 | bwd_inner: 4624.11 | bwd_allreduce: 9.43 | step: 53.91
- 34%|███▍      | 1990/5800 [5:32:29<7:21:54,  6.96s/it]                                                       {'loss': 0.0244, 'grad_norm': 9.579557418823242, 'learning_rate': 3.0568271549173605e-05, 'epoch': 17.16}
- 34%|███▍      | 1990/5800 [5:32:29<7:21:54,  6.96s/it]score1 tensor([[0.6094],
-        [0.4688],
-        [0.5039],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5078, 0.4707, 0.4004, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0635, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:19:06,063] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 14:19:06,065] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2171.79 | bwd_microstep: 4634.11 | bwd_inner_microstep: 4628.96 | bwd_allreduce_microstep: 5.07 | step_microstep: 44.15
-[2025-01-25 14:19:06,065] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2171.71 | bwd: 4634.13 | bwd_inner: 4628.96 | bwd_allreduce: 5.10 | step: 44.15
- 34%|███▍      | 1991/5800 [5:32:36<7:21:40,  6.96s/it]                                                       {'loss': 0.0635, 'grad_norm': 4.785432815551758, 'learning_rate': 3.0558788310699745e-05, 'epoch': 17.16}
- 34%|███▍      | 1991/5800 [5:32:36<7:21:40,  6.96s/it]score1 tensor([[0.5508],
-        [0.5469],
-        [0.5234],
-        [0.6016]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6836, 0.5625, 0.5039, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0479, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:19:12,966] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.37
-[2025-01-25 14:19:12,968] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.15 | bwd_microstep: 4620.25 | bwd_inner_microstep: 4615.41 | bwd_allreduce_microstep: 4.75 | step_microstep: 45.21
-[2025-01-25 14:19:12,968] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.11 | bwd: 4620.27 | bwd_inner: 4615.42 | bwd_allreduce: 4.79 | step: 45.22
- 34%|███▍      | 1992/5800 [5:32:42<7:20:28,  6.94s/it]                                                       {'loss': 0.0479, 'grad_norm': 0.4464320242404938, 'learning_rate': 3.0549301779812486e-05, 'epoch': 17.17}
- 34%|███▍      | 1992/5800 [5:32:42<7:20:28,  6.94s/it]score1 tensor([[0.5977],
-        [0.5469],
-        [0.5078],
-        [0.5781]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.4980, 0.4141, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0679, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:19:19,885] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.13 | optimizer_step: 4.37
-[2025-01-25 14:19:19,886] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.21 | bwd_microstep: 4631.48 | bwd_inner_microstep: 4626.14 | bwd_allreduce_microstep: 5.19 | step_microstep: 49.80
-[2025-01-25 14:19:19,886] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.17 | bwd: 4631.51 | bwd_inner: 4626.14 | bwd_allreduce: 5.29 | step: 49.80
- 34%|███▍      | 1993/5800 [5:32:49<7:19:59,  6.93s/it]                                                       {'loss': 0.0679, 'grad_norm': 9.192214012145996, 'learning_rate': 3.053981195946988e-05, 'epoch': 17.18}
- 34%|███▍      | 1993/5800 [5:32:49<7:19:59,  6.93s/it]score1 tensor([[0.4980],
-        [0.4980],
-        [0.4766],
-        [0.4395]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.4746, 0.3730, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0386, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:19:26,789] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 14:19:26,790] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.79 | bwd_microstep: 4621.15 | bwd_inner_microstep: 4616.34 | bwd_allreduce_microstep: 4.72 | step_microstep: 47.64
-[2025-01-25 14:19:26,791] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.76 | bwd: 4621.17 | bwd_inner: 4616.34 | bwd_allreduce: 4.76 | step: 47.65
- 34%|███▍      | 1994/5800 [5:32:56<7:19:15,  6.92s/it]                                                       {'loss': 0.0386, 'grad_norm': 0.5785087943077087, 'learning_rate': 3.053031885263102e-05, 'epoch': 17.19}
- 34%|███▍      | 1994/5800 [5:32:56<7:19:15,  6.92s/it]score1 tensor([[0.5078],
-        [0.5703],
-        [0.4004],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.6484, 0.4219, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0415, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:19:33,705] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 14:19:33,707] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.70 | bwd_microstep: 4624.16 | bwd_inner_microstep: 4619.24 | bwd_allreduce_microstep: 4.84 | step_microstep: 44.34
-[2025-01-25 14:19:33,707] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.66 | bwd: 4624.18 | bwd_inner: 4619.24 | bwd_allreduce: 4.88 | step: 44.34
- 34%|███▍      | 1995/5800 [5:33:03<7:19:01,  6.92s/it]                                                       {'loss': 0.0415, 'grad_norm': 4.387633800506592, 'learning_rate': 3.052082246225603e-05, 'epoch': 17.2}
- 34%|███▍      | 1995/5800 [5:33:03<7:19:01,  6.92s/it]score1 tensor([[0.5977],
-        [0.5352],
-        [0.4512],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.4824, 0.4629, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0288, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:19:40,576] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 14:19:40,577] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.72 | bwd_microstep: 4584.94 | bwd_inner_microstep: 4580.05 | bwd_allreduce_microstep: 4.82 | step_microstep: 43.45
-[2025-01-25 14:19:40,577] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.68 | bwd: 4584.97 | bwd_inner: 4580.05 | bwd_allreduce: 4.86 | step: 43.45
- 34%|███▍      | 1996/5800 [5:33:10<7:17:57,  6.91s/it]                                                       {'loss': 0.0288, 'grad_norm': 2.641781806945801, 'learning_rate': 3.051132279130604e-05, 'epoch': 17.21}
- 34%|███▍      | 1996/5800 [5:33:10<7:17:57,  6.91s/it]score1 tensor([[0.5430],
-        [0.4316],
-        [0.4277],
-        [0.3730]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4434, 0.4844, 0.3750, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0928, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:19:47,495] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 14:19:47,496] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.52 | bwd_microstep: 4638.50 | bwd_inner_microstep: 4632.87 | bwd_allreduce_microstep: 5.52 | step_microstep: 44.07
-[2025-01-25 14:19:47,496] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.49 | bwd: 4638.53 | bwd_inner: 4632.87 | bwd_allreduce: 5.58 | step: 44.08
- 34%|███▍      | 1997/5800 [5:33:17<7:17:56,  6.91s/it]                                                       {'loss': 0.0928, 'grad_norm': 0.8198146820068359, 'learning_rate': 3.0501819842743225e-05, 'epoch': 17.22}
- 34%|███▍      | 1997/5800 [5:33:17<7:17:56,  6.91s/it]score1 tensor([[0.5273],
-        [0.4785],
-        [0.4727],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.4512, 0.4551, 0.4727], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:19:54,410] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 7.75 | optimizer_step: 4.37
-[2025-01-25 14:19:54,411] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.67 | bwd_microstep: 4629.84 | bwd_inner_microstep: 4623.50 | bwd_allreduce_microstep: 6.17 | step_microstep: 52.14
-[2025-01-25 14:19:54,412] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.64 | bwd: 4629.86 | bwd_inner: 4623.50 | bwd_allreduce: 6.29 | step: 52.14
- 34%|███▍      | 1998/5800 [5:33:24<7:18:21,  6.92s/it]                                                       {'loss': 0.0293, 'grad_norm': 8.724920272827148, 'learning_rate': 3.0492313619530755e-05, 'epoch': 17.22}
- 34%|███▍      | 1998/5800 [5:33:24<7:18:21,  6.92s/it]score1 tensor([[0.4883],
-        [0.5742],
-        [0.4883],
-        [0.3848]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.7031, 0.4570, 0.4199], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0791, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:20:01,528] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 14:20:01,530] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.77 | bwd_microstep: 4632.34 | bwd_inner_microstep: 4625.37 | bwd_allreduce_microstep: 6.79 | step_microstep: 74.93
-[2025-01-25 14:20:01,530] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.73 | bwd: 4632.40 | bwd_inner: 4625.37 | bwd_allreduce: 6.88 | step: 74.91
- 34%|███▍      | 1999/5800 [5:33:31<7:21:46,  6.97s/it]                                                       {'loss': 0.0791, 'grad_norm': 4.381897449493408, 'learning_rate': 3.0482804124632844e-05, 'epoch': 17.23}
- 34%|███▍      | 1999/5800 [5:33:31<7:21:46,  6.97s/it]score1 tensor([[0.5273],
-        [0.4941],
-        [0.5742],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.4961, 0.5625, 0.5938], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0171, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:20:08,479] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.00 | optimizer_step: 4.36
-[2025-01-25 14:20:08,481] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2169.65 | bwd_microstep: 4627.80 | bwd_inner_microstep: 4622.81 | bwd_allreduce_microstep: 4.84 | step_microstep: 58.44
-[2025-01-25 14:20:08,481] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2169.60 | bwd: 4627.83 | bwd_inner: 4622.82 | bwd_allreduce: 4.92 | step: 58.45
- 34%|███▍      | 2000/5800 [5:33:38<7:21:23,  6.97s/it]                                                       {'loss': 0.0171, 'grad_norm': 4.383726596832275, 'learning_rate': 3.0473291361014713e-05, 'epoch': 17.24}
- 34%|███▍      | 2000/5800 [5:33:38<7:21:23,  6.97s/it]score1 tensor([[0.5703],
-        [0.5547],
-        [0.4648],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5820, 0.5391, 0.4043, 0.5742], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0288, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:20:15,395] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.36
-[2025-01-25 14:20:15,397] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.12 | bwd_microstep: 4625.29 | bwd_inner_microstep: 4620.73 | bwd_allreduce_microstep: 4.49 | step_microstep: 43.63
-[2025-01-25 14:20:15,397] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.08 | bwd: 4625.32 | bwd_inner: 4620.73 | bwd_allreduce: 4.53 | step: 43.63
- 34%|███▍      | 2001/5800 [5:33:45<7:19:56,  6.95s/it]                                                       {'loss': 0.0288, 'grad_norm': 0.5747715830802917, 'learning_rate': 3.04637753316426e-05, 'epoch': 17.25}
- 34%|███▍      | 2001/5800 [5:33:45<7:19:56,  6.95s/it]score1 tensor([[0.5391],
-        [0.3887],
-        [0.5078],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.3418, 0.4648, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:20:22,301] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.36
-[2025-01-25 14:20:22,303] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.50 | bwd_microstep: 4633.68 | bwd_inner_microstep: 4628.62 | bwd_allreduce_microstep: 4.94 | step_microstep: 44.07
-[2025-01-25 14:20:22,303] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.47 | bwd: 4633.73 | bwd_inner: 4628.62 | bwd_allreduce: 5.01 | step: 44.08
- 35%|███▍      | 2002/5800 [5:33:52<7:19:00,  6.94s/it]                                                       {'loss': 0.0391, 'grad_norm': 4.166464805603027, 'learning_rate': 3.0454256039483784e-05, 'epoch': 17.26}
- 35%|███▍      | 2002/5800 [5:33:52<7:19:00,  6.94s/it]score1 tensor([[0.5469],
-        [0.5312],
-        [0.5430],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.5664, 0.6055, 0.6484], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0713, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:20:29,205] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 14:20:29,207] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.46 | bwd_microstep: 4628.60 | bwd_inner_microstep: 4623.93 | bwd_allreduce_microstep: 4.57 | step_microstep: 43.12
-[2025-01-25 14:20:29,207] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.43 | bwd: 4628.62 | bwd_inner: 4623.93 | bwd_allreduce: 4.62 | step: 43.13
- 35%|███▍      | 2003/5800 [5:33:59<7:18:22,  6.93s/it]                                                       {'loss': 0.0713, 'grad_norm': 9.159538269042969, 'learning_rate': 3.0444733487506544e-05, 'epoch': 17.27}
- 35%|███▍      | 2003/5800 [5:33:59<7:18:22,  6.93s/it]score1 tensor([[0.4531],
-        [0.4355],
-        [0.6055],
-        [0.6211]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4707, 0.4043, 0.6797, 0.6953], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0493, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:20:36,108] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.45 | optimizer_step: 4.36
-[2025-01-25 14:20:36,109] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.44 | bwd_microstep: 4625.22 | bwd_inner_microstep: 4620.57 | bwd_allreduce_microstep: 4.54 | step_microstep: 43.39
-[2025-01-25 14:20:36,110] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.40 | bwd: 4625.24 | bwd_inner: 4620.57 | bwd_allreduce: 4.58 | step: 43.40
- 35%|███▍      | 2004/5800 [5:34:06<7:17:43,  6.92s/it]                                                       {'loss': 0.0493, 'grad_norm': 5.001017093658447, 'learning_rate': 3.0435207678680164e-05, 'epoch': 17.28}
- 35%|███▍      | 2004/5800 [5:34:06<7:17:43,  6.92s/it]score1 tensor([[0.5391],
-        [0.4414],
-        [0.4863],
-        [0.4766]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.4336, 0.4121, 0.4121], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0376, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:20:43,006] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.36
-[2025-01-25 14:20:43,007] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.05 | bwd_microstep: 4625.62 | bwd_inner_microstep: 4620.68 | bwd_allreduce_microstep: 4.85 | step_microstep: 43.46
-[2025-01-25 14:20:43,008] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.02 | bwd: 4625.64 | bwd_inner: 4620.68 | bwd_allreduce: 4.89 | step: 43.46
- 35%|███▍      | 2005/5800 [5:34:12<7:17:13,  6.91s/it]                                                       {'loss': 0.0376, 'grad_norm': 8.539640426635742, 'learning_rate': 3.0425678615974962e-05, 'epoch': 17.28}
- 35%|███▍      | 2005/5800 [5:34:12<7:17:13,  6.91s/it]score1 tensor([[0.5820],
-        [0.5391],
-        [0.5625],
-        [0.3906]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6367, 0.5898, 0.5625, 0.3926], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0269, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:20:49,865] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.36
-[2025-01-25 14:20:49,866] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.21 | bwd_microstep: 4581.72 | bwd_inner_microstep: 4576.79 | bwd_allreduce_microstep: 4.85 | step_microstep: 39.82
-[2025-01-25 14:20:49,866] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.17 | bwd: 4581.75 | bwd_inner: 4576.79 | bwd_allreduce: 4.90 | step: 39.82
- 35%|███▍      | 2006/5800 [5:34:19<7:16:03,  6.90s/it]                                                       {'loss': 0.0269, 'grad_norm': 6.5791802406311035, 'learning_rate': 3.041614630236227e-05, 'epoch': 17.29}
- 35%|███▍      | 2006/5800 [5:34:19<7:16:03,  6.90s/it]score1 tensor([[0.4980],
-        [0.5742],
-        [0.5156],
-        [0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.5508, 0.4941, 0.4336], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:20:56,780] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.37
-[2025-01-25 14:20:56,782] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.95 | bwd_microstep: 4643.03 | bwd_inner_microstep: 4637.10 | bwd_allreduce_microstep: 5.79 | step_microstep: 44.82
-[2025-01-25 14:20:56,783] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.92 | bwd: 4643.05 | bwd_inner: 4637.10 | bwd_allreduce: 5.87 | step: 44.84
- 35%|███▍      | 2007/5800 [5:34:26<7:16:25,  6.90s/it]                                                       {'loss': 0.0254, 'grad_norm': 4.563439846038818, 'learning_rate': 3.0406610740814418e-05, 'epoch': 17.3}
- 35%|███▍      | 2007/5800 [5:34:26<7:16:25,  6.90s/it]score1 tensor([[0.4980],
-        [0.5469],
-        [0.4824],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.4297, 0.4980, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0425, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:21:03,688] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.36
-[2025-01-25 14:21:03,689] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.71 | bwd_microstep: 4624.65 | bwd_inner_microstep: 4619.79 | bwd_allreduce_microstep: 4.78 | step_microstep: 44.65
-[2025-01-25 14:21:03,689] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.68 | bwd: 4624.68 | bwd_inner: 4619.79 | bwd_allreduce: 4.82 | step: 44.66
- 35%|███▍      | 2008/5800 [5:34:33<7:16:17,  6.90s/it]                                                       {'loss': 0.0425, 'grad_norm': 4.2960734367370605, 'learning_rate': 3.0397071934304773e-05, 'epoch': 17.31}
- 35%|███▍      | 2008/5800 [5:34:33<7:16:17,  6.90s/it]score1 tensor([[0.5742],
-        [0.6055],
-        [0.5391],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.5039, 0.5156, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0400, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:21:10,586] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 14:21:10,587] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.23 | bwd_microstep: 4624.40 | bwd_inner_microstep: 4619.47 | bwd_allreduce_microstep: 4.83 | step_microstep: 45.54
-[2025-01-25 14:21:10,588] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.19 | bwd: 4624.42 | bwd_inner: 4619.47 | bwd_allreduce: 4.88 | step: 45.54
- 35%|███▍      | 2009/5800 [5:34:40<7:16:06,  6.90s/it]                                                       {'loss': 0.04, 'grad_norm': 4.588974475860596, 'learning_rate': 3.0387529885807692e-05, 'epoch': 17.32}
- 35%|███▍      | 2009/5800 [5:34:40<7:16:06,  6.90s/it]score1 tensor([[0.5156],
-        [0.4336],
-        [0.5664],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.3672, 0.4863, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0386, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:21:17,488] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.37
-[2025-01-25 14:21:17,490] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.59 | bwd_microstep: 4625.04 | bwd_inner_microstep: 4620.40 | bwd_allreduce_microstep: 4.49 | step_microstep: 44.69
-[2025-01-25 14:21:17,490] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.56 | bwd: 4625.07 | bwd_inner: 4620.40 | bwd_allreduce: 4.60 | step: 44.71
- 35%|███▍      | 2010/5800 [5:34:47<7:15:59,  6.90s/it]                                                       {'loss': 0.0386, 'grad_norm': 0.5344634652137756, 'learning_rate': 3.0377984598298553e-05, 'epoch': 17.33}
- 35%|███▍      | 2010/5800 [5:34:47<7:15:59,  6.90s/it]score1 tensor([[0.5000],
-        [0.5508],
-        [0.5508],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.5781, 0.5039, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:21:24,385] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 14:21:24,387] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.21 | bwd_microstep: 4624.79 | bwd_inner_microstep: 4620.07 | bwd_allreduce_microstep: 4.64 | step_microstep: 44.06
-[2025-01-25 14:21:24,387] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.17 | bwd: 4624.82 | bwd_inner: 4620.07 | bwd_allreduce: 4.68 | step: 44.07
- 35%|███▍      | 2011/5800 [5:34:54<7:15:54,  6.90s/it]                                                       {'loss': 0.0547, 'grad_norm': 0.49113035202026367, 'learning_rate': 3.0368436074753745e-05, 'epoch': 17.34}
- 35%|███▍      | 2011/5800 [5:34:54<7:15:54,  6.90s/it]score1 tensor([[0.5352],
-        [0.4238],
-        [0.4961],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6484, 0.4414, 0.4473, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0605, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:21:31,342] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.54 | optimizer_step: 4.36
-[2025-01-25 14:21:31,345] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.95 | bwd_microstep: 4634.49 | bwd_inner_microstep: 4625.67 | bwd_allreduce_microstep: 8.52 | step_microstep: 69.75
-[2025-01-25 14:21:31,346] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.90 | bwd: 4634.56 | bwd_inner: 4625.67 | bwd_allreduce: 8.69 | step: 69.77
- 35%|███▍      | 2012/5800 [5:35:01<7:16:55,  6.92s/it]                                                       {'loss': 0.0605, 'grad_norm': 4.436122894287109, 'learning_rate': 3.0358884318150655e-05, 'epoch': 17.34}
- 35%|███▍      | 2012/5800 [5:35:01<7:16:55,  6.92s/it]score1 tensor([[0.5469],
-        [0.5469],
-        [0.5586],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.4961, 0.6055, 0.5820], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0537, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:21:38,293] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.37
-[2025-01-25 14:21:38,295] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.64 | bwd_microstep: 4630.65 | bwd_inner_microstep: 4625.99 | bwd_allreduce_microstep: 4.58 | step_microstep: 53.50
-[2025-01-25 14:21:38,295] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.61 | bwd: 4630.68 | bwd_inner: 4625.99 | bwd_allreduce: 4.62 | step: 53.50
- 35%|███▍      | 2013/5800 [5:35:08<7:17:17,  6.93s/it]                                                       {'loss': 0.0537, 'grad_norm': 0.4590035080909729, 'learning_rate': 3.0349329331467693e-05, 'epoch': 17.35}
- 35%|███▍      | 2013/5800 [5:35:08<7:17:17,  6.93s/it]score1 tensor([[0.4941],
-        [0.5859],
-        [0.5898],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4609, 0.5586, 0.5547, 0.4668], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0361, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:21:45,220] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 14:21:45,221] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.75 | bwd_microstep: 4634.86 | bwd_inner_microstep: 4630.26 | bwd_allreduce_microstep: 4.52 | step_microstep: 43.75
-[2025-01-25 14:21:45,221] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.72 | bwd: 4634.89 | bwd_inner: 4630.26 | bwd_allreduce: 4.56 | step: 43.75
- 35%|███▍      | 2014/5800 [5:35:15<7:16:58,  6.93s/it]                                                       {'loss': 0.0361, 'grad_norm': 9.198619842529297, 'learning_rate': 3.033977111768428e-05, 'epoch': 17.36}
- 35%|███▍      | 2014/5800 [5:35:15<7:16:58,  6.93s/it]score1 tensor([[0.5117],
-        [0.5469],
-        [0.5586],
-        [0.6055]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4609, 0.4941, 0.5078, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0425, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:21:52,123] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 14:21:52,124] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.52 | bwd_microstep: 4627.66 | bwd_inner_microstep: 4622.67 | bwd_allreduce_microstep: 4.91 | step_microstep: 44.26
-[2025-01-25 14:21:52,124] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.47 | bwd: 4627.68 | bwd_inner: 4622.67 | bwd_allreduce: 4.95 | step: 44.27
- 35%|███▍      | 2015/5800 [5:35:22<7:16:29,  6.92s/it]                                                       {'loss': 0.0425, 'grad_norm': 4.377598285675049, 'learning_rate': 3.0330209679780807e-05, 'epoch': 17.37}
- 35%|███▍      | 2015/5800 [5:35:22<7:16:29,  6.92s/it]score1 tensor([[0.5703],
-        [0.6094],
-        [0.5898],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.6094, 0.5508, 0.4277], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0317, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:21:58,991] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.37
-[2025-01-25 14:21:58,992] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.59 | bwd_microstep: 4588.24 | bwd_inner_microstep: 4583.16 | bwd_allreduce_microstep: 5.00 | step_microstep: 42.93
-[2025-01-25 14:21:58,992] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.55 | bwd: 4588.26 | bwd_inner: 4583.16 | bwd_allreduce: 5.04 | step: 42.94
- 35%|███▍      | 2016/5800 [5:35:28<7:15:22,  6.90s/it]                                                       {'loss': 0.0317, 'grad_norm': 6.911086559295654, 'learning_rate': 3.0320645020738726e-05, 'epoch': 17.38}
- 35%|███▍      | 2016/5800 [5:35:28<7:15:22,  6.90s/it]score1 tensor([[0.4844],
-        [0.4707],
-        [0.4824],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4531, 0.4297, 0.4648, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:22:05,891] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 14:22:05,893] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.54 | bwd_microstep: 4625.17 | bwd_inner_microstep: 4619.57 | bwd_allreduce_microstep: 5.49 | step_microstep: 42.52
-[2025-01-25 14:22:05,893] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.50 | bwd: 4625.20 | bwd_inner: 4619.57 | bwd_allreduce: 5.54 | step: 42.54
- 35%|███▍      | 2017/5800 [5:35:35<7:15:17,  6.90s/it]                                                       {'loss': 0.0254, 'grad_norm': 8.543173789978027, 'learning_rate': 3.031107714354044e-05, 'epoch': 17.39}
- 35%|███▍      | 2017/5800 [5:35:35<7:15:17,  6.90s/it]score1 tensor([[0.5234],
-        [0.4980],
-        [0.5664],
-        [0.4961]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5586, 0.5156, 0.6406, 0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0405, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:22:12,788] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 14:22:12,789] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.75 | bwd_microstep: 4623.64 | bwd_inner_microstep: 4618.50 | bwd_allreduce_microstep: 5.04 | step_microstep: 40.78
-[2025-01-25 14:22:12,790] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.70 | bwd: 4623.67 | bwd_inner: 4618.50 | bwd_allreduce: 5.09 | step: 40.79
- 35%|███▍      | 2018/5800 [5:35:42<7:15:04,  6.90s/it]                                                       {'loss': 0.0405, 'grad_norm': 8.9366455078125, 'learning_rate': 3.030150605116939e-05, 'epoch': 17.4}
- 35%|███▍      | 2018/5800 [5:35:42<7:15:04,  6.90s/it]score1 tensor([[0.5391],
-        [0.5195],
-        [0.5273],
-        [0.4980]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4629, 0.4902, 0.6016, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0591, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:22:19,707] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 14:22:19,708] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.73 | bwd_microstep: 4633.43 | bwd_inner_microstep: 4627.92 | bwd_allreduce_microstep: 5.41 | step_microstep: 45.29
-[2025-01-25 14:22:19,709] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.69 | bwd: 4633.46 | bwd_inner: 4627.92 | bwd_allreduce: 5.46 | step: 45.31
- 35%|███▍      | 2019/5800 [5:35:49<7:15:18,  6.91s/it]                                                       {'loss': 0.0591, 'grad_norm': 0.45768922567367554, 'learning_rate': 3.029193174661e-05, 'epoch': 17.41}
- 35%|███▍      | 2019/5800 [5:35:49<7:15:18,  6.91s/it]score1 tensor([[0.4219],
-        [0.5039],
-        [0.4844],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4805, 0.6094, 0.4238, 0.5586], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0688, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:22:26,626] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.03 | optimizer_step: 4.37
-[2025-01-25 14:22:26,628] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.48 | bwd_microstep: 4635.72 | bwd_inner_microstep: 4630.34 | bwd_allreduce_microstep: 5.24 | step_microstep: 45.82
-[2025-01-25 14:22:26,629] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.44 | bwd: 4635.76 | bwd_inner: 4630.34 | bwd_allreduce: 5.28 | step: 45.86
- 35%|███▍      | 2020/5800 [5:35:56<7:15:24,  6.91s/it]                                                       {'loss': 0.0688, 'grad_norm': 4.237921714782715, 'learning_rate': 3.0282354232847715e-05, 'epoch': 17.41}
- 35%|███▍      | 2020/5800 [5:35:56<7:15:24,  6.91s/it]score1 tensor([[0.5117],
-        [0.5156],
-        [0.5312],
-        [0.4121]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.4844, 0.6055, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0601, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:22:33,553] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.88 | optimizer_step: 4.36
-[2025-01-25 14:22:33,555] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.45 | bwd_microstep: 4627.97 | bwd_inner_microstep: 4622.49 | bwd_allreduce_microstep: 5.37 | step_microstep: 50.48
-[2025-01-25 14:22:33,555] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.40 | bwd: 4627.99 | bwd_inner: 4622.49 | bwd_allreduce: 5.43 | step: 50.49
- 35%|███▍      | 2021/5800 [5:36:03<7:15:27,  6.91s/it]                                                       {'loss': 0.0601, 'grad_norm': 0.5673553347587585, 'learning_rate': 3.0272773512868968e-05, 'epoch': 17.42}
- 35%|███▍      | 2021/5800 [5:36:03<7:15:27,  6.91s/it]score1 tensor([[0.4922],
-        [0.5898],
-        [0.4922],
-        [0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4922, 0.6094, 0.5195, 0.4570], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:22:40,444] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 14:22:40,446] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.05 | bwd_microstep: 4580.50 | bwd_inner_microstep: 4573.15 | bwd_allreduce_microstep: 7.27 | step_microstep: 68.58
-[2025-01-25 14:22:40,446] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.02 | bwd: 4580.52 | bwd_inner: 4573.15 | bwd_allreduce: 7.30 | step: 68.59
- 35%|███▍      | 2022/5800 [5:36:10<7:15:24,  6.91s/it]                                                       {'loss': 0.0186, 'grad_norm': 2.517062187194824, 'learning_rate': 3.0263189589661195e-05, 'epoch': 17.43}
- 35%|███▍      | 2022/5800 [5:36:10<7:15:24,  6.91s/it]score1 tensor([[0.4590],
-        [0.5117],
-        [0.3672],
-        [0.4434]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.5000, 0.3086, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0327, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:22:47,433] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 14:22:47,435] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2171.27 | bwd_microstep: 4638.30 | bwd_inner_microstep: 4632.94 | bwd_allreduce_microstep: 5.28 | step_microstep: 45.62
-[2025-01-25 14:22:47,436] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2171.24 | bwd: 4638.33 | bwd_inner: 4632.94 | bwd_allreduce: 5.32 | step: 45.63
- 35%|███▍      | 2023/5800 [5:36:17<7:16:17,  6.93s/it]                                                       {'loss': 0.0327, 'grad_norm': 4.158685684204102, 'learning_rate': 3.025360246621282e-05, 'epoch': 17.44}
- 35%|███▍      | 2023/5800 [5:36:17<7:16:17,  6.93s/it]score1 tensor([[0.5703],
-        [0.4199],
-        [0.5117],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.4668, 0.5117, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0454, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:22:54,309] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 14:22:54,311] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.62 | bwd_microstep: 4580.79 | bwd_inner_microstep: 4575.73 | bwd_allreduce_microstep: 4.96 | step_microstep: 44.19
-[2025-01-25 14:22:54,311] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.57 | bwd: 4580.81 | bwd_inner: 4575.73 | bwd_allreduce: 5.00 | step: 44.20
- 35%|███▍      | 2024/5800 [5:36:24<7:15:10,  6.91s/it]                                                       {'loss': 0.0454, 'grad_norm': 2.249478340148926, 'learning_rate': 3.024401214551328e-05, 'epoch': 17.45}
- 35%|███▍      | 2024/5800 [5:36:24<7:15:10,  6.91s/it]score1 tensor([[0.4805],
-        [0.3867],
-        [0.5156],
-        [0.4707]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4453, 0.4688, 0.4980, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0439, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:23:01,230] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.36
-[2025-01-25 14:23:01,232] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.95 | bwd_microstep: 4630.88 | bwd_inner_microstep: 4625.07 | bwd_allreduce_microstep: 5.72 | step_microstep: 46.68
-[2025-01-25 14:23:01,232] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.91 | bwd: 4630.91 | bwd_inner: 4625.07 | bwd_allreduce: 5.77 | step: 46.70
- 35%|███▍      | 2025/5800 [5:36:31<7:15:09,  6.92s/it]                                                       {'loss': 0.0439, 'grad_norm': 0.6977921724319458, 'learning_rate': 3.023441863055301e-05, 'epoch': 17.46}
- 35%|███▍      | 2025/5800 [5:36:31<7:15:09,  6.92s/it]score1 tensor([[0.5703],
-        [0.5625],
-        [0.4766],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.5898, 0.4590, 0.6875], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0591, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:23:08,155] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 14:23:08,156] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.90 | bwd_microstep: 4636.05 | bwd_inner_microstep: 4630.76 | bwd_allreduce_microstep: 5.22 | step_microstep: 45.70
-[2025-01-25 14:23:08,157] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.87 | bwd: 4636.08 | bwd_inner: 4630.76 | bwd_allreduce: 5.26 | step: 45.70
- 35%|███▍      | 2026/5800 [5:36:38<7:15:07,  6.92s/it]                                                       {'loss': 0.0591, 'grad_norm': 0.9524239301681519, 'learning_rate': 3.0224821924323423e-05, 'epoch': 17.47}
- 35%|███▍      | 2026/5800 [5:36:38<7:15:07,  6.92s/it]score1 tensor([[0.5312],
-        [0.5234],
-        [0.5859],
-        [0.4316]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.5391, 0.7070, 0.3691], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0591, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:23:15,064] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 14:23:15,065] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.43 | bwd_microstep: 4631.10 | bwd_inner_microstep: 4625.88 | bwd_allreduce_microstep: 5.15 | step_microstep: 43.39
-[2025-01-25 14:23:15,066] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.39 | bwd: 4631.13 | bwd_inner: 4625.88 | bwd_allreduce: 5.19 | step: 43.40
- 35%|███▍      | 2027/5800 [5:36:45<7:14:50,  6.92s/it]                                                       {'loss': 0.0591, 'grad_norm': 0.802555501461029, 'learning_rate': 3.0215222029816947e-05, 'epoch': 17.47}
- 35%|███▍      | 2027/5800 [5:36:45<7:14:50,  6.92s/it]score1 tensor([[0.5078],
-        [0.5000],
-        [0.4082],
-        [0.3730]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.6055, 0.4004, 0.3457], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0449, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:23:21,974] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.37
-[2025-01-25 14:23:21,975] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.00 | bwd_microstep: 4631.27 | bwd_inner_microstep: 4626.16 | bwd_allreduce_microstep: 5.03 | step_microstep: 43.43
-[2025-01-25 14:23:21,976] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.97 | bwd: 4631.29 | bwd_inner: 4626.16 | bwd_allreduce: 5.07 | step: 43.44
- 35%|███▍      | 2028/5800 [5:36:51<7:14:36,  6.91s/it]                                                       {'loss': 0.0449, 'grad_norm': 3.94882869720459, 'learning_rate': 3.0205618950026987e-05, 'epoch': 17.48}
- 35%|███▍      | 2028/5800 [5:36:51<7:14:36,  6.91s/it]score1 tensor([[0.4941],
-        [0.4688],
-        [0.4785],
-        [0.4805]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.4590, 0.4512, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0356, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:23:28,885] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 14:23:28,887] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.68 | bwd_microstep: 4633.76 | bwd_inner_microstep: 4628.22 | bwd_allreduce_microstep: 5.45 | step_microstep: 45.53
-[2025-01-25 14:23:28,887] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.65 | bwd: 4633.79 | bwd_inner: 4628.22 | bwd_allreduce: 5.50 | step: 45.54
- 35%|███▍      | 2029/5800 [5:36:58<7:14:35,  6.91s/it]                                                       {'loss': 0.0356, 'grad_norm': 8.570904731750488, 'learning_rate': 3.0196012687947962e-05, 'epoch': 17.49}
- 35%|███▍      | 2029/5800 [5:36:58<7:14:35,  6.91s/it]score1 tensor([[0.3711],
-        [0.5117],
-        [0.5352],
-        [0.4336]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4434, 0.6406, 0.4883, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0894, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:23:35,811] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.37
-[2025-01-25 14:23:35,812] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.81 | bwd_microstep: 4634.19 | bwd_inner_microstep: 4628.91 | bwd_allreduce_microstep: 5.17 | step_microstep: 45.99
-[2025-01-25 14:23:35,813] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.78 | bwd: 4634.21 | bwd_inner: 4628.91 | bwd_allreduce: 5.21 | step: 46.01
- 35%|███▌      | 2030/5800 [5:37:05<7:14:48,  6.92s/it]                                                       {'loss': 0.0894, 'grad_norm': 4.044411659240723, 'learning_rate': 3.0186403246575263e-05, 'epoch': 17.5}
- 35%|███▌      | 2030/5800 [5:37:05<7:14:48,  6.92s/it]score1 tensor([[0.5078],
-        [0.6250],
-        [0.5781],
-        [0.4941]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4922, 0.6250, 0.5781, 0.4570], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:23:42,646] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.89 | optimizer_step: 4.36
-[2025-01-25 14:23:42,647] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.93 | bwd_microstep: 4543.38 | bwd_inner_microstep: 4538.51 | bwd_allreduce_microstep: 4.76 | step_microstep: 41.64
-[2025-01-25 14:23:42,648] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.90 | bwd: 4543.40 | bwd_inner: 4538.51 | bwd_allreduce: 4.82 | step: 41.65
- 35%|███▌      | 2031/5800 [5:37:12<7:13:16,  6.90s/it]                                                       {'loss': 0.0132, 'grad_norm': 4.426170349121094, 'learning_rate': 3.0176790628905278e-05, 'epoch': 17.51}
- 35%|███▌      | 2031/5800 [5:37:12<7:13:16,  6.90s/it]score1 tensor([[0.4082],
-        [0.5273],
-        [0.5039],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4355, 0.4492, 0.5625, 0.6133], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0674, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:23:49,642] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 7.02 | optimizer_step: 4.36
-[2025-01-25 14:23:49,644] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.25 | bwd_microstep: 4641.03 | bwd_inner_microstep: 4635.96 | bwd_allreduce_microstep: 4.99 | step_microstep: 68.65
-[2025-01-25 14:23:49,644] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.17 | bwd: 4641.06 | bwd_inner: 4635.97 | bwd_allreduce: 5.03 | step: 68.66
- 35%|███▌      | 2032/5800 [5:37:19<7:14:46,  6.92s/it]                                                       {'loss': 0.0674, 'grad_norm': 4.233480453491211, 'learning_rate': 3.0167174837935397e-05, 'epoch': 17.52}
- 35%|███▌      | 2032/5800 [5:37:19<7:14:46,  6.92s/it]score1 tensor([[0.4590],
-        [0.6367],
-        [0.4219],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4258, 0.6562, 0.4082, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0361, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:23:56,576] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.36
-[2025-01-25 14:23:56,578] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.88 | bwd_microstep: 4635.49 | bwd_inner_microstep: 4629.74 | bwd_allreduce_microstep: 5.66 | step_microstep: 45.54
-[2025-01-25 14:23:56,578] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.81 | bwd: 4635.51 | bwd_inner: 4629.74 | bwd_allreduce: 5.71 | step: 45.55
- 35%|███▌      | 2033/5800 [5:37:26<7:14:42,  6.92s/it]                                                       {'loss': 0.0361, 'grad_norm': 1.2702573537826538, 'learning_rate': 3.0157555876663984e-05, 'epoch': 17.53}
- 35%|███▌      | 2033/5800 [5:37:26<7:14:42,  6.92s/it]score1 tensor([[0.4648],
-        [0.4414],
-        [0.4512],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5078, 0.4004, 0.4688, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0283, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:24:03,502] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.94 | optimizer_step: 4.36
-[2025-01-25 14:24:03,504] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.20 | bwd_microstep: 4634.80 | bwd_inner_microstep: 4628.81 | bwd_allreduce_microstep: 5.85 | step_microstep: 52.05
-[2025-01-25 14:24:03,505] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.17 | bwd: 4634.83 | bwd_inner: 4628.81 | bwd_allreduce: 5.92 | step: 52.07
- 35%|███▌      | 2034/5800 [5:37:33<7:14:43,  6.93s/it]                                                       {'loss': 0.0283, 'grad_norm': 0.9596006274223328, 'learning_rate': 3.0147933748090402e-05, 'epoch': 17.53}
- 35%|███▌      | 2034/5800 [5:37:33<7:14:43,  6.93s/it]score1 tensor([[0.4902],
-        [0.3398],
-        [0.4863],
-        [0.5859]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.4258, 0.5234, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0464, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:24:10,434] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 14:24:10,436] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.90 | bwd_microstep: 4642.02 | bwd_inner_microstep: 4632.90 | bwd_allreduce_microstep: 9.03 | step_microstep: 43.06
-[2025-01-25 14:24:10,436] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.84 | bwd: 4642.05 | bwd_inner: 4632.90 | bwd_allreduce: 9.08 | step: 43.07
- 35%|███▌      | 2035/5800 [5:37:40<7:14:37,  6.93s/it]                                                       {'loss': 0.0464, 'grad_norm': 8.666484832763672, 'learning_rate': 3.0138308455215e-05, 'epoch': 17.54}
- 35%|███▌      | 2035/5800 [5:37:40<7:14:37,  6.93s/it]score1 tensor([[0.3320],
-        [0.3066],
-        [0.5547],
-        [0.4160]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3477, 0.3516, 0.5625, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0376, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:24:17,336] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 14:24:17,337] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.22 | bwd_microstep: 4625.30 | bwd_inner_microstep: 4620.53 | bwd_allreduce_microstep: 4.69 | step_microstep: 43.58
-[2025-01-25 14:24:17,338] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.18 | bwd: 4625.32 | bwd_inner: 4620.52 | bwd_allreduce: 4.73 | step: 43.59
- 35%|███▌      | 2036/5800 [5:37:47<7:14:00,  6.92s/it]                                                       {'loss': 0.0376, 'grad_norm': 7.930599689483643, 'learning_rate': 3.0128680001039093e-05, 'epoch': 17.55}
- 35%|███▌      | 2036/5800 [5:37:47<7:14:00,  6.92s/it]score1 tensor([[0.6602],
-        [0.5000],
-        [0.4492],
-        [0.3281]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6562, 0.4863, 0.4766, 0.3652], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0205, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:24:24,246] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 14:24:24,247] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.93 | bwd_microstep: 4633.17 | bwd_inner_microstep: 4625.66 | bwd_allreduce_microstep: 5.18 | step_microstep: 46.79
-[2025-01-25 14:24:24,248] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.90 | bwd: 4633.20 | bwd_inner: 4625.67 | bwd_allreduce: 7.44 | step: 46.80
- 35%|███▌      | 2037/5800 [5:37:54<7:13:46,  6.92s/it]                                                       {'loss': 0.0205, 'grad_norm': 1.4146108627319336, 'learning_rate': 3.0119048388565023e-05, 'epoch': 17.56}
- 35%|███▌      | 2037/5800 [5:37:54<7:13:46,  6.92s/it]score1 tensor([[0.4180],
-        [0.5000],
-        [0.4727],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3809, 0.4785, 0.5352, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0527, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:24:31,166] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 14:24:31,167] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.08 | bwd_microstep: 4637.36 | bwd_inner_microstep: 4632.25 | bwd_allreduce_microstep: 5.03 | step_microstep: 43.54
-[2025-01-25 14:24:31,167] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.05 | bwd: 4637.39 | bwd_inner: 4632.25 | bwd_allreduce: 5.06 | step: 43.55
- 35%|███▌      | 2038/5800 [5:38:01<7:13:44,  6.92s/it]                                                       {'loss': 0.0527, 'grad_norm': 0.8534650206565857, 'learning_rate': 3.010941362079608e-05, 'epoch': 17.57}
- 35%|███▌      | 2038/5800 [5:38:01<7:13:44,  6.92s/it]score1 tensor([[0.4746],
-        [0.6289],
-        [0.4941],
-        [0.4395]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4023, 0.6875, 0.4844, 0.4805], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0454, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:24:38,083] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 14:24:38,084] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.02 | bwd_microstep: 4629.56 | bwd_inner_microstep: 4624.22 | bwd_allreduce_microstep: 5.22 | step_microstep: 49.47
-[2025-01-25 14:24:38,085] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.99 | bwd: 4629.58 | bwd_inner: 4624.22 | bwd_allreduce: 5.28 | step: 49.52
- 35%|███▌      | 2039/5800 [5:38:08<7:13:41,  6.92s/it]                                                       {'loss': 0.0454, 'grad_norm': 0.6947988271713257, 'learning_rate': 3.0099775700736562e-05, 'epoch': 17.58}
- 35%|███▌      | 2039/5800 [5:38:08<7:13:41,  6.92s/it]score1 tensor([[0.3711],
-        [0.4102],
-        [0.5781],
-        [0.4160]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.4492, 0.5547, 0.4219], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0181, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:24:45,039] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 7.58 | optimizer_step: 4.62
-[2025-01-25 14:24:45,040] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.03 | bwd_microstep: 4641.63 | bwd_inner_microstep: 4629.06 | bwd_allreduce_microstep: 12.00 | step_microstep: 67.24
-[2025-01-25 14:24:45,041] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.99 | bwd: 4641.71 | bwd_inner: 4629.06 | bwd_allreduce: 12.35 | step: 67.19
- 35%|███▌      | 2040/5800 [5:38:15<7:14:10,  6.93s/it]                                                       {'loss': 0.0181, 'grad_norm': 3.6621882915496826, 'learning_rate': 3.009013463139173e-05, 'epoch': 17.59}
- 35%|███▌      | 2040/5800 [5:38:15<7:14:10,  6.93s/it]score1 tensor([[0.5273],
-        [0.5312],
-        [0.4355],
-        [0.4766]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.5625, 0.4863, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:24:51,994] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.36
-[2025-01-25 14:24:51,997] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2170.25 | bwd_microstep: 4637.94 | bwd_inner_microstep: 4632.88 | bwd_allreduce_microstep: 4.96 | step_microstep: 68.41
-[2025-01-25 14:24:51,998] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2170.21 | bwd: 4637.97 | bwd_inner: 4632.88 | bwd_allreduce: 5.01 | step: 68.46
- 35%|███▌      | 2041/5800 [5:38:21<7:14:43,  6.94s/it]                                                       {'loss': 0.043, 'grad_norm': 8.782501220703125, 'learning_rate': 3.0080490415767847e-05, 'epoch': 17.59}
- 35%|███▌      | 2041/5800 [5:38:21<7:14:43,  6.94s/it]score1 tensor([[0.5859],
-        [0.6055],
-        [0.6094],
-        [0.4824]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5898, 0.6172, 0.5508, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0269, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:24:58,914] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 14:24:58,916] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.50 | bwd_microstep: 4628.00 | bwd_inner_microstep: 4622.38 | bwd_allreduce_microstep: 5.54 | step_microstep: 44.32
-[2025-01-25 14:24:58,916] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.47 | bwd: 4628.03 | bwd_inner: 4622.38 | bwd_allreduce: 5.58 | step: 44.33
- 35%|███▌      | 2042/5800 [5:38:28<7:14:05,  6.93s/it]                                                       {'loss': 0.0269, 'grad_norm': 0.8062969446182251, 'learning_rate': 3.0070843056872134e-05, 'epoch': 17.6}
- 35%|███▌      | 2042/5800 [5:38:28<7:14:05,  6.93s/it]score1 tensor([[0.6641],
-        [0.6758],
-        [0.5938],
-        [0.4785]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.5664, 0.5977, 0.4160], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0781, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:25:05,827] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 14:25:05,828] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.54 | bwd_microstep: 4630.51 | bwd_inner_microstep: 4625.54 | bwd_allreduce_microstep: 4.88 | step_microstep: 43.77
-[2025-01-25 14:25:05,828] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.47 | bwd: 4630.53 | bwd_inner: 4625.54 | bwd_allreduce: 4.92 | step: 43.78
- 35%|███▌      | 2043/5800 [5:38:35<7:13:42,  6.93s/it]                                                       {'loss': 0.0781, 'grad_norm': 4.974067687988281, 'learning_rate': 3.0061192557712812e-05, 'epoch': 17.61}
- 35%|███▌      | 2043/5800 [5:38:35<7:13:42,  6.93s/it]score1 tensor([[0.6602],
-        [0.5859],
-        [0.6719],
-        [0.6250]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.4941, 0.5742, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1201, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:25:12,758] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 14:25:12,760] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.91 | bwd_microstep: 4637.16 | bwd_inner_microstep: 4632.04 | bwd_allreduce_microstep: 5.03 | step_microstep: 45.31
-[2025-01-25 14:25:12,760] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.88 | bwd: 4637.19 | bwd_inner: 4632.03 | bwd_allreduce: 5.08 | step: 45.32
- 35%|███▌      | 2044/5800 [5:38:42<7:13:42,  6.93s/it]                                                       {'loss': 0.1201, 'grad_norm': 10.17710018157959, 'learning_rate': 3.0051538921299074e-05, 'epoch': 17.62}
- 35%|███▌      | 2044/5800 [5:38:42<7:13:42,  6.93s/it]score1 tensor([[0.5117],
-        [0.7031],
-        [0.5273],
-        [0.4316]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.6133, 0.4805, 0.3672], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0737, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:25:19,696] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 14:25:19,698] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.55 | bwd_microstep: 4641.55 | bwd_inner_microstep: 4635.94 | bwd_allreduce_microstep: 5.49 | step_microstep: 49.11
-[2025-01-25 14:25:19,699] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.51 | bwd: 4641.57 | bwd_inner: 4635.94 | bwd_allreduce: 5.55 | step: 49.12
- 35%|███▌      | 2045/5800 [5:38:49<7:13:41,  6.93s/it]                                                       {'loss': 0.0737, 'grad_norm': 9.281943321228027, 'learning_rate': 3.004188215064109e-05, 'epoch': 17.63}
- 35%|███▌      | 2045/5800 [5:38:49<7:13:41,  6.93s/it]score1 tensor([[0.7031],
-        [0.5508],
-        [0.4746],
-        [0.6758]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6875, 0.5039, 0.4160, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0635, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:25:26,619] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.37
-[2025-01-25 14:25:26,620] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.01 | bwd_microstep: 4639.14 | bwd_inner_microstep: 4633.69 | bwd_allreduce_microstep: 5.38 | step_microstep: 42.99
-[2025-01-25 14:25:26,621] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.97 | bwd: 4639.17 | bwd_inner: 4633.69 | bwd_allreduce: 5.41 | step: 43.00
- 35%|███▌      | 2046/5800 [5:38:56<7:13:26,  6.93s/it]                                                       {'loss': 0.0635, 'grad_norm': 9.822221755981445, 'learning_rate': 3.0032222248750016e-05, 'epoch': 17.64}
- 35%|███▌      | 2046/5800 [5:38:56<7:13:26,  6.93s/it]score1 tensor([[0.6367],
-        [0.6016],
-        [0.5781],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.5547, 0.5000, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0625, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:25:33,532] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 14:25:33,533] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.27 | bwd_microstep: 4626.85 | bwd_inner_microstep: 4621.30 | bwd_allreduce_microstep: 5.46 | step_microstep: 44.81
-[2025-01-25 14:25:33,533] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.22 | bwd: 4626.87 | bwd_inner: 4621.30 | bwd_allreduce: 5.51 | step: 44.82
- 35%|███▌      | 2047/5800 [5:39:03<7:13:03,  6.92s/it]                                                       {'loss': 0.0625, 'grad_norm': 9.84626293182373, 'learning_rate': 3.002255921863796e-05, 'epoch': 17.65}
- 35%|███▌      | 2047/5800 [5:39:03<7:13:03,  6.92s/it]score1 tensor([[0.6992],
-        [0.7070],
-        [0.4980],
-        [0.4043]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.5742, 0.5195, 0.3984], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0596, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:25:40,451] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.37
-[2025-01-25 14:25:40,452] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.09 | bwd_microstep: 4637.06 | bwd_inner_microstep: 4631.69 | bwd_allreduce_microstep: 5.29 | step_microstep: 41.11
-[2025-01-25 14:25:40,452] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.05 | bwd: 4637.08 | bwd_inner: 4631.69 | bwd_allreduce: 5.33 | step: 41.11
- 35%|███▌      | 2048/5800 [5:39:10<7:12:45,  6.92s/it]                                                       {'loss': 0.0596, 'grad_norm': 5.235495090484619, 'learning_rate': 3.0012893063318043e-05, 'epoch': 17.66}
- 35%|███▌      | 2048/5800 [5:39:10<7:12:45,  6.92s/it]score1 tensor([[0.6602],
-        [0.3867],
-        [0.4219],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.4160, 0.4023, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0220, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:25:47,380] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.07 | optimizer_step: 4.37
-[2025-01-25 14:25:47,381] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.38 | bwd_microstep: 4643.61 | bwd_inner_microstep: 4636.62 | bwd_allreduce_microstep: 6.82 | step_microstep: 49.77
-[2025-01-25 14:25:47,382] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.36 | bwd: 4643.68 | bwd_inner: 4636.61 | bwd_allreduce: 6.89 | step: 49.76
- 35%|███▌      | 2049/5800 [5:39:17<7:12:52,  6.92s/it]                                                       {'loss': 0.022, 'grad_norm': 0.9056896567344666, 'learning_rate': 3.0003223785804328e-05, 'epoch': 17.66}
- 35%|███▌      | 2049/5800 [5:39:17<7:12:52,  6.92s/it]score1 tensor([[0.3379],
-        [0.4570],
-        [0.4668],
-        [0.4648]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3652, 0.4473, 0.3809, 0.5234], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0454, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:25:54,361] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.90 | optimizer_step: 4.48
-[2025-01-25 14:25:54,362] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.05 | bwd_microstep: 4639.87 | bwd_inner_microstep: 4632.21 | bwd_allreduce_microstep: 7.49 | step_microstep: 93.72
-[2025-01-25 14:25:54,362] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.02 | bwd: 4639.92 | bwd_inner: 4632.21 | bwd_allreduce: 7.58 | step: 93.72
- 35%|███▌      | 2050/5800 [5:39:24<7:14:12,  6.95s/it]                                                       {'loss': 0.0454, 'grad_norm': 0.8658824563026428, 'learning_rate': 2.9993551389111865e-05, 'epoch': 17.67}
- 35%|███▌      | 2050/5800 [5:39:24<7:14:12,  6.95s/it]score1 tensor([[0.5703],
-        [0.4102],
-        [0.5391],
-        [0.4512]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4902, 0.5820, 0.5508, 0.4258], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0723, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:26:01,317] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.34 | optimizer_step: 4.37
-[2025-01-25 14:26:01,318] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.47 | bwd_microstep: 4636.03 | bwd_inner_microstep: 4630.76 | bwd_allreduce_microstep: 5.15 | step_microstep: 41.89
-[2025-01-25 14:26:01,318] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2168.32 | bwd: 4636.08 | bwd_inner: 4630.76 | bwd_allreduce: 5.21 | step: 41.90
- 35%|███▌      | 2051/5800 [5:39:31<7:13:44,  6.94s/it]                                                       {'loss': 0.0723, 'grad_norm': 1.0636018514633179, 'learning_rate': 2.9983875876256673e-05, 'epoch': 17.68}
- 35%|███▌      | 2051/5800 [5:39:31<7:13:44,  6.94s/it]score1 tensor([[0.4219],
-        [0.4766],
-        [0.3633],
-        [0.4512]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4531, 0.5391, 0.3711, 0.4277], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:26:08,235] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 14:26:08,236] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.82 | bwd_microstep: 4630.33 | bwd_inner_microstep: 4624.99 | bwd_allreduce_microstep: 5.22 | step_microstep: 44.90
-[2025-01-25 14:26:08,236] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.78 | bwd: 4630.44 | bwd_inner: 4624.99 | bwd_allreduce: 5.29 | step: 44.90
- 35%|███▌      | 2052/5800 [5:39:38<7:13:19,  6.94s/it]                                                       {'loss': 0.0312, 'grad_norm': 4.006751537322998, 'learning_rate': 2.997419725025575e-05, 'epoch': 17.69}
- 35%|███▌      | 2052/5800 [5:39:38<7:13:19,  6.94s/it]score1 tensor([[0.5547],
-        [0.4375],
-        [0.3887],
-        [0.4277]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6484, 0.4688, 0.4551, 0.4883], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0630, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:26:15,146] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.53 | optimizer_step: 4.37
-[2025-01-25 14:26:15,148] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.66 | bwd_microstep: 4627.84 | bwd_inner_microstep: 4622.77 | bwd_allreduce_microstep: 4.97 | step_microstep: 46.41
-[2025-01-25 14:26:15,148] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.62 | bwd: 4627.86 | bwd_inner: 4622.77 | bwd_allreduce: 5.02 | step: 46.42
- 35%|███▌      | 2053/5800 [5:39:45<7:12:40,  6.93s/it]                                                       {'loss': 0.063, 'grad_norm': 8.442479133605957, 'learning_rate': 2.9964515514127056e-05, 'epoch': 17.7}
- 35%|███▌      | 2053/5800 [5:39:45<7:12:40,  6.93s/it]score1 tensor([[0.4707],
-        [0.4883],
-        [0.5547],
-        [0.4805]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.5625, 0.6211, 0.4766], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0420, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:26:22,059] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 14:26:22,061] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.90 | bwd_microstep: 4631.62 | bwd_inner_microstep: 4626.19 | bwd_allreduce_microstep: 5.35 | step_microstep: 45.95
-[2025-01-25 14:26:22,061] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.86 | bwd: 4631.65 | bwd_inner: 4626.19 | bwd_allreduce: 5.39 | step: 45.96
- 35%|███▌      | 2054/5800 [5:39:52<7:12:16,  6.92s/it]                                                       {'loss': 0.042, 'grad_norm': 0.8777186274528503, 'learning_rate': 2.9954830670889524e-05, 'epoch': 17.71}
- 35%|███▌      | 2054/5800 [5:39:52<7:12:16,  6.92s/it]score1 tensor([[0.5938],
-        [0.5508],
-        [0.5078],
-        [0.4961]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.6289, 0.5664, 0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0488, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:26:28,979] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 14:26:28,981] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.66 | bwd_microstep: 4637.46 | bwd_inner_microstep: 4632.01 | bwd_allreduce_microstep: 5.35 | step_microstep: 45.00
-[2025-01-25 14:26:28,981] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.63 | bwd: 4637.48 | bwd_inner: 4632.01 | bwd_allreduce: 5.40 | step: 45.01
- 35%|███▌      | 2055/5800 [5:39:58<7:12:09,  6.92s/it]                                                       {'loss': 0.0488, 'grad_norm': 9.226137161254883, 'learning_rate': 2.994514272356306e-05, 'epoch': 17.72}
- 35%|███▌      | 2055/5800 [5:39:58<7:12:09,  6.92s/it]score1 tensor([[0.4141],
-        [0.4766],
-        [0.5742],
-        [0.4297]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.4453, 0.5391, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0220, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:26:35,889] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 14:26:35,890] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.20 | bwd_microstep: 4626.15 | bwd_inner_microstep: 4620.90 | bwd_allreduce_microstep: 5.13 | step_microstep: 43.52
-[2025-01-25 14:26:35,890] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.17 | bwd: 4626.17 | bwd_inner: 4620.90 | bwd_allreduce: 5.20 | step: 43.53
- 35%|███▌      | 2056/5800 [5:40:05<7:11:49,  6.92s/it]                                                       {'loss': 0.022, 'grad_norm': 1.022670030593872, 'learning_rate': 2.9935451675168533e-05, 'epoch': 17.72}
- 35%|███▌      | 2056/5800 [5:40:05<7:11:49,  6.92s/it]score1 tensor([[0.5039],
-        [0.4043],
-        [0.4160],
-        [0.4355]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.4141, 0.3730, 0.5000], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0449, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:26:42,812] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.91 | optimizer_step: 4.36
-[2025-01-25 14:26:42,813] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.31 | bwd_microstep: 4627.68 | bwd_inner_microstep: 4621.23 | bwd_allreduce_microstep: 6.33 | step_microstep: 47.39
-[2025-01-25 14:26:42,814] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.27 | bwd: 4627.71 | bwd_inner: 4621.23 | bwd_allreduce: 6.40 | step: 47.41
- 35%|███▌      | 2057/5800 [5:40:12<7:11:44,  6.92s/it]                                                       {'loss': 0.0449, 'grad_norm': 4.211069583892822, 'learning_rate': 2.9925757528727774e-05, 'epoch': 17.73}
- 35%|███▌      | 2057/5800 [5:40:12<7:11:44,  6.92s/it]score1 tensor([[0.4297],
-        [0.4121],
-        [0.4316],
-        [0.4023]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.4062, 0.5117, 0.3789], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0317, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:26:49,744] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.40 | optimizer_step: 4.37
-[2025-01-25 14:26:49,746] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.80 | bwd_microstep: 4631.22 | bwd_inner_microstep: 4623.34 | bwd_allreduce_microstep: 7.74 | step_microstep: 56.04
-[2025-01-25 14:26:49,747] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.77 | bwd: 4631.24 | bwd_inner: 4623.34 | bwd_allreduce: 7.82 | step: 56.05
- 35%|███▌      | 2058/5800 [5:40:19<7:11:58,  6.93s/it]                                                       {'loss': 0.0317, 'grad_norm': 0.5408427715301514, 'learning_rate': 2.9916060287263594e-05, 'epoch': 17.74}
- 35%|███▌      | 2058/5800 [5:40:19<7:11:58,  6.93s/it]score1 tensor([[0.5742],
-        [0.5586],
-        [0.5625],
-        [0.4609]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.5664, 0.5273, 0.4297], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:26:56,686] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.89 | optimizer_step: 4.37
-[2025-01-25 14:26:56,688] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.75 | bwd_microstep: 4628.92 | bwd_inner_microstep: 4619.51 | bwd_allreduce_microstep: 9.16 | step_microstep: 59.24
-[2025-01-25 14:26:56,688] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.69 | bwd: 4628.99 | bwd_inner: 4619.51 | bwd_allreduce: 9.32 | step: 59.22
- 36%|███▌      | 2059/5800 [5:40:26<7:12:14,  6.93s/it]                                                       {'loss': 0.0215, 'grad_norm': 4.549279689788818, 'learning_rate': 2.9906359953799756e-05, 'epoch': 17.75}
- 36%|███▌      | 2059/5800 [5:40:26<7:12:14,  6.93s/it]score1 tensor([[0.4590],
-        [0.5391],
-        [0.4785],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4824, 0.5469, 0.4375, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0210, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:27:03,652] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 14:27:03,653] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.44 | bwd_microstep: 4630.77 | bwd_inner_microstep: 4621.99 | bwd_allreduce_microstep: 8.56 | step_microstep: 75.35
-[2025-01-25 14:27:03,654] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.41 | bwd: 4630.83 | bwd_inner: 4621.99 | bwd_allreduce: 8.69 | step: 75.33
- 36%|███▌      | 2060/5800 [5:40:33<7:12:56,  6.95s/it]                                                       {'loss': 0.021, 'grad_norm': 4.425289154052734, 'learning_rate': 2.9896656531360997e-05, 'epoch': 17.76}
- 36%|███▌      | 2060/5800 [5:40:33<7:12:56,  6.95s/it]score1 tensor([[0.6016],
-        [0.6289],
-        [0.6250],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.6445, 0.5469, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0449, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:27:10,608] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 14:27:10,609] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.03 | bwd_microstep: 4625.61 | bwd_inner_microstep: 4620.13 | bwd_allreduce_microstep: 5.39 | step_microstep: 44.25
-[2025-01-25 14:27:10,610] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.99 | bwd: 4625.63 | bwd_inner: 4620.13 | bwd_allreduce: 5.43 | step: 44.26
- 36%|███▌      | 2061/5800 [5:40:40<7:12:37,  6.94s/it]                                                       {'loss': 0.0449, 'grad_norm': 0.8368126749992371, 'learning_rate': 2.9886950022973004e-05, 'epoch': 17.77}
- 36%|███▌      | 2061/5800 [5:40:40<7:12:37,  6.94s/it]score1 tensor([[0.4473],
-        [0.4395],
-        [0.6328],
-        [0.6328]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3652, 0.3984, 0.6094, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0396, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:27:17,530] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.78 | optimizer_step: 4.53
-[2025-01-25 14:27:17,531] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.49 | bwd_microstep: 4624.68 | bwd_inner_microstep: 4619.28 | bwd_allreduce_microstep: 5.30 | step_microstep: 57.12
-[2025-01-25 14:27:17,531] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.45 | bwd: 4624.71 | bwd_inner: 4619.28 | bwd_allreduce: 5.35 | step: 57.13
- 36%|███▌      | 2062/5800 [5:40:47<7:12:08,  6.94s/it]                                                       {'loss': 0.0396, 'grad_norm': 4.107481956481934, 'learning_rate': 2.9877240431662442e-05, 'epoch': 17.78}
- 36%|███▌      | 2062/5800 [5:40:47<7:12:08,  6.94s/it]score1 tensor([[0.5469],
-        [0.4844],
-        [0.4980],
-        [0.4707]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.3398, 0.4473, 0.4199], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0693, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:27:24,456] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 14:27:24,458] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.22 | bwd_microstep: 4622.16 | bwd_inner_microstep: 4617.78 | bwd_allreduce_microstep: 4.30 | step_microstep: 47.51
-[2025-01-25 14:27:24,458] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.19 | bwd: 4622.18 | bwd_inner: 4617.78 | bwd_allreduce: 4.34 | step: 47.52
- 36%|███▌      | 2063/5800 [5:40:54<7:11:48,  6.93s/it]                                                       {'loss': 0.0693, 'grad_norm': 8.75247859954834, 'learning_rate': 2.986752776045693e-05, 'epoch': 17.78}
- 36%|███▌      | 2063/5800 [5:40:54<7:11:48,  6.93s/it]score1 tensor([[0.5273],
-        [0.4785],
-        [0.6250],
-        [0.4863]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4961, 0.4609, 0.6797, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0337, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:27:31,370] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 14:27:31,371] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.50 | bwd_microstep: 4626.54 | bwd_inner_microstep: 4621.61 | bwd_allreduce_microstep: 4.85 | step_microstep: 47.69
-[2025-01-25 14:27:31,372] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.47 | bwd: 4626.56 | bwd_inner: 4621.61 | bwd_allreduce: 4.89 | step: 47.70
- 36%|███▌      | 2064/5800 [5:41:01<7:11:15,  6.93s/it]                                                       {'loss': 0.0337, 'grad_norm': 4.058903694152832, 'learning_rate': 2.9857812012385045e-05, 'epoch': 17.79}
- 36%|███▌      | 2064/5800 [5:41:01<7:11:15,  6.93s/it]score1 tensor([[0.4688],
-        [0.4531],
-        [0.6016],
-        [0.6250]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.4375, 0.6172, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0337, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:27:38,273] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.37
-[2025-01-25 14:27:38,274] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.77 | bwd_microstep: 4632.81 | bwd_inner_microstep: 4627.97 | bwd_allreduce_microstep: 4.76 | step_microstep: 42.23
-[2025-01-25 14:27:38,275] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.73 | bwd: 4632.83 | bwd_inner: 4627.97 | bwd_allreduce: 4.80 | step: 42.24
- 36%|███▌      | 2065/5800 [5:41:08<7:10:45,  6.92s/it]                                                       {'loss': 0.0337, 'grad_norm': 4.27838134765625, 'learning_rate': 2.9848093190476328e-05, 'epoch': 17.8}
- 36%|███▌      | 2065/5800 [5:41:08<7:10:45,  6.92s/it]score1 tensor([[0.4375],
-        [0.4082],
-        [0.4629],
-        [0.4648]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4277, 0.3789, 0.3340, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0581, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:27:45,180] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 14:27:45,182] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.17 | bwd_microstep: 4630.08 | bwd_inner_microstep: 4620.59 | bwd_allreduce_microstep: 9.39 | step_microstep: 42.21
-[2025-01-25 14:27:45,182] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.13 | bwd: 4630.10 | bwd_inner: 4620.60 | bwd_allreduce: 9.44 | step: 42.19
- 36%|███▌      | 2066/5800 [5:41:15<7:10:21,  6.92s/it]                                                       {'loss': 0.0581, 'grad_norm': 8.151716232299805, 'learning_rate': 2.9838371297761273e-05, 'epoch': 17.81}
- 36%|███▌      | 2066/5800 [5:41:15<7:10:21,  6.92s/it]score1 tensor([[0.4492],
-        [0.4082],
-        [0.4102],
-        [0.3711]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4316, 0.4590, 0.4844, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0552, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:27:52,087] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 14:27:52,088] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.04 | bwd_microstep: 4629.16 | bwd_inner_microstep: 4624.04 | bwd_allreduce_microstep: 5.03 | step_microstep: 43.43
-[2025-01-25 14:27:52,088] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.00 | bwd: 4629.18 | bwd_inner: 4624.05 | bwd_allreduce: 5.07 | step: 43.44
- 36%|███▌      | 2067/5800 [5:41:22<7:10:06,  6.91s/it]                                                       {'loss': 0.0552, 'grad_norm': 3.7063405513763428, 'learning_rate': 2.982864633727134e-05, 'epoch': 17.82}
- 36%|███▌      | 2067/5800 [5:41:22<7:10:06,  6.91s/it]score1 tensor([[0.3848],
-        [0.4316],
-        [0.4180],
-        [0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3867, 0.4121, 0.4629, 0.6016], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0459, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:27:58,993] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.37
-[2025-01-25 14:27:58,995] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.56 | bwd_microstep: 4627.18 | bwd_inner_microstep: 4621.26 | bwd_allreduce_microstep: 5.83 | step_microstep: 44.21
-[2025-01-25 14:27:58,995] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.52 | bwd: 4627.20 | bwd_inner: 4621.27 | bwd_allreduce: 5.87 | step: 44.22
- 36%|███▌      | 2068/5800 [5:41:28<7:09:54,  6.91s/it]                                                       {'loss': 0.0459, 'grad_norm': 4.125699043273926, 'learning_rate': 2.9818918312038934e-05, 'epoch': 17.83}
- 36%|███▌      | 2068/5800 [5:41:28<7:09:54,  6.91s/it]score1 tensor([[0.4961],
-        [0.4980],
-        [0.4219],
-        [0.4707]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.6289, 0.4355, 0.6719], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0908, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:28:05,919] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.26 | optimizer_step: 4.37
-[2025-01-25 14:28:05,921] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.64 | bwd_microstep: 4631.89 | bwd_inner_microstep: 4624.65 | bwd_allreduce_microstep: 7.14 | step_microstep: 52.96
-[2025-01-25 14:28:05,922] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.55 | bwd: 4631.92 | bwd_inner: 4624.65 | bwd_allreduce: 7.19 | step: 52.98
- 36%|███▌      | 2069/5800 [5:41:35<7:10:10,  6.92s/it]                                                       {'loss': 0.0908, 'grad_norm': 4.176988124847412, 'learning_rate': 2.9809187225097424e-05, 'epoch': 17.84}
- 36%|███▌      | 2069/5800 [5:41:35<7:10:10,  6.92s/it]score1 tensor([[0.4707],
-        [0.3828],
-        [0.3867],
-        [0.4258]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.4277, 0.4727, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0576, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:28:12,848] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.36
-[2025-01-25 14:28:12,851] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.28 | bwd_microstep: 4625.00 | bwd_inner_microstep: 4619.57 | bwd_allreduce_microstep: 5.34 | step_microstep: 50.71
-[2025-01-25 14:28:12,852] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.25 | bwd: 4625.02 | bwd_inner: 4619.57 | bwd_allreduce: 5.39 | step: 50.75
- 36%|███▌      | 2070/5800 [5:41:42<7:10:50,  6.93s/it]                                                       {'loss': 0.0576, 'grad_norm': 3.7810397148132324, 'learning_rate': 2.9799453079481136e-05, 'epoch': 17.84}
- 36%|███▌      | 2070/5800 [5:41:42<7:10:50,  6.93s/it]evaluate!
-score1 tensor([[0.4531]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4805, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5625, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4355]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4766, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0410, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4668]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5703, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1035, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5000]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5742, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0742, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4062]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5547, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1484, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5977, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0586, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4785]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4219, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0566, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5664]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0781, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4434]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5000, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0566, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4121]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3945, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5000, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4473]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3730, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0742, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4355]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0566, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4316]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4414, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4648]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6797, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2148, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4824]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5391, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0566, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4258]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4648, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4043]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4258, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4082]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4707, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0625, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4258]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3965, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4395]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3750, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0645, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4551]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4492, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4160]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5586, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1426, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6523, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1133, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5078]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5898, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0820, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4980]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4355, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0625, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4668]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5156, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0488, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4258]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3262, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0996, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3848]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4863, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1016, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4395]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2090, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5078]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6055, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0977, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4727]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4570, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4805]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1680, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5078]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.7031, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1953, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4199]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4336, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5391, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4004]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4473, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4492]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5430, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0938, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4531]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4590, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5703]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6641, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0938, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4453, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0781, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4844]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.7109, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2266, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4961]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5273, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4238]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3691, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3867]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4180, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4805]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3223, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1582, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5625]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6016, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4727]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4648, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3887]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3418, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4766]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4590, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4512]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5117, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0605, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4727]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4551, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4707]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5234, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0527, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5039]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5312, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4844]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5820, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0977, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4238]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4199, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4375]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3926, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0449, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4551]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1621, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4902]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4844]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4688]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5352, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4941]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6250, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1309, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5625]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5703, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4004]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4727, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0723, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5430]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4434, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0996, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4805]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4941, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4961]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5352, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4922]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4785]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3828, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0957, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4473]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4219, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4277]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4844, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0566, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4785]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4805, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0781, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4961]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5898, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0938, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6641, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1406, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4492]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4688, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4434]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4297, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4375]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4668, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4648]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1641, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4668]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4297, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0371, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4023]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4609, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0586, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4551]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1621, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4551]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4492, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3926]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3730, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4570]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6445, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1875, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4395]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.2090, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5430, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4590]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5273, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0684, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4141]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3867, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4902]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4668, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4824]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4668]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5625, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0957, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6250, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1094, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4746]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1543, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1016, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3965]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4785, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0820, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4316]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4062, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5078]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4395, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0684, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4160]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5664, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1504, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4043]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3555, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0488, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4258]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4785, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0527, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5312]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5078, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4863]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6016, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1152, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4570]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4277, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4570]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5781, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1211, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4453]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3477, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0977, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4355]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6133, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1777, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4512]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5156, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0645, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4375]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3379, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0996, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4160]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3418, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0742, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4023]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3848, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3945]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3828, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4434]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4277, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4062]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4043, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16)
-Results saved to /DATA/DATA1/wjr/intern/InternVL/internvl_chat/new_train_result/stage2/mos3_test.csv
-Accuracy: 0.0
-SRCC_score: 0.5335661905618233
-PLCC_score: 0.5076952623757025
-KRCC_score: 0.3701899900264391
-SRCC_level: 0.5335661905618233
-PLCC_level: 0.5076952623757025
-KRCC_level: 0.3701899900264391
-score1 tensor([[0.5195],
-        [0.5391],
-        [0.3730],
-        [0.5898]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133, 0.6094, 0.3613, 0.6328], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:38:50,155] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 14:38:50,157] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2134.76 | bwd_microstep: 4595.20 | bwd_inner_microstep: 4590.54 | bwd_allreduce_microstep: 4.57 | step_microstep: 41.15
-[2025-01-25 14:38:50,157] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2134.72 | bwd: 4595.22 | bwd_inner: 4590.54 | bwd_allreduce: 4.62 | step: 41.15
- 36%|███▌      | 2071/5800 [5:52:20<203:03:23, 196.03s/it]                                                          {'loss': 0.0547, 'grad_norm': 5.104669570922852, 'learning_rate': 2.9789715878225344e-05, 'epoch': 17.85}
- 36%|███▌      | 2071/5800 [5:52:20<203:03:23, 196.03s/it]score1 tensor([[0.5234],
-        [0.6172],
-        [0.4922],
-        [0.4492]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6602, 0.6641, 0.5703, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0830, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:38:56,981] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.37
-[2025-01-25 14:38:56,982] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2139.32 | bwd_microstep: 4566.45 | bwd_inner_microstep: 4561.49 | bwd_allreduce_microstep: 4.89 | step_microstep: 42.65
-[2025-01-25 14:38:56,982] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2139.29 | bwd: 4566.48 | bwd_inner: 4561.49 | bwd_allreduce: 4.93 | step: 42.66
- 36%|███▌      | 2072/5800 [5:52:26<144:13:21, 139.27s/it]                                                          {'loss': 0.083, 'grad_norm': 8.812199592590332, 'learning_rate': 2.9779975624366276e-05, 'epoch': 17.86}
- 36%|███▌      | 2072/5800 [5:52:26<144:13:21, 139.27s/it]score1 tensor([[0.4805],
-        [0.4297],
-        [0.5273],
-        [0.4707]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4883, 0.4570, 0.5469, 0.4746], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:39:03,801] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.37
-[2025-01-25 14:39:03,802] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2128.86 | bwd_microstep: 4572.39 | bwd_inner_microstep: 4567.31 | bwd_allreduce_microstep: 5.00 | step_microstep: 42.75
-[2025-01-25 14:39:03,802] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2128.84 | bwd: 4572.42 | bwd_inner: 4567.31 | bwd_allreduce: 5.04 | step: 42.76
- 36%|███▌      | 2073/5800 [5:52:33<103:02:46, 99.53s/it]                                                          {'loss': 0.0146, 'grad_norm': 8.39771842956543, 'learning_rate': 2.9770232320941105e-05, 'epoch': 17.87}
- 36%|███▌      | 2073/5800 [5:52:33<103:02:46, 99.53s/it]score1 tensor([[0.4844],
-        [0.4863],
-        [0.5156],
-        [0.5938]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.4316, 0.4844, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0420, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:39:10,644] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 14:39:10,646] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2134.34 | bwd_microstep: 4588.39 | bwd_inner_microstep: 4582.88 | bwd_allreduce_microstep: 5.41 | step_microstep: 42.82
-[2025-01-25 14:39:10,646] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2134.30 | bwd: 4588.44 | bwd_inner: 4582.88 | bwd_allreduce: 5.46 | step: 42.83
- 36%|███▌      | 2074/5800 [5:52:40<74:14:35, 71.73s/it]                                                         {'loss': 0.042, 'grad_norm': 4.648991584777832, 'learning_rate': 2.976048597098797e-05, 'epoch': 17.88}
- 36%|███▌      | 2074/5800 [5:52:40<74:14:35, 71.73s/it]score1 tensor([[0.5312],
-        [0.5156],
-        [0.4434],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.5352, 0.3223, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:39:17,522] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 14:39:17,524] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2147.01 | bwd_microstep: 4590.16 | bwd_inner_microstep: 4585.54 | bwd_allreduce_microstep: 4.54 | step_microstep: 42.33
-[2025-01-25 14:39:17,524] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2146.91 | bwd: 4590.18 | bwd_inner: 4585.54 | bwd_allreduce: 4.58 | step: 42.34
- 36%|███▌      | 2075/5800 [5:52:47<54:05:08, 52.27s/it]                                                        {'loss': 0.043, 'grad_norm': 0.5157396793365479, 'learning_rate': 2.975073657754595e-05, 'epoch': 17.89}
- 36%|███▌      | 2075/5800 [5:52:47<54:05:08, 52.27s/it]score1 tensor([[0.5273],
-        [0.6133],
-        [0.6602],
-        [0.6172]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.5625, 0.6367, 0.5742], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0376, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:39:24,419] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.18 | optimizer_step: 4.37
-[2025-01-25 14:39:24,420] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2147.56 | bwd_microstep: 4609.88 | bwd_inner_microstep: 4605.10 | bwd_allreduce_microstep: 4.69 | step_microstep: 52.62
-[2025-01-25 14:39:24,421] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2147.53 | bwd: 4609.91 | bwd_inner: 4605.10 | bwd_allreduce: 4.74 | step: 52.63
- 36%|███▌      | 2076/5800 [5:52:54<39:59:24, 38.66s/it]                                                        {'loss': 0.0376, 'grad_norm': 9.635863304138184, 'learning_rate': 2.9740984143655075e-05, 'epoch': 17.9}
- 36%|███▌      | 2076/5800 [5:52:54<39:59:24, 38.66s/it]score1 tensor([[0.5469],
-        [0.5039],
-        [0.5938],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4375, 0.4414, 0.5430, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0640, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:39:31,297] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 14:39:31,298] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2148.47 | bwd_microstep: 4611.27 | bwd_inner_microstep: 4606.55 | bwd_allreduce_microstep: 4.64 | step_microstep: 42.67
-[2025-01-25 14:39:31,299] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2148.44 | bwd: 4611.30 | bwd_inner: 4606.55 | bwd_allreduce: 4.68 | step: 42.67
- 36%|███▌      | 2077/5800 [5:53:01<30:07:08, 29.12s/it]                                                        {'loss': 0.064, 'grad_norm': 9.013300895690918, 'learning_rate': 2.9731228672356326e-05, 'epoch': 17.91}
- 36%|███▌      | 2077/5800 [5:53:01<30:07:08, 29.12s/it]score1 tensor([[0.5352],
-        [0.5117],
-        [0.5859],
-        [0.4336]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.4395, 0.4805, 0.1787], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1099, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:39:38,173] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 14:39:38,174] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2146.82 | bwd_microstep: 4611.03 | bwd_inner_microstep: 4606.31 | bwd_allreduce_microstep: 4.64 | step_microstep: 46.83
-[2025-01-25 14:39:38,174] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2146.79 | bwd: 4611.05 | bwd_inner: 4606.31 | bwd_allreduce: 4.68 | step: 46.84
- 36%|███▌      | 2078/5800 [5:53:08<23:12:37, 22.45s/it]                                                        {'loss': 0.1099, 'grad_norm': 8.709654808044434, 'learning_rate': 2.972147016669163e-05, 'epoch': 17.91}
- 36%|███▌      | 2078/5800 [5:53:08<23:12:37, 22.45s/it]score1 tensor([[0.5703],
-        [0.6289],
-        [0.5312],
-        [0.4570]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.6562, 0.4961, 0.3105], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0630, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:39:45,060] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 14:39:45,061] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2147.99 | bwd_microstep: 4612.12 | bwd_inner_microstep: 4607.63 | bwd_allreduce_microstep: 4.42 | step_microstep: 43.52
-[2025-01-25 14:39:45,062] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2147.95 | bwd: 4612.14 | bwd_inner: 4607.63 | bwd_allreduce: 4.45 | step: 43.52
- 36%|███▌      | 2079/5800 [5:53:15<18:22:43, 17.78s/it]                                                        {'loss': 0.063, 'grad_norm': 4.158846378326416, 'learning_rate': 2.9711708629703847e-05, 'epoch': 17.92}
- 36%|███▌      | 2079/5800 [5:53:15<18:22:43, 17.78s/it]score1 tensor([[0.5117],
-        [0.5820],
-        [0.4902],
-        [0.4922]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.5938, 0.4648, 0.3262], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0527, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:39:51,933] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 14:39:51,935] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.08 | bwd_microstep: 4606.99 | bwd_inner_microstep: 4601.77 | bwd_allreduce_microstep: 5.11 | step_microstep: 41.56
-[2025-01-25 14:39:51,939] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.05 | bwd: 4607.02 | bwd_inner: 4601.77 | bwd_allreduce: 5.17 | step: 41.57
- 36%|███▌      | 2080/5800 [5:53:21<14:59:45, 14.51s/it]                                                        {'loss': 0.0527, 'grad_norm': 0.6515860557556152, 'learning_rate': 2.97019440644368e-05, 'epoch': 17.93}
- 36%|███▌      | 2080/5800 [5:53:21<14:59:45, 14.51s/it]score1 tensor([[0.4570],
-        [0.5156],
-        [0.4980],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3438, 0.5430, 0.5508, 0.4395], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0674, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:39:58,811] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 14:39:58,812] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2149.11 | bwd_microstep: 4599.33 | bwd_inner_microstep: 4593.92 | bwd_allreduce_microstep: 5.33 | step_microstep: 43.31
-[2025-01-25 14:39:58,813] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2149.08 | bwd: 4599.36 | bwd_inner: 4593.92 | bwd_allreduce: 5.37 | step: 43.32
- 36%|███▌      | 2081/5800 [5:53:28<12:37:23, 12.22s/it]                                                        {'loss': 0.0674, 'grad_norm': 0.5679455995559692, 'learning_rate': 2.9692176473935247e-05, 'epoch': 17.94}
- 36%|███▌      | 2081/5800 [5:53:28<12:37:23, 12.22s/it]score1 tensor([[0.5352],
-        [0.5352],
-        [0.4473],
-        [0.4512]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.5469, 0.3906, 0.2812], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0635, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:40:05,695] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.54 | optimizer_step: 4.36
-[2025-01-25 14:40:05,697] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.17 | bwd_microstep: 4611.07 | bwd_inner_microstep: 4605.76 | bwd_allreduce_microstep: 5.20 | step_microstep: 45.29
-[2025-01-25 14:40:05,697] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.14 | bwd: 4611.10 | bwd_inner: 4605.76 | bwd_allreduce: 5.25 | step: 45.30
- 36%|███▌      | 2082/5800 [5:53:35<10:57:55, 10.62s/it]                                                        {'loss': 0.0635, 'grad_norm': 0.773417055606842, 'learning_rate': 2.9682405861244902e-05, 'epoch': 17.95}
- 36%|███▌      | 2082/5800 [5:53:35<10:57:55, 10.62s/it]score1 tensor([[0.4316],
-        [0.5000],
-        [0.4688],
-        [0.4512]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.4453, 0.4551, 0.4238], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0317, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:40:12,572] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 14:40:12,573] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.77 | bwd_microstep: 4613.65 | bwd_inner_microstep: 4609.07 | bwd_allreduce_microstep: 4.51 | step_microstep: 41.56
-[2025-01-25 14:40:12,573] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.73 | bwd: 4613.68 | bwd_inner: 4609.07 | bwd_allreduce: 4.55 | step: 41.57
- 36%|███▌      | 2083/5800 [5:53:42<9:48:15,  9.50s/it]                                                        {'loss': 0.0317, 'grad_norm': 8.330718994140625, 'learning_rate': 2.9672632229412398e-05, 'epoch': 17.96}
- 36%|███▌      | 2083/5800 [5:53:42<9:48:15,  9.50s/it]score1 tensor([[0.5469],
-        [0.4551],
-        [0.4766],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.4863, 0.4941, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:40:19,454] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 14:40:19,455] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.66 | bwd_microstep: 4605.61 | bwd_inner_microstep: 4600.57 | bwd_allreduce_microstep: 4.95 | step_microstep: 50.00
-[2025-01-25 14:40:19,455] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.61 | bwd: 4605.64 | bwd_inner: 4600.57 | bwd_allreduce: 4.99 | step: 50.01
- 36%|███▌      | 2084/5800 [5:53:49<8:59:39,  8.71s/it]                                                       {'loss': 0.0254, 'grad_norm': 4.222324848175049, 'learning_rate': 2.966285558148534e-05, 'epoch': 17.97}
- 36%|███▌      | 2084/5800 [5:53:49<8:59:39,  8.71s/it]score1 tensor([[0.3867],
-        [0.4375],
-        [0.4395],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4043, 0.4023, 0.4766, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0454, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:40:26,367] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 14:40:26,368] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.85 | bwd_microstep: 4611.35 | bwd_inner_microstep: 4602.91 | bwd_allreduce_microstep: 8.27 | step_microstep: 60.13
-[2025-01-25 14:40:26,368] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.73 | bwd: 4611.40 | bwd_inner: 4602.91 | bwd_allreduce: 8.36 | step: 60.10
- 36%|███▌      | 2085/5800 [5:53:56<8:26:08,  8.17s/it]                                                       {'loss': 0.0454, 'grad_norm': 0.774806022644043, 'learning_rate': 2.965307592051224e-05, 'epoch': 17.97}
- 36%|███▌      | 2085/5800 [5:53:56<8:26:08,  8.17s/it]score1 tensor([[0.4375],
-        [0.4492],
-        [0.5430],
-        [0.4434]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4609, 0.5391, 0.5469, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:40:33,289] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 14:40:33,291] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.61 | bwd_microstep: 4615.13 | bwd_inner_microstep: 4610.22 | bwd_allreduce_microstep: 4.83 | step_microstep: 43.14
-[2025-01-25 14:40:33,291] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.53 | bwd: 4615.16 | bwd_inner: 4610.22 | bwd_allreduce: 4.87 | step: 43.15
- 36%|███▌      | 2086/5800 [5:54:03<8:02:33,  7.80s/it]                                                       {'loss': 0.0312, 'grad_norm': 8.390710830688477, 'learning_rate': 2.9643293249542573e-05, 'epoch': 17.98}
- 36%|███▌      | 2086/5800 [5:54:03<8:02:33,  7.80s/it]score1 tensor([[0.4473],
-        [0.3965],
-        [0.5117],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4043, 0.4180, 0.4141, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0444, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:40:40,173] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 14:40:40,175] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.37 | bwd_microstep: 4616.14 | bwd_inner_microstep: 4611.06 | bwd_allreduce_microstep: 5.00 | step_microstep: 42.82
-[2025-01-25 14:40:40,175] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.34 | bwd: 4616.17 | bwd_inner: 4611.06 | bwd_allreduce: 5.04 | step: 42.83
- 36%|███▌      | 2087/5800 [5:54:10<7:45:34,  7.52s/it]                                                       {'loss': 0.0444, 'grad_norm': 4.52031135559082, 'learning_rate': 2.9633507571626747e-05, 'epoch': 17.99}
- 36%|███▌      | 2087/5800 [5:54:10<7:45:34,  7.52s/it][93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-score1 tensor([[0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5703], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:40:44,875] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 14:40:44,876] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 570.13 | bwd_microstep: 1220.78 | bwd_inner_microstep: 1215.76 | bwd_allreduce_microstep: 4.92 | step_microstep: 57.02
-[2025-01-25 14:40:44,877] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 570.10 | bwd: 1220.81 | bwd_inner: 1215.75 | bwd_allreduce: 4.98 | step: 57.02
- 36%|███▌      | 2088/5800 [5:54:14<6:53:10,  6.68s/it]                                                       {'loss': 0.0234, 'grad_norm': 9.198143005371094, 'learning_rate': 2.9623718889816105e-05, 'epoch': 18.0}
- 36%|███▌      | 2088/5800 [5:54:14<6:53:10,  6.68s/it][2025-01-25 14:40:49,432] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 14:40:59,702] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 14:41:09,945] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 14:41:19,512] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-score1 tensor([[0.4570],
-        [0.4453],
-        [0.4238],
-        [0.4297]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4609, 0.4727, 0.4629, 0.4355], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0190, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:41:33,889] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.95 | optimizer_step: 4.36
-[2025-01-25 14:41:33,891] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2141.04 | bwd_microstep: 4600.65 | bwd_inner_microstep: 4595.65 | bwd_allreduce_microstep: 4.92 | step_microstep: 51.45
-[2025-01-25 14:41:33,891] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2141.00 | bwd: 4600.68 | bwd_inner: 4595.65 | bwd_allreduce: 4.96 | step: 51.47
- 36%|███▌      | 2089/5800 [5:55:03<19:58:29, 19.38s/it]                                                        {'loss': 0.019, 'grad_norm': 8.125077247619629, 'learning_rate': 2.9613927207162922e-05, 'epoch': 18.01}
- 36%|███▌      | 2089/5800 [5:55:03<19:58:29, 19.38s/it]score1 tensor([[0.4414],
-        [0.5508],
-        [0.6172],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4277, 0.5508, 0.6055, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:41:40,640] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 14:41:40,642] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2132.95 | bwd_microstep: 4491.03 | bwd_inner_microstep: 4485.64 | bwd_allreduce_microstep: 5.32 | step_microstep: 44.39
-[2025-01-25 14:41:40,642] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2132.91 | bwd: 4491.06 | bwd_inner: 4485.64 | bwd_allreduce: 5.36 | step: 44.40
- 36%|███▌      | 2090/5800 [5:55:10<16:03:59, 15.59s/it]                                                        {'loss': 0.0063, 'grad_norm': 4.544936180114746, 'learning_rate': 2.9604132526720426e-05, 'epoch': 18.02}
- 36%|███▌      | 2090/5800 [5:55:10<16:03:59, 15.59s/it]score1 tensor([[0.4805],
-        [0.5742],
-        [0.3965],
-        [0.4453]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5234, 0.5625, 0.4004, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0151, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:41:47,486] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.37
-[2025-01-25 14:41:47,488] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2133.75 | bwd_microstep: 4586.27 | bwd_inner_microstep: 4579.72 | bwd_allreduce_microstep: 6.35 | step_microstep: 47.39
-[2025-01-25 14:41:47,488] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2133.71 | bwd: 4586.32 | bwd_inner: 4579.72 | bwd_allreduce: 6.46 | step: 47.37
- 36%|███▌      | 2091/5800 [5:55:17<13:21:31, 12.97s/it]                                                        {'loss': 0.0151, 'grad_norm': 3.915909767150879, 'learning_rate': 2.959433485154276e-05, 'epoch': 18.03}
- 36%|███▌      | 2091/5800 [5:55:17<13:21:31, 12.97s/it]score1 tensor([[0.4102],
-        [0.4844],
-        [0.5703],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4375, 0.4316, 0.7031, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0562, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:41:54,362] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.37
-[2025-01-25 14:41:54,363] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2146.04 | bwd_microstep: 4602.90 | bwd_inner_microstep: 4595.51 | bwd_allreduce_microstep: 7.24 | step_microstep: 49.06
-[2025-01-25 14:41:54,363] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2146.00 | bwd: 4602.95 | bwd_inner: 4595.51 | bwd_allreduce: 7.31 | step: 49.03
- 36%|███▌      | 2092/5800 [5:55:24<11:28:39, 11.14s/it]                                                        {'loss': 0.0562, 'grad_norm': 4.722601413726807, 'learning_rate': 2.958453418468501e-05, 'epoch': 18.03}
- 36%|███▌      | 2092/5800 [5:55:24<11:28:39, 11.14s/it]score1 tensor([[0.4570],
-        [0.4160],
-        [0.5547],
-        [0.4238]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.3750, 0.5156, 0.3809], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0513, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:42:01,259] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.37
-[2025-01-25 14:42:01,260] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2149.86 | bwd_microstep: 4599.91 | bwd_inner_microstep: 4595.41 | bwd_allreduce_microstep: 4.41 | step_microstep: 41.69
-[2025-01-25 14:42:01,260] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2149.82 | bwd: 4599.93 | bwd_inner: 4595.42 | bwd_allreduce: 4.45 | step: 41.69
- 36%|███▌      | 2093/5800 [5:55:31<10:09:26,  9.86s/it]                                                        {'loss': 0.0513, 'grad_norm': 8.404861450195312, 'learning_rate': 2.9574730529203192e-05, 'epoch': 18.04}
- 36%|███▌      | 2093/5800 [5:55:31<10:09:26,  9.86s/it]score1 tensor([[0.4668],
-        [0.5195],
-        [0.5273],
-        [0.6133]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4121, 0.4707, 0.5391, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0483, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:42:08,117] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 14:42:08,118] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2147.35 | bwd_microstep: 4602.01 | bwd_inner_microstep: 4597.98 | bwd_allreduce_microstep: 3.97 | step_microstep: 41.27
-[2025-01-25 14:42:08,119] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2147.32 | bwd: 4602.03 | bwd_inner: 4597.98 | bwd_allreduce: 4.00 | step: 41.27
- 36%|███▌      | 2094/5800 [5:55:38<9:13:32,  8.96s/it]                                                        {'loss': 0.0483, 'grad_norm': 4.689432144165039, 'learning_rate': 2.9564923888154267e-05, 'epoch': 18.05}
- 36%|███▌      | 2094/5800 [5:55:38<9:13:32,  8.96s/it]score1 tensor([[0.4688],
-        [0.5039],
-        [0.5938],
-        [0.4219]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5078, 0.4863, 0.5000, 0.4414], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0425, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:42:14,994] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 14:42:14,999] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2149.64 | bwd_microstep: 4614.42 | bwd_inner_microstep: 4609.81 | bwd_allreduce_microstep: 4.51 | step_microstep: 45.24
-[2025-01-25 14:42:15,000] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2149.61 | bwd: 4614.44 | bwd_inner: 4609.81 | bwd_allreduce: 4.57 | step: 45.26
- 36%|███▌      | 2095/5800 [5:55:44<8:34:58,  8.34s/it]                                                       {'loss': 0.0425, 'grad_norm': 0.9681897163391113, 'learning_rate': 2.95551142645961e-05, 'epoch': 18.06}
- 36%|███▌      | 2095/5800 [5:55:44<8:34:58,  8.34s/it]score1 tensor([[0.6094],
-        [0.6445],
-        [0.4531],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5898, 0.6250, 0.4727, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0244, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:42:21,871] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 14:42:21,872] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2148.38 | bwd_microstep: 4605.78 | bwd_inner_microstep: 4600.92 | bwd_allreduce_microstep: 4.72 | step_microstep: 41.59
-[2025-01-25 14:42:21,872] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2148.33 | bwd: 4605.80 | bwd_inner: 4600.92 | bwd_allreduce: 4.80 | step: 41.59
- 36%|███▌      | 2096/5800 [5:55:51<8:07:33,  7.90s/it]                                                       {'loss': 0.0244, 'grad_norm': 5.292811870574951, 'learning_rate': 2.954530166158752e-05, 'epoch': 18.07}
- 36%|███▌      | 2096/5800 [5:55:51<8:07:33,  7.90s/it]score1 tensor([[0.6914],
-        [0.6055],
-        [0.4395],
-        [0.4434]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6641, 0.5664, 0.4609, 0.3809], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0376, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:42:28,741] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.37
-[2025-01-25 14:42:28,743] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2149.94 | bwd_microstep: 4606.40 | bwd_inner_microstep: 4601.47 | bwd_allreduce_microstep: 4.83 | step_microstep: 42.36
-[2025-01-25 14:42:28,743] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2149.90 | bwd: 4606.42 | bwd_inner: 4601.47 | bwd_allreduce: 4.88 | step: 42.36
- 36%|███▌      | 2097/5800 [5:55:58<7:48:31,  7.59s/it]                                                       {'loss': 0.0376, 'grad_norm': 6.015527248382568, 'learning_rate': 2.9535486082188244e-05, 'epoch': 18.08}
- 36%|███▌      | 2097/5800 [5:55:58<7:48:31,  7.59s/it]score1 tensor([[0.4043],
-        [0.5078],
-        [0.4824],
-        [0.4121]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.2812, 0.5781, 0.6094, 0.4043], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0820, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:42:35,615] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 14:42:35,617] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.05 | bwd_microstep: 4606.46 | bwd_inner_microstep: 4601.74 | bwd_allreduce_microstep: 4.64 | step_microstep: 43.47
-[2025-01-25 14:42:35,617] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.01 | bwd: 4606.48 | bwd_inner: 4601.74 | bwd_allreduce: 4.68 | step: 43.48
- 36%|███▌      | 2098/5800 [5:56:05<7:35:02,  7.37s/it]                                                       {'loss': 0.082, 'grad_norm': 1.6269636154174805, 'learning_rate': 2.952566752945896e-05, 'epoch': 18.09}
- 36%|███▌      | 2098/5800 [5:56:05<7:35:02,  7.37s/it]score1 tensor([[0.4414],
-        [0.5000],
-        [0.4766],
-        [0.4961]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4316, 0.5078, 0.4688, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0347, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:42:42,502] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.36
-[2025-01-25 14:42:42,504] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.60 | bwd_microstep: 4615.28 | bwd_inner_microstep: 4610.33 | bwd_allreduce_microstep: 4.83 | step_microstep: 45.74
-[2025-01-25 14:42:42,504] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.56 | bwd: 4615.30 | bwd_inner: 4610.33 | bwd_allreduce: 4.89 | step: 45.75
- 36%|███▌      | 2099/5800 [5:56:12<7:25:57,  7.23s/it]                                                       {'loss': 0.0347, 'grad_norm': 0.6342516541481018, 'learning_rate': 2.9515846006461254e-05, 'epoch': 18.09}
- 36%|███▌      | 2099/5800 [5:56:12<7:25:57,  7.23s/it]score1 tensor([[0.5117],
-        [0.5117],
-        [0.4355],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.5898, 0.4258, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0308, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:42:49,397] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 8.07 | optimizer_step: 4.37
-[2025-01-25 14:42:49,398] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.15 | bwd_microstep: 4608.90 | bwd_inner_microstep: 4603.60 | bwd_allreduce_microstep: 5.18 | step_microstep: 51.08
-[2025-01-25 14:42:49,398] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.12 | bwd: 4608.92 | bwd_inner: 4603.60 | bwd_allreduce: 5.24 | step: 51.09
- 36%|███▌      | 2100/5800 [5:56:19<7:19:35,  7.13s/it]                                                       {'loss': 0.0308, 'grad_norm': 4.7237982749938965, 'learning_rate': 2.9506021516257646e-05, 'epoch': 18.1}
- 36%|███▌      | 2100/5800 [5:56:19<7:19:35,  7.13s/it]score1 tensor([[0.5430],
-        [0.5195],
-        [0.4297],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5742, 0.5547, 0.4492, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:42:56,311] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.94 | optimizer_step: 4.55
-[2025-01-25 14:42:56,314] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.84 | bwd_microstep: 4610.27 | bwd_inner_microstep: 4604.85 | bwd_allreduce_microstep: 5.33 | step_microstep: 69.29
-[2025-01-25 14:42:56,315] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.78 | bwd: 4610.29 | bwd_inner: 4604.85 | bwd_allreduce: 5.38 | step: 69.33
- 36%|███▌      | 2101/5800 [5:56:26<7:16:03,  7.07s/it]                                                       {'loss': 0.0469, 'grad_norm': 8.683638572692871, 'learning_rate': 2.9496194061911584e-05, 'epoch': 18.11}
- 36%|███▌      | 2101/5800 [5:56:26<7:16:03,  7.07s/it]score1 tensor([[0.5859],
-        [0.4961],
-        [0.5273],
-        [0.4766]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.5469, 0.5195, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:43:03,262] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.36
-[2025-01-25 14:43:03,263] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.17 | bwd_microstep: 4620.76 | bwd_inner_microstep: 4616.01 | bwd_allreduce_microstep: 4.66 | step_microstep: 41.69
-[2025-01-25 14:43:03,264] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.14 | bwd: 4620.78 | bwd_inner: 4616.01 | bwd_allreduce: 4.71 | step: 41.69
- 36%|███▌      | 2102/5800 [5:56:33<7:13:17,  7.03s/it]                                                       {'loss': 0.0312, 'grad_norm': 4.430288791656494, 'learning_rate': 2.9486363646487443e-05, 'epoch': 18.12}
- 36%|███▌      | 2102/5800 [5:56:33<7:13:17,  7.03s/it]score1 tensor([[0.3711],
-        [0.4844],
-        [0.5195],
-        [0.4922]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3730, 0.4980, 0.5586, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0264, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:43:10,163] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.37
-[2025-01-25 14:43:10,164] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.41 | bwd_microstep: 4619.28 | bwd_inner_microstep: 4614.41 | bwd_allreduce_microstep: 4.78 | step_microstep: 43.25
-[2025-01-25 14:43:10,164] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.35 | bwd: 4619.31 | bwd_inner: 4614.41 | bwd_allreduce: 4.83 | step: 43.26
- 36%|███▋      | 2103/5800 [5:56:40<7:10:35,  6.99s/it]                                                       {'loss': 0.0264, 'grad_norm': 8.454900741577148, 'learning_rate': 2.947653027305052e-05, 'epoch': 18.13}
- 36%|███▋      | 2103/5800 [5:56:40<7:10:35,  6.99s/it]score1 tensor([[0.4648],
-        [0.4863],
-        [0.4648],
-        [0.5938]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.4844, 0.5312, 0.6523], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0479, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:43:17,059] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 14:43:17,060] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.90 | bwd_microstep: 4622.35 | bwd_inner_microstep: 4616.91 | bwd_allreduce_microstep: 5.34 | step_microstep: 44.15
-[2025-01-25 14:43:17,061] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.87 | bwd: 4622.37 | bwd_inner: 4616.91 | bwd_allreduce: 5.39 | step: 44.16
- 36%|███▋      | 2104/5800 [5:56:47<7:08:45,  6.96s/it]                                                       {'loss': 0.0479, 'grad_norm': 0.6945760846138, 'learning_rate': 2.946669394466702e-05, 'epoch': 18.14}
- 36%|███▋      | 2104/5800 [5:56:47<7:08:45,  6.96s/it]score1 tensor([[0.5469],
-        [0.5312],
-        [0.4746],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.4844, 0.4941, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0303, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:43:23,942] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 14:43:23,943] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.45 | bwd_microstep: 4614.37 | bwd_inner_microstep: 4609.40 | bwd_allreduce_microstep: 4.89 | step_microstep: 41.87
-[2025-01-25 14:43:23,943] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.41 | bwd: 4614.40 | bwd_inner: 4609.40 | bwd_allreduce: 4.93 | step: 41.87
- 36%|███▋      | 2105/5800 [5:56:53<7:07:11,  6.94s/it]                                                       {'loss': 0.0303, 'grad_norm': 4.426111698150635, 'learning_rate': 2.945685466440409e-05, 'epoch': 18.15}
- 36%|███▋      | 2105/5800 [5:56:53<7:07:11,  6.94s/it]score1 tensor([[0.6211],
-        [0.6172],
-        [0.5938],
-        [0.4688]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.5156, 0.4570, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0864, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:43:30,831] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 14:43:30,832] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.36 | bwd_microstep: 4616.59 | bwd_inner_microstep: 4612.01 | bwd_allreduce_microstep: 4.50 | step_microstep: 41.75
-[2025-01-25 14:43:30,833] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.33 | bwd: 4616.62 | bwd_inner: 4612.01 | bwd_allreduce: 4.54 | step: 41.76
- 36%|███▋      | 2106/5800 [5:57:00<7:06:13,  6.92s/it]                                                       {'loss': 0.0864, 'grad_norm': 5.186394214630127, 'learning_rate': 2.944701243532978e-05, 'epoch': 18.16}
- 36%|███▋      | 2106/5800 [5:57:00<7:06:13,  6.92s/it]score1 tensor([[0.5078],
-        [0.6523],
-        [0.4023],
-        [0.5977]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4062, 0.5195, 0.3613, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0815, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:43:37,733] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 14:43:37,735] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.24 | bwd_microstep: 4622.94 | bwd_inner_microstep: 4617.36 | bwd_allreduce_microstep: 5.47 | step_microstep: 44.15
-[2025-01-25 14:43:37,735] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.19 | bwd: 4622.97 | bwd_inner: 4617.36 | bwd_allreduce: 5.53 | step: 44.16
- 36%|███▋      | 2107/5800 [5:57:07<7:05:41,  6.92s/it]                                                       {'loss': 0.0815, 'grad_norm': 8.960063934326172, 'learning_rate': 2.9437167260513077e-05, 'epoch': 18.16}
- 36%|███▋      | 2107/5800 [5:57:07<7:05:41,  6.92s/it]score1 tensor([[0.5391],
-        [0.4551],
-        [0.5703],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4375, 0.3750, 0.5039, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0781, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:43:44,621] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 14:43:44,622] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.23 | bwd_microstep: 4620.55 | bwd_inner_microstep: 4615.74 | bwd_allreduce_microstep: 4.72 | step_microstep: 42.09
-[2025-01-25 14:43:44,622] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.21 | bwd: 4620.57 | bwd_inner: 4615.74 | bwd_allreduce: 4.77 | step: 42.09
- 36%|███▋      | 2108/5800 [5:57:14<7:05:11,  6.91s/it]                                                       {'loss': 0.0781, 'grad_norm': 8.924513816833496, 'learning_rate': 2.942731914302387e-05, 'epoch': 18.17}
- 36%|███▋      | 2108/5800 [5:57:14<7:05:11,  6.91s/it]score1 tensor([[0.6211],
-        [0.6172],
-        [0.5117],
-        [0.6523]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.4980, 0.5000, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0757, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:43:51,545] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.03 | optimizer_step: 4.36
-[2025-01-25 14:43:51,547] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.78 | bwd_microstep: 4622.21 | bwd_inner_microstep: 4610.96 | bwd_allreduce_microstep: 11.12 | step_microstep: 54.08
-[2025-01-25 14:43:51,547] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.75 | bwd: 4622.23 | bwd_inner: 4610.96 | bwd_allreduce: 11.20 | step: 54.09
- 36%|███▋      | 2109/5800 [5:57:21<7:05:25,  6.92s/it]                                                       {'loss': 0.0757, 'grad_norm': 9.622743606567383, 'learning_rate': 2.941746808593298e-05, 'epoch': 18.18}
- 36%|███▋      | 2109/5800 [5:57:21<7:05:25,  6.92s/it]score1 tensor([[0.4238],
-        [0.6680],
-        [0.5703],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3652, 0.5508, 0.4922, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0771, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:43:58,502] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.03 | optimizer_step: 4.36
-[2025-01-25 14:43:58,504] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.24 | bwd_microstep: 4624.30 | bwd_inner_microstep: 4616.60 | bwd_allreduce_microstep: 7.45 | step_microstep: 79.49
-[2025-01-25 14:43:58,505] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.20 | bwd: 4624.36 | bwd_inner: 4616.60 | bwd_allreduce: 7.58 | step: 79.50
- 36%|███▋      | 2110/5800 [5:57:28<7:06:19,  6.93s/it]                                                       {'loss': 0.0771, 'grad_norm': 9.031569480895996, 'learning_rate': 2.940761409231213e-05, 'epoch': 18.19}
- 36%|███▋      | 2110/5800 [5:57:28<7:06:19,  6.93s/it]score1 tensor([[0.4062],
-        [0.5195],
-        [0.4961],
-        [0.6133]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3672, 0.4844, 0.4277, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0503, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:44:05,464] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 14:44:05,465] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.40 | bwd_microstep: 4627.16 | bwd_inner_microstep: 4621.99 | bwd_allreduce_microstep: 5.09 | step_microstep: 57.75
-[2025-01-25 14:44:05,465] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.32 | bwd: 4627.19 | bwd_inner: 4621.99 | bwd_allreduce: 5.13 | step: 57.76
- 36%|███▋      | 2111/5800 [5:57:35<7:06:20,  6.93s/it]                                                       {'loss': 0.0503, 'grad_norm': 8.785032272338867, 'learning_rate': 2.939775716523397e-05, 'epoch': 18.2}
- 36%|███▋      | 2111/5800 [5:57:35<7:06:20,  6.93s/it]score1 tensor([[0.4922],
-        [0.4629],
-        [0.6719],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4590, 0.4707, 0.6289, 0.5586], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0220, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:44:12,375] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.37
-[2025-01-25 14:44:12,376] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.59 | bwd_microstep: 4626.89 | bwd_inner_microstep: 4619.37 | bwd_allreduce_microstep: 7.37 | step_microstep: 48.60
-[2025-01-25 14:44:12,377] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.55 | bwd: 4626.94 | bwd_inner: 4619.37 | bwd_allreduce: 7.44 | step: 48.56
- 36%|███▋      | 2112/5800 [5:57:42<7:05:45,  6.93s/it]                                                       {'loss': 0.022, 'grad_norm': 4.985817909240723, 'learning_rate': 2.938789730777206e-05, 'epoch': 18.21}
- 36%|███▋      | 2112/5800 [5:57:42<7:05:45,  6.93s/it]score1 tensor([[0.3867],
-        [0.4512],
-        [0.4043],
-        [0.3770]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4023, 0.4238, 0.4805, 0.3223], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0435, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:44:19,270] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 14:44:19,271] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.58 | bwd_microstep: 4615.73 | bwd_inner_microstep: 4610.54 | bwd_allreduce_microstep: 5.09 | step_microstep: 43.00
-[2025-01-25 14:44:19,272] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.54 | bwd: 4615.75 | bwd_inner: 4610.55 | bwd_allreduce: 5.14 | step: 43.01
- 36%|███▋      | 2113/5800 [5:57:49<7:05:08,  6.92s/it]                                                       {'loss': 0.0435, 'grad_norm': 0.6039568781852722, 'learning_rate': 2.9378034523000875e-05, 'epoch': 18.22}
- 36%|███▋      | 2113/5800 [5:57:49<7:05:08,  6.92s/it]score1 tensor([[0.4336],
-        [0.4844],
-        [0.4453],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4277, 0.5430, 0.4941, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:44:26,165] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 14:44:26,167] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.68 | bwd_microstep: 4615.73 | bwd_inner_microstep: 4610.79 | bwd_allreduce_microstep: 4.84 | step_microstep: 42.60
-[2025-01-25 14:44:26,167] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.66 | bwd: 4615.75 | bwd_inner: 4610.79 | bwd_allreduce: 4.88 | step: 42.60
- 36%|███▋      | 2114/5800 [5:57:56<7:04:33,  6.91s/it]                                                       {'loss': 0.0352, 'grad_norm': 0.6537477970123291, 'learning_rate': 2.9368168813995806e-05, 'epoch': 18.22}
- 36%|███▋      | 2114/5800 [5:57:56<7:04:33,  6.91s/it]score1 tensor([[0.4375],
-        [0.4453],
-        [0.5156],
-        [0.3770]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.4805, 0.6016, 0.4082], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:44:33,080] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 14:44:33,082] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.46 | bwd_microstep: 4624.15 | bwd_inner_microstep: 4618.52 | bwd_allreduce_microstep: 5.56 | step_microstep: 47.63
-[2025-01-25 14:44:33,082] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.43 | bwd: 4624.17 | bwd_inner: 4618.52 | bwd_allreduce: 5.59 | step: 47.62
- 36%|███▋      | 2115/5800 [5:58:03<7:04:29,  6.91s/it]                                                       {'loss': 0.0547, 'grad_norm': 8.225518226623535, 'learning_rate': 2.935830018383315e-05, 'epoch': 18.23}
- 36%|███▋      | 2115/5800 [5:58:03<7:04:29,  6.91s/it]score1 tensor([[0.4043],
-        [0.4668],
-        [0.4414],
-        [0.4609]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4336, 0.4941, 0.4707, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0479, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:44:39,988] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 14:44:39,989] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.81 | bwd_microstep: 4620.24 | bwd_inner_microstep: 4615.44 | bwd_allreduce_microstep: 4.70 | step_microstep: 46.54
-[2025-01-25 14:44:39,989] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.77 | bwd: 4620.27 | bwd_inner: 4615.44 | bwd_allreduce: 4.75 | step: 46.54
- 36%|███▋      | 2116/5800 [5:58:09<7:04:19,  6.91s/it]                                                       {'loss': 0.0479, 'grad_norm': 8.22128677368164, 'learning_rate': 2.934842863559011e-05, 'epoch': 18.24}
- 36%|███▋      | 2116/5800 [5:58:09<7:04:19,  6.91s/it]score1 tensor([[0.5977],
-        [0.4492],
-        [0.6250],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5938, 0.5195, 0.6172, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0283, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:44:46,888] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 14:44:46,889] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.09 | bwd_microstep: 4621.89 | bwd_inner_microstep: 4616.40 | bwd_allreduce_microstep: 5.40 | step_microstep: 43.59
-[2025-01-25 14:44:46,889] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.05 | bwd: 4621.92 | bwd_inner: 4616.40 | bwd_allreduce: 5.45 | step: 43.59
- 36%|███▋      | 2117/5800 [5:58:16<7:04:00,  6.91s/it]                                                       {'loss': 0.0283, 'grad_norm': 1.131232500076294, 'learning_rate': 2.9338554172344813e-05, 'epoch': 18.25}
- 36%|███▋      | 2117/5800 [5:58:16<7:04:00,  6.91s/it]score1 tensor([[0.4805],
-        [0.5312],
-        [0.4180],
-        [0.4688]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.5547, 0.4785, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0444, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:44:53,785] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 14:44:53,786] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.85 | bwd_microstep: 4615.00 | bwd_inner_microstep: 4609.29 | bwd_allreduce_microstep: 5.50 | step_microstep: 47.39
-[2025-01-25 14:44:53,787] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.82 | bwd: 4615.06 | bwd_inner: 4609.29 | bwd_allreduce: 5.60 | step: 47.40
- 37%|███▋      | 2118/5800 [5:58:23<7:03:56,  6.91s/it]                                                       {'loss': 0.0444, 'grad_norm': 8.468426704406738, 'learning_rate': 2.932867679717629e-05, 'epoch': 18.26}
- 37%|███▋      | 2118/5800 [5:58:23<7:03:56,  6.91s/it]score1 tensor([[0.5547],
-        [0.3906],
-        [0.5312],
-        [0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5820, 0.4746, 0.5273, 0.4688], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0327, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:45:00,733] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.51 | optimizer_step: 4.46
-[2025-01-25 14:45:00,739] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.13 | bwd_microstep: 4618.48 | bwd_inner_microstep: 4609.93 | bwd_allreduce_microstep: 8.29 | step_microstep: 79.65
-[2025-01-25 14:45:00,739] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.10 | bwd: 4618.53 | bwd_inner: 4609.93 | bwd_allreduce: 8.43 | step: 79.65
- 37%|███▋      | 2119/5800 [5:58:30<7:04:47,  6.92s/it]                                                       {'loss': 0.0327, 'grad_norm': 0.7342262268066406, 'learning_rate': 2.9318796513164476e-05, 'epoch': 18.27}
- 37%|███▋      | 2119/5800 [5:58:30<7:04:47,  6.92s/it]score1 tensor([[0.5898],
-        [0.3652],
-        [0.5664],
-        [0.4121]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.3652, 0.6328, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0244, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:45:07,631] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.37
-[2025-01-25 14:45:07,632] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.60 | bwd_microstep: 4574.45 | bwd_inner_microstep: 4569.15 | bwd_allreduce_microstep: 5.21 | step_microstep: 46.88
-[2025-01-25 14:45:07,632] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.57 | bwd: 4574.47 | bwd_inner: 4569.15 | bwd_allreduce: 5.26 | step: 46.89
- 37%|███▋      | 2120/5800 [5:58:37<7:03:41,  6.91s/it]                                                       {'loss': 0.0244, 'grad_norm': 2.8762192726135254, 'learning_rate': 2.930891332339021e-05, 'epoch': 18.28}
- 37%|███▋      | 2120/5800 [5:58:37<7:03:41,  6.91s/it]score1 tensor([[0.3828],
-        [0.4043],
-        [0.4199],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3418, 0.4004, 0.4180, 0.6289], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0420, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:45:14,535] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 14:45:14,536] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.67 | bwd_microstep: 4624.66 | bwd_inner_microstep: 4619.61 | bwd_allreduce_microstep: 4.94 | step_microstep: 48.28
-[2025-01-25 14:45:14,537] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.63 | bwd: 4624.69 | bwd_inner: 4619.61 | bwd_allreduce: 5.01 | step: 48.29
- 37%|███▋      | 2121/5800 [5:58:44<7:03:38,  6.91s/it]                                                       {'loss': 0.042, 'grad_norm': 3.732563018798828, 'learning_rate': 2.929902723093525e-05, 'epoch': 18.28}
- 37%|███▋      | 2121/5800 [5:58:44<7:03:38,  6.91s/it]score1 tensor([[0.5469],
-        [0.4062],
-        [0.4102],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.3984, 0.3926, 0.6133], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0386, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:45:21,450] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 14:45:21,451] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.43 | bwd_microstep: 4622.99 | bwd_inner_microstep: 4618.23 | bwd_allreduce_microstep: 4.67 | step_microstep: 47.80
-[2025-01-25 14:45:21,452] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.39 | bwd: 4623.02 | bwd_inner: 4618.23 | bwd_allreduce: 4.72 | step: 47.81
- 37%|███▋      | 2122/5800 [5:58:51<7:03:29,  6.91s/it]                                                       {'loss': 0.0386, 'grad_norm': 1.1202125549316406, 'learning_rate': 2.9289138238882245e-05, 'epoch': 18.29}
- 37%|███▋      | 2122/5800 [5:58:51<7:03:29,  6.91s/it]score1 tensor([[0.5898],
-        [0.4570],
-        [0.4922],
-        [0.3711]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.4785, 0.5078, 0.4062], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0229, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:45:28,358] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 14:45:28,360] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.11 | bwd_microstep: 4627.43 | bwd_inner_microstep: 4622.60 | bwd_allreduce_microstep: 4.75 | step_microstep: 42.38
-[2025-01-25 14:45:28,360] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.07 | bwd: 4627.46 | bwd_inner: 4622.60 | bwd_allreduce: 4.79 | step: 42.40
- 37%|███▋      | 2123/5800 [5:58:58<7:03:24,  6.91s/it]                                                       {'loss': 0.0229, 'grad_norm': 8.391228675842285, 'learning_rate': 2.9279246350314756e-05, 'epoch': 18.3}
- 37%|███▋      | 2123/5800 [5:58:58<7:03:24,  6.91s/it]score1 tensor([[0.5195],
-        [0.4785],
-        [0.5156],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.5156, 0.5703, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0332, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:45:35,260] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 14:45:35,261] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.77 | bwd_microstep: 4623.73 | bwd_inner_microstep: 4619.01 | bwd_allreduce_microstep: 4.64 | step_microstep: 41.78
-[2025-01-25 14:45:35,262] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.71 | bwd: 4623.75 | bwd_inner: 4619.01 | bwd_allreduce: 4.67 | step: 41.78
- 37%|███▋      | 2124/5800 [5:59:05<7:03:10,  6.91s/it]                                                       {'loss': 0.0332, 'grad_norm': 4.344761848449707, 'learning_rate': 2.926935156831725e-05, 'epoch': 18.31}
- 37%|███▋      | 2124/5800 [5:59:05<7:03:10,  6.91s/it]score1 tensor([[0.5039],
-        [0.5391],
-        [0.5352],
-        [0.4570]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.6484, 0.5977, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0532, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:45:42,167] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.37
-[2025-01-25 14:45:42,169] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.62 | bwd_microstep: 4624.73 | bwd_inner_microstep: 4619.39 | bwd_allreduce_microstep: 5.25 | step_microstep: 44.07
-[2025-01-25 14:45:42,169] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.59 | bwd: 4624.75 | bwd_inner: 4619.39 | bwd_allreduce: 5.29 | step: 44.07
- 37%|███▋      | 2125/5800 [5:59:12<7:03:02,  6.91s/it]                                                       {'loss': 0.0532, 'grad_norm': 0.6310330033302307, 'learning_rate': 2.925945389597508e-05, 'epoch': 18.32}
- 37%|███▋      | 2125/5800 [5:59:12<7:03:02,  6.91s/it]score1 tensor([[0.5430],
-        [0.5430],
-        [0.4375],
-        [0.4805]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.5469, 0.4121, 0.4766], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0269, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:45:49,068] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 14:45:49,069] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.66 | bwd_microstep: 4624.49 | bwd_inner_microstep: 4619.62 | bwd_allreduce_microstep: 4.77 | step_microstep: 43.53
-[2025-01-25 14:45:49,069] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.63 | bwd: 4624.51 | bwd_inner: 4619.62 | bwd_allreduce: 4.82 | step: 43.54
- 37%|███▋      | 2126/5800 [5:59:19<7:02:55,  6.91s/it]                                                       {'loss': 0.0269, 'grad_norm': 4.309808254241943, 'learning_rate': 2.9249553336374527e-05, 'epoch': 18.33}
- 37%|███▋      | 2126/5800 [5:59:19<7:02:55,  6.91s/it]score1 tensor([[0.6250],
-        [0.6016],
-        [0.4141],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.5664, 0.4141, 0.4688], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:45:55,938] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.37
-[2025-01-25 14:45:55,939] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.93 | bwd_microstep: 4581.86 | bwd_inner_microstep: 4576.69 | bwd_allreduce_microstep: 5.08 | step_microstep: 44.03
-[2025-01-25 14:45:55,939] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.88 | bwd: 4581.88 | bwd_inner: 4576.69 | bwd_allreduce: 5.12 | step: 44.04
- 37%|███▋      | 2127/5800 [5:59:25<7:02:07,  6.90s/it]                                                       {'loss': 0.0186, 'grad_norm': 2.1198039054870605, 'learning_rate': 2.9239649892602752e-05, 'epoch': 18.34}
- 37%|███▋      | 2127/5800 [5:59:25<7:02:07,  6.90s/it]score1 tensor([[0.4785],
-        [0.5508],
-        [0.5977],
-        [0.6133]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4238, 0.5781, 0.6211, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:46:02,876] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.89 | optimizer_step: 4.36
-[2025-01-25 14:46:02,879] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.22 | bwd_microstep: 4619.90 | bwd_inner_microstep: 4611.29 | bwd_allreduce_microstep: 8.39 | step_microstep: 71.05
-[2025-01-25 14:46:02,879] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.16 | bwd: 4619.95 | bwd_inner: 4611.29 | bwd_allreduce: 8.50 | step: 71.07
- 37%|███▋      | 2128/5800 [5:59:32<7:03:07,  6.91s/it]                                                       {'loss': 0.0273, 'grad_norm': 0.6782734394073486, 'learning_rate': 2.9229743567747814e-05, 'epoch': 18.34}
- 37%|███▋      | 2128/5800 [5:59:32<7:03:07,  6.91s/it]score1 tensor([[0.5156],
-        [0.5742],
-        [0.5625],
-        [0.5859]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4883, 0.5898, 0.5391, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:46:09,799] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.37
-[2025-01-25 14:46:09,800] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.90 | bwd_microstep: 4612.75 | bwd_inner_microstep: 4607.28 | bwd_allreduce_microstep: 5.37 | step_microstep: 44.51
-[2025-01-25 14:46:09,801] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.86 | bwd: 4612.78 | bwd_inner: 4607.28 | bwd_allreduce: 5.42 | step: 44.52
- 37%|███▋      | 2129/5800 [5:59:39<7:02:44,  6.91s/it]                                                       {'loss': 0.0312, 'grad_norm': 4.518746852874756, 'learning_rate': 2.921983436489868e-05, 'epoch': 18.35}
- 37%|███▋      | 2129/5800 [5:59:39<7:02:44,  6.91s/it]score1 tensor([[0.5820],
-        [0.4551],
-        [0.4551],
-        [0.4766]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6367, 0.4551, 0.4180, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0308, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:46:16,666] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.37
-[2025-01-25 14:46:16,667] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.14 | bwd_microstep: 4579.97 | bwd_inner_microstep: 4574.93 | bwd_allreduce_microstep: 4.96 | step_microstep: 43.85
-[2025-01-25 14:46:16,667] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.10 | bwd: 4579.99 | bwd_inner: 4574.93 | bwd_allreduce: 5.00 | step: 43.86
- 37%|███▋      | 2130/5800 [5:59:46<7:01:51,  6.90s/it]                                                       {'loss': 0.0308, 'grad_norm': 2.0000038146972656, 'learning_rate': 2.9209922287145212e-05, 'epoch': 18.36}
- 37%|███▋      | 2130/5800 [5:59:46<7:01:51,  6.90s/it]score1 tensor([[0.5820],
-        [0.5547],
-        [0.4766],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.5664, 0.5156, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0366, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:46:23,573] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.37
-[2025-01-25 14:46:23,575] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.36 | bwd_microstep: 4624.55 | bwd_inner_microstep: 4618.95 | bwd_allreduce_microstep: 5.51 | step_microstep: 41.48
-[2025-01-25 14:46:23,575] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.30 | bwd: 4624.57 | bwd_inner: 4618.95 | bwd_allreduce: 5.55 | step: 41.49
- 37%|███▋      | 2131/5800 [5:59:53<7:01:56,  6.90s/it]                                                       {'loss': 0.0366, 'grad_norm': 0.5996444821357727, 'learning_rate': 2.9200007337578172e-05, 'epoch': 18.37}
- 37%|███▋      | 2131/5800 [5:59:53<7:01:56,  6.90s/it]score1 tensor([[0.5234],
-        [0.5781],
-        [0.4570],
-        [0.3418]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.5508, 0.4941, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:46:30,482] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.37
-[2025-01-25 14:46:30,484] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.34 | bwd_microstep: 4625.29 | bwd_inner_microstep: 4620.00 | bwd_allreduce_microstep: 5.20 | step_microstep: 43.11
-[2025-01-25 14:46:30,484] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.29 | bwd: 4625.32 | bwd_inner: 4620.00 | bwd_allreduce: 5.24 | step: 43.11
- 37%|███▋      | 2132/5800 [6:00:00<7:01:59,  6.90s/it]                                                       {'loss': 0.0664, 'grad_norm': 3.873702049255371, 'learning_rate': 2.9190089519289212e-05, 'epoch': 18.38}
- 37%|███▋      | 2132/5800 [6:00:00<7:01:59,  6.90s/it]score1 tensor([[0.4043],
-        [0.5391],
-        [0.4961],
-        [0.4316]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.5703, 0.5273, 0.4023], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0479, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:46:37,385] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 14:46:37,386] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.67 | bwd_microstep: 4622.74 | bwd_inner_microstep: 4617.94 | bwd_allreduce_microstep: 4.71 | step_microstep: 42.86
-[2025-01-25 14:46:37,387] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.64 | bwd: 4622.77 | bwd_inner: 4617.94 | bwd_allreduce: 4.76 | step: 42.86
- 37%|███▋      | 2133/5800 [6:00:07<7:01:47,  6.90s/it]                                                       {'loss': 0.0479, 'grad_norm': 4.333792209625244, 'learning_rate': 2.9180168835370874e-05, 'epoch': 18.39}
- 37%|███▋      | 2133/5800 [6:00:07<7:01:47,  6.90s/it]score1 tensor([[0.3496],
-        [0.3789],
-        [0.5039],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3984, 0.4297, 0.4824, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:46:44,287] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 14:46:44,288] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.71 | bwd_microstep: 4621.63 | bwd_inner_microstep: 4616.48 | bwd_allreduce_microstep: 5.04 | step_microstep: 46.37
-[2025-01-25 14:46:44,288] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.66 | bwd: 4621.66 | bwd_inner: 4616.48 | bwd_allreduce: 5.10 | step: 46.38
- 37%|███▋      | 2134/5800 [6:00:14<7:01:41,  6.90s/it]                                                       {'loss': 0.0469, 'grad_norm': 1.216817021369934, 'learning_rate': 2.9170245288916606e-05, 'epoch': 18.4}
- 37%|███▋      | 2134/5800 [6:00:14<7:01:41,  6.90s/it]score1 tensor([[0.4570],
-        [0.4688],
-        [0.4980],
-        [0.5781]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4668, 0.5195, 0.4941, 0.6875], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0435, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:46:51,183] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.28 | optimizer_step: 4.37
-[2025-01-25 14:46:51,185] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.67 | bwd_microstep: 4616.93 | bwd_inner_microstep: 4611.81 | bwd_allreduce_microstep: 5.03 | step_microstep: 46.42
-[2025-01-25 14:46:51,185] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.62 | bwd: 4616.95 | bwd_inner: 4611.81 | bwd_allreduce: 5.07 | step: 46.43
- 37%|███▋      | 2135/5800 [6:00:21<7:01:51,  6.91s/it]                                                       {'loss': 0.0435, 'grad_norm': 4.375899314880371, 'learning_rate': 2.9160318883020736e-05, 'epoch': 18.41}
- 37%|███▋      | 2135/5800 [6:00:21<7:01:51,  6.91s/it]score1 tensor([[0.5391],
-        [0.4531],
-        [0.4805],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4629, 0.4941, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0166, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:46:58,148] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.95 | optimizer_step: 4.37
-[2025-01-25 14:46:58,149] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.62 | bwd_microstep: 4627.12 | bwd_inner_microstep: 4619.28 | bwd_allreduce_microstep: 7.64 | step_microstep: 78.20
-[2025-01-25 14:46:58,150] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.59 | bwd: 4627.17 | bwd_inner: 4619.28 | bwd_allreduce: 7.76 | step: 78.18
- 37%|███▋      | 2136/5800 [6:00:28<7:03:04,  6.93s/it]                                                       {'loss': 0.0166, 'grad_norm': 4.215536594390869, 'learning_rate': 2.9150389620778495e-05, 'epoch': 18.41}
- 37%|███▋      | 2136/5800 [6:00:28<7:03:04,  6.93s/it]score1 tensor([[0.6055],
-        [0.5234],
-        [0.5312],
-        [0.4727]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.5117, 0.5430, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0220, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:47:05,099] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 14:47:05,101] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.57 | bwd_microstep: 4619.45 | bwd_inner_microstep: 4612.05 | bwd_allreduce_microstep: 7.32 | step_microstep: 43.45
-[2025-01-25 14:47:05,101] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.46 | bwd: 4619.47 | bwd_inner: 4612.05 | bwd_allreduce: 7.36 | step: 43.46
- 37%|███▋      | 2137/5800 [6:00:35<7:02:50,  6.93s/it]                                                       {'loss': 0.022, 'grad_norm': 0.8985069990158081, 'learning_rate': 2.9140457505285998e-05, 'epoch': 18.42}
- 37%|███▋      | 2137/5800 [6:00:35<7:02:50,  6.93s/it]score1 tensor([[0.4766],
-        [0.4590],
-        [0.5898],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.4473, 0.4961, 0.4863], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0381, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:47:12,005] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.37
-[2025-01-25 14:47:12,006] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.79 | bwd_microstep: 4616.84 | bwd_inner_microstep: 4611.59 | bwd_allreduce_microstep: 5.17 | step_microstep: 50.41
-[2025-01-25 14:47:12,006] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.75 | bwd: 4616.87 | bwd_inner: 4611.59 | bwd_allreduce: 5.21 | step: 50.42
- 37%|███▋      | 2138/5800 [6:00:41<7:02:19,  6.92s/it]                                                       {'loss': 0.0381, 'grad_norm': 8.653103828430176, 'learning_rate': 2.9130522539640247e-05, 'epoch': 18.43}
- 37%|███▋      | 2138/5800 [6:00:41<7:02:19,  6.92s/it]score1 tensor([[0.4375],
-        [0.5742],
-        [0.4199],
-        [0.6641]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4453, 0.4922, 0.3750, 0.6836], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0386, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:47:18,902] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 14:47:18,903] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.27 | bwd_microstep: 4619.24 | bwd_inner_microstep: 4614.27 | bwd_allreduce_microstep: 4.89 | step_microstep: 41.15
-[2025-01-25 14:47:18,903] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.24 | bwd: 4619.26 | bwd_inner: 4614.27 | bwd_allreduce: 4.93 | step: 41.15
- 37%|███▋      | 2139/5800 [6:00:48<7:01:45,  6.91s/it]                                                       {'loss': 0.0386, 'grad_norm': 0.8067142367362976, 'learning_rate': 2.9120584726939145e-05, 'epoch': 18.44}
- 37%|��██▋      | 2139/5800 [6:00:48<7:01:45,  6.91s/it]score1 tensor([[0.6367],
-        [0.6094],
-        [0.5938],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6016, 0.6250, 0.6562, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:47:25,824] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 14:47:25,831] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.81 | bwd_microstep: 4620.03 | bwd_inner_microstep: 4614.28 | bwd_allreduce_microstep: 5.62 | step_microstep: 72.20
-[2025-01-25 14:47:25,832] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.78 | bwd: 4620.05 | bwd_inner: 4614.28 | bwd_allreduce: 5.70 | step: 72.22
- 37%|███▋      | 2140/5800 [6:00:55<7:02:02,  6.92s/it]                                                       {'loss': 0.0312, 'grad_norm': 4.53428316116333, 'learning_rate': 2.911064407028147e-05, 'epoch': 18.45}
- 37%|███▋      | 2140/5800 [6:00:55<7:02:02,  6.92s/it]score1 tensor([[0.5742],
-        [0.5938],
-        [0.4648],
-        [0.4414]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.5039, 0.4219, 0.4160], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0532, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:47:32,756] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.00 | optimizer_step: 4.36
-[2025-01-25 14:47:32,757] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.88 | bwd_microstep: 4624.83 | bwd_inner_microstep: 4619.78 | bwd_allreduce_microstep: 4.95 | step_microstep: 52.61
-[2025-01-25 14:47:32,757] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.84 | bwd: 4624.85 | bwd_inner: 4619.78 | bwd_allreduce: 5.00 | step: 52.62
- 37%|███▋      | 2141/5800 [6:01:02<7:02:08,  6.92s/it]                                                       {'loss': 0.0532, 'grad_norm': 8.809808731079102, 'learning_rate': 2.9100700572766888e-05, 'epoch': 18.46}
- 37%|███▋      | 2141/5800 [6:01:02<7:02:08,  6.92s/it]score1 tensor([[0.5312],
-        [0.6367],
-        [0.5039],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4766, 0.6641, 0.4844, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0361, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:47:39,687] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.28 | optimizer_step: 4.36
-[2025-01-25 14:47:39,689] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.11 | bwd_microstep: 4625.05 | bwd_inner_microstep: 4619.88 | bwd_allreduce_microstep: 5.09 | step_microstep: 53.67
-[2025-01-25 14:47:39,689] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.01 | bwd: 4625.08 | bwd_inner: 4619.88 | bwd_allreduce: 5.13 | step: 53.68
- 37%|███▋      | 2142/5800 [6:01:09<7:02:08,  6.92s/it]                                                       {'loss': 0.0361, 'grad_norm': 4.243535041809082, 'learning_rate': 2.9090754237495953e-05, 'epoch': 18.47}
- 37%|███▋      | 2142/5800 [6:01:09<7:02:08,  6.92s/it]score1 tensor([[0.4219],
-        [0.6328],
-        [0.4746],
-        [0.4043]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3555, 0.5625, 0.4805, 0.4629], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0503, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:47:46,601] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 14:47:46,602] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.20 | bwd_microstep: 4625.02 | bwd_inner_microstep: 4619.96 | bwd_allreduce_microstep: 4.94 | step_microstep: 42.21
-[2025-01-25 14:47:46,602] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.17 | bwd: 4625.05 | bwd_inner: 4619.96 | bwd_allreduce: 5.00 | step: 42.21
- 37%|███▋      | 2143/5800 [6:01:16<7:01:44,  6.92s/it]                                                       {'loss': 0.0503, 'grad_norm': 0.8775203824043274, 'learning_rate': 2.9080805067570113e-05, 'epoch': 18.47}
- 37%|███▋      | 2143/5800 [6:01:16<7:01:44,  6.92s/it]score1 tensor([[0.6992],
-        [0.6289],
-        [0.6172],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.6094, 0.6602, 0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0400, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:47:53,496] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 14:47:53,497] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.82 | bwd_microstep: 4617.49 | bwd_inner_microstep: 4612.43 | bwd_allreduce_microstep: 4.97 | step_microstep: 42.65
-[2025-01-25 14:47:53,498] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.79 | bwd: 4617.51 | bwd_inner: 4612.43 | bwd_allreduce: 5.02 | step: 42.66
- 37%|███▋      | 2144/5800 [6:01:23<7:01:09,  6.91s/it]                                                       {'loss': 0.04, 'grad_norm': 1.0131632089614868, 'learning_rate': 2.9070853066091685e-05, 'epoch': 18.48}
- 37%|███▋      | 2144/5800 [6:01:23<7:01:09,  6.91s/it]score1 tensor([[0.4062],
-        [0.4102],
-        [0.6797],
-        [0.4121]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4395, 0.4043, 0.6211, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0410, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:48:00,392] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 14:48:00,393] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.46 | bwd_microstep: 4621.03 | bwd_inner_microstep: 4616.13 | bwd_allreduce_microstep: 4.81 | step_microstep: 42.70
-[2025-01-25 14:48:00,393] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.42 | bwd: 4621.05 | bwd_inner: 4616.13 | bwd_allreduce: 4.85 | step: 42.71
- 37%|███▋      | 2145/5800 [6:01:30<7:00:51,  6.91s/it]                                                       {'loss': 0.041, 'grad_norm': 1.0440598726272583, 'learning_rate': 2.9060898236163874e-05, 'epoch': 18.49}
- 37%|███▋      | 2145/5800 [6:01:30<7:00:51,  6.91s/it]score1 tensor([[0.5156],
-        [0.5820],
-        [0.6016],
-        [0.4785]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.6172, 0.6133, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0249, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:48:07,291] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.37
-[2025-01-25 14:48:07,292] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.43 | bwd_microstep: 4615.77 | bwd_inner_microstep: 4611.14 | bwd_allreduce_microstep: 4.54 | step_microstep: 41.97
-[2025-01-25 14:48:07,293] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.39 | bwd: 4615.79 | bwd_inner: 4611.14 | bwd_allreduce: 4.59 | step: 41.96
- 37%|███▋      | 2146/5800 [6:01:37<7:00:31,  6.91s/it]                                                       {'loss': 0.0249, 'grad_norm': 8.970314979553223, 'learning_rate': 2.9050940580890783e-05, 'epoch': 18.5}
- 37%|███▋      | 2146/5800 [6:01:37<7:00:31,  6.91s/it]score1 tensor([[0.5508],
-        [0.5586],
-        [0.5273],
-        [0.3887]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.6133, 0.5508, 0.3691], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0342, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:48:14,195] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 14:48:14,196] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.96 | bwd_microstep: 4623.96 | bwd_inner_microstep: 4619.12 | bwd_allreduce_microstep: 4.77 | step_microstep: 40.99
-[2025-01-25 14:48:14,196] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.93 | bwd: 4623.98 | bwd_inner: 4619.12 | bwd_allreduce: 4.80 | step: 41.00
- 37%|███▋      | 2147/5800 [6:01:44<7:00:19,  6.90s/it]                                                       {'loss': 0.0342, 'grad_norm': 0.7850308418273926, 'learning_rate': 2.9040980103377367e-05, 'epoch': 18.51}
- 37%|███▋      | 2147/5800 [6:01:44<7:00:19,  6.90s/it]score1 tensor([[0.6133],
-        [0.6523],
-        [0.4062],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.6875, 0.4453, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0366, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:48:21,094] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 14:48:21,096] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.17 | bwd_microstep: 4613.68 | bwd_inner_microstep: 4608.91 | bwd_allreduce_microstep: 4.68 | step_microstep: 42.37
-[2025-01-25 14:48:21,096] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.11 | bwd: 4613.70 | bwd_inner: 4608.91 | bwd_allreduce: 4.72 | step: 42.37
- 37%|███▋      | 2148/5800 [6:01:51<7:00:08,  6.90s/it]                                                       {'loss': 0.0366, 'grad_norm': 0.6839064955711365, 'learning_rate': 2.9031016806729474e-05, 'epoch': 18.52}
- 37%|███▋      | 2148/5800 [6:01:51<7:00:08,  6.90s/it]score1 tensor([[0.6602],
-        [0.4707],
-        [0.4219],
-        [0.4727]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6953, 0.6094, 0.4980, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0781, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:48:28,002] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.50 | optimizer_step: 4.37
-[2025-01-25 14:48:28,003] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.34 | bwd_microstep: 4620.10 | bwd_inner_microstep: 4610.76 | bwd_allreduce_microstep: 9.26 | step_microstep: 50.22
-[2025-01-25 14:48:28,003] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.31 | bwd: 4620.12 | bwd_inner: 4610.76 | bwd_allreduce: 9.30 | step: 50.22
- 37%|███▋      | 2149/5800 [6:01:57<7:00:12,  6.91s/it]                                                       {'loss': 0.0781, 'grad_norm': 8.654504776000977, 'learning_rate': 2.9021050694053846e-05, 'epoch': 18.53}
- 37%|███▋      | 2149/5800 [6:01:57<7:00:12,  6.91s/it]score1 tensor([[0.4082],
-        [0.4219],
-        [0.4180],
-        [0.4355]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3457, 0.3652, 0.4199, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0347, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:48:34,937] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.68 | optimizer_step: 4.36
-[2025-01-25 14:48:34,939] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.89 | bwd_microstep: 4617.75 | bwd_inner_microstep: 4610.08 | bwd_allreduce_microstep: 7.49 | step_microstep: 72.97
-[2025-01-25 14:48:34,940] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.83 | bwd: 4617.81 | bwd_inner: 4610.08 | bwd_allreduce: 7.58 | step: 72.94
- 37%|███▋      | 2150/5800 [6:02:04<7:01:16,  6.93s/it]                                                       {'loss': 0.0347, 'grad_norm': 3.9785666465759277, 'learning_rate': 2.9011081768458077e-05, 'epoch': 18.53}
- 37%|███▋      | 2150/5800 [6:02:04<7:01:16,  6.93s/it]score1 tensor([[0.6094],
-        [0.4238],
-        [0.6250],
-        [0.4785]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133, 0.4160, 0.6367, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:48:41,912] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 14:48:41,913] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.90 | bwd_microstep: 4617.20 | bwd_inner_microstep: 4612.28 | bwd_allreduce_microstep: 4.81 | step_microstep: 54.17
-[2025-01-25 14:48:41,913] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.78 | bwd: 4617.22 | bwd_inner: 4612.28 | bwd_allreduce: 4.86 | step: 54.18
- 37%|███▋      | 2151/5800 [6:02:11<7:01:17,  6.93s/it]                                                       {'loss': 0.0127, 'grad_norm': 1.2499873638153076, 'learning_rate': 2.900111003305066e-05, 'epoch': 18.54}
- 37%|███▋      | 2151/5800 [6:02:11<7:01:17,  6.93s/it]score1 tensor([[0.4883],
-        [0.4414],
-        [0.4434],
-        [0.4297]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.4863, 0.4551, 0.4336], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0190, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:48:48,805] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 14:48:48,807] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.06 | bwd_microstep: 4620.80 | bwd_inner_microstep: 4615.94 | bwd_allreduce_microstep: 4.77 | step_microstep: 41.20
-[2025-01-25 14:48:48,807] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.02 | bwd: 4620.83 | bwd_inner: 4615.94 | bwd_allreduce: 4.82 | step: 41.20
- 37%|███▋      | 2152/5800 [6:02:18<7:00:32,  6.92s/it]                                                       {'loss': 0.019, 'grad_norm': 8.166888236999512, 'learning_rate': 2.899113549094095e-05, 'epoch': 18.55}
- 37%|███▋      | 2152/5800 [6:02:18<7:00:32,  6.92s/it]score1 tensor([[0.4746],
-        [0.5352],
-        [0.5195],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4902, 0.6016, 0.5000, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0415, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:48:55,703] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.37
-[2025-01-25 14:48:55,704] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.65 | bwd_microstep: 4621.42 | bwd_inner_microstep: 4616.67 | bwd_allreduce_microstep: 4.65 | step_microstep: 41.67
-[2025-01-25 14:48:55,705] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.62 | bwd: 4621.44 | bwd_inner: 4616.67 | bwd_allreduce: 4.70 | step: 41.68
- 37%|███▋      | 2153/5800 [6:02:25<7:00:06,  6.91s/it]                                                       {'loss': 0.0415, 'grad_norm': 0.5900768637657166, 'learning_rate': 2.8981158145239186e-05, 'epoch': 18.56}
- 37%|███▋      | 2153/5800 [6:02:25<7:00:06,  6.91s/it]score1 tensor([[0.3887],
-        [0.4863],
-        [0.5039],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3516, 0.4727, 0.4570, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0361, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:49:02,597] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.37
-[2025-01-25 14:49:02,598] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.73 | bwd_microstep: 4613.38 | bwd_inner_microstep: 4608.30 | bwd_allreduce_microstep: 4.95 | step_microstep: 42.53
-[2025-01-25 14:49:02,599] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.69 | bwd: 4613.40 | bwd_inner: 4608.30 | bwd_allreduce: 5.02 | step: 42.53
- 37%|███▋      | 2154/5800 [6:02:32<6:59:43,  6.91s/it]                                                       {'loss': 0.0361, 'grad_norm': 3.9640135765075684, 'learning_rate': 2.8971177999056474e-05, 'epoch': 18.57}
- 37%|███▋      | 2154/5800 [6:02:32<6:59:43,  6.91s/it]score1 tensor([[0.4355],
-        [0.4668],
-        [0.5586],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3730, 0.4473, 0.6055, 0.4297], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0537, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:49:09,503] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 14:49:09,504] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.00 | bwd_microstep: 4621.48 | bwd_inner_microstep: 4616.78 | bwd_allreduce_microstep: 4.61 | step_microstep: 41.61
-[2025-01-25 14:49:09,508] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.96 | bwd: 4621.50 | bwd_inner: 4616.78 | bwd_allreduce: 4.65 | step: 41.60
- 37%|███▋      | 2155/5800 [6:02:39<6:59:36,  6.91s/it]                                                       {'loss': 0.0537, 'grad_norm': 3.9707789421081543, 'learning_rate': 2.89611950555048e-05, 'epoch': 18.58}
- 37%|███▋      | 2155/5800 [6:02:39<6:59:36,  6.91s/it]score1 tensor([[0.6367],
-        [0.4180],
-        [0.4824],
-        [0.6172]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.4844, 0.4141, 0.6406], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0542, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:49:16,398] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 14:49:16,399] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.19 | bwd_microstep: 4614.63 | bwd_inner_microstep: 4609.88 | bwd_allreduce_microstep: 4.67 | step_microstep: 41.87
-[2025-01-25 14:49:16,400] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.15 | bwd: 4614.65 | bwd_inner: 4609.88 | bwd_allreduce: 4.71 | step: 41.87
- 37%|███▋      | 2156/5800 [6:02:46<6:59:11,  6.90s/it]                                                       {'loss': 0.0542, 'grad_norm': 0.7306159138679504, 'learning_rate': 2.895120931769702e-05, 'epoch': 18.59}
- 37%|███▋      | 2156/5800 [6:02:46<6:59:11,  6.90s/it]score1 tensor([[0.6602],
-        [0.5977],
-        [0.5703],
-        [0.4277]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.6055, 0.6406, 0.3887], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:49:23,292] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 14:49:23,294] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.01 | bwd_microstep: 4616.20 | bwd_inner_microstep: 4611.19 | bwd_allreduce_microstep: 4.93 | step_microstep: 42.30
-[2025-01-25 14:49:23,294] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.96 | bwd: 4616.22 | bwd_inner: 4611.18 | bwd_allreduce: 4.97 | step: 42.32
- 37%|███▋      | 2157/5800 [6:02:53<6:58:56,  6.90s/it]                                                       {'loss': 0.0391, 'grad_norm': 0.6134146451950073, 'learning_rate': 2.894122078874685e-05, 'epoch': 18.59}
- 37%|███▋      | 2157/5800 [6:02:53<6:58:56,  6.90s/it]score1 tensor([[0.4922],
-        [0.4590],
-        [0.4727],
-        [0.4590]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.4512, 0.4844, 0.4570], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:49:30,205] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.96 | optimizer_step: 4.36
-[2025-01-25 14:49:30,207] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.74 | bwd_microstep: 4619.99 | bwd_inner_microstep: 4614.62 | bwd_allreduce_microstep: 5.24 | step_microstep: 44.96
-[2025-01-25 14:49:30,207] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.70 | bwd: 4620.02 | bwd_inner: 4614.62 | bwd_allreduce: 5.31 | step: 44.95
- 37%|███▋      | 2158/5800 [6:03:00<6:59:07,  6.90s/it]                                                       {'loss': 0.0059, 'grad_norm': 0.5902155637741089, 'learning_rate': 2.89312294717689e-05, 'epoch': 18.6}
- 37%|███▋      | 2158/5800 [6:03:00<6:59:07,  6.90s/it]score1 tensor([[0.6445],
-        [0.5781],
-        [0.5273],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6562, 0.6445, 0.5312, 0.6289], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0322, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:49:37,111] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 14:49:37,112] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.29 | bwd_microstep: 4617.47 | bwd_inner_microstep: 4609.50 | bwd_allreduce_microstep: 7.76 | step_microstep: 44.51
-[2025-01-25 14:49:37,112] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.25 | bwd: 4617.52 | bwd_inner: 4609.50 | bwd_allreduce: 7.87 | step: 44.50
- 37%|███▋      | 2159/5800 [6:03:07<6:58:58,  6.90s/it]                                                       {'loss': 0.0322, 'grad_norm': 9.353821754455566, 'learning_rate': 2.8921235369878637e-05, 'epoch': 18.61}
- 37%|███▋      | 2159/5800 [6:03:07<6:58:58,  6.90s/it]score1 tensor([[0.5273],
-        [0.4180],
-        [0.5195],
-        [0.4629]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.4199, 0.5664, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0303, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:49:44,033] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 14:49:44,034] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.76 | bwd_microstep: 4617.20 | bwd_inner_microstep: 4610.11 | bwd_allreduce_microstep: 6.94 | step_microstep: 47.50
-[2025-01-25 14:49:44,034] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.66 | bwd: 4617.24 | bwd_inner: 4610.11 | bwd_allreduce: 7.02 | step: 47.48
- 37%|███▋      | 2160/5800 [6:03:14<6:59:07,  6.91s/it]                                                       {'loss': 0.0303, 'grad_norm': 8.414176940917969, 'learning_rate': 2.891123848619238e-05, 'epoch': 18.62}
- 37%|███▋      | 2160/5800 [6:03:14<6:59:07,  6.91s/it]score1 tensor([[0.3828],
-        [0.4102],
-        [0.4219],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3789, 0.4043, 0.4551, 0.4883], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:49:50,927] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 14:49:50,929] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.69 | bwd_microstep: 4619.04 | bwd_inner_microstep: 4613.79 | bwd_allreduce_microstep: 5.16 | step_microstep: 43.01
-[2025-01-25 14:49:50,929] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.66 | bwd: 4619.06 | bwd_inner: 4613.79 | bwd_allreduce: 5.20 | step: 43.02
- 37%|███▋      | 2161/5800 [6:03:20<6:58:48,  6.91s/it]                                                       {'loss': 0.0176, 'grad_norm': 4.036696910858154, 'learning_rate': 2.8901238823827342e-05, 'epoch': 18.63}
- 37%|███▋      | 2161/5800 [6:03:20<6:58:48,  6.91s/it]score1 tensor([[0.6836],
-        [0.3848],
-        [0.5625],
-        [0.6406]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.5391, 0.6094, 0.6562], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0698, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:49:57,834] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.37
-[2025-01-25 14:49:57,836] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.22 | bwd_microstep: 4623.16 | bwd_inner_microstep: 4618.30 | bwd_allreduce_microstep: 4.76 | step_microstep: 41.57
-[2025-01-25 14:49:57,836] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.18 | bwd: 4623.22 | bwd_inner: 4618.30 | bwd_allreduce: 4.81 | step: 41.58
- 37%|███▋      | 2162/5800 [6:03:27<6:58:40,  6.90s/it]                                                       {'loss': 0.0698, 'grad_norm': 4.1636128425598145, 'learning_rate': 2.8891236385901584e-05, 'epoch': 18.64}
- 37%|███▋      | 2162/5800 [6:03:27<6:58:40,  6.90s/it]score1 tensor([[0.5273],
-        [0.6484],
-        [0.6094],
-        [0.3613]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6016, 0.6484, 0.5781, 0.3105], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:50:04,687] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 14:50:04,688] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.99 | bwd_microstep: 4574.03 | bwd_inner_microstep: 4569.20 | bwd_allreduce_microstep: 4.72 | step_microstep: 41.24
-[2025-01-25 14:50:04,692] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.96 | bwd: 4574.05 | bwd_inner: 4569.20 | bwd_allreduce: 4.78 | step: 41.25
- 37%|███▋      | 2163/5800 [6:03:34<6:57:41,  6.89s/it]                                                       {'loss': 0.0391, 'grad_norm': 2.0711748600006104, 'learning_rate': 2.8881231175534048e-05, 'epoch': 18.65}
- 37%|███▋      | 2163/5800 [6:03:34<6:57:41,  6.89s/it]score1 tensor([[0.6211],
-        [0.5977],
-        [0.5352],
-        [0.5898]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5977, 0.5508, 0.4980, 0.4863], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0527, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:50:11,582] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 14:50:11,583] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.71 | bwd_microstep: 4619.35 | bwd_inner_microstep: 4614.34 | bwd_allreduce_microstep: 4.91 | step_microstep: 43.15
-[2025-01-25 14:50:11,583] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.68 | bwd: 4619.38 | bwd_inner: 4614.35 | bwd_allreduce: 4.96 | step: 43.15
- 37%|███▋      | 2164/5800 [6:03:41<6:57:34,  6.89s/it]                                                       {'loss': 0.0527, 'grad_norm': 9.385016441345215, 'learning_rate': 2.8871223195844533e-05, 'epoch': 18.66}
- 37%|███▋      | 2164/5800 [6:03:41<6:57:34,  6.89s/it]score1 tensor([[0.4746],
-        [0.3770],
-        [0.6211],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.4258, 0.5664, 0.5586], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0425, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:50:18,473] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 14:50:18,475] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.01 | bwd_microstep: 4614.31 | bwd_inner_microstep: 4609.28 | bwd_allreduce_microstep: 4.91 | step_microstep: 42.65
-[2025-01-25 14:50:18,475] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.96 | bwd: 4614.33 | bwd_inner: 4609.28 | bwd_allreduce: 4.97 | step: 42.65
- 37%|███▋      | 2165/5800 [6:03:48<6:57:36,  6.89s/it]                                                       {'loss': 0.0425, 'grad_norm': 0.9222514033317566, 'learning_rate': 2.8861212449953694e-05, 'epoch': 18.66}
- 37%|███▋      | 2165/5800 [6:03:48<6:57:36,  6.89s/it]score1 tensor([[0.5312],
-        [0.6328],
-        [0.4609],
-        [0.6055]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5586, 0.6797, 0.3945, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0459, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:50:25,366] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 14:50:25,368] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.22 | bwd_microstep: 4612.83 | bwd_inner_microstep: 4608.21 | bwd_allreduce_microstep: 4.54 | step_microstep: 41.09
-[2025-01-25 14:50:25,368] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.18 | bwd: 4612.85 | bwd_inner: 4608.21 | bwd_allreduce: 4.57 | step: 41.10
- 37%|███▋      | 2166/5800 [6:03:55<6:57:22,  6.89s/it]                                                       {'loss': 0.0459, 'grad_norm': 0.7871248722076416, 'learning_rate': 2.8851198940983054e-05, 'epoch': 18.67}
- 37%|███▋      | 2166/5800 [6:03:55<6:57:22,  6.89s/it]score1 tensor([[0.5391],
-        [0.3145],
-        [0.4082],
-        [0.4902]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6484, 0.3086, 0.3906, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0513, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:50:32,266] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 14:50:32,267] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.13 | bwd_microstep: 4622.43 | bwd_inner_microstep: 4617.75 | bwd_allreduce_microstep: 4.61 | step_microstep: 41.14
-[2025-01-25 14:50:32,267] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.09 | bwd: 4622.46 | bwd_inner: 4617.75 | bwd_allreduce: 4.64 | step: 41.15
- 37%|███▋      | 2167/5800 [6:04:02<6:57:30,  6.90s/it]                                                       {'loss': 0.0513, 'grad_norm': 3.7362287044525146, 'learning_rate': 2.8841182672055007e-05, 'epoch': 18.68}
- 37%|███▋      | 2167/5800 [6:04:02<6:57:30,  6.90s/it]score1 tensor([[0.5508],
-        [0.4863],
-        [0.3867],
-        [0.4023]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.4961, 0.4180, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:50:39,179] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 14:50:39,181] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.53 | bwd_microstep: 4616.74 | bwd_inner_microstep: 4611.38 | bwd_allreduce_microstep: 5.18 | step_microstep: 54.26
-[2025-01-25 14:50:39,181] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.50 | bwd: 4616.76 | bwd_inner: 4611.38 | bwd_allreduce: 5.31 | step: 54.28
- 37%|███▋      | 2168/5800 [6:04:09<6:57:34,  6.90s/it]                                                       {'loss': 0.0186, 'grad_norm': 0.6258161067962646, 'learning_rate': 2.883116364629279e-05, 'epoch': 18.69}
- 37%|███▋      | 2168/5800 [6:04:09<6:57:34,  6.90s/it]score1 tensor([[0.4590],
-        [0.5742],
-        [0.4121],
-        [0.4551]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.5625, 0.4121, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0229, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:50:46,066] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.89 | optimizer_step: 4.36
-[2025-01-25 14:50:46,067] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.92 | bwd_microstep: 4579.96 | bwd_inner_microstep: 4572.75 | bwd_allreduce_microstep: 7.02 | step_microstep: 76.77
-[2025-01-25 14:50:46,067] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.89 | bwd: 4580.02 | bwd_inner: 4572.75 | bwd_allreduce: 7.12 | step: 76.75
- 37%|███▋      | 2169/5800 [6:04:16<6:57:34,  6.90s/it]                                                       {'loss': 0.0229, 'grad_norm': 2.325784921646118, 'learning_rate': 2.8821141866820518e-05, 'epoch': 18.7}
- 37%|███▋      | 2169/5800 [6:04:16<6:57:34,  6.90s/it]score1 tensor([[0.4199],
-        [0.4824],
-        [0.4414],
-        [0.4297]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.5273, 0.5117, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0605, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:50:52,996] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 14:50:52,997] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.96 | bwd_microstep: 4625.02 | bwd_inner_microstep: 4620.28 | bwd_allreduce_microstep: 4.66 | step_microstep: 40.50
-[2025-01-25 14:50:52,997] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.90 | bwd: 4625.04 | bwd_inner: 4620.28 | bwd_allreduce: 4.70 | step: 40.52
- 37%|███▋      | 2170/5800 [6:04:22<6:57:44,  6.90s/it]                                                       {'loss': 0.0605, 'grad_norm': 8.17789363861084, 'learning_rate': 2.881111733676315e-05, 'epoch': 18.71}
- 37%|██��▋      | 2170/5800 [6:04:22<6:57:44,  6.90s/it]score1 tensor([[0.3203],
-        [0.4746],
-        [0.3789],
-        [0.4492]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4336, 0.5391, 0.4824, 0.4355], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0737, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:50:59,898] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.36 | optimizer_step: 4.36
-[2025-01-25 14:50:59,899] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.48 | bwd_microstep: 4626.42 | bwd_inner_microstep: 4621.57 | bwd_allreduce_microstep: 4.76 | step_microstep: 37.90
-[2025-01-25 14:50:59,899] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.44 | bwd: 4626.44 | bwd_inner: 4621.58 | bwd_allreduce: 4.80 | step: 37.91
- 37%|███▋      | 2171/5800 [6:04:29<6:57:23,  6.90s/it]                                                       {'loss': 0.0737, 'grad_norm': 3.7879035472869873, 'learning_rate': 2.8801090059246513e-05, 'epoch': 18.72}
- 37%|███▋      | 2171/5800 [6:04:29<6:57:23,  6.90s/it]score1 tensor([[0.4590],
-        [0.4668],
-        [0.6328],
-        [0.3789]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.5312, 0.5508, 0.4160], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0679, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:51:06,776] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.54 | optimizer_step: 4.36
-[2025-01-25 14:51:06,777] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.75 | bwd_microstep: 4614.62 | bwd_inner_microstep: 4609.46 | bwd_allreduce_microstep: 5.04 | step_microstep: 43.32
-[2025-01-25 14:51:06,778] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.71 | bwd: 4614.65 | bwd_inner: 4609.46 | bwd_allreduce: 5.09 | step: 43.33
- 37%|███▋      | 2172/5800 [6:04:36<6:57:04,  6.90s/it]                                                       {'loss': 0.0679, 'grad_norm': 3.754185438156128, 'learning_rate': 2.879106003739728e-05, 'epoch': 18.72}
- 37%|███▋      | 2172/5800 [6:04:36<6:57:04,  6.90s/it]score1 tensor([[0.5000],
-        [0.4824],
-        [0.4141],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5820, 0.5195, 0.4141, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0337, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:51:13,633] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 14:51:13,637] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.11 | bwd_microstep: 4583.35 | bwd_inner_microstep: 4578.46 | bwd_allreduce_microstep: 4.80 | step_microstep: 42.96
-[2025-01-25 14:51:13,637] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.09 | bwd: 4583.37 | bwd_inner: 4578.46 | bwd_allreduce: 4.84 | step: 42.98
- 37%|███▋      | 2173/5800 [6:04:43<6:56:21,  6.89s/it]                                                       {'loss': 0.0337, 'grad_norm': 2.072221279144287, 'learning_rate': 2.878102727434299e-05, 'epoch': 18.73}
- 37%|███▋      | 2173/5800 [6:04:43<6:56:21,  6.89s/it]score1 tensor([[0.5039],
-        [0.5547],
-        [0.4785],
-        [0.3633]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5000, 0.6172, 0.5273, 0.4668], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:51:20,545] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 14:51:20,546] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.59 | bwd_microstep: 4626.97 | bwd_inner_microstep: 4621.96 | bwd_allreduce_microstep: 4.90 | step_microstep: 41.29
-[2025-01-25 14:51:20,546] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.55 | bwd: 4627.00 | bwd_inner: 4621.96 | bwd_allreduce: 4.96 | step: 41.29
- 37%|███▋      | 2174/5800 [6:04:50<6:56:32,  6.89s/it]                                                       {'loss': 0.0547, 'grad_norm': 4.15493631362915, 'learning_rate': 2.8770991773212032e-05, 'epoch': 18.74}
- 37%|███▋      | 2174/5800 [6:04:50<6:56:32,  6.89s/it]score1 tensor([[0.4375],
-        [0.4219],
-        [0.4570],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.4414, 0.6055, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0493, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:51:27,450] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 14:51:27,452] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.00 | bwd_microstep: 4627.41 | bwd_inner_microstep: 4621.78 | bwd_allreduce_microstep: 5.55 | step_microstep: 43.69
-[2025-01-25 14:51:27,452] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.96 | bwd: 4627.43 | bwd_inner: 4621.78 | bwd_allreduce: 5.59 | step: 43.69
- 38%|███▊      | 2175/5800 [6:04:57<6:56:34,  6.89s/it]                                                       {'loss': 0.0493, 'grad_norm': 8.32327938079834, 'learning_rate': 2.876095353713365e-05, 'epoch': 18.75}
- 38%|███▊      | 2175/5800 [6:04:57<6:56:34,  6.89s/it]score1 tensor([[0.5625],
-        [0.5234],
-        [0.4121],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.5352, 0.4785, 0.5234], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:51:34,335] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.37
-[2025-01-25 14:51:34,336] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.58 | bwd_microstep: 4614.61 | bwd_inner_microstep: 4609.48 | bwd_allreduce_microstep: 5.01 | step_microstep: 43.10
-[2025-01-25 14:51:34,337] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.54 | bwd: 4614.64 | bwd_inner: 4609.48 | bwd_allreduce: 5.07 | step: 43.11
- 38%|███▊      | 2176/5800 [6:05:04<6:56:17,  6.89s/it]                                                       {'loss': 0.0254, 'grad_norm': 0.7599920034408569, 'learning_rate': 2.8750912569237937e-05, 'epoch': 18.76}
- 38%|███▊      | 2176/5800 [6:05:04<6:56:17,  6.89s/it]score1 tensor([[0.4258],
-        [0.4434],
-        [0.6094],
-        [0.4941]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3789, 0.4629, 0.5703, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0269, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:51:41,247] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.94 | optimizer_step: 4.36
-[2025-01-25 14:51:41,249] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.83 | bwd_microstep: 4614.84 | bwd_inner_microstep: 4609.82 | bwd_allreduce_microstep: 4.92 | step_microstep: 64.55
-[2025-01-25 14:51:41,250] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.78 | bwd: 4614.87 | bwd_inner: 4609.82 | bwd_allreduce: 4.97 | step: 64.58
- 38%|███▊      | 2177/5800 [6:05:11<6:56:52,  6.90s/it]                                                       {'loss': 0.0269, 'grad_norm': 4.508767127990723, 'learning_rate': 2.874086887265585e-05, 'epoch': 18.77}
- 38%|███▊      | 2177/5800 [6:05:11<6:56:52,  6.90s/it]score1 tensor([[0.4941],
-        [0.4824],
-        [0.5469],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.4590, 0.5156, 0.4570], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0327, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:51:48,185] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 14:51:48,186] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.00 | bwd_microstep: 4625.98 | bwd_inner_microstep: 4618.09 | bwd_allreduce_microstep: 7.73 | step_microstep: 60.96
-[2025-01-25 14:51:48,186] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.96 | bwd: 4626.07 | bwd_inner: 4618.09 | bwd_allreduce: 7.80 | step: 60.95
- 38%|███▊      | 2178/5800 [6:05:18<6:57:04,  6.91s/it]                                                       {'loss': 0.0327, 'grad_norm': 4.354470729827881, 'learning_rate': 2.8730822450519172e-05, 'epoch': 18.78}
- 38%|███▊      | 2178/5800 [6:05:18<6:57:04,  6.91s/it]score1 tensor([[0.4766],
-        [0.6211],
-        [0.6328],
-        [0.6094]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3945, 0.5508, 0.6445, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0771, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:51:55,082] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.07 | optimizer_step: 4.37
-[2025-01-25 14:51:55,083] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.48 | bwd_microstep: 4613.67 | bwd_inner_microstep: 4608.85 | bwd_allreduce_microstep: 4.75 | step_microstep: 44.05
-[2025-01-25 14:51:55,083] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.44 | bwd: 4613.70 | bwd_inner: 4608.85 | bwd_allreduce: 4.78 | step: 44.05
- 38%|███▊      | 2179/5800 [6:05:25<6:56:43,  6.91s/it]                                                       {'loss': 0.0771, 'grad_norm': 4.499443054199219, 'learning_rate': 2.8720773305960566e-05, 'epoch': 18.78}
- 38%|███▊      | 2179/5800 [6:05:25<6:56:43,  6.91s/it]score1 tensor([[0.4570],
-        [0.5898],
-        [0.4199],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3477, 0.5664, 0.1787, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1064, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:52:01,985] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 14:52:01,986] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.36 | bwd_microstep: 4621.57 | bwd_inner_microstep: 4616.72 | bwd_allreduce_microstep: 4.76 | step_microstep: 44.10
-[2025-01-25 14:52:01,987] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.32 | bwd: 4621.60 | bwd_inner: 4616.72 | bwd_allreduce: 4.81 | step: 44.11
- 38%|███▊      | 2180/5800 [6:05:31<6:56:32,  6.90s/it]                                                       {'loss': 0.1064, 'grad_norm': 8.566215515136719, 'learning_rate': 2.8710721442113523e-05, 'epoch': 18.79}
- 38%|███▊      | 2180/5800 [6:05:31<6:56:32,  6.90s/it]score1 tensor([[0.5938],
-        [0.5234],
-        [0.4531],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.4453, 0.3398, 0.4688], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0723, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:52:08,870] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 14:52:08,872] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.81 | bwd_microstep: 4614.69 | bwd_inner_microstep: 4609.74 | bwd_allreduce_microstep: 4.88 | step_microstep: 43.92
-[2025-01-25 14:52:08,872] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.78 | bwd: 4614.72 | bwd_inner: 4609.74 | bwd_allreduce: 4.92 | step: 43.92
- 38%|███▊      | 2181/5800 [6:05:38<6:56:03,  6.90s/it]                                                       {'loss': 0.0723, 'grad_norm': 8.748095512390137, 'learning_rate': 2.8700666862112392e-05, 'epoch': 18.8}
- 38%|███▊      | 2181/5800 [6:05:38<6:56:03,  6.90s/it]score1 tensor([[0.5078],
-        [0.3906],
-        [0.4570],
-        [0.4551]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4590, 0.3750, 0.4375, 0.3789], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0400, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:52:15,757] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 14:52:15,759] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.75 | bwd_microstep: 4613.20 | bwd_inner_microstep: 4608.30 | bwd_allreduce_microstep: 4.83 | step_microstep: 40.09
-[2025-01-25 14:52:15,759] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.71 | bwd: 4613.23 | bwd_inner: 4608.30 | bwd_allreduce: 4.87 | step: 40.11
- 38%|███▊      | 2182/5800 [6:05:45<6:55:45,  6.89s/it]                                                       {'loss': 0.04, 'grad_norm': 8.150338172912598, 'learning_rate': 2.869060956909236e-05, 'epoch': 18.81}
- 38%|███▊      | 2182/5800 [6:05:45<6:55:45,  6.89s/it]score1 tensor([[0.4902],
-        [0.5039],
-        [0.6289],
-        [0.3906]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4766, 0.5625, 0.6797, 0.3457], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0420, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:52:22,659] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.05 | optimizer_step: 4.37
-[2025-01-25 14:52:22,660] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.43 | bwd_microstep: 4623.27 | bwd_inner_microstep: 4618.26 | bwd_allreduce_microstep: 4.91 | step_microstep: 47.78
-[2025-01-25 14:52:22,661] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.40 | bwd: 4623.29 | bwd_inner: 4618.26 | bwd_allreduce: 4.96 | step: 47.79
- 38%|███▊      | 2183/5800 [6:05:52<6:55:48,  6.90s/it]                                                       {'loss': 0.042, 'grad_norm': 0.9755801558494568, 'learning_rate': 2.868054956618948e-05, 'epoch': 18.82}
- 38%|███▊      | 2183/5800 [6:05:52<6:55:48,  6.90s/it]score1 tensor([[0.5742],
-        [0.4395],
-        [0.3887],
-        [0.3613]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.4980, 0.4492, 0.3867], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0381, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:52:29,552] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 14:52:29,553] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.75 | bwd_microstep: 4620.62 | bwd_inner_microstep: 4615.77 | bwd_allreduce_microstep: 4.76 | step_microstep: 43.56
-[2025-01-25 14:52:29,554] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.72 | bwd: 4620.64 | bwd_inner: 4615.76 | bwd_allreduce: 4.81 | step: 43.57
- 38%|███▊      | 2184/5800 [6:05:59<6:55:34,  6.90s/it]                                                       {'loss': 0.0381, 'grad_norm': 3.526916980743408, 'learning_rate': 2.8670486856540612e-05, 'epoch': 18.83}
- 38%|███▊      | 2184/5800 [6:05:59<6:55:34,  6.90s/it]score1 tensor([[0.4453],
-        [0.4492],
-        [0.4922],
-        [0.4062]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4277, 0.4648, 0.6133, 0.4609], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0522, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:52:36,440] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.54 | optimizer_step: 4.36
-[2025-01-25 14:52:36,442] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.03 | bwd_microstep: 4614.29 | bwd_inner_microstep: 4609.38 | bwd_allreduce_microstep: 4.79 | step_microstep: 43.25
-[2025-01-25 14:52:36,442] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.98 | bwd: 4614.31 | bwd_inner: 4609.38 | bwd_allreduce: 4.86 | step: 43.26
- 38%|███▊      | 2185/5800 [6:06:06<6:55:17,  6.89s/it]                                                       {'loss': 0.0522, 'grad_norm': 4.121516704559326, 'learning_rate': 2.86604214432835e-05, 'epoch': 18.84}
- 38%|███▊      | 2185/5800 [6:06:06<6:55:17,  6.89s/it]score1 tensor([[0.4316],
-        [0.5273],
-        [0.5273],
-        [0.4238]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.5664, 0.5352, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0415, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:52:43,336] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.37
-[2025-01-25 14:52:43,337] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.30 | bwd_microstep: 4623.77 | bwd_inner_microstep: 4618.73 | bwd_allreduce_microstep: 4.96 | step_microstep: 45.76
-[2025-01-25 14:52:43,338] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.26 | bwd: 4623.80 | bwd_inner: 4618.73 | bwd_allreduce: 5.00 | step: 45.79
- 38%|███▊      | 2186/5800 [6:06:13<6:55:29,  6.90s/it]                                                       {'loss': 0.0415, 'grad_norm': 8.33730697631836, 'learning_rate': 2.865035332955671e-05, 'epoch': 18.84}
- 38%|███▊      | 2186/5800 [6:06:13<6:55:29,  6.90s/it]score1 tensor([[0.5117],
-        [0.5586],
-        [0.5430],
-        [0.4355]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.7070, 0.5938, 0.3672], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0776, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:52:50,313] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.58 | optimizer_step: 4.40
-[2025-01-25 14:52:50,315] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.67 | bwd_microstep: 4633.45 | bwd_inner_microstep: 4622.99 | bwd_allreduce_microstep: 10.23 | step_microstep: 87.36
-[2025-01-25 14:52:50,315] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.64 | bwd: 4633.55 | bwd_inner: 4622.99 | bwd_allreduce: 10.35 | step: 87.34
- 38%|███▊      | 2187/5800 [6:06:20<6:56:57,  6.92s/it]                                                       {'loss': 0.0776, 'grad_norm': 4.789454936981201, 'learning_rate': 2.8640282518499657e-05, 'epoch': 18.85}
- 38%|███▊      | 2187/5800 [6:06:20<6:56:57,  6.92s/it]score1 tensor([[0.5078],
-        [0.5039],
-        [0.3984],
-        [0.4590]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5000, 0.5156, 0.3711, 0.4746], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:52:57,261] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.84 | optimizer_step: 4.37
-[2025-01-25 14:52:57,262] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.18 | bwd_microstep: 4615.97 | bwd_inner_microstep: 4610.87 | bwd_allreduce_microstep: 4.98 | step_microstep: 65.02
-[2025-01-25 14:52:57,263] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.09 | bwd: 4615.99 | bwd_inner: 4610.87 | bwd_allreduce: 5.05 | step: 65.06
- 38%|███▊      | 2188/5800 [6:06:27<6:56:55,  6.93s/it]                                                       {'loss': 0.0156, 'grad_norm': 0.6369653940200806, 'learning_rate': 2.8630209013252593e-05, 'epoch': 18.86}
- 38%|███▊      | 2188/5800 [6:06:27<6:56:55,  6.93s/it]score1 tensor([[0.5547],
-        [0.5977],
-        [0.4727],
-        [0.4414]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6719, 0.6875, 0.4609, 0.4668], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0610, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:53:04,160] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 14:53:04,161] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.05 | bwd_microstep: 4624.53 | bwd_inner_microstep: 4619.46 | bwd_allreduce_microstep: 4.98 | step_microstep: 41.28
-[2025-01-25 14:53:04,162] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.01 | bwd: 4624.56 | bwd_inner: 4619.46 | bwd_allreduce: 5.03 | step: 41.29
- 38%|███▊      | 2189/5800 [6:06:34<6:56:21,  6.92s/it]                                                       {'loss': 0.061, 'grad_norm': 4.540035724639893, 'learning_rate': 2.8620132816956608e-05, 'epoch': 18.87}
- 38%|███▊      | 2189/5800 [6:06:34<6:56:21,  6.92s/it]score1 tensor([[0.4980],
-        [0.5664],
-        [0.4531],
-        [0.4688]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4219, 0.4785, 0.5273, 0.3262], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0952, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:53:11,046] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.37
-[2025-01-25 14:53:11,047] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.30 | bwd_microstep: 4615.34 | bwd_inner_microstep: 4610.57 | bwd_allreduce_microstep: 4.70 | step_microstep: 40.03
-[2025-01-25 14:53:11,047] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.27 | bwd: 4615.36 | bwd_inner: 4610.57 | bwd_allreduce: 4.73 | step: 40.03
- 38%|███▊      | 2190/5800 [6:06:41<6:55:37,  6.91s/it]                                                       {'loss': 0.0952, 'grad_norm': 4.425891876220703, 'learning_rate': 2.861005393275364e-05, 'epoch': 18.88}
- 38%|███▊      | 2190/5800 [6:06:41<6:55:37,  6.91s/it]score1 tensor([[0.5781],
-        [0.5195],
-        [0.6016],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.4043, 0.5391, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0732, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:53:17,941] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 14:53:17,942] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.47 | bwd_microstep: 4621.36 | bwd_inner_microstep: 4616.61 | bwd_allreduce_microstep: 4.65 | step_microstep: 43.48
-[2025-01-25 14:53:17,943] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.43 | bwd: 4621.38 | bwd_inner: 4616.61 | bwd_allreduce: 4.70 | step: 43.49
- 38%|███▊      | 2191/5800 [6:06:47<6:55:15,  6.90s/it]                                                       {'loss': 0.0732, 'grad_norm': 4.516332149505615, 'learning_rate': 2.8599972363786462e-05, 'epoch': 18.89}
- 38%|███▊      | 2191/5800 [6:06:47<6:55:15,  6.90s/it]score1 tensor([[0.4707],
-        [0.5898],
-        [0.5312],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4258, 0.5820, 0.5430, 0.4883], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0308, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:53:24,834] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.37
-[2025-01-25 14:53:24,835] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.39 | bwd_microstep: 4619.21 | bwd_inner_microstep: 4614.43 | bwd_allreduce_microstep: 4.66 | step_microstep: 43.45
-[2025-01-25 14:53:24,836] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.35 | bwd: 4619.23 | bwd_inner: 4614.43 | bwd_allreduce: 4.72 | step: 43.46
- 38%|███▊      | 2192/5800 [6:06:54<6:55:05,  6.90s/it]                                                       {'loss': 0.0308, 'grad_norm': 4.46588134765625, 'learning_rate': 2.8589888113198675e-05, 'epoch': 18.9}
- 38%|███▊      | 2192/5800 [6:06:54<6:55:05,  6.90s/it]score1 tensor([[0.5195],
-        [0.5195],
-        [0.5039],
-        [0.4824]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4531, 0.4492, 0.4863, 0.4531], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0459, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:53:31,736] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 14:53:31,738] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.74 | bwd_microstep: 4625.01 | bwd_inner_microstep: 4620.15 | bwd_allreduce_microstep: 4.77 | step_microstep: 41.25
-[2025-01-25 14:53:31,738] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.71 | bwd: 4625.03 | bwd_inner: 4620.15 | bwd_allreduce: 4.81 | step: 41.26
- 38%|███▊      | 2193/5800 [6:07:01<6:54:48,  6.90s/it]                                                       {'loss': 0.0459, 'grad_norm': 8.631394386291504, 'learning_rate': 2.8579801184134727e-05, 'epoch': 18.91}
- 38%|███▊      | 2193/5800 [6:07:01<6:54:48,  6.90s/it]score1 tensor([[0.4648],
-        [0.3867],
-        [0.3633],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.4551, 0.4082, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:53:38,636] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.37
-[2025-01-25 14:53:38,637] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.59 | bwd_microstep: 4623.17 | bwd_inner_microstep: 4617.78 | bwd_allreduce_microstep: 5.28 | step_microstep: 46.17
-[2025-01-25 14:53:38,638] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.55 | bwd: 4623.20 | bwd_inner: 4617.78 | bwd_allreduce: 5.34 | step: 46.19
- 38%|███▊      | 2194/5800 [6:07:08<6:54:38,  6.90s/it]                                                       {'loss': 0.0508, 'grad_norm': 3.6016063690185547, 'learning_rate': 2.85697115797399e-05, 'epoch': 18.91}
- 38%|███▊      | 2194/5800 [6:07:08<6:54:38,  6.90s/it]score1 tensor([[0.6328],
-        [0.4629],
-        [0.5781],
-        [0.6367]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6367, 0.4941, 0.5742, 0.5742], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:53:45,536] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.10 | optimizer_step: 4.36
-[2025-01-25 14:53:45,538] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.47 | bwd_microstep: 4619.95 | bwd_inner_microstep: 4615.09 | bwd_allreduce_microstep: 4.76 | step_microstep: 49.19
-[2025-01-25 14:53:45,538] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.43 | bwd: 4619.98 | bwd_inner: 4615.09 | bwd_allreduce: 4.81 | step: 49.20
- 38%|███▊      | 2195/5800 [6:07:15<6:54:55,  6.91s/it]                                                       {'loss': 0.0254, 'grad_norm': 0.7952841520309448, 'learning_rate': 2.85596193031603e-05, 'epoch': 18.92}
- 38%|███▊      | 2195/5800 [6:07:15<6:54:55,  6.91s/it]score1 tensor([[0.4980],
-        [0.4531],
-        [0.5156],
-        [0.4746]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.4648, 0.5391, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:53:52,480] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 14:53:52,482] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.98 | bwd_microstep: 4622.44 | bwd_inner_microstep: 4614.93 | bwd_allreduce_microstep: 7.33 | step_microstep: 62.30
-[2025-01-25 14:53:52,483] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.90 | bwd: 4622.49 | bwd_inner: 4614.93 | bwd_allreduce: 7.41 | step: 62.32
- 38%|███▊      | 2196/5800 [6:07:22<6:55:30,  6.92s/it]                                                       {'loss': 0.0195, 'grad_norm': 8.375274658203125, 'learning_rate': 2.8549524357542887e-05, 'epoch': 18.93}
- 38%|███▊      | 2196/5800 [6:07:22<6:55:30,  6.92s/it]score1 tensor([[0.5781],
-        [0.4434],
-        [0.6016],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.4023, 0.5742, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0601, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:53:59,415] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 14:53:59,417] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.81 | bwd_microstep: 4627.87 | bwd_inner_microstep: 4622.22 | bwd_allreduce_microstep: 5.51 | step_microstep: 53.66
-[2025-01-25 14:53:59,418] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.74 | bwd: 4627.92 | bwd_inner: 4622.22 | bwd_allreduce: 5.58 | step: 53.70
- 38%|███▊      | 2197/5800 [6:07:29<6:55:46,  6.92s/it]                                                       {'loss': 0.0601, 'grad_norm': 0.7773016095161438, 'learning_rate': 2.8539426746035426e-05, 'epoch': 18.94}
- 38%|███▊      | 2197/5800 [6:07:29<6:55:46,  6.92s/it]score1 tensor([[0.4590],
-        [0.4180],
-        [0.5391],
-        [0.4395]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4805, 0.4727, 0.5508, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:54:06,342] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.48 | optimizer_step: 4.36
-[2025-01-25 14:54:06,343] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.41 | bwd_microstep: 4623.52 | bwd_inner_microstep: 4618.83 | bwd_allreduce_microstep: 4.62 | step_microstep: 41.13
-[2025-01-25 14:54:06,344] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.37 | bwd: 4623.54 | bwd_inner: 4618.83 | bwd_allreduce: 4.66 | step: 41.14
- 38%|███▊      | 2198/5800 [6:07:36<6:55:22,  6.92s/it]                                                       {'loss': 0.0234, 'grad_norm': 8.277384757995605, 'learning_rate': 2.8529326471786536e-05, 'epoch': 18.95}
- 38%|███▊      | 2198/5800 [6:07:36<6:55:22,  6.92s/it]score1 tensor([[0.4531],
-        [0.4473],
-        [0.3691],
-        [0.4941]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4902, 0.4297, 0.4180, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0381, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:54:13,242] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 14:54:13,243] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.13 | bwd_microstep: 4623.64 | bwd_inner_microstep: 4619.19 | bwd_allreduce_microstep: 4.37 | step_microstep: 41.37
-[2025-01-25 14:54:13,243] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.10 | bwd: 4623.67 | bwd_inner: 4619.19 | bwd_allreduce: 4.41 | step: 41.38
- 38%|███▊      | 2199/5800 [6:07:43<6:54:52,  6.91s/it]                                                       {'loss': 0.0381, 'grad_norm': 3.9822051525115967, 'learning_rate': 2.8519223537945654e-05, 'epoch': 18.96}
- 38%|███▊      | 2199/5800 [6:07:43<6:54:52,  6.91s/it]score1 tensor([[0.3613],
-        [0.5273],
-        [0.4121],
-        [0.6406]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3438, 0.5039, 0.3340, 0.6484], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0317, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:54:20,131] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.37
-[2025-01-25 14:54:20,132] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.59 | bwd_microstep: 4615.23 | bwd_inner_microstep: 4610.46 | bwd_allreduce_microstep: 4.67 | step_microstep: 41.00
-[2025-01-25 14:54:20,133] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.55 | bwd: 4615.26 | bwd_inner: 4610.46 | bwd_allreduce: 4.72 | step: 41.01
- 38%|███▊      | 2200/5800 [6:07:50<6:54:19,  6.91s/it]                                                       {'loss': 0.0317, 'grad_norm': 3.666203022003174, 'learning_rate': 2.850911794766305e-05, 'epoch': 18.97}
- 38%|███▊      | 2200/5800 [6:07:50<6:54:19,  6.91s/it]score1 tensor([[0.5273],
-        [0.5156],
-        [0.5547],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4961, 0.5156, 0.4434, 0.4395], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:54:26,985] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.54 | optimizer_step: 4.36
-[2025-01-25 14:54:26,986] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.62 | bwd_microstep: 4576.71 | bwd_inner_microstep: 4571.65 | bwd_allreduce_microstep: 4.97 | step_microstep: 45.67
-[2025-01-25 14:54:26,987] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.59 | bwd: 4576.73 | bwd_inner: 4571.65 | bwd_allreduce: 5.01 | step: 45.68
- 38%|███▊      | 2201/5800 [6:07:56<6:53:13,  6.89s/it]                                                       {'loss': 0.0547, 'grad_norm': 6.611939430236816, 'learning_rate': 2.8499009704089826e-05, 'epoch': 18.97}
- 38%|███▊      | 2201/5800 [6:07:56<6:53:13,  6.89s/it]score1 tensor([[0.5586],
-        [0.4219],
-        [0.5547],
-        [0.4609]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.4648, 0.4961, 0.4395], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0317, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:54:33,880] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 14:54:33,881] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.47 | bwd_microstep: 4619.24 | bwd_inner_microstep: 4614.02 | bwd_allreduce_microstep: 5.06 | step_microstep: 45.34
-[2025-01-25 14:54:33,881] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.43 | bwd: 4619.27 | bwd_inner: 4614.02 | bwd_allreduce: 5.15 | step: 45.34
- 38%|███▊      | 2202/5800 [6:08:03<6:53:23,  6.89s/it]                                                       {'loss': 0.0317, 'grad_norm': 0.6211073994636536, 'learning_rate': 2.8488898810377907e-05, 'epoch': 18.98}
- 38%|███▊      | 2202/5800 [6:08:03<6:53:23,  6.89s/it]score1 tensor([[0.4512],
-        [0.4375],
-        [0.4902],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.4434, 0.4512, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:54:40,785] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.37
-[2025-01-25 14:54:40,787] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.65 | bwd_microstep: 4618.53 | bwd_inner_microstep: 4613.71 | bwd_allreduce_microstep: 4.74 | step_microstep: 47.64
-[2025-01-25 14:54:40,787] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.62 | bwd: 4618.55 | bwd_inner: 4613.71 | bwd_allreduce: 4.78 | step: 47.65
- 38%|███▊      | 2203/5800 [6:08:10<6:53:23,  6.90s/it]                                                       {'loss': 0.0195, 'grad_norm': 0.6983546018600464, 'learning_rate': 2.8478785269680042e-05, 'epoch': 18.99}
- 38%|███▊      | 2203/5800 [6:08:10<6:53:23,  6.90s/it][93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-score1 tensor([[0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:54:44,965] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 14:54:44,966] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 570.85 | bwd_microstep: 1221.15 | bwd_inner_microstep: 1216.59 | bwd_allreduce_microstep: 4.49 | step_microstep: 41.21
-[2025-01-25 14:54:44,966] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 570.81 | bwd: 1221.17 | bwd_inner: 1216.59 | bwd_allreduce: 4.52 | step: 41.22
- 38%|███▊      | 2204/5800 [6:08:14<6:04:23,  6.08s/it]                                                       {'loss': 0.0352, 'grad_norm': 8.752372741699219, 'learning_rate': 2.8468669085149812e-05, 'epoch': 19.0}
- 38%|███▊      | 2204/5800 [6:08:14<6:04:23,  6.08s/it][2025-01-25 14:54:49,313] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 14:54:59,317] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 14:55:09,536] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 14:55:19,277] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-score1 tensor([[0.4590],
-        [0.5000],
-        [0.5117],
-        [0.5781]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4863, 0.4297, 0.4883, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0371, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:55:35,731] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 14:55:35,733] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2140.85 | bwd_microstep: 4571.73 | bwd_inner_microstep: 4566.96 | bwd_allreduce_microstep: 4.66 | step_microstep: 42.78
-[2025-01-25 14:55:35,733] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2140.79 | bwd: 4571.76 | bwd_inner: 4566.96 | bwd_allreduce: 4.72 | step: 42.78
- 38%|███▊      | 2205/5800 [6:09:05<19:27:33, 19.49s/it]                                                        {'loss': 0.0371, 'grad_norm': 4.560696601867676, 'learning_rate': 2.8458550259941607e-05, 'epoch': 19.01}
- 38%|███▊      | 2205/5800 [6:09:05<19:27:33, 19.49s/it]score1 tensor([[0.4219],
-        [0.4766],
-        [0.5742],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4023, 0.4473, 0.5547, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0190, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:55:42,559] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 14:55:42,561] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2130.36 | bwd_microstep: 4578.54 | bwd_inner_microstep: 4573.05 | bwd_allreduce_microstep: 5.38 | step_microstep: 44.64
-[2025-01-25 14:55:42,561] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2130.32 | bwd: 4578.56 | bwd_inner: 4573.05 | bwd_allreduce: 5.44 | step: 44.64
- 38%|███▊      | 2206/5800 [6:09:12<15:39:49, 15.69s/it]                                                        {'loss': 0.019, 'grad_norm': 4.164273262023926, 'learning_rate': 2.8448428797210673e-05, 'epoch': 19.02}
- 38%|███▊      | 2206/5800 [6:09:12<15:39:49, 15.69s/it]score1 tensor([[0.5039],
-        [0.5703],
-        [0.5234],
-        [0.4785]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.4961, 0.5273, 0.4590], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0308, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:55:49,413] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 14:55:49,414] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2133.94 | bwd_microstep: 4594.87 | bwd_inner_microstep: 4590.41 | bwd_allreduce_microstep: 4.40 | step_microstep: 46.77
-[2025-01-25 14:55:49,415] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2133.90 | bwd: 4594.90 | bwd_inner: 4590.41 | bwd_allreduce: 4.43 | step: 46.78
- 38%|███▊      | 2207/5800 [6:09:19<13:00:50, 13.04s/it]                                                        {'loss': 0.0308, 'grad_norm': 4.397565841674805, 'learning_rate': 2.8438304700113046e-05, 'epoch': 19.03}
- 38%|███▊      | 2207/5800 [6:09:19<13:00:50, 13.04s/it]score1 tensor([[0.3730],
-        [0.5703],
-        [0.4629],
-        [0.4805]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4238, 0.6289, 0.5195, 0.5742], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0649, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:55:56,270] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 14:55:56,271] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2144.67 | bwd_microstep: 4589.95 | bwd_inner_microstep: 4584.90 | bwd_allreduce_microstep: 4.93 | step_microstep: 41.99
-[2025-01-25 14:55:56,271] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2144.63 | bwd: 4589.97 | bwd_inner: 4584.90 | bwd_allreduce: 4.99 | step: 42.00
- 38%|███▊      | 2208/5800 [6:09:26<11:09:39, 11.19s/it]                                                        {'loss': 0.0649, 'grad_norm': 8.321041107177734, 'learning_rate': 2.8428177971805593e-05, 'epoch': 19.03}
- 38%|███▊      | 2208/5800 [6:09:26<11:09:39, 11.19s/it]score1 tensor([[0.3242],
-        [0.6914],
-        [0.4551],
-        [0.4453]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3105, 0.6875, 0.4688, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0210, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:56:03,145] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 14:56:03,146] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2145.18 | bwd_microstep: 4603.71 | bwd_inner_microstep: 4599.16 | bwd_allreduce_microstep: 4.47 | step_microstep: 40.93
-[2025-01-25 14:56:03,146] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2145.13 | bwd: 4603.73 | bwd_inner: 4599.16 | bwd_allreduce: 4.51 | step: 40.94
- 38%|███▊      | 2209/5800 [6:09:33<9:51:56,  9.89s/it]                                                        {'loss': 0.021, 'grad_norm': 1.3979588747024536, 'learning_rate': 2.841804861544601e-05, 'epoch': 19.04}
- 38%|███▊      | 2209/5800 [6:09:33<9:51:56,  9.89s/it]score1 tensor([[0.5273],
-        [0.4141],
-        [0.6406],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.4922, 0.6719, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0537, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:56:10,019] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 14:56:10,020] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2147.29 | bwd_microstep: 4608.22 | bwd_inner_microstep: 4603.28 | bwd_allreduce_microstep: 4.80 | step_microstep: 46.91
-[2025-01-25 14:56:10,020] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2147.26 | bwd: 4608.25 | bwd_inner: 4603.28 | bwd_allreduce: 4.87 | step: 46.91
- 38%|███▊      | 2210/5800 [6:09:39<8:57:38,  8.99s/it]                                                       {'loss': 0.0537, 'grad_norm': 4.301681995391846, 'learning_rate': 2.8407916634192802e-05, 'epoch': 19.05}
- 38%|███▊      | 2210/5800 [6:09:39<8:57:38,  8.99s/it]score1 tensor([[0.3848],
-        [0.5898],
-        [0.4883],
-        [0.3906]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3652, 0.6211, 0.5703, 0.3867], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0342, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:56:16,893] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 7.26 | optimizer_step: 4.37
-[2025-01-25 14:56:16,894] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2148.92 | bwd_microstep: 4602.65 | bwd_inner_microstep: 4597.77 | bwd_allreduce_microstep: 4.78 | step_microstep: 45.62
-[2025-01-25 14:56:16,894] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2148.89 | bwd: 4602.67 | bwd_inner: 4597.77 | bwd_allreduce: 4.84 | step: 45.62
- 38%|███▊      | 2211/5800 [6:09:46<8:19:36,  8.35s/it]                                                       {'loss': 0.0342, 'grad_norm': 1.136397361755371, 'learning_rate': 2.8397782031205295e-05, 'epoch': 19.06}
- 38%|███▊      | 2211/5800 [6:09:46<8:19:36,  8.35s/it]score1 tensor([[0.6992],
-        [0.4492],
-        [0.5703],
-        [0.4980]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6367, 0.4121, 0.5664, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:56:23,759] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 14:56:23,760] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2151.66 | bwd_microstep: 4595.07 | bwd_inner_microstep: 4589.99 | bwd_allreduce_microstep: 4.94 | step_microstep: 42.03
-[2025-01-25 14:56:23,761] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2151.63 | bwd: 4595.09 | bwd_inner: 4590.00 | bwd_allreduce: 5.00 | step: 42.04
- 38%|███▊      | 2212/5800 [6:09:53<7:52:47,  7.91s/it]                                                       {'loss': 0.0352, 'grad_norm': 4.817817687988281, 'learning_rate': 2.8387644809643635e-05, 'epoch': 19.07}
- 38%|███▊      | 2212/5800 [6:09:53<7:52:47,  7.91s/it]score1 tensor([[0.4102],
-        [0.3789],
-        [0.4609],
-        [0.4980]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4336, 0.4082, 0.5000, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0244, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:56:30,627] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 14:56:30,628] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2149.04 | bwd_microstep: 4597.59 | bwd_inner_microstep: 4592.94 | bwd_allreduce_microstep: 4.58 | step_microstep: 41.56
-[2025-01-25 14:56:30,628] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2148.99 | bwd: 4597.62 | bwd_inner: 4592.94 | bwd_allreduce: 4.61 | step: 41.57
- 38%|███▊      | 2213/5800 [6:10:00<7:34:01,  7.59s/it]                                                       {'loss': 0.0244, 'grad_norm': 8.000285148620605, 'learning_rate': 2.837750497266879e-05, 'epoch': 19.08}
- 38%|███▊      | 2213/5800 [6:10:00<7:34:01,  7.59s/it]score1 tensor([[0.5078],
-        [0.3809],
-        [0.5859],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4707, 0.3652, 0.5586, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0288, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:56:37,499] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.37
-[2025-01-25 14:56:37,500] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2148.90 | bwd_microstep: 4605.78 | bwd_inner_microstep: 4600.85 | bwd_allreduce_microstep: 4.84 | step_microstep: 42.40
-[2025-01-25 14:56:37,500] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2148.86 | bwd: 4605.81 | bwd_inner: 4600.85 | bwd_allreduce: 4.88 | step: 42.41
- 38%|███▊      | 2214/5800 [6:10:07<7:20:57,  7.38s/it]                                                       {'loss': 0.0288, 'grad_norm': 8.57210922241211, 'learning_rate': 2.8367362523442534e-05, 'epoch': 19.09}
- 38%|███▊      | 2214/5800 [6:10:07<7:20:57,  7.38s/it]score1 tensor([[0.6523],
-        [0.4414],
-        [0.6133],
-        [0.6016]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.4844, 0.5469, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0732, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:56:44,370] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 14:56:44,371] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.05 | bwd_microstep: 4602.97 | bwd_inner_microstep: 4598.26 | bwd_allreduce_microstep: 4.62 | step_microstep: 43.16
-[2025-01-25 14:56:44,372] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.01 | bwd: 4603.00 | bwd_inner: 4598.26 | bwd_allreduce: 4.66 | step: 43.17
- 38%|███▊      | 2215/5800 [6:10:14<7:11:45,  7.23s/it]                                                       {'loss': 0.0732, 'grad_norm': 5.196300029754639, 'learning_rate': 2.835721746512745e-05, 'epoch': 19.09}
- 38%|███▊      | 2215/5800 [6:10:14<7:11:45,  7.23s/it]score1 tensor([[0.5859],
-        [0.4004],
-        [0.3438],
-        [0.6289]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5898, 0.5391, 0.3789, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0493, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:56:51,258] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 14:56:51,259] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.43 | bwd_microstep: 4610.38 | bwd_inner_microstep: 4605.58 | bwd_allreduce_microstep: 4.71 | step_microstep: 47.52
-[2025-01-25 14:56:51,259] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.40 | bwd: 4610.41 | bwd_inner: 4605.58 | bwd_allreduce: 4.76 | step: 47.52
- 38%|███▊      | 2216/5800 [6:10:21<7:05:30,  7.12s/it]                                                       {'loss': 0.0493, 'grad_norm': 3.7114808559417725, 'learning_rate': 2.8347069800886955e-05, 'epoch': 19.1}
- 38%|███▊      | 2216/5800 [6:10:21<7:05:30,  7.12s/it]score1 tensor([[0.3789],
-        [0.5000],
-        [0.5625],
-        [0.4570]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4219, 0.4785, 0.5156, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0376, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:56:58,139] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 14:56:58,140] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.82 | bwd_microstep: 4614.66 | bwd_inner_microstep: 4609.86 | bwd_allreduce_microstep: 4.72 | step_microstep: 40.42
-[2025-01-25 14:56:58,146] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.78 | bwd: 4614.69 | bwd_inner: 4609.86 | bwd_allreduce: 4.76 | step: 40.44
- 38%|███▊      | 2217/5800 [6:10:28<7:01:13,  7.05s/it]                                                       {'loss': 0.0376, 'grad_norm': 1.0627377033233643, 'learning_rate': 2.833691953388526e-05, 'epoch': 19.11}
- 38%|███▊      | 2217/5800 [6:10:28<7:01:13,  7.05s/it]score1 tensor([[0.4453],
-        [0.5820],
-        [0.3027],
-        [0.3008]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4668, 0.5781, 0.3691, 0.3457], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0342, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:57:05,043] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.46 | optimizer_step: 4.37
-[2025-01-25 14:57:05,044] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.09 | bwd_microstep: 4612.44 | bwd_inner_microstep: 4607.57 | bwd_allreduce_microstep: 4.77 | step_microstep: 45.29
-[2025-01-25 14:57:05,045] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.04 | bwd: 4612.47 | bwd_inner: 4607.57 | bwd_allreduce: 4.82 | step: 45.30
- 38%|███▊      | 2218/5800 [6:10:35<6:58:14,  7.01s/it]                                                       {'loss': 0.0342, 'grad_norm': 3.334578037261963, 'learning_rate': 2.8326766667287394e-05, 'epoch': 19.12}
- 38%|███▊      | 2218/5800 [6:10:35<6:58:14,  7.01s/it]score1 tensor([[0.4844],
-        [0.4102],
-        [0.4824],
-        [0.4629]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.3789, 0.5000, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0264, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:57:11,928] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.50 | optimizer_step: 4.36
-[2025-01-25 14:57:11,929] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.71 | bwd_microstep: 4617.53 | bwd_inner_microstep: 4612.81 | bwd_allreduce_microstep: 4.64 | step_microstep: 41.66
-[2025-01-25 14:57:11,930] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.67 | bwd: 4617.55 | bwd_inner: 4612.81 | bwd_allreduce: 4.68 | step: 41.67
- 38%|███▊      | 2219/5800 [6:10:41<6:55:57,  6.97s/it]                                                       {'loss': 0.0264, 'grad_norm': 3.980302333831787, 'learning_rate': 2.8316611204259203e-05, 'epoch': 19.13}
- 38%|███▊      | 2219/5800 [6:10:41<6:55:57,  6.97s/it]score1 tensor([[0.3789],
-        [0.5977],
-        [0.4785],
-        [0.6133]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3340, 0.5977, 0.4570, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:57:18,762] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 14:57:18,763] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.27 | bwd_microstep: 4564.76 | bwd_inner_microstep: 4559.81 | bwd_allreduce_microstep: 4.80 | step_microstep: 41.10
-[2025-01-25 14:57:18,764] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.24 | bwd: 4564.79 | bwd_inner: 4559.81 | bwd_allreduce: 4.88 | step: 41.11
- 38%|███▊      | 2220/5800 [6:10:48<6:53:33,  6.93s/it]                                                       {'loss': 0.0176, 'grad_norm': 6.279852867126465, 'learning_rate': 2.830645314796733e-05, 'epoch': 19.14}
- 38%|███▊      | 2220/5800 [6:10:48<6:53:33,  6.93s/it]score1 tensor([[0.5625],
-        [0.4980],
-        [0.4512],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.4980, 0.4355, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:57:25,597] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 14:57:25,598] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.89 | bwd_microstep: 4560.38 | bwd_inner_microstep: 4555.50 | bwd_allreduce_microstep: 4.80 | step_microstep: 42.82
-[2025-01-25 14:57:25,598] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.85 | bwd: 4560.40 | bwd_inner: 4555.50 | bwd_allreduce: 4.84 | step: 42.83
- 38%|███▊      | 2221/5800 [6:10:55<6:51:36,  6.90s/it]                                                       {'loss': 0.0156, 'grad_norm': 6.577620029449463, 'learning_rate': 2.8296292501579237e-05, 'epoch': 19.15}
- 38%|███▊      | 2221/5800 [6:10:55<6:51:36,  6.90s/it]score1 tensor([[0.3828],
-        [0.4551],
-        [0.4766],
-        [0.6992]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4043, 0.4160, 0.4766, 0.6953], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:57:32,440] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 14:57:32,442] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.23 | bwd_microstep: 4574.17 | bwd_inner_microstep: 4569.23 | bwd_allreduce_microstep: 4.84 | step_microstep: 42.21
-[2025-01-25 14:57:32,442] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.20 | bwd: 4574.19 | bwd_inner: 4569.23 | bwd_allreduce: 4.89 | step: 42.21
- 38%|███▊      | 2222/5800 [6:11:02<6:50:27,  6.88s/it]                                                       {'loss': 0.0161, 'grad_norm': 2.9451255798339844, 'learning_rate': 2.8286129268263188e-05, 'epoch': 19.16}
- 38%|███▊      | 2222/5800 [6:11:02<6:50:27,  6.88s/it]score1 tensor([[0.3730],
-        [0.5039],
-        [0.5742],
-        [0.4434]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.5586, 0.5508, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:57:39,326] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.96 | optimizer_step: 4.37
-[2025-01-25 14:57:39,327] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.61 | bwd_microstep: 4612.46 | bwd_inner_microstep: 4607.55 | bwd_allreduce_microstep: 4.82 | step_microstep: 42.86
-[2025-01-25 14:57:39,327] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.58 | bwd: 4612.49 | bwd_inner: 4607.55 | bwd_allreduce: 4.87 | step: 42.87
- 38%|███▊      | 2223/5800 [6:11:09<6:50:29,  6.89s/it]                                                       {'loss': 0.0273, 'grad_norm': 3.8785102367401123, 'learning_rate': 2.8275963451188254e-05, 'epoch': 19.16}
- 38%|███▊      | 2223/5800 [6:11:09<6:50:29,  6.89s/it]score1 tensor([[0.5039],
-        [0.5391],
-        [0.4902],
-        [0.4453]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.6484, 0.4492, 0.4355], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0400, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:57:46,163] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 14:57:46,164] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.77 | bwd_microstep: 4563.25 | bwd_inner_microstep: 4558.21 | bwd_allreduce_microstep: 4.94 | step_microstep: 43.69
-[2025-01-25 14:57:46,165] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.74 | bwd: 4563.27 | bwd_inner: 4558.21 | bwd_allreduce: 4.99 | step: 43.70
- 38%|███▊      | 2224/5800 [6:11:16<6:49:28,  6.87s/it]                                                       {'loss': 0.04, 'grad_norm': 1.9524003267288208, 'learning_rate': 2.826579505352432e-05, 'epoch': 19.17}
- 38%|███▊      | 2224/5800 [6:11:16<6:49:28,  6.87s/it]score1 tensor([[0.4941],
-        [0.4512],
-        [0.3984],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4395, 0.4004, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:57:53,046] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 14:57:53,047] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.01 | bwd_microstep: 4607.15 | bwd_inner_microstep: 4602.22 | bwd_allreduce_microstep: 4.83 | step_microstep: 42.22
-[2025-01-25 14:57:53,047] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.97 | bwd: 4607.17 | bwd_inner: 4602.22 | bwd_allreduce: 4.88 | step: 42.23
- 38%|███▊      | 2225/5800 [6:11:23<6:49:39,  6.88s/it]                                                       {'loss': 0.0234, 'grad_norm': 0.7020133137702942, 'learning_rate': 2.8255624078442064e-05, 'epoch': 19.18}
- 38%|███▊      | 2225/5800 [6:11:23<6:49:39,  6.88s/it]score1 tensor([[0.6094],
-        [0.5742],
-        [0.5234],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6641, 0.6367, 0.5391, 0.6133], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0449, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:57:59,946] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 14:57:59,947] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.37 | bwd_microstep: 4609.78 | bwd_inner_microstep: 4604.68 | bwd_allreduce_microstep: 5.00 | step_microstep: 44.03
-[2025-01-25 14:57:59,948] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.34 | bwd: 4609.80 | bwd_inner: 4604.68 | bwd_allreduce: 5.05 | step: 44.04
- 38%|███▊      | 2226/5800 [6:11:29<6:49:53,  6.88s/it]                                                       {'loss': 0.0449, 'grad_norm': 9.171908378601074, 'learning_rate': 2.824545052911297e-05, 'epoch': 19.19}
- 38%|███▊      | 2226/5800 [6:11:29<6:49:53,  6.88s/it]score1 tensor([[0.5312],
-        [0.5742],
-        [0.4980],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.6211, 0.5625, 0.6133], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0542, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:58:06,843] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 14:58:06,844] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.92 | bwd_microstep: 4615.42 | bwd_inner_microstep: 4610.37 | bwd_allreduce_microstep: 4.95 | step_microstep: 41.53
-[2025-01-25 14:58:06,845] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.86 | bwd: 4615.44 | bwd_inner: 4610.37 | bwd_allreduce: 5.00 | step: 41.54
- 38%|███▊      | 2227/5800 [6:11:36<6:50:05,  6.89s/it]                                                       {'loss': 0.0542, 'grad_norm': 8.836403846740723, 'learning_rate': 2.8235274408709338e-05, 'epoch': 19.2}
- 38%|███▊      | 2227/5800 [6:11:36<6:50:05,  6.89s/it]score1 tensor([[0.4336],
-        [0.5469],
-        [0.4961],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4922, 0.6445, 0.5508, 0.6289], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0713, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:58:13,745] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.37
-[2025-01-25 14:58:13,751] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.28 | bwd_microstep: 4621.34 | bwd_inner_microstep: 4616.60 | bwd_allreduce_microstep: 4.66 | step_microstep: 50.65
-[2025-01-25 14:58:13,751] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.25 | bwd: 4621.36 | bwd_inner: 4616.60 | bwd_allreduce: 4.70 | step: 50.66
- 38%|███▊      | 2228/5800 [6:11:43<6:50:19,  6.89s/it]                                                       {'loss': 0.0713, 'grad_norm': 8.584227561950684, 'learning_rate': 2.8225095720404244e-05, 'epoch': 19.21}
- 38%|███▊      | 2228/5800 [6:11:43<6:50:19,  6.89s/it]score1 tensor([[0.3770],
-        [0.5195],
-        [0.3945],
-        [0.4434]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3945, 0.5781, 0.4492, 0.4668], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0386, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:58:20,639] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.91 | optimizer_step: 4.36
-[2025-01-25 14:58:20,640] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.22 | bwd_microstep: 4615.95 | bwd_inner_microstep: 4611.22 | bwd_allreduce_microstep: 4.63 | step_microstep: 43.79
-[2025-01-25 14:58:20,641] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.18 | bwd: 4615.97 | bwd_inner: 4611.22 | bwd_allreduce: 4.69 | step: 43.80
- 38%|███▊      | 2229/5800 [6:11:50<6:50:08,  6.89s/it]                                                       {'loss': 0.0386, 'grad_norm': 7.893351078033447, 'learning_rate': 2.8214914467371586e-05, 'epoch': 19.22}
- 38%|███▊      | 2229/5800 [6:11:50<6:50:08,  6.89s/it]score1 tensor([[0.4531],
-        [0.4590],
-        [0.4297],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.4941, 0.4121, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0464, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:58:27,530] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 14:58:27,532] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.18 | bwd_microstep: 4609.64 | bwd_inner_microstep: 4604.35 | bwd_allreduce_microstep: 5.20 | step_microstep: 47.00
-[2025-01-25 14:58:27,532] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.15 | bwd: 4609.67 | bwd_inner: 4604.35 | bwd_allreduce: 5.25 | step: 47.01
- 38%|███▊      | 2230/5800 [6:11:57<6:50:06,  6.89s/it]                                                       {'loss': 0.0464, 'grad_norm': 4.321402072906494, 'learning_rate': 2.8204730652786056e-05, 'epoch': 19.22}
- 38%|███▊      | 2230/5800 [6:11:57<6:50:06,  6.89s/it]score1 tensor([[0.6250],
-        [0.4609],
-        [0.5312],
-        [0.4805]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6250, 0.4648, 0.5352, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0029, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:58:34,380] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 14:58:34,381] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.97 | bwd_microstep: 4567.38 | bwd_inner_microstep: 4562.76 | bwd_allreduce_microstep: 4.54 | step_microstep: 43.77
-[2025-01-25 14:58:34,381] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.94 | bwd: 4567.41 | bwd_inner: 4562.76 | bwd_allreduce: 4.58 | step: 43.80
- 38%|███▊      | 2231/5800 [6:12:04<6:49:09,  6.88s/it]                                                       {'loss': 0.0029, 'grad_norm': 6.2578444480896, 'learning_rate': 2.8194544279823148e-05, 'epoch': 19.23}
- 38%|███▊      | 2231/5800 [6:12:04<6:49:09,  6.88s/it]score1 tensor([[0.5703],
-        [0.4688],
-        [0.5078],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4570, 0.4805, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:58:41,270] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 14:58:41,272] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.38 | bwd_microstep: 4618.69 | bwd_inner_microstep: 4613.71 | bwd_allreduce_microstep: 4.88 | step_microstep: 41.27
-[2025-01-25 14:58:41,272] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.33 | bwd: 4618.72 | bwd_inner: 4613.71 | bwd_allreduce: 4.93 | step: 41.28
- 38%|███▊      | 2232/5800 [6:12:11<6:49:22,  6.88s/it]                                                       {'loss': 0.0161, 'grad_norm': 8.515228271484375, 'learning_rate': 2.818435535165914e-05, 'epoch': 19.24}
- 38%|███▊      | 2232/5800 [6:12:11<6:49:22,  6.88s/it]score1 tensor([[0.4297],
-        [0.5664],
-        [0.5703],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3555, 0.5820, 0.5625, 0.4766], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0361, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:58:48,170] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 14:58:48,171] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.97 | bwd_microstep: 4611.11 | bwd_inner_microstep: 4606.32 | bwd_allreduce_microstep: 4.69 | step_microstep: 42.90
-[2025-01-25 14:58:48,171] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.93 | bwd: 4611.14 | bwd_inner: 4606.32 | bwd_allreduce: 4.74 | step: 42.90
- 38%|███▊      | 2233/5800 [6:12:18<6:49:24,  6.89s/it]                                                       {'loss': 0.0361, 'grad_norm': 4.11782169342041, 'learning_rate': 2.817416387147113e-05, 'epoch': 19.25}
- 38%|███▊      | 2233/5800 [6:12:18<6:49:24,  6.89s/it]score1 tensor([[0.5273],
-        [0.5156],
-        [0.5898],
-        [0.6211]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.5039, 0.6445, 0.5938], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:58:55,055] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 14:58:55,057] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.93 | bwd_microstep: 4611.13 | bwd_inner_microstep: 4605.94 | bwd_allreduce_microstep: 5.11 | step_microstep: 41.91
-[2025-01-25 14:58:55,057] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.90 | bwd: 4611.15 | bwd_inner: 4605.94 | bwd_allreduce: 5.15 | step: 41.91
- 39%|███▊      | 2234/5800 [6:12:25<6:49:15,  6.89s/it]                                                       {'loss': 0.0391, 'grad_norm': 4.358062267303467, 'learning_rate': 2.8163969842436986e-05, 'epoch': 19.26}
- 39%|███▊      | 2234/5800 [6:12:25<6:49:15,  6.89s/it]score1 tensor([[0.5742],
-        [0.5078],
-        [0.4453],
-        [0.6445]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.4707, 0.4199, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:59:01,945] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 7.87 | optimizer_step: 4.36
-[2025-01-25 14:59:01,946] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.10 | bwd_microstep: 4611.28 | bwd_inner_microstep: 4606.36 | bwd_allreduce_microstep: 4.83 | step_microstep: 45.73
-[2025-01-25 14:59:01,946] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.07 | bwd: 4611.31 | bwd_inner: 4606.36 | bwd_allreduce: 4.87 | step: 45.74
- 39%|███▊      | 2235/5800 [6:12:31<6:49:11,  6.89s/it]                                                       {'loss': 0.0234, 'grad_norm': 4.301511764526367, 'learning_rate': 2.8153773267735386e-05, 'epoch': 19.27}
- 39%|███▊      | 2235/5800 [6:12:31<6:49:11,  6.89s/it]score1 tensor([[0.5312],
-        [0.4688],
-        [0.4473],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4805, 0.4746, 0.3750, 0.4414], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0527, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:59:08,831] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.36
-[2025-01-25 14:59:08,832] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.32 | bwd_microstep: 4617.08 | bwd_inner_microstep: 4612.25 | bwd_allreduce_microstep: 4.74 | step_microstep: 38.32
-[2025-01-25 14:59:08,832] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.27 | bwd: 4617.11 | bwd_inner: 4612.25 | bwd_allreduce: 4.79 | step: 38.33
- 39%|███▊      | 2236/5800 [6:12:38<6:49:03,  6.89s/it]                                                       {'loss': 0.0527, 'grad_norm': 4.342129707336426, 'learning_rate': 2.8143574150545803e-05, 'epoch': 19.28}
- 39%|███▊      | 2236/5800 [6:12:38<6:49:03,  6.89s/it]score1 tensor([[0.5312],
-        [0.5508],
-        [0.4824],
-        [0.4551]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.4883, 0.4980, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0205, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:59:15,671] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 14:59:15,673] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.10 | bwd_microstep: 4560.71 | bwd_inner_microstep: 4555.90 | bwd_allreduce_microstep: 4.72 | step_microstep: 43.23
-[2025-01-25 14:59:15,673] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.06 | bwd: 4560.74 | bwd_inner: 4555.90 | bwd_allreduce: 4.77 | step: 43.24
- 39%|███▊      | 2237/5800 [6:12:45<6:48:11,  6.87s/it]                                                       {'loss': 0.0205, 'grad_norm': 2.226322650909424, 'learning_rate': 2.8133372494048498e-05, 'epoch': 19.28}
- 39%|███▊      | 2237/5800 [6:12:45<6:48:11,  6.87s/it]score1 tensor([[0.4453],
-        [0.4141],
-        [0.5195],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.3418, 0.5156, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0327, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:59:22,568] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.37
-[2025-01-25 14:59:22,570] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.92 | bwd_microstep: 4620.45 | bwd_inner_microstep: 4615.54 | bwd_allreduce_microstep: 4.81 | step_microstep: 42.72
-[2025-01-25 14:59:22,570] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.89 | bwd: 4620.47 | bwd_inner: 4615.54 | bwd_allreduce: 4.86 | step: 42.74
- 39%|███▊      | 2238/5800 [6:12:52<6:48:28,  6.88s/it]                                                       {'loss': 0.0327, 'grad_norm': 8.23653793334961, 'learning_rate': 2.812316830142452e-05, 'epoch': 19.29}
- 39%|███▊      | 2238/5800 [6:12:52<6:48:28,  6.88s/it]score1 tensor([[0.5664],
-        [0.3535],
-        [0.4961],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5586, 0.3516, 0.4902, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:59:29,458] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 14:59:29,459] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.62 | bwd_microstep: 4611.78 | bwd_inner_microstep: 4606.94 | bwd_allreduce_microstep: 4.71 | step_microstep: 44.38
-[2025-01-25 14:59:29,460] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.57 | bwd: 4611.81 | bwd_inner: 4606.94 | bwd_allreduce: 4.80 | step: 44.39
- 39%|███▊      | 2239/5800 [6:12:59<6:48:23,  6.88s/it]                                                       {'loss': 0.0078, 'grad_norm': 8.310556411743164, 'learning_rate': 2.8112961575855715e-05, 'epoch': 19.3}
- 39%|███▊      | 2239/5800 [6:12:59<6:48:23,  6.88s/it]score1 tensor([[0.4629],
-        [0.5156],
-        [0.4062],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.5078, 0.4160, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0264, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:59:36,344] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 14:59:36,346] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.32 | bwd_microstep: 4615.53 | bwd_inner_microstep: 4610.77 | bwd_allreduce_microstep: 4.64 | step_microstep: 44.85
-[2025-01-25 14:59:36,346] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.28 | bwd: 4615.56 | bwd_inner: 4610.76 | bwd_allreduce: 4.69 | step: 44.86
- 39%|███▊      | 2240/5800 [6:13:06<6:48:35,  6.89s/it]                                                       {'loss': 0.0264, 'grad_norm': 3.939436674118042, 'learning_rate': 2.8102752320524725e-05, 'epoch': 19.31}
- 39%|███▊      | 2240/5800 [6:13:06<6:48:35,  6.89s/it]score1 tensor([[0.5547],
-        [0.5117],
-        [0.4434],
-        [0.3691]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6523, 0.6094, 0.5352, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0796, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:59:43,247] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 14:59:43,248] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.99 | bwd_microstep: 4618.32 | bwd_inner_microstep: 4613.19 | bwd_allreduce_microstep: 5.04 | step_microstep: 43.62
-[2025-01-25 14:59:43,249] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.95 | bwd: 4618.34 | bwd_inner: 4613.19 | bwd_allreduce: 5.09 | step: 43.63
- 39%|███▊      | 2241/5800 [6:13:13<6:48:39,  6.89s/it]                                                       {'loss': 0.0796, 'grad_norm': 8.157084465026855, 'learning_rate': 2.809254053861496e-05, 'epoch': 19.32}
- 39%|███▊      | 2241/5800 [6:13:13<6:48:39,  6.89s/it]score1 tensor([[0.4492],
-        [0.3848],
-        [0.4512],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.4688, 0.4863, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0630, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:59:50,140] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 14:59:50,141] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.73 | bwd_microstep: 4615.37 | bwd_inner_microstep: 4610.38 | bwd_allreduce_microstep: 4.91 | step_microstep: 41.80
-[2025-01-25 14:59:50,142] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.69 | bwd: 4615.39 | bwd_inner: 4610.38 | bwd_allreduce: 4.95 | step: 41.81
- 39%|███▊      | 2242/5800 [6:13:20<6:48:37,  6.89s/it]                                                       {'loss': 0.063, 'grad_norm': 8.06696891784668, 'learning_rate': 2.8082326233310636e-05, 'epoch': 19.33}
- 39%|███▊      | 2242/5800 [6:13:20<6:48:37,  6.89s/it]score1 tensor([[0.4551],
-        [0.3867],
-        [0.4434],
-        [0.2910]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.4609, 0.4648, 0.1787], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 14:59:57,032] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 14:59:57,034] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.32 | bwd_microstep: 4615.45 | bwd_inner_microstep: 4610.35 | bwd_allreduce_microstep: 4.99 | step_microstep: 43.21
-[2025-01-25 14:59:57,034] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.27 | bwd: 4615.48 | bwd_inner: 4610.35 | bwd_allreduce: 5.05 | step: 43.22
- 39%|███▊      | 2243/5800 [6:13:27<6:48:30,  6.89s/it]                                                       {'loss': 0.0664, 'grad_norm': 4.216248035430908, 'learning_rate': 2.807210940779676e-05, 'epoch': 19.34}
- 39%|███▊      | 2243/5800 [6:13:27<6:48:30,  6.89s/it]score1 tensor([[0.5234],
-        [0.5039],
-        [0.4082],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.6172, 0.4961, 0.6875], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0952, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:00:03,923] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 15:00:03,924] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.67 | bwd_microstep: 4613.97 | bwd_inner_microstep: 4609.16 | bwd_allreduce_microstep: 4.73 | step_microstep: 42.36
-[2025-01-25 15:00:03,924] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.59 | bwd: 4613.99 | bwd_inner: 4609.16 | bwd_allreduce: 4.77 | step: 42.37
- 39%|███▊      | 2244/5800 [6:13:33<6:48:24,  6.89s/it]                                                       {'loss': 0.0952, 'grad_norm': 8.40127182006836, 'learning_rate': 2.8061890065259104e-05, 'epoch': 19.34}
- 39%|███▊      | 2244/5800 [6:13:33<6:48:24,  6.89s/it]score1 tensor([[0.4844],
-        [0.4609],
-        [0.4316],
-        [0.4102]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.4941, 0.4590, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:00:10,821] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 15:00:10,822] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.73 | bwd_microstep: 4620.63 | bwd_inner_microstep: 4615.35 | bwd_allreduce_microstep: 5.21 | step_microstep: 43.23
-[2025-01-25 15:00:10,822] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.69 | bwd: 4620.65 | bwd_inner: 4615.35 | bwd_allreduce: 5.24 | step: 43.24
- 39%|███▊      | 2245/5800 [6:13:40<6:48:24,  6.89s/it]                                                       {'loss': 0.0186, 'grad_norm': 3.8440778255462646, 'learning_rate': 2.805166820888424e-05, 'epoch': 19.35}
- 39%|███▊      | 2245/5800 [6:13:40<6:48:24,  6.89s/it]score1 tensor([[0.5664],
-        [0.6055],
-        [0.5391],
-        [0.3906]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5898, 0.5508, 0.6211, 0.4277], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0493, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:00:17,712] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 15:00:17,713] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.29 | bwd_microstep: 4612.51 | bwd_inner_microstep: 4607.78 | bwd_allreduce_microstep: 4.65 | step_microstep: 40.99
-[2025-01-25 15:00:17,713] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.25 | bwd: 4612.53 | bwd_inner: 4607.78 | bwd_allreduce: 4.68 | step: 41.00
- 39%|███▊      | 2246/5800 [6:13:47<6:48:14,  6.89s/it]                                                       {'loss': 0.0493, 'grad_norm': 4.042176246643066, 'learning_rate': 2.804144384185952e-05, 'epoch': 19.36}
- 39%|███▊      | 2246/5800 [6:13:47<6:48:14,  6.89s/it]score1 tensor([[0.4121],
-        [0.5820],
-        [0.4453],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3086, 0.6445, 0.4551, 0.4434], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0591, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:00:24,600] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.37
-[2025-01-25 15:00:24,602] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.52 | bwd_microstep: 4611.80 | bwd_inner_microstep: 4607.15 | bwd_allreduce_microstep: 4.57 | step_microstep: 42.43
-[2025-01-25 15:00:24,602] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.48 | bwd: 4611.83 | bwd_inner: 4607.15 | bwd_allreduce: 4.61 | step: 42.44
- 39%|███▊      | 2247/5800 [6:13:54<6:48:03,  6.89s/it]                                                       {'loss': 0.0591, 'grad_norm': 0.8458038568496704, 'learning_rate': 2.8031216967373077e-05, 'epoch': 19.37}
- 39%|███▊      | 2247/5800 [6:13:54<6:48:03,  6.89s/it]score1 tensor([[0.5742],
-        [0.5352],
-        [0.5273],
-        [0.4668]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.5156, 0.4844, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0386, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:00:31,493] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 15:00:31,494] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.95 | bwd_microstep: 4613.64 | bwd_inner_microstep: 4608.63 | bwd_allreduce_microstep: 4.90 | step_microstep: 46.65
-[2025-01-25 15:00:31,495] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.91 | bwd: 4613.67 | bwd_inner: 4608.63 | bwd_allreduce: 4.94 | step: 46.66
- 39%|███▉      | 2248/5800 [6:14:01<6:48:05,  6.89s/it]                                                       {'loss': 0.0386, 'grad_norm': 4.150024890899658, 'learning_rate': 2.802098758861383e-05, 'epoch': 19.38}
- 39%|███▉      | 2248/5800 [6:14:01<6:48:05,  6.89s/it]score1 tensor([[0.5742],
-        [0.4570],
-        [0.5820],
-        [0.6406]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.4629, 0.6250, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0190, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:00:38,388] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.36
-[2025-01-25 15:00:38,390] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.29 | bwd_microstep: 4610.48 | bwd_inner_microstep: 4605.68 | bwd_allreduce_microstep: 4.71 | step_microstep: 43.60
-[2025-01-25 15:00:38,390] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.24 | bwd: 4610.50 | bwd_inner: 4605.68 | bwd_allreduce: 4.76 | step: 43.56
- 39%|███▉      | 2249/5800 [6:14:08<6:47:54,  6.89s/it]                                                       {'loss': 0.019, 'grad_norm': 0.8088418245315552, 'learning_rate': 2.8010755708771474e-05, 'epoch': 19.39}
- 39%|███▉      | 2249/5800 [6:14:08<6:47:54,  6.89s/it]score1 tensor([[0.4883],
-        [0.4941],
-        [0.5391],
-        [0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.4375, 0.4707, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:00:45,275] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 15:00:45,276] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.68 | bwd_microstep: 4611.64 | bwd_inner_microstep: 4606.56 | bwd_allreduce_microstep: 4.96 | step_microstep: 42.90
-[2025-01-25 15:00:45,276] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.63 | bwd: 4611.66 | bwd_inner: 4606.56 | bwd_allreduce: 5.02 | step: 42.91
- 39%|███▉      | 2250/5800 [6:14:15<6:47:45,  6.89s/it]                                                       {'loss': 0.043, 'grad_norm': 0.5275598168373108, 'learning_rate': 2.8000521331036496e-05, 'epoch': 19.4}
- 39%|███▉      | 2250/5800 [6:14:15<6:47:45,  6.89s/it]score1 tensor([[0.4609],
-        [0.4629],
-        [0.6406],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4258, 0.4199, 0.6055, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0371, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:00:52,169] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 15:00:52,170] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.18 | bwd_microstep: 4611.29 | bwd_inner_microstep: 4606.34 | bwd_allreduce_microstep: 4.87 | step_microstep: 43.20
-[2025-01-25 15:00:52,174] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.14 | bwd: 4611.32 | bwd_inner: 4606.34 | bwd_allreduce: 4.91 | step: 43.21
- 39%|███▉      | 2251/5800 [6:14:22<6:47:41,  6.89s/it]                                                       {'loss': 0.0371, 'grad_norm': 8.531221389770508, 'learning_rate': 2.7990284458600143e-05, 'epoch': 19.41}
- 39%|███▉      | 2251/5800 [6:14:22<6:47:41,  6.89s/it]score1 tensor([[0.5898],
-        [0.5391],
-        [0.5820],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6016, 0.4844, 0.6406, 0.4531], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0488, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:00:59,072] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.37
-[2025-01-25 15:00:59,074] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.64 | bwd_microstep: 4619.40 | bwd_inner_microstep: 4614.65 | bwd_allreduce_microstep: 4.68 | step_microstep: 43.36
-[2025-01-25 15:00:59,074] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.59 | bwd: 4619.43 | bwd_inner: 4614.65 | bwd_allreduce: 4.72 | step: 43.37
- 39%|███▉      | 2252/5800 [6:14:29<6:47:42,  6.89s/it]                                                       {'loss': 0.0488, 'grad_norm': 0.6146556735038757, 'learning_rate': 2.7980045094654458e-05, 'epoch': 19.41}
- 39%|███▉      | 2252/5800 [6:14:29<6:47:42,  6.89s/it]score1 tensor([[0.4980],
-        [0.5039],
-        [0.5273],
-        [0.4863]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4453, 0.4395, 0.5469, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:01:05,963] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 15:01:05,965] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.61 | bwd_microstep: 4616.85 | bwd_inner_microstep: 4612.09 | bwd_allreduce_microstep: 4.67 | step_microstep: 43.48
-[2025-01-25 15:01:05,965] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.57 | bwd: 4616.88 | bwd_inner: 4612.09 | bwd_allreduce: 4.72 | step: 43.49
- 39%|███▉      | 2253/5800 [6:14:35<6:47:30,  6.89s/it]                                                       {'loss': 0.043, 'grad_norm': 4.046074867248535, 'learning_rate': 2.7969803242392236e-05, 'epoch': 19.42}
- 39%|███▉      | 2253/5800 [6:14:35<6:47:30,  6.89s/it]score1 tensor([[0.5742],
-        [0.4336],
-        [0.5664],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.4043, 0.6172, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0347, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:01:12,848] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.37
-[2025-01-25 15:01:12,849] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.40 | bwd_microstep: 4611.67 | bwd_inner_microstep: 4606.59 | bwd_allreduce_microstep: 5.00 | step_microstep: 41.39
-[2025-01-25 15:01:12,850] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.35 | bwd: 4611.70 | bwd_inner: 4606.59 | bwd_allreduce: 5.04 | step: 41.39
- 39%|███▉      | 2254/5800 [6:14:42<6:47:18,  6.89s/it]                                                       {'loss': 0.0347, 'grad_norm': 0.7030783891677856, 'learning_rate': 2.795955890500708e-05, 'epoch': 19.43}
- 39%|███▉      | 2254/5800 [6:14:42<6:47:18,  6.89s/it]score1 tensor([[0.5898],
-        [0.4785],
-        [0.5273],
-        [0.4492]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.4590, 0.6094, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0386, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:01:19,758] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 15:01:19,759] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.67 | bwd_microstep: 4618.18 | bwd_inner_microstep: 4613.72 | bwd_allreduce_microstep: 4.37 | step_microstep: 42.20
-[2025-01-25 15:01:19,759] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.63 | bwd: 4618.20 | bwd_inner: 4613.72 | bwd_allreduce: 4.41 | step: 42.21
- 39%|███▉      | 2255/5800 [6:14:49<6:47:30,  6.90s/it]                                                       {'loss': 0.0386, 'grad_norm': 0.6508904099464417, 'learning_rate': 2.7949312085693344e-05, 'epoch': 19.44}
- 39%|███▉      | 2255/5800 [6:14:49<6:47:30,  6.90s/it]score1 tensor([[0.4648],
-        [0.4609],
-        [0.4316],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.4160, 0.3887, 0.6484], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0449, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:01:26,654] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.37
-[2025-01-25 15:01:26,656] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.71 | bwd_microstep: 4618.24 | bwd_inner_microstep: 4613.47 | bwd_allreduce_microstep: 4.68 | step_microstep: 45.08
-[2025-01-25 15:01:26,656] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.68 | bwd: 4618.26 | bwd_inner: 4613.47 | bwd_allreduce: 4.73 | step: 45.08
- 39%|███▉      | 2256/5800 [6:14:56<6:47:18,  6.90s/it]                                                       {'loss': 0.0449, 'grad_norm': 3.764953851699829, 'learning_rate': 2.793906278764617e-05, 'epoch': 19.45}
- 39%|███▉      | 2256/5800 [6:14:56<6:47:18,  6.90s/it]score1 tensor([[0.4727],
-        [0.4023],
-        [0.5078],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4277, 0.3926, 0.5117, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:01:33,535] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.37
-[2025-01-25 15:01:33,536] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.41 | bwd_microstep: 4607.26 | bwd_inner_microstep: 4602.83 | bwd_allreduce_microstep: 4.34 | step_microstep: 42.47
-[2025-01-25 15:01:33,537] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.37 | bwd: 4607.28 | bwd_inner: 4602.83 | bwd_allreduce: 4.37 | step: 42.47
- 39%|███▉      | 2257/5800 [6:15:03<6:46:56,  6.89s/it]                                                       {'loss': 0.0186, 'grad_norm': 4.09108829498291, 'learning_rate': 2.7928811014061465e-05, 'epoch': 19.46}
- 39%|███▉      | 2257/5800 [6:15:03<6:46:56,  6.89s/it]score1 tensor([[0.4023],
-        [0.3438],
-        [0.5312],
-        [0.4941]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4043, 0.3652, 0.5078, 0.4727], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0171, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:01:40,417] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.36
-[2025-01-25 15:01:40,418] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.21 | bwd_microstep: 4608.24 | bwd_inner_microstep: 4603.56 | bwd_allreduce_microstep: 4.59 | step_microstep: 43.19
-[2025-01-25 15:01:40,418] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.17 | bwd: 4608.26 | bwd_inner: 4603.55 | bwd_allreduce: 4.64 | step: 43.20
- 39%|███▉      | 2258/5800 [6:15:10<6:46:37,  6.89s/it]                                                       {'loss': 0.0171, 'grad_norm': 0.9481746554374695, 'learning_rate': 2.7918556768135908e-05, 'epoch': 19.47}
- 39%|███▉      | 2258/5800 [6:15:10<6:46:37,  6.89s/it]score1 tensor([[0.4766],
-        [0.4238],
-        [0.4277],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.4375, 0.4844, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:01:47,327] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.37
-[2025-01-25 15:01:47,328] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.86 | bwd_microstep: 4619.49 | bwd_inner_microstep: 4614.77 | bwd_allreduce_microstep: 4.64 | step_microstep: 46.17
-[2025-01-25 15:01:47,328] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.82 | bwd: 4619.51 | bwd_inner: 4614.77 | bwd_allreduce: 4.68 | step: 46.18
- 39%|███▉      | 2259/5800 [6:15:17<6:46:54,  6.89s/it]                                                       {'loss': 0.0254, 'grad_norm': 8.03952693939209, 'learning_rate': 2.790830005306695e-05, 'epoch': 19.47}
- 39%|███▉      | 2259/5800 [6:15:17<6:46:54,  6.89s/it]score1 tensor([[0.3965],
-        [0.4746],
-        [0.4414],
-        [0.4688]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.4863, 0.4258, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0239, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:01:54,228] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.37
-[2025-01-25 15:01:54,230] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.25 | bwd_microstep: 4626.93 | bwd_inner_microstep: 4621.81 | bwd_allreduce_microstep: 5.04 | step_microstep: 44.32
-[2025-01-25 15:01:54,230] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.21 | bwd: 4626.96 | bwd_inner: 4621.80 | bwd_allreduce: 5.08 | step: 44.33
- 39%|███▉      | 2260/5800 [6:15:24<6:46:56,  6.90s/it]                                                       {'loss': 0.0239, 'grad_norm': 4.003660202026367, 'learning_rate': 2.7898040872052815e-05, 'epoch': 19.48}
- 39%|███▉      | 2260/5800 [6:15:24<6:46:56,  6.90s/it]score1 tensor([[0.5391],
-        [0.4492],
-        [0.4883],
-        [0.5898]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133, 0.4824, 0.5078, 0.6562], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0483, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:02:01,138] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 15:02:01,139] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.63 | bwd_microstep: 4631.87 | bwd_inner_microstep: 4627.04 | bwd_allreduce_microstep: 4.73 | step_microstep: 41.87
-[2025-01-25 15:02:01,139] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.60 | bwd: 4631.90 | bwd_inner: 4627.04 | bwd_allreduce: 4.78 | step: 41.89
- 39%|███▉      | 2261/5800 [6:15:31<6:47:01,  6.90s/it]                                                       {'loss': 0.0483, 'grad_norm': 8.56633472442627, 'learning_rate': 2.7887779228292493e-05, 'epoch': 19.49}
- 39%|███▉      | 2261/5800 [6:15:31<6:47:01,  6.90s/it]score1 tensor([[0.4844],
-        [0.5039],
-        [0.4961],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.5547, 0.5469, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:02:08,051] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 15:02:08,052] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.74 | bwd_microstep: 4631.60 | bwd_inner_microstep: 4626.66 | bwd_allreduce_microstep: 4.84 | step_microstep: 43.47
-[2025-01-25 15:02:08,052] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.71 | bwd: 4631.63 | bwd_inner: 4626.66 | bwd_allreduce: 4.89 | step: 43.48
- 39%|███▉      | 2262/5800 [6:15:38<6:47:12,  6.91s/it]                                                       {'loss': 0.043, 'grad_norm': 8.383588790893555, 'learning_rate': 2.7877515124985745e-05, 'epoch': 19.5}
- 39%|███▉      | 2262/5800 [6:15:38<6:47:12,  6.91s/it]score1 tensor([[0.4395],
-        [0.4727],
-        [0.5547],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4316, 0.4648, 0.5625, 0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:02:14,966] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 15:02:14,967] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.04 | bwd_microstep: 4631.95 | bwd_inner_microstep: 4627.25 | bwd_allreduce_microstep: 4.60 | step_microstep: 42.70
-[2025-01-25 15:02:14,968] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.01 | bwd: 4631.97 | bwd_inner: 4627.25 | bwd_allreduce: 4.66 | step: 42.72
- 39%|███▉      | 2263/5800 [6:15:44<6:47:14,  6.91s/it]                                                       {'loss': 0.0078, 'grad_norm': 0.7049365043640137, 'learning_rate': 2.7867248565333094e-05, 'epoch': 19.51}
- 39%|███▉      | 2263/5800 [6:15:44<6:47:14,  6.91s/it]score1 tensor([[0.3867],
-        [0.4473],
-        [0.5039],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.4570, 0.5430, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0190, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:02:21,891] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 15:02:21,892] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.34 | bwd_microstep: 4640.40 | bwd_inner_microstep: 4635.54 | bwd_allreduce_microstep: 4.78 | step_microstep: 42.86
-[2025-01-25 15:02:21,892] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.30 | bwd: 4640.43 | bwd_inner: 4635.54 | bwd_allreduce: 4.82 | step: 42.87
- 39%|███▉      | 2264/5800 [6:15:51<6:47:20,  6.91s/it]                                                       {'loss': 0.019, 'grad_norm': 0.560341477394104, 'learning_rate': 2.7856979552535835e-05, 'epoch': 19.52}
- 39%|███▉      | 2264/5800 [6:15:51<6:47:20,  6.91s/it]score1 tensor([[0.5195],
-        [0.5820],
-        [0.4434],
-        [0.6172]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.5273, 0.4023, 0.6367], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0337, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:02:28,810] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 15:02:28,811] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.63 | bwd_microstep: 4639.96 | bwd_inner_microstep: 4634.91 | bwd_allreduce_microstep: 4.91 | step_microstep: 41.96
-[2025-01-25 15:02:28,811] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.60 | bwd: 4639.98 | bwd_inner: 4634.91 | bwd_allreduce: 4.99 | step: 41.97
- 39%|███▉      | 2265/5800 [6:15:58<6:47:20,  6.91s/it]                                                       {'loss': 0.0337, 'grad_norm': 0.6059259176254272, 'learning_rate': 2.7846708089796017e-05, 'epoch': 19.53}
- 39%|███▉      | 2265/5800 [6:15:58<6:47:20,  6.91s/it]score1 tensor([[0.4023],
-        [0.6211],
-        [0.4590],
-        [0.3867]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3730, 0.7070, 0.4141, 0.3438], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:02:35,727] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.54 | optimizer_step: 4.36
-[2025-01-25 15:02:35,728] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.54 | bwd_microstep: 4641.32 | bwd_inner_microstep: 4636.84 | bwd_allreduce_microstep: 4.39 | step_microstep: 42.16
-[2025-01-25 15:02:35,729] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.50 | bwd: 4641.35 | bwd_inner: 4636.84 | bwd_allreduce: 4.42 | step: 42.17
- 39%|███▉      | 2266/5800 [6:16:05<6:47:22,  6.92s/it]                                                       {'loss': 0.0508, 'grad_norm': 3.4273951053619385, 'learning_rate': 2.783643418031646e-05, 'epoch': 19.53}
- 39%|███▉      | 2266/5800 [6:16:05<6:47:22,  6.92s/it]score1 tensor([[0.5195],
-        [0.5430],
-        [0.5234],
-        [0.6250]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4609, 0.5312, 0.4980, 0.6836], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0386, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:02:42,636] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.36
-[2025-01-25 15:02:42,637] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.25 | bwd_microstep: 4630.59 | bwd_inner_microstep: 4626.01 | bwd_allreduce_microstep: 4.47 | step_microstep: 42.55
-[2025-01-25 15:02:42,637] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.22 | bwd: 4630.61 | bwd_inner: 4626.01 | bwd_allreduce: 4.53 | step: 42.56
- 39%|███▉      | 2267/5800 [6:16:12<6:47:00,  6.91s/it]                                                       {'loss': 0.0386, 'grad_norm': 4.09790563583374, 'learning_rate': 2.7826157827300756e-05, 'epoch': 19.54}
- 39%|███▉      | 2267/5800 [6:16:12<6:47:00,  6.91s/it]score1 tensor([[0.5938],
-        [0.4727],
-        [0.5117],
-        [0.5977]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6016, 0.5000, 0.4609, 0.5742], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:02:49,539] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 15:02:49,541] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.12 | bwd_microstep: 4632.01 | bwd_inner_microstep: 4627.03 | bwd_allreduce_microstep: 4.86 | step_microstep: 43.07
-[2025-01-25 15:02:49,541] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.09 | bwd: 4632.04 | bwd_inner: 4627.03 | bwd_allreduce: 4.92 | step: 43.09
- 39%|███▉      | 2268/5800 [6:16:19<6:46:50,  6.91s/it]                                                       {'loss': 0.0273, 'grad_norm': 0.6134623289108276, 'learning_rate': 2.7815879033953246e-05, 'epoch': 19.55}
- 39%|███▉      | 2268/5800 [6:16:19<6:46:50,  6.91s/it]score1 tensor([[0.4258],
-        [0.5820],
-        [0.4707],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.5664, 0.4551, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0229, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:02:56,458] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 15:02:56,459] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.98 | bwd_microstep: 4636.45 | bwd_inner_microstep: 4631.85 | bwd_allreduce_microstep: 4.53 | step_microstep: 41.79
-[2025-01-25 15:02:56,460] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.95 | bwd: 4636.47 | bwd_inner: 4631.85 | bwd_allreduce: 4.56 | step: 41.80
- 39%|███▉      | 2269/5800 [6:16:26<6:46:48,  6.91s/it]                                                       {'loss': 0.0229, 'grad_norm': 0.5603498816490173, 'learning_rate': 2.7805597803479037e-05, 'epoch': 19.56}
- 39%|███▉      | 2269/5800 [6:16:26<6:46:48,  6.91s/it]score1 tensor([[0.5156],
-        [0.5039],
-        [0.4648],
-        [0.4590]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.5000, 0.4980, 0.4688], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0171, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:03:03,376] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.36
-[2025-01-25 15:03:03,377] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.92 | bwd_microstep: 4640.96 | bwd_inner_microstep: 4635.77 | bwd_allreduce_microstep: 5.09 | step_microstep: 41.33
-[2025-01-25 15:03:03,377] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.87 | bwd: 4640.99 | bwd_inner: 4635.77 | bwd_allreduce: 5.14 | step: 41.34
- 39%|███▉      | 2270/5800 [6:16:33<6:46:46,  6.91s/it]                                                       {'loss': 0.0171, 'grad_norm': 0.5839961767196655, 'learning_rate': 2.7795314139083992e-05, 'epoch': 19.57}
- 39%|███▉      | 2270/5800 [6:16:33<6:46:46,  6.91s/it]score1 tensor([[0.4668],
-        [0.5312],
-        [0.5078],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4062, 0.5977, 0.5469, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0601, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:03:10,299] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 15:03:10,300] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.63 | bwd_microstep: 4641.63 | bwd_inner_microstep: 4636.82 | bwd_allreduce_microstep: 4.72 | step_microstep: 46.47
-[2025-01-25 15:03:10,300] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.60 | bwd: 4641.65 | bwd_inner: 4636.82 | bwd_allreduce: 4.76 | step: 46.48
- 39%|███▉      | 2271/5800 [6:16:40<6:46:48,  6.92s/it]                                                       {'loss': 0.0601, 'grad_norm': 4.467309951782227, 'learning_rate': 2.778502804397474e-05, 'epoch': 19.58}
- 39%|███▉      | 2271/5800 [6:16:40<6:46:48,  6.92s/it]score1 tensor([[0.5117],
-        [0.4922],
-        [0.4824],
-        [0.4043]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4453, 0.5117, 0.5469, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0503, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:03:17,209] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.37
-[2025-01-25 15:03:17,211] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.23 | bwd_microstep: 4634.15 | bwd_inner_microstep: 4628.32 | bwd_allreduce_microstep: 5.75 | step_microstep: 43.09
-[2025-01-25 15:03:17,211] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.18 | bwd: 4634.17 | bwd_inner: 4628.32 | bwd_allreduce: 5.79 | step: 43.10
- 39%|███▉      | 2272/5800 [6:16:47<6:46:39,  6.92s/it]                                                       {'loss': 0.0503, 'grad_norm': 4.044907569885254, 'learning_rate': 2.777473952135866e-05, 'epoch': 19.59}
- 39%|███▉      | 2272/5800 [6:16:47<6:46:39,  6.92s/it]score1 tensor([[0.5312],
-        [0.4883],
-        [0.4180],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4863, 0.4258, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:03:24,132] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 15:03:24,134] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.97 | bwd_microstep: 4636.69 | bwd_inner_microstep: 4631.44 | bwd_allreduce_microstep: 5.15 | step_microstep: 45.58
-[2025-01-25 15:03:24,134] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.93 | bwd: 4636.72 | bwd_inner: 4631.44 | bwd_allreduce: 5.21 | step: 45.59
- 39%|███▉      | 2273/5800 [6:16:54<6:46:35,  6.92s/it]                                                       {'loss': 0.0083, 'grad_norm': 0.62522292137146, 'learning_rate': 2.7764448574443893e-05, 'epoch': 19.59}
- 39%|███▉      | 2273/5800 [6:16:54<6:46:35,  6.92s/it]score1 tensor([[0.5547],
-        [0.4277],
-        [0.5508],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.4180, 0.5391, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0142, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:03:31,044] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 15:03:31,045] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.47 | bwd_microstep: 4635.65 | bwd_inner_microstep: 4631.06 | bwd_allreduce_microstep: 4.51 | step_microstep: 43.78
-[2025-01-25 15:03:31,045] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.44 | bwd: 4635.67 | bwd_inner: 4631.06 | bwd_allreduce: 4.55 | step: 43.79
- 39%|███▉      | 2274/5800 [6:17:01<6:46:28,  6.92s/it]                                                       {'loss': 0.0142, 'grad_norm': 4.277435302734375, 'learning_rate': 2.7754155206439337e-05, 'epoch': 19.6}
- 39%|███▉      | 2274/5800 [6:17:01<6:46:28,  6.92s/it]score1 tensor([[0.4941],
-        [0.5469],
-        [0.5820],
-        [0.4238]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.4941, 0.6055, 0.4414], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:03:37,960] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 15:03:37,961] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.13 | bwd_microstep: 4638.85 | bwd_inner_microstep: 4633.78 | bwd_allreduce_microstep: 4.99 | step_microstep: 42.14
-[2025-01-25 15:03:37,961] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.10 | bwd: 4638.87 | bwd_inner: 4633.78 | bwd_allreduce: 5.03 | step: 42.15
- 39%|███▉      | 2275/5800 [6:17:07<6:46:16,  6.92s/it]                                                       {'loss': 0.0352, 'grad_norm': 0.5709355473518372, 'learning_rate': 2.774385942055464e-05, 'epoch': 19.61}
- 39%|███▉      | 2275/5800 [6:17:07<6:46:16,  6.92s/it]score1 tensor([[0.5547],
-        [0.5156],
-        [0.4727],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5742, 0.4629, 0.5000, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0298, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:03:44,879] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 15:03:44,880] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.13 | bwd_microstep: 4641.75 | bwd_inner_microstep: 4637.01 | bwd_allreduce_microstep: 4.64 | step_microstep: 42.07
-[2025-01-25 15:03:44,881] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.09 | bwd: 4641.78 | bwd_inner: 4637.01 | bwd_allreduce: 4.68 | step: 42.08
- 39%|███▉      | 2276/5800 [6:17:14<6:46:14,  6.92s/it]                                                       {'loss': 0.0298, 'grad_norm': 4.338747501373291, 'learning_rate': 2.773356122000021e-05, 'epoch': 19.62}
- 39%|███▉      | 2276/5800 [6:17:14<6:46:14,  6.92s/it]score1 tensor([[0.3770],
-        [0.5938],
-        [0.5117],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3984, 0.6211, 0.4902, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:03:51,800] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 15:03:51,802] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.21 | bwd_microstep: 4637.08 | bwd_inner_microstep: 4632.34 | bwd_allreduce_microstep: 4.66 | step_microstep: 46.21
-[2025-01-25 15:03:51,802] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.18 | bwd: 4637.10 | bwd_inner: 4632.34 | bwd_allreduce: 4.70 | step: 46.22
- 39%|█��█▉      | 2277/5800 [6:17:21<6:46:11,  6.92s/it]                                                       {'loss': 0.0254, 'grad_norm': 0.5390425324440002, 'learning_rate': 2.7723260607987202e-05, 'epoch': 19.63}
- 39%|███▉      | 2277/5800 [6:17:21<6:46:11,  6.92s/it]score1 tensor([[0.4590],
-        [0.4746],
-        [0.5273],
-        [0.4766]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.4277, 0.5156, 0.4277], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0366, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:03:58,712] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.52 | optimizer_step: 4.36
-[2025-01-25 15:03:58,713] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.90 | bwd_microstep: 4633.73 | bwd_inner_microstep: 4629.26 | bwd_allreduce_microstep: 4.39 | step_microstep: 41.61
-[2025-01-25 15:03:58,714] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.87 | bwd: 4633.75 | bwd_inner: 4629.26 | bwd_allreduce: 4.42 | step: 41.62
- 39%|███▉      | 2278/5800 [6:17:28<6:46:00,  6.92s/it]                                                       {'loss': 0.0366, 'grad_norm': 4.245129108428955, 'learning_rate': 2.771295758772753e-05, 'epoch': 19.64}
- 39%|███▉      | 2278/5800 [6:17:28<6:46:00,  6.92s/it]score1 tensor([[0.4863],
-        [0.5547],
-        [0.4668],
-        [0.3711]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4766, 0.5586, 0.4629, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:04:05,632] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 15:04:05,633] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.06 | bwd_microstep: 4640.18 | bwd_inner_microstep: 4634.88 | bwd_allreduce_microstep: 5.18 | step_microstep: 43.93
-[2025-01-25 15:04:05,634] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.03 | bwd: 4640.20 | bwd_inner: 4634.88 | bwd_allreduce: 5.24 | step: 43.94
- 39%|███▉      | 2279/5800 [6:17:35<6:45:59,  6.92s/it]                                                       {'loss': 0.0054, 'grad_norm': 0.8925101161003113, 'learning_rate': 2.770265216243384e-05, 'epoch': 19.65}
- 39%|███▉      | 2279/5800 [6:17:35<6:45:59,  6.92s/it]score1 tensor([[0.4727],
-        [0.4375],
-        [0.4141],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3262, 0.3906, 0.4141, 0.4805], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0571, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:04:12,499] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 15:04:12,500] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.19 | bwd_microstep: 4585.92 | bwd_inner_microstep: 4581.13 | bwd_allreduce_microstep: 4.71 | step_microstep: 41.15
-[2025-01-25 15:04:12,501] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.16 | bwd: 4585.95 | bwd_inner: 4581.13 | bwd_allreduce: 4.75 | step: 41.16
- 39%|███▉      | 2280/5800 [6:17:42<6:44:52,  6.90s/it]                                                       {'loss': 0.0571, 'grad_norm': 6.118075847625732, 'learning_rate': 2.7692344335319564e-05, 'epoch': 19.66}
- 39%|███▉      | 2280/5800 [6:17:42<6:44:52,  6.90s/it]score1 tensor([[0.5977],
-        [0.4551],
-        [0.5273],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.4609, 0.5312, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0181, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:04:19,408] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 15:04:19,409] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.76 | bwd_microstep: 4632.46 | bwd_inner_microstep: 4627.76 | bwd_allreduce_microstep: 4.63 | step_microstep: 41.13
-[2025-01-25 15:04:19,410] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.73 | bwd: 4632.49 | bwd_inner: 4627.75 | bwd_allreduce: 4.67 | step: 41.14
- 39%|███▉      | 2281/5800 [6:17:49<6:44:59,  6.91s/it]                                                       {'loss': 0.0181, 'grad_norm': 8.731369972229004, 'learning_rate': 2.7682034109598843e-05, 'epoch': 19.66}
- 39%|███▉      | 2281/5800 [6:17:49<6:44:59,  6.91s/it]score1 tensor([[0.5117],
-        [0.5156],
-        [0.3906],
-        [0.4531]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4668, 0.5703, 0.4004, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:04:26,332] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.88 | optimizer_step: 4.36
-[2025-01-25 15:04:26,333] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.36 | bwd_microstep: 4639.50 | bwd_inner_microstep: 4634.66 | bwd_allreduce_microstep: 4.75 | step_microstep: 41.74
-[2025-01-25 15:04:26,334] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.33 | bwd: 4639.52 | bwd_inner: 4634.66 | bwd_allreduce: 4.79 | step: 41.75
- 39%|███▉      | 2282/5800 [6:17:56<6:45:10,  6.91s/it]                                                       {'loss': 0.0293, 'grad_norm': 0.5626809000968933, 'learning_rate': 2.7671721488486593e-05, 'epoch': 19.67}
- 39%|███▉      | 2282/5800 [6:17:56<6:45:10,  6.91s/it]score1 tensor([[0.4121],
-        [0.4434],
-        [0.4082],
-        [0.4785]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3457, 0.4570, 0.4512, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0342, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:04:33,248] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 15:04:33,250] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.46 | bwd_microstep: 4637.35 | bwd_inner_microstep: 4632.71 | bwd_allreduce_microstep: 4.56 | step_microstep: 41.07
-[2025-01-25 15:04:33,250] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.43 | bwd: 4637.38 | bwd_inner: 4632.71 | bwd_allreduce: 4.60 | step: 41.08
- 39%|███▉      | 2283/5800 [6:18:03<6:45:07,  6.91s/it]                                                       {'loss': 0.0342, 'grad_norm': 4.0156168937683105, 'learning_rate': 2.7661406475198472e-05, 'epoch': 19.68}
- 39%|███▉      | 2283/5800 [6:18:03<6:45:07,  6.91s/it]score1 tensor([[0.4844],
-        [0.4902],
-        [0.4316],
-        [0.4941]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.4648, 0.4473, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0142, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:04:40,152] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 15:04:40,153] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.11 | bwd_microstep: 4629.59 | bwd_inner_microstep: 4624.69 | bwd_allreduce_microstep: 4.80 | step_microstep: 39.14
-[2025-01-25 15:04:40,153] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.06 | bwd: 4629.61 | bwd_inner: 4624.69 | bwd_allreduce: 4.85 | step: 39.15
- 39%|███▉      | 2284/5800 [6:18:10<6:44:48,  6.91s/it]                                                       {'loss': 0.0142, 'grad_norm': 0.5650109052658081, 'learning_rate': 2.7651089072950875e-05, 'epoch': 19.69}
- 39%|███▉      | 2284/5800 [6:18:10<6:44:48,  6.91s/it]score1 tensor([[0.5195],
-        [0.5391],
-        [0.4512],
-        [0.4277]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.5234, 0.3789, 0.3672], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0596, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:04:47,054] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.88 | optimizer_step: 4.36
-[2025-01-25 15:04:47,055] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.78 | bwd_microstep: 4631.09 | bwd_inner_microstep: 4626.42 | bwd_allreduce_microstep: 4.57 | step_microstep: 41.54
-[2025-01-25 15:04:47,055] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.74 | bwd: 4631.11 | bwd_inner: 4626.42 | bwd_allreduce: 4.62 | step: 41.55
- 39%|███▉      | 2285/5800 [6:18:17<6:44:46,  6.91s/it]                                                       {'loss': 0.0596, 'grad_norm': 4.011119842529297, 'learning_rate': 2.764076928496094e-05, 'epoch': 19.7}
- 39%|███▉      | 2285/5800 [6:18:17<6:44:46,  6.91s/it]score1 tensor([[0.5156],
-        [0.6289],
-        [0.5234],
-        [0.5938]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4961, 0.6484, 0.4746, 0.6875], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0454, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:04:53,970] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 15:04:53,971] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.92 | bwd_microstep: 4633.49 | bwd_inner_microstep: 4628.69 | bwd_allreduce_microstep: 4.70 | step_microstep: 42.68
-[2025-01-25 15:04:53,972] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.90 | bwd: 4633.51 | bwd_inner: 4628.69 | bwd_allreduce: 4.75 | step: 42.69
- 39%|███▉      | 2286/5800 [6:18:23<6:44:39,  6.91s/it]                                                       {'loss': 0.0454, 'grad_norm': 0.7099324464797974, 'learning_rate': 2.763044711444657e-05, 'epoch': 19.71}
- 39%|███▉      | 2286/5800 [6:18:23<6:44:39,  6.91s/it]score1 tensor([[0.4902],
-        [0.4941],
-        [0.5234],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.4922, 0.5664, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0210, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:05:00,882] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.37
-[2025-01-25 15:05:00,884] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.73 | bwd_microstep: 4634.53 | bwd_inner_microstep: 4629.97 | bwd_allreduce_microstep: 4.48 | step_microstep: 42.69
-[2025-01-25 15:05:00,884] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.68 | bwd: 4634.56 | bwd_inner: 4629.97 | bwd_allreduce: 4.52 | step: 42.69
- 39%|███▉      | 2287/5800 [6:18:30<6:44:32,  6.91s/it]                                                       {'loss': 0.021, 'grad_norm': 0.502299964427948, 'learning_rate': 2.7620122564626383e-05, 'epoch': 19.72}
- 39%|███▉      | 2287/5800 [6:18:30<6:44:32,  6.91s/it]score1 tensor([[0.6250],
-        [0.5430],
-        [0.5078],
-        [0.6055]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6797, 0.5273, 0.4492, 0.6328], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:05:07,801] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 15:05:07,802] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.87 | bwd_microstep: 4642.63 | bwd_inner_microstep: 4636.86 | bwd_allreduce_microstep: 5.70 | step_microstep: 43.47
-[2025-01-25 15:05:07,802] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.84 | bwd: 4642.65 | bwd_inner: 4636.85 | bwd_allreduce: 5.73 | step: 43.48
- 39%|███▉      | 2288/5800 [6:18:37<6:44:35,  6.91s/it]                                                       {'loss': 0.0391, 'grad_norm': 0.783743143081665, 'learning_rate': 2.7609795638719767e-05, 'epoch': 19.72}
- 39%|███▉      | 2288/5800 [6:18:37<6:44:35,  6.91s/it]score1 tensor([[0.4453],
-        [0.4648],
-        [0.4961],
-        [0.4590]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4121, 0.4512, 0.4492, 0.4824], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:05:14,715] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 15:05:14,716] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.16 | bwd_microstep: 4632.05 | bwd_inner_microstep: 4627.35 | bwd_allreduce_microstep: 4.60 | step_microstep: 40.96
-[2025-01-25 15:05:14,716] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.11 | bwd: 4632.08 | bwd_inner: 4627.35 | bwd_allreduce: 4.65 | step: 40.97
- 39%|███▉      | 2289/5800 [6:18:44<6:44:31,  6.91s/it]                                                       {'loss': 0.0293, 'grad_norm': 4.047839641571045, 'learning_rate': 2.759946633994683e-05, 'epoch': 19.73}
- 39%|███▉      | 2289/5800 [6:18:44<6:44:31,  6.91s/it]score1 tensor([[0.5664],
-        [0.4531],
-        [0.4883],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.4023, 0.4688, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0371, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:05:21,626] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 15:05:21,628] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.17 | bwd_microstep: 4636.57 | bwd_inner_microstep: 4631.67 | bwd_allreduce_microstep: 4.82 | step_microstep: 44.85
-[2025-01-25 15:05:21,628] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.14 | bwd: 4636.59 | bwd_inner: 4631.67 | bwd_allreduce: 4.86 | step: 44.86
- 39%|███▉      | 2290/5800 [6:18:51<6:44:25,  6.91s/it]                                                       {'loss': 0.0371, 'grad_norm': 4.131101608276367, 'learning_rate': 2.758913467152842e-05, 'epoch': 19.74}
- 39%|███▉      | 2290/5800 [6:18:51<6:44:25,  6.91s/it]score1 tensor([[0.6055],
-        [0.4453],
-        [0.6094],
-        [0.6055]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6562, 0.4492, 0.6445, 0.6289], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0283, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:05:28,541] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 15:05:28,542] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.43 | bwd_microstep: 4634.03 | bwd_inner_microstep: 4629.28 | bwd_allreduce_microstep: 4.66 | step_microstep: 40.64
-[2025-01-25 15:05:28,543] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.39 | bwd: 4634.06 | bwd_inner: 4629.28 | bwd_allreduce: 4.71 | step: 40.65
- 40%|███▉      | 2291/5800 [6:18:58<6:44:17,  6.91s/it]                                                       {'loss': 0.0283, 'grad_norm': 9.07674789428711, 'learning_rate': 2.757880063668614e-05, 'epoch': 19.75}
- 40%|███▉      | 2291/5800 [6:18:58<6:44:17,  6.91s/it]score1 tensor([[0.5586],
-        [0.4395],
-        [0.4980],
-        [0.3984]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.3809, 0.5234, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0264, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:05:35,452] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.37
-[2025-01-25 15:05:35,459] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.53 | bwd_microstep: 4639.77 | bwd_inner_microstep: 4634.99 | bwd_allreduce_microstep: 4.71 | step_microstep: 46.94
-[2025-01-25 15:05:35,460] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.50 | bwd: 4639.80 | bwd_inner: 4634.99 | bwd_allreduce: 4.74 | step: 46.96
- 40%|███▉      | 2292/5800 [6:19:05<6:44:20,  6.92s/it]                                                       {'loss': 0.0264, 'grad_norm': 0.48910990357398987, 'learning_rate': 2.7568464238642314e-05, 'epoch': 19.76}
- 40%|███▉      | 2292/5800 [6:19:05<6:44:20,  6.92s/it]score1 tensor([[0.5352],
-        [0.4180],
-        [0.4844],
-        [0.4531]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.4551, 0.4941, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0332, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:05:42,372] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 15:05:42,373] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.23 | bwd_microstep: 4633.14 | bwd_inner_microstep: 4628.27 | bwd_allreduce_microstep: 4.76 | step_microstep: 40.59
-[2025-01-25 15:05:42,373] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.20 | bwd: 4633.16 | bwd_inner: 4628.27 | bwd_allreduce: 4.81 | step: 40.59
- 40%|███▉      | 2293/5800 [6:19:12<6:44:08,  6.91s/it]                                                       {'loss': 0.0332, 'grad_norm': 8.142845153808594, 'learning_rate': 2.7558125480620018e-05, 'epoch': 19.77}
- 40%|███▉      | 2293/5800 [6:19:12<6:44:08,  6.91s/it]score1 tensor([[0.4727],
-        [0.3789],
-        [0.5000],
-        [0.6523]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4141, 0.2812, 0.4785, 0.6562], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0454, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:05:49,280] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 15:05:49,282] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.76 | bwd_microstep: 4631.81 | bwd_inner_microstep: 4626.82 | bwd_allreduce_microstep: 4.88 | step_microstep: 40.66
-[2025-01-25 15:05:49,282] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.73 | bwd: 4631.83 | bwd_inner: 4626.82 | bwd_allreduce: 4.94 | step: 40.66
- 40%|███▉      | 2294/5800 [6:19:19<6:43:52,  6.91s/it]                                                       {'loss': 0.0454, 'grad_norm': 3.620849847793579, 'learning_rate': 2.7547784365843047e-05, 'epoch': 19.78}
- 40%|███▉      | 2294/5800 [6:19:19<6:43:52,  6.91s/it]score1 tensor([[0.5195],
-        [0.4277],
-        [0.4883],
-        [0.4746]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.4883, 0.4961, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0459, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:05:56,195] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 15:05:56,196] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.99 | bwd_microstep: 4632.34 | bwd_inner_microstep: 4627.62 | bwd_allreduce_microstep: 4.63 | step_microstep: 41.95
-[2025-01-25 15:05:56,197] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.96 | bwd: 4632.37 | bwd_inner: 4627.62 | bwd_allreduce: 4.68 | step: 41.96
- 40%|███▉      | 2295/5800 [6:19:26<6:43:54,  6.91s/it]                                                       {'loss': 0.0459, 'grad_norm': 0.5740492343902588, 'learning_rate': 2.7537440897535953e-05, 'epoch': 19.78}
- 40%|███▉      | 2295/5800 [6:19:26<6:43:54,  6.91s/it]score1 tensor([[0.3730],
-        [0.4414],
-        [0.3633],
-        [0.4902]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4062, 0.4336, 0.3750, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0322, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:06:03,111] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 15:06:03,112] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.91 | bwd_microstep: 4641.46 | bwd_inner_microstep: 4636.73 | bwd_allreduce_microstep: 4.63 | step_microstep: 41.28
-[2025-01-25 15:06:03,113] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.88 | bwd: 4641.49 | bwd_inner: 4636.73 | bwd_allreduce: 4.67 | step: 41.29
- 40%|███▉      | 2296/5800 [6:19:33<6:43:52,  6.92s/it]                                                       {'loss': 0.0322, 'grad_norm': 3.7964284420013428, 'learning_rate': 2.7527095078923998e-05, 'epoch': 19.79}
- 40%|███▉      | 2296/5800 [6:19:33<6:43:52,  6.92s/it]score1 tensor([[0.4492],
-        [0.4844],
-        [0.5508],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.5391, 0.5156, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:06:10,050] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.36
-[2025-01-25 15:06:10,051] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.58 | bwd_microstep: 4642.62 | bwd_inner_microstep: 4637.87 | bwd_allreduce_microstep: 4.64 | step_microstep: 42.68
-[2025-01-25 15:06:10,052] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.52 | bwd: 4642.65 | bwd_inner: 4637.87 | bwd_allreduce: 4.70 | step: 42.69
- 40%|███▉      | 2297/5800 [6:19:40<6:44:04,  6.92s/it]                                                       {'loss': 0.0293, 'grad_norm': 0.6514560580253601, 'learning_rate': 2.7516746913233186e-05, 'epoch': 19.8}
- 40%|███▉      | 2297/5800 [6:19:40<6:44:04,  6.92s/it]score1 tensor([[0.4395],
-        [0.4824],
-        [0.5273],
-        [0.4922]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4375, 0.5156, 0.6172, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0400, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:06:16,967] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 15:06:16,969] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.61 | bwd_microstep: 4638.52 | bwd_inner_microstep: 4633.74 | bwd_allreduce_microstep: 4.68 | step_microstep: 42.58
-[2025-01-25 15:06:16,969] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.57 | bwd: 4638.55 | bwd_inner: 4633.74 | bwd_allreduce: 4.73 | step: 42.55
- 40%|███▉      | 2298/5800 [6:19:46<6:43:50,  6.92s/it]                                                       {'loss': 0.04, 'grad_norm': 4.358088493347168, 'learning_rate': 2.7506396403690265e-05, 'epoch': 19.81}
- 40%|███▉      | 2298/5800 [6:19:46<6:43:50,  6.92s/it]score1 tensor([[0.4199],
-        [0.5000],
-        [0.4316],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3711, 0.5430, 0.4551, 0.4434], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0459, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:06:23,872] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 15:06:23,873] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.53 | bwd_microstep: 4633.22 | bwd_inner_microstep: 4628.18 | bwd_allreduce_microstep: 4.95 | step_microstep: 42.11
-[2025-01-25 15:06:23,873] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.50 | bwd: 4633.25 | bwd_inner: 4628.18 | bwd_allreduce: 5.00 | step: 42.12
- 40%|███▉      | 2299/5800 [6:19:53<6:43:29,  6.91s/it]                                                       {'loss': 0.0459, 'grad_norm': 0.4437994360923767, 'learning_rate': 2.74960435535227e-05, 'epoch': 19.82}
- 40%|███▉      | 2299/5800 [6:19:53<6:43:29,  6.91s/it]score1 tensor([[0.5391],
-        [0.4395],
-        [0.5586],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5742, 0.4180, 0.5625, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0171, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:06:30,785] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.37
-[2025-01-25 15:06:30,786] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.56 | bwd_microstep: 4636.17 | bwd_inner_microstep: 4631.52 | bwd_allreduce_microstep: 4.55 | step_microstep: 40.81
-[2025-01-25 15:06:30,786] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.52 | bwd: 4636.19 | bwd_inner: 4631.52 | bwd_allreduce: 4.60 | step: 40.81
- 40%|███▉      | 2300/5800 [6:20:00<6:43:20,  6.91s/it]                                                       {'loss': 0.0171, 'grad_norm': 0.5374167561531067, 'learning_rate': 2.748568836595868e-05, 'epoch': 19.83}
- 40%|███▉      | 2300/5800 [6:20:00<6:43:20,  6.91s/it]evaluate!
-score1 tensor([[0.4473]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4805, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0332, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5586]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5625, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4531]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4766, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5703, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5938]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5742, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4199]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5547, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1348, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5117]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5977, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0859, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5859]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4219, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1641, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5820]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0938, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4824]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5000, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4180]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3945, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4980]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5000, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5078]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3730, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1348, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4922]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0., device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4355]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4414, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5117]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6797, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1680, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5312]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5391, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4785]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4648, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4180]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4258, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4395]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4707, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4824]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3965, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0859, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4688]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3750, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0938, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5117]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4492, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0625, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4941]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5586, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0645, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5547]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6523, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0977, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6250]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5898, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5742]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4355, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1387, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5039]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5156, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4785]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3262, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1523, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3984]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4863, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0879, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4551]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1934, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6055, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4902]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4570, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0332, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5430]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1055, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5898]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.7031, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1133, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4512]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4336, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5430]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5391, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4512]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4473, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5430, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4141]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4590, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0449, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6523]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6641, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5078]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4453, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0625, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5977]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.7109, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1133, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5859]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5273, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0586, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4922]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3691, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1230, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4492]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4180, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4668]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3223, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1445, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5625]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6016, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4648, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0742, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4336]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3418, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0918, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4922]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4590, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0332, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4922]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5117, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4941]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4551, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5117]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5234, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5469]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5312, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5859]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5820, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4473]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4199, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4219]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3926, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5000]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1172, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5117]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5352, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0., device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6055]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6250, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6016]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5703, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4238]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4727, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0488, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5820]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4434, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1387, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4785]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4941, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5312]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5352, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4473]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3828, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0645, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4727]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4219, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4863]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4844, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5312]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4805, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5586]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0586, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5625]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5898, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6836]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6641, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4688, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4648]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4297, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4414]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4668, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4922]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1367, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4824]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4297, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0527, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4297]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4609, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4785]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1387, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4727]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4492, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4297]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3730, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0566, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6445, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1289, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4668]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1816, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4863]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5430, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0566, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4844]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5273, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4629]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3867, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0762, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4668, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0684, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5664]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5625, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5977]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6250, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5586]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5898]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4277]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4785, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4805]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4062, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0742, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5117]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4395, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0723, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4629]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5664, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1035, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4531]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3555, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0977, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4414]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4785, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0371, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5703]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5078, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0625, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5781]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6016, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4805]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4277, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0527, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5586]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5781, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5078]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3477, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1602, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4980]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6133, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1152, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5312]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5156, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4434]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3379, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1055, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4414]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3418, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0996, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4531]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3848, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0684, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4375]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3828, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4805]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4277, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0527, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4277]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4043, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-Results saved to /DATA/DATA1/wjr/intern/InternVL/internvl_chat/new_train_result/stage2/mos3_test.csv
-Accuracy: 0.0
-SRCC_score: 0.6052000201544723
-PLCC_score: 0.5972978306718223
-KRCC_score: 0.428046578244685
-SRCC_level: 0.6052000201544723
-PLCC_level: 0.5972978306718223
-KRCC_level: 0.428046578244685
-score1 tensor([[0.3809],
-        [0.4961],
-        [0.6328],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3613, 0.4941, 0.6172, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:16:51,450] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.37
-[2025-01-25 15:16:51,452] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2135.46 | bwd_microstep: 4597.54 | bwd_inner_microstep: 4592.49 | bwd_allreduce_microstep: 4.96 | step_microstep: 41.78
-[2025-01-25 15:16:51,452] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2135.42 | bwd: 4597.57 | bwd_inner: 4592.49 | bwd_allreduce: 5.00 | step: 41.78
- 40%|███▉      | 2301/5800 [6:30:21<185:40:47, 191.04s/it]                                                          {'loss': 0.0122, 'grad_norm': 4.210055351257324, 'learning_rate': 2.7475330844227147e-05, 'epoch': 19.84}
- 40%|███▉      | 2301/5800 [6:30:21<185:40:47, 191.04s/it]score1 tensor([[0.5820],
-        [0.4922],
-        [0.5977],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.4297, 0.6055, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0488, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:16:58,279] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 15:16:58,280] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2134.79 | bwd_microstep: 4575.85 | bwd_inner_microstep: 4571.02 | bwd_allreduce_microstep: 4.75 | step_microstep: 40.08
-[2025-01-25 15:16:58,280] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2134.75 | bwd: 4575.87 | bwd_inner: 4571.02 | bwd_allreduce: 4.80 | step: 40.08
- 40%|███▉      | 2302/5800 [6:30:28<131:55:48, 135.78s/it]                                                          {'loss': 0.0488, 'grad_norm': 0.5692770481109619, 'learning_rate': 2.7464970991557747e-05, 'epoch': 19.84}
- 40%|███▉      | 2302/5800 [6:30:28<131:55:48, 135.78s/it]score1 tensor([[0.4531],
-        [0.5195],
-        [0.4570],
-        [0.6875]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3398, 0.5391, 0.3730, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0649, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:17:05,114] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.89 | optimizer_step: 4.37
-[2025-01-25 15:17:05,115] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2131.99 | bwd_microstep: 4583.53 | bwd_inner_microstep: 4578.43 | bwd_allreduce_microstep: 4.98 | step_microstep: 41.33
-[2025-01-25 15:17:05,116] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2131.95 | bwd: 4583.56 | bwd_inner: 4578.43 | bwd_allreduce: 5.04 | step: 41.33
- 40%|███▉      | 2303/5800 [6:30:35<94:18:57, 97.09s/it]                                                          {'loss': 0.0649, 'grad_norm': 4.39826774597168, 'learning_rate': 2.745460881118086e-05, 'epoch': 19.85}
- 40%|███▉      | 2303/5800 [6:30:35<94:18:57, 97.09s/it]score1 tensor([[0.4102],
-        [0.4238],
-        [0.4570],
-        [0.4375]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4238, 0.4336, 0.4473, 0.4082], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:17:11,955] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 15:17:11,957] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2133.17 | bwd_microstep: 4583.33 | bwd_inner_microstep: 4578.07 | bwd_allreduce_microstep: 5.15 | step_microstep: 45.48
-[2025-01-25 15:17:11,957] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2133.13 | bwd: 4583.36 | bwd_inner: 4578.07 | bwd_allreduce: 5.21 | step: 45.49
- 40%|███▉      | 2304/5800 [6:30:41<67:59:47, 70.02s/it]                                                        {'loss': 0.0156, 'grad_norm': 0.49399498105049133, 'learning_rate': 2.74442443063276e-05, 'epoch': 19.86}
- 40%|███▉      | 2304/5800 [6:30:41<67:59:47, 70.02s/it]score1 tensor([[0.6211],
-        [0.6328],
-        [0.4160],
-        [0.4297]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5820, 0.5938, 0.4043, 0.4395], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0249, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:17:18,814] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 15:17:18,819] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2141.06 | bwd_microstep: 4593.89 | bwd_inner_microstep: 4588.98 | bwd_allreduce_microstep: 4.82 | step_microstep: 45.37
-[2025-01-25 15:17:18,819] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2141.02 | bwd: 4593.91 | bwd_inner: 4588.98 | bwd_allreduce: 4.87 | step: 45.38
- 40%|███▉      | 2305/5800 [6:30:48<49:34:51, 51.07s/it]                                                        {'loss': 0.0249, 'grad_norm': 4.803596019744873, 'learning_rate': 2.743387748022979e-05, 'epoch': 19.87}
- 40%|███▉      | 2305/5800 [6:30:48<49:34:51, 51.07s/it]score1 tensor([[0.4355],
-        [0.5625],
-        [0.5234],
-        [0.4238]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.5508, 0.5508, 0.4316], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:17:25,680] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 15:17:25,682] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2143.06 | bwd_microstep: 4600.51 | bwd_inner_microstep: 4595.43 | bwd_allreduce_microstep: 4.97 | step_microstep: 47.08
-[2025-01-25 15:17:25,682] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2143.01 | bwd: 4600.53 | bwd_inner: 4595.44 | bwd_allreduce: 5.02 | step: 47.09
- 40%|███▉      | 2306/5800 [6:30:55<36:41:43, 37.81s/it]                                                        {'loss': 0.0146, 'grad_norm': 3.8361542224884033, 'learning_rate': 2.742350833612e-05, 'epoch': 19.88}
- 40%|███▉      | 2306/5800 [6:30:55<36:41:43, 37.81s/it]score1 tensor([[0.4824],
-        [0.5234],
-        [0.4766],
-        [0.4434]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.5508, 0.4453, 0.4727], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0229, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:17:32,548] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 15:17:32,550] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2145.80 | bwd_microstep: 4598.92 | bwd_inner_microstep: 4594.00 | bwd_allreduce_microstep: 4.83 | step_microstep: 47.39
-[2025-01-25 15:17:32,550] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2145.75 | bwd: 4598.96 | bwd_inner: 4594.00 | bwd_allreduce: 4.88 | step: 47.40
- 40%|███▉      | 2307/5800 [6:31:02<27:40:43, 28.53s/it]                                                        {'loss': 0.0229, 'grad_norm': 0.5079198479652405, 'learning_rate': 2.7413136877231497e-05, 'epoch': 19.89}
- 40%|███▉      | 2307/5800 [6:31:02<27:40:43, 28.53s/it]score1 tensor([[0.4082],
-        [0.3984],
-        [0.5938],
-        [0.3457]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3477, 0.4180, 0.6602, 0.3223], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0425, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:17:39,423] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 15:17:39,424] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2146.75 | bwd_microstep: 4606.79 | bwd_inner_microstep: 4601.96 | bwd_allreduce_microstep: 4.74 | step_microstep: 44.17
-[2025-01-25 15:17:39,424] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2146.69 | bwd: 4606.81 | bwd_inner: 4601.96 | bwd_allreduce: 4.78 | step: 44.18
- 40%|███▉      | 2308/5800 [6:31:09<21:22:07, 22.03s/it]                                                        {'loss': 0.0425, 'grad_norm': 0.873447835445404, 'learning_rate': 2.7402763106798295e-05, 'epoch': 19.9}
- 40%|███▉      | 2308/5800 [6:31:09<21:22:07, 22.03s/it]score1 tensor([[0.4785],
-        [0.5273],
-        [0.4785],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.5312, 0.5430, 0.5898], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0420, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:17:46,296] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.05 | optimizer_step: 4.37
-[2025-01-25 15:17:46,297] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2148.12 | bwd_microstep: 4605.79 | bwd_inner_microstep: 4600.84 | bwd_allreduce_microstep: 4.85 | step_microstep: 41.61
-[2025-01-25 15:17:46,298] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2148.08 | bwd: 4605.81 | bwd_inner: 4600.84 | bwd_allreduce: 4.91 | step: 41.62
- 40%|███▉      | 2309/5800 [6:31:16<16:57:19, 17.48s/it]                                                        {'loss': 0.042, 'grad_norm': 8.514618873596191, 'learning_rate': 2.73923870280551e-05, 'epoch': 19.91}
- 40%|███▉      | 2309/5800 [6:31:16<16:57:19, 17.48s/it]score1 tensor([[0.4297],
-        [0.5938],
-        [0.4629],
-        [0.6523]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3945, 0.6016, 0.4941, 0.6406], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:17:53,189] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.37
-[2025-01-25 15:17:53,190] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2149.44 | bwd_microstep: 4607.48 | bwd_inner_microstep: 4602.47 | bwd_allreduce_microstep: 4.93 | step_microstep: 49.28
-[2025-01-25 15:17:53,190] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2149.39 | bwd: 4607.51 | bwd_inner: 4602.47 | bwd_allreduce: 4.97 | step: 49.28
- 40%|███▉      | 2310/5800 [6:31:23<13:52:07, 14.31s/it]                                                        {'loss': 0.0215, 'grad_norm': 0.5942332148551941, 'learning_rate': 2.7382008644237357e-05, 'epoch': 19.91}
- 40%|███▉      | 2310/5800 [6:31:23<13:52:07, 14.31s/it]score1 tensor([[0.5234],
-        [0.4414],
-        [0.4570],
-        [0.4512]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.4805, 0.4629, 0.4297], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:18:00,064] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 15:18:00,066] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.00 | bwd_microstep: 4606.44 | bwd_inner_microstep: 4601.08 | bwd_allreduce_microstep: 5.24 | step_microstep: 41.65
-[2025-01-25 15:18:00,066] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.97 | bwd: 4606.47 | bwd_inner: 4601.08 | bwd_allreduce: 5.30 | step: 41.66
- 40%|███▉      | 2311/5800 [6:31:30<11:42:21, 12.08s/it]                                                        {'loss': 0.0195, 'grad_norm': 4.1614508628845215, 'learning_rate': 2.737162795858123e-05, 'epoch': 19.92}
- 40%|███▉      | 2311/5800 [6:31:30<11:42:21, 12.08s/it]score1 tensor([[0.4648],
-        [0.4023],
-        [0.6211],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.3672, 0.6094, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:18:06,951] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 15:18:06,952] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.90 | bwd_microstep: 4609.71 | bwd_inner_microstep: 4604.80 | bwd_allreduce_microstep: 4.82 | step_microstep: 46.07
-[2025-01-25 15:18:06,953] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.87 | bwd: 4609.74 | bwd_inner: 4604.80 | bwd_allreduce: 4.87 | step: 46.08
- 40%|███▉      | 2312/5800 [6:31:36<10:11:28, 10.52s/it]                                                        {'loss': 0.0176, 'grad_norm': 0.49009695649147034, 'learning_rate': 2.7361244974323604e-05, 'epoch': 19.93}
- 40%|███▉      | 2312/5800 [6:31:36<10:11:28, 10.52s/it]score1 tensor([[0.4492],
-        [0.6797],
-        [0.6055],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3984, 0.6133, 0.7031, 0.5820], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0684, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:18:13,832] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 15:18:13,834] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.74 | bwd_microstep: 4610.65 | bwd_inner_microstep: 4605.39 | bwd_allreduce_microstep: 5.14 | step_microstep: 44.48
-[2025-01-25 15:18:13,834] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.69 | bwd: 4610.69 | bwd_inner: 4605.39 | bwd_allreduce: 5.20 | step: 44.48
- 40%|███▉      | 2313/5800 [6:31:43<9:07:56,  9.43s/it]                                                        {'loss': 0.0684, 'grad_norm': 0.7165543437004089, 'learning_rate': 2.7350859694702065e-05, 'epoch': 19.94}
- 40%|███▉      | 2313/5800 [6:31:43<9:07:56,  9.43s/it]score1 tensor([[0.6602],
-        [0.4746],
-        [0.5195],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.4648, 0.5625, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0439, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:18:20,718] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.37
-[2025-01-25 15:18:20,719] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.48 | bwd_microstep: 4614.01 | bwd_inner_microstep: 4608.84 | bwd_allreduce_microstep: 5.06 | step_microstep: 43.55
-[2025-01-25 15:18:20,720] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.39 | bwd: 4614.04 | bwd_inner: 4608.84 | bwd_allreduce: 5.12 | step: 43.56
- 40%|███▉      | 2314/5800 [6:31:50<8:23:23,  8.66s/it]                                                       {'loss': 0.0439, 'grad_norm': 4.6919026374816895, 'learning_rate': 2.7340472122954923e-05, 'epoch': 19.95}
- 40%|███▉      | 2314/5800 [6:31:50<8:23:23,  8.66s/it]score1 tensor([[0.5742],
-        [0.4961],
-        [0.5547],
-        [0.6836]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4922, 0.4863, 0.5430, 0.6484], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0347, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:18:27,592] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 15:18:27,594] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.92 | bwd_microstep: 4605.99 | bwd_inner_microstep: 4601.17 | bwd_allreduce_microstep: 4.72 | step_microstep: 42.04
-[2025-01-25 15:18:27,594] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.89 | bwd: 4606.02 | bwd_inner: 4601.17 | bwd_allreduce: 4.77 | step: 42.05
- 40%|███▉      | 2315/5800 [6:31:57<7:52:06,  8.13s/it]                                                       {'loss': 0.0347, 'grad_norm': 9.130524635314941, 'learning_rate': 2.7330082262321203e-05, 'epoch': 19.96}
- 40%|███▉      | 2315/5800 [6:31:57<7:52:06,  8.13s/it]score1 tensor([[0.6016],
-        [0.5312],
-        [0.5469],
-        [0.4824]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5703, 0.4531, 0.5664, 0.4219], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0474, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:18:34,472] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 15:18:34,473] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.84 | bwd_microstep: 4608.16 | bwd_inner_microstep: 4603.29 | bwd_allreduce_microstep: 4.77 | step_microstep: 42.20
-[2025-01-25 15:18:34,473] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.80 | bwd: 4608.20 | bwd_inner: 4603.29 | bwd_allreduce: 4.82 | step: 42.20
- 40%|███▉      | 2316/5800 [6:32:04<7:30:21,  7.76s/it]                                                       {'loss': 0.0474, 'grad_norm': 4.386194705963135, 'learning_rate': 2.731969011604065e-05, 'epoch': 19.97}
- 40%|███▉      | 2316/5800 [6:32:04<7:30:21,  7.76s/it]score1 tensor([[0.7148],
-        [0.5273],
-        [0.5039],
-        [0.4941]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133, 0.6016, 0.5156, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0576, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:18:41,372] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.37
-[2025-01-25 15:18:41,374] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.51 | bwd_microstep: 4614.70 | bwd_inner_microstep: 4609.06 | bwd_allreduce_microstep: 5.49 | step_microstep: 48.16
-[2025-01-25 15:18:41,375] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.46 | bwd: 4614.73 | bwd_inner: 4609.06 | bwd_allreduce: 5.56 | step: 48.17
- 40%|███▉      | 2317/5800 [6:32:11<7:15:14,  7.50s/it]                                                       {'loss': 0.0576, 'grad_norm': 0.8071432113647461, 'learning_rate': 2.7309295687353707e-05, 'epoch': 19.97}
- 40%|███▉      | 2317/5800 [6:32:11<7:15:14,  7.50s/it]score1 tensor([[0.6602],
-        [0.5508],
-        [0.6680],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6797, 0.4941, 0.6641, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0210, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:18:48,268] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 15:18:48,270] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.96 | bwd_microstep: 4616.59 | bwd_inner_microstep: 4611.48 | bwd_allreduce_microstep: 5.01 | step_microstep: 43.08
-[2025-01-25 15:18:48,270] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.91 | bwd: 4616.62 | bwd_inner: 4611.48 | bwd_allreduce: 5.06 | step: 43.09
- 40%|███▉      | 2318/5800 [6:32:18<7:04:36,  7.32s/it]                                                       {'loss': 0.021, 'grad_norm': 4.474473476409912, 'learning_rate': 2.7298898979501546e-05, 'epoch': 19.98}
- 40%|███▉      | 2318/5800 [6:32:18<7:04:36,  7.32s/it]score1 tensor([[0.6094],
-        [0.3867],
-        [0.5039],
-        [0.4453]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.3809, 0.5469, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0640, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:18:55,150] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 15:18:55,151] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.15 | bwd_microstep: 4608.33 | bwd_inner_microstep: 4603.47 | bwd_allreduce_microstep: 4.77 | step_microstep: 42.42
-[2025-01-25 15:18:55,151] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.12 | bwd: 4608.35 | bwd_inner: 4603.47 | bwd_allreduce: 4.82 | step: 42.42
- 40%|███▉      | 2319/5800 [6:32:25<6:56:57,  7.19s/it]                                                       {'loss': 0.064, 'grad_norm': 0.6336334943771362, 'learning_rate': 2.728849999572603e-05, 'epoch': 19.99}
- 40%|███▉      | 2319/5800 [6:32:25<6:56:57,  7.19s/it][93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-score1 tensor([[0.6055]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:18:59,357] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.36
-[2025-01-25 15:18:59,359] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 571.69 | bwd_microstep: 1221.75 | bwd_inner_microstep: 1217.08 | bwd_allreduce_microstep: 4.58 | step_microstep: 48.21
-[2025-01-25 15:18:59,359] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 571.65 | bwd: 1221.78 | bwd_inner: 1217.08 | bwd_allreduce: 4.63 | step: 48.22
- 40%|████      | 2320/5800 [6:32:29<6:04:53,  6.29s/it]                                                       {'loss': 0.0117, 'grad_norm': 9.514626502990723, 'learning_rate': 2.7278098739269757e-05, 'epoch': 20.0}
- 40%|████      | 2320/5800 [6:32:29<6:04:53,  6.29s/it][2025-01-25 15:19:03,951] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 15:19:14,194] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 15:19:24,292] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 15:19:34,136] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-score1 tensor([[0.6641],
-        [0.5586],
-        [0.6016],
-        [0.4277]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6367, 0.5664, 0.5820, 0.4629], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0225, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:19:56,373] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.37
-[2025-01-25 15:19:56,375] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2145.00 | bwd_microstep: 4595.09 | bwd_inner_microstep: 4588.91 | bwd_allreduce_microstep: 6.08 | step_microstep: 47.05
-[2025-01-25 15:19:56,376] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2144.96 | bwd: 4595.13 | bwd_inner: 4588.91 | bwd_allreduce: 6.14 | step: 47.06
- 40%|████      | 2321/5800 [6:33:26<20:47:17, 21.51s/it]                                                        {'loss': 0.0225, 'grad_norm': 0.9898993372917175, 'learning_rate': 2.7267695213376005e-05, 'epoch': 20.01}
- 40%|████      | 2321/5800 [6:33:26<20:47:17, 21.51s/it]score1 tensor([[0.6094],
-        [0.5938],
-        [0.4141],
-        [0.4902]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6016, 0.6172, 0.4336, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:20:03,203] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 15:20:03,204] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2130.24 | bwd_microstep: 4572.90 | bwd_inner_microstep: 4567.60 | bwd_allreduce_microstep: 5.22 | step_microstep: 43.94
-[2025-01-25 15:20:03,205] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2130.20 | bwd: 4572.93 | bwd_inner: 4567.60 | bwd_allreduce: 5.26 | step: 43.94
- 40%|████      | 2322/5800 [6:33:33<16:31:36, 17.11s/it]                                                        {'loss': 0.0161, 'grad_norm': 3.9269819259643555, 'learning_rate': 2.725728942128878e-05, 'epoch': 20.02}
- 40%|████      | 2322/5800 [6:33:33<16:31:36, 17.11s/it]score1 tensor([[0.4863],
-        [0.4922],
-        [0.4180],
-        [0.4043]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.4805, 0.3809, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0239, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:20:10,057] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.37
-[2025-01-25 15:20:10,058] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2134.90 | bwd_microstep: 4588.55 | bwd_inner_microstep: 4583.57 | bwd_allreduce_microstep: 4.88 | step_microstep: 45.14
-[2025-01-25 15:20:10,058] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2134.87 | bwd: 4588.58 | bwd_inner: 4583.57 | bwd_allreduce: 4.94 | step: 45.15
- 40%|████      | 2323/5800 [6:33:40<13:33:03, 14.03s/it]                                                        {'loss': 0.0239, 'grad_norm': 3.821211338043213, 'learning_rate': 2.7246881366252787e-05, 'epoch': 20.03}
- 40%|████      | 2323/5800 [6:33:40<13:33:03, 14.03s/it]score1 tensor([[0.5391],
-        [0.5703],
-        [0.5352],
-        [0.4707]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.5508, 0.5625, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0225, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:20:16,917] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 15:20:16,919] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2140.16 | bwd_microstep: 4594.43 | bwd_inner_microstep: 4588.88 | bwd_allreduce_microstep: 5.44 | step_microstep: 46.49
-[2025-01-25 15:20:16,920] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2140.12 | bwd: 4594.47 | bwd_inner: 4588.88 | bwd_allreduce: 5.49 | step: 46.50
- 40%|████      | 2324/5800 [6:33:46<11:28:16, 11.88s/it]                                                        {'loss': 0.0225, 'grad_norm': 0.5914301872253418, 'learning_rate': 2.7236471051513444e-05, 'epoch': 20.03}
- 40%|████      | 2324/5800 [6:33:46<11:28:16, 11.88s/it]score1 tensor([[0.5000],
-        [0.5469],
-        [0.4180],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.5508, 0.4004, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:20:23,803] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.88 | optimizer_step: 4.36
-[2025-01-25 15:20:23,805] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2148.76 | bwd_microstep: 4605.58 | bwd_inner_microstep: 4599.98 | bwd_allreduce_microstep: 5.52 | step_microstep: 44.87
-[2025-01-25 15:20:23,805] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2148.72 | bwd: 4605.60 | bwd_inner: 4599.98 | bwd_allreduce: 5.56 | step: 44.88
- 40%|████      | 2325/5800 [6:33:53<10:01:16, 10.38s/it]                                                        {'loss': 0.0156, 'grad_norm': 0.7750594615936279, 'learning_rate': 2.7226058480316862e-05, 'epoch': 20.04}
- 40%|████      | 2325/5800 [6:33:53<10:01:16, 10.38s/it]score1 tensor([[0.6016],
-        [0.5781],
-        [0.3516],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6562, 0.5664, 0.3906, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:20:30,681] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.37
-[2025-01-25 15:20:30,682] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2147.77 | bwd_microstep: 4604.66 | bwd_inner_microstep: 4598.92 | bwd_allreduce_microstep: 5.60 | step_microstep: 43.64
-[2025-01-25 15:20:30,683] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2147.73 | bwd: 4604.68 | bwd_inner: 4598.92 | bwd_allreduce: 5.67 | step: 43.65
- 40%|████      | 2326/5800 [6:34:00<9:00:08,  9.33s/it]                                                        {'loss': 0.0391, 'grad_norm': 3.929271697998047, 'learning_rate': 2.7215643655909865e-05, 'epoch': 20.05}
- 40%|████      | 2326/5800 [6:34:00<9:00:08,  9.33s/it]score1 tensor([[0.5117],
-        [0.5352],
-        [0.4219],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5234, 0.5703, 0.5391, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0420, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:20:37,552] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.10 | optimizer_step: 4.37
-[2025-01-25 15:20:37,553] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2146.59 | bwd_microstep: 4605.59 | bwd_inner_microstep: 4600.68 | bwd_allreduce_microstep: 4.76 | step_microstep: 44.35
-[2025-01-25 15:20:37,554] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2146.55 | bwd: 4605.63 | bwd_inner: 4600.68 | bwd_allreduce: 4.85 | step: 44.36
- 40%|████      | 2327/5800 [6:34:07<8:17:25,  8.59s/it]                                                       {'loss': 0.042, 'grad_norm': 3.9522554874420166, 'learning_rate': 2.7205226581539966e-05, 'epoch': 20.06}
- 40%|████      | 2327/5800 [6:34:07<8:17:25,  8.59s/it]score1 tensor([[0.4355],
-        [0.5469],
-        [0.5273],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4219, 0.5664, 0.4863, 0.6289], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0371, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:20:44,428] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.37
-[2025-01-25 15:20:44,430] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2148.25 | bwd_microstep: 4604.81 | bwd_inner_microstep: 4599.26 | bwd_allreduce_microstep: 5.46 | step_microstep: 43.23
-[2025-01-25 15:20:44,430] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2148.21 | bwd: 4604.83 | bwd_inner: 4599.26 | bwd_allreduce: 5.51 | step: 43.24
- 40%|████      | 2328/5800 [6:34:14<7:47:25,  8.08s/it]                                                       {'loss': 0.0371, 'grad_norm': 0.6657881736755371, 'learning_rate': 2.7194807260455403e-05, 'epoch': 20.07}
- 40%|████      | 2328/5800 [6:34:14<7:47:25,  8.08s/it]score1 tensor([[0.5781],
-        [0.6055],
-        [0.4805],
-        [0.4551]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5898, 0.6211, 0.4961, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0166, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:20:51,313] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.37
-[2025-01-25 15:20:51,314] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.08 | bwd_microstep: 4609.04 | bwd_inner_microstep: 4603.53 | bwd_allreduce_microstep: 5.40 | step_microstep: 47.59
-[2025-01-25 15:20:51,315] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.05 | bwd: 4609.07 | bwd_inner: 4603.53 | bwd_allreduce: 5.46 | step: 47.59
- 40%|████      | 2329/5800 [6:34:21<7:26:31,  7.72s/it]                                                       {'loss': 0.0166, 'grad_norm': 8.72750473022461, 'learning_rate': 2.7184385695905094e-05, 'epoch': 20.08}
- 40%|████      | 2329/5800 [6:34:21<7:26:31,  7.72s/it]score1 tensor([[0.5273],
-        [0.4941],
-        [0.5430],
-        [0.4238]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.4395, 0.5742, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0503, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:20:58,195] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.37
-[2025-01-25 15:20:58,196] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2151.72 | bwd_microstep: 4607.01 | bwd_inner_microstep: 4601.38 | bwd_allreduce_microstep: 5.52 | step_microstep: 45.00
-[2025-01-25 15:20:58,197] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2151.68 | bwd: 4607.04 | bwd_inner: 4601.38 | bwd_allreduce: 5.59 | step: 45.00
- 40%|████      | 2330/5800 [6:34:28<7:11:54,  7.47s/it]                                                       {'loss': 0.0503, 'grad_norm': 0.5110815763473511, 'learning_rate': 2.7173961891138665e-05, 'epoch': 20.09}
- 40%|████      | 2330/5800 [6:34:28<7:11:54,  7.47s/it]score1 tensor([[0.5898],
-        [0.4316],
-        [0.4805],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5586, 0.4180, 0.4629, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0166, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:21:05,086] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 15:21:05,087] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.22 | bwd_microstep: 4616.17 | bwd_inner_microstep: 4610.57 | bwd_allreduce_microstep: 5.45 | step_microstep: 45.83
-[2025-01-25 15:21:05,088] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.18 | bwd: 4616.20 | bwd_inner: 4610.57 | bwd_allreduce: 5.54 | step: 45.83
- 40%|████      | 2331/5800 [6:34:35<7:01:47,  7.30s/it]                                                       {'loss': 0.0166, 'grad_norm': 8.563591003417969, 'learning_rate': 2.716353584940644e-05, 'epoch': 20.09}
- 40%|████      | 2331/5800 [6:34:35<7:01:47,  7.30s/it]score1 tensor([[0.3672],
-        [0.5391],
-        [0.5156],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3672, 0.4844, 0.4121, 0.4863], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0566, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:21:11,924] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 7.63 | optimizer_step: 4.36
-[2025-01-25 15:21:11,926] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.37 | bwd_microstep: 4554.31 | bwd_inner_microstep: 4549.67 | bwd_allreduce_microstep: 4.56 | step_microstep: 46.21
-[2025-01-25 15:21:11,926] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.33 | bwd: 4554.33 | bwd_inner: 4549.67 | bwd_allreduce: 4.60 | step: 46.22
- 40%|████      | 2332/5800 [6:34:41<6:53:43,  7.16s/it]                                                       {'loss': 0.0566, 'grad_norm': 6.522052764892578, 'learning_rate': 2.7153107573959444e-05, 'epoch': 20.1}
- 40%|████      | 2332/5800 [6:34:41<6:53:43,  7.16s/it]score1 tensor([[0.4941],
-        [0.5625],
-        [0.6523],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.4941, 0.6797, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0278, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:21:18,777] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 15:21:18,779] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.34 | bwd_microstep: 4573.97 | bwd_inner_microstep: 4569.11 | bwd_allreduce_microstep: 4.74 | step_microstep: 44.76
-[2025-01-25 15:21:18,779] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.28 | bwd: 4574.00 | bwd_inner: 4569.11 | bwd_allreduce: 4.79 | step: 44.77
- 40%|████      | 2333/5800 [6:34:48<6:48:19,  7.07s/it]                                                       {'loss': 0.0278, 'grad_norm': 2.0855607986450195, 'learning_rate': 2.7142677068049384e-05, 'epoch': 20.11}
- 40%|████      | 2333/5800 [6:34:48<6:48:19,  7.07s/it]score1 tensor([[0.5234],
-        [0.6211],
-        [0.4883],
-        [0.6289]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4707, 0.5508, 0.4648, 0.7031], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0552, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:21:25,676] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 15:21:25,677] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.11 | bwd_microstep: 4612.94 | bwd_inner_microstep: 4607.12 | bwd_allreduce_microstep: 5.65 | step_microstep: 47.88
-[2025-01-25 15:21:25,678] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.07 | bwd: 4612.96 | bwd_inner: 4607.12 | bwd_allreduce: 5.75 | step: 47.89
- 40%|████      | 2334/5800 [6:34:55<6:45:14,  7.02s/it]                                                       {'loss': 0.0552, 'grad_norm': 4.26658821105957, 'learning_rate': 2.713224433492868e-05, 'epoch': 20.12}
- 40%|████      | 2334/5800 [6:34:55<6:45:14,  7.02s/it]score1 tensor([[0.5469],
-        [0.3809],
-        [0.5117],
-        [0.4375]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.4141, 0.4922, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0181, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:21:32,515] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 15:21:32,516] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.86 | bwd_microstep: 4566.13 | bwd_inner_microstep: 4561.18 | bwd_allreduce_microstep: 4.83 | step_microstep: 43.71
-[2025-01-25 15:21:32,516] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.81 | bwd: 4566.16 | bwd_inner: 4561.18 | bwd_allreduce: 4.89 | step: 43.72
- 40%|████      | 2335/5800 [6:35:02<6:42:04,  6.96s/it]                                                       {'loss': 0.0181, 'grad_norm': 2.272258758544922, 'learning_rate': 2.7121809377850433e-05, 'epoch': 20.13}
- 40%|████      | 2335/5800 [6:35:02<6:42:04,  6.96s/it]score1 tensor([[0.3750],
-        [0.6055],
-        [0.5898],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3945, 0.6094, 0.5664, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:21:39,401] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 15:21:39,403] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.89 | bwd_microstep: 4611.93 | bwd_inner_microstep: 4606.62 | bwd_allreduce_microstep: 5.23 | step_microstep: 44.10
-[2025-01-25 15:21:39,403] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.86 | bwd: 4611.95 | bwd_inner: 4606.62 | bwd_allreduce: 5.27 | step: 44.11
- 40%|████      | 2336/5800 [6:35:09<6:40:39,  6.94s/it]                                                       {'loss': 0.0186, 'grad_norm': 4.003902912139893, 'learning_rate': 2.711137220006845e-05, 'epoch': 20.14}
- 40%|████      | 2336/5800 [6:35:09<6:40:39,  6.94s/it]score1 tensor([[0.5312],
-        [0.5078],
-        [0.4121],
-        [0.6211]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.5430, 0.4238, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0205, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:21:46,306] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 15:21:46,308] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.92 | bwd_microstep: 4621.28 | bwd_inner_microstep: 4616.01 | bwd_allreduce_microstep: 5.10 | step_microstep: 46.12
-[2025-01-25 15:21:46,309] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.87 | bwd: 4621.30 | bwd_inner: 4616.01 | bwd_allreduce: 5.22 | step: 46.12
- 40%|████      | 2337/5800 [6:35:16<6:40:08,  6.93s/it]                                                       {'loss': 0.0205, 'grad_norm': 3.7897419929504395, 'learning_rate': 2.7100932804837223e-05, 'epoch': 20.15}
- 40%|████      | 2337/5800 [6:35:16<6:40:08,  6.93s/it]score1 tensor([[0.4746],
-        [0.4609],
-        [0.4766],
-        [0.4570]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.4980, 0.4375, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0317, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:21:53,234] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.37
-[2025-01-25 15:21:53,236] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.52 | bwd_microstep: 4625.02 | bwd_inner_microstep: 4619.21 | bwd_allreduce_microstep: 5.71 | step_microstep: 45.18
-[2025-01-25 15:21:53,236] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.47 | bwd: 4625.05 | bwd_inner: 4619.21 | bwd_allreduce: 5.77 | step: 45.19
- 40%|████      | 2338/5800 [6:35:23<6:39:43,  6.93s/it]                                                       {'loss': 0.0317, 'grad_norm': 4.009610652923584, 'learning_rate': 2.7090491195411934e-05, 'epoch': 20.16}
- 40%|████      | 2338/5800 [6:35:23<6:39:43,  6.93s/it]score1 tensor([[0.5391],
-        [0.3555],
-        [0.3867],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.3945, 0.3398, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0381, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:22:00,103] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 15:22:00,104] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.46 | bwd_microstep: 4587.06 | bwd_inner_microstep: 4582.65 | bwd_allreduce_microstep: 4.35 | step_microstep: 47.38
-[2025-01-25 15:22:00,105] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.40 | bwd: 4587.08 | bwd_inner: 4582.65 | bwd_allreduce: 4.37 | step: 47.39
- 40%|████      | 2339/5800 [6:35:30<6:38:35,  6.91s/it]                                                       {'loss': 0.0381, 'grad_norm': 2.1574156284332275, 'learning_rate': 2.7080047375048466e-05, 'epoch': 20.16}
- 40%|████      | 2339/5800 [6:35:30<6:38:35,  6.91s/it]score1 tensor([[0.4648],
-        [0.4629],
-        [0.4355],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4922, 0.4785, 0.4258, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:22:07,000] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 15:22:07,001] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.58 | bwd_microstep: 4618.35 | bwd_inner_microstep: 4613.10 | bwd_allreduce_microstep: 5.18 | step_microstep: 42.09
-[2025-01-25 15:22:07,002] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.55 | bwd: 4618.38 | bwd_inner: 4613.10 | bwd_allreduce: 5.22 | step: 42.10
- 40%|████      | 2340/5800 [6:35:36<6:38:18,  6.91s/it]                                                       {'loss': 0.0161, 'grad_norm': 4.223642826080322, 'learning_rate': 2.706960134700337e-05, 'epoch': 20.17}
- 40%|████      | 2340/5800 [6:35:36<6:38:18,  6.91s/it]score1 tensor([[0.5898],
-        [0.3730],
-        [0.3984],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5977, 0.3691, 0.3438, 0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0205, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:22:13,903] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 15:22:13,905] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.86 | bwd_microstep: 4619.42 | bwd_inner_microstep: 4614.10 | bwd_allreduce_microstep: 5.21 | step_microstep: 43.00
-[2025-01-25 15:22:13,905] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.82 | bwd: 4619.45 | bwd_inner: 4614.10 | bwd_allreduce: 5.27 | step: 43.00
- 40%|████      | 2341/5800 [6:35:43<6:38:11,  6.91s/it]                                                       {'loss': 0.0205, 'grad_norm': 1.1872392892837524, 'learning_rate': 2.7059153114533906e-05, 'epoch': 20.18}
- 40%|████      | 2341/5800 [6:35:43<6:38:11,  6.91s/it]score1 tensor([[0.5664],
-        [0.5898],
-        [0.4785],
-        [0.4551]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.5039, 0.4434, 0.3887], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0557, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:22:20,815] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 15:22:20,817] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.04 | bwd_microstep: 4625.31 | bwd_inner_microstep: 4619.32 | bwd_allreduce_microstep: 5.81 | step_microstep: 49.65
-[2025-01-25 15:22:20,817] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.99 | bwd: 4625.34 | bwd_inner: 4619.32 | bwd_allreduce: 5.91 | step: 49.66
- 40%|████      | 2342/5800 [6:35:50<6:38:02,  6.91s/it]                                                       {'loss': 0.0557, 'grad_norm': 8.590479850769043, 'learning_rate': 2.704870268089802e-05, 'epoch': 20.19}
- 40%|████      | 2342/5800 [6:35:50<6:38:02,  6.91s/it]score1 tensor([[0.6914],
-        [0.5195],
-        [0.4902],
-        [0.4512]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6836, 0.5039, 0.4609, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:22:27,667] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 15:22:27,669] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.03 | bwd_microstep: 4578.37 | bwd_inner_microstep: 4573.41 | bwd_allreduce_microstep: 4.89 | step_microstep: 42.05
-[2025-01-25 15:22:27,669] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.99 | bwd: 4578.40 | bwd_inner: 4573.41 | bwd_allreduce: 4.93 | step: 42.06
- 40%|████      | 2343/5800 [6:35:57<6:37:01,  6.89s/it]                                                       {'loss': 0.0132, 'grad_norm': 6.74607515335083, 'learning_rate': 2.7038250049354324e-05, 'epoch': 20.2}
- 40%|████      | 2343/5800 [6:35:57<6:37:01,  6.89s/it]score1 tensor([[0.4551],
-        [0.4453],
-        [0.3906],
-        [0.3320]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4395, 0.4180, 0.4082, 0.3086], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0210, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:22:34,574] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.37
-[2025-01-25 15:22:34,576] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.04 | bwd_microstep: 4627.49 | bwd_inner_microstep: 4622.30 | bwd_allreduce_microstep: 5.11 | step_microstep: 42.34
-[2025-01-25 15:22:34,576] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.99 | bwd: 4627.52 | bwd_inner: 4622.30 | bwd_allreduce: 5.15 | step: 42.34
- 40%|████      | 2344/5800 [6:36:04<6:37:15,  6.90s/it]                                                       {'loss': 0.021, 'grad_norm': 3.8941988945007324, 'learning_rate': 2.702779522316214e-05, 'epoch': 20.21}
- 40%|████      | 2344/5800 [6:36:04<6:37:15,  6.90s/it]score1 tensor([[0.5703],
-        [0.6602],
-        [0.4707],
-        [0.2891]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.6641, 0.5117, 0.3223], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:22:41,490] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 15:22:41,492] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.23 | bwd_microstep: 4617.45 | bwd_inner_microstep: 4611.86 | bwd_allreduce_microstep: 5.49 | step_microstep: 46.56
-[2025-01-25 15:22:41,492] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.20 | bwd: 4617.47 | bwd_inner: 4611.86 | bwd_allreduce: 5.53 | step: 46.57
- 40%|████      | 2345/5800 [6:36:11<6:37:33,  6.90s/it]                                                       {'loss': 0.0293, 'grad_norm': 3.858222007751465, 'learning_rate': 2.7017338205581466e-05, 'epoch': 20.22}
- 40%|████      | 2345/5800 [6:36:11<6:37:33,  6.90s/it]score1 tensor([[0.6602],
-        [0.5781],
-        [0.4316],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6367, 0.5078, 0.4785, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0498, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:22:48,408] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.95 | optimizer_step: 4.37
-[2025-01-25 15:22:48,410] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.97 | bwd_microstep: 4619.59 | bwd_inner_microstep: 4614.14 | bwd_allreduce_microstep: 5.32 | step_microstep: 51.76
-[2025-01-25 15:22:48,410] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.92 | bwd: 4619.61 | bwd_inner: 4614.14 | bwd_allreduce: 5.38 | step: 51.77
- 40%|████      | 2346/5800 [6:36:18<6:37:30,  6.91s/it]                                                       {'loss': 0.0498, 'grad_norm': 4.972245693206787, 'learning_rate': 2.7006878999872975e-05, 'epoch': 20.22}
- 40%|████      | 2346/5800 [6:36:18<6:37:30,  6.91s/it]score1 tensor([[0.4805],
-        [0.5586],
-        [0.4805],
-        [0.5898]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.5391, 0.4863, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:22:55,317] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.37
-[2025-01-25 15:22:55,319] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.21 | bwd_microstep: 4617.61 | bwd_inner_microstep: 4612.08 | bwd_allreduce_microstep: 5.45 | step_microstep: 44.23
-[2025-01-25 15:22:55,319] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.17 | bwd: 4617.63 | bwd_inner: 4612.08 | bwd_allreduce: 5.49 | step: 44.23
- 40%|████      | 2347/5800 [6:36:25<6:37:25,  6.91s/it]                                                       {'loss': 0.0117, 'grad_norm': 0.5786240696907043, 'learning_rate': 2.6996417609298037e-05, 'epoch': 20.23}
- 40%|████      | 2347/5800 [6:36:25<6:37:25,  6.91s/it]score1 tensor([[0.6016],
-        [0.5977],
-        [0.4512],
-        [0.4492]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5938, 0.5508, 0.4180, 0.4316], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0264, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:23:02,220] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.37
-[2025-01-25 15:23:02,221] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.24 | bwd_microstep: 4618.00 | bwd_inner_microstep: 4612.74 | bwd_allreduce_microstep: 5.16 | step_microstep: 44.62
-[2025-01-25 15:23:02,221] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.21 | bwd: 4618.04 | bwd_inner: 4612.74 | bwd_allreduce: 5.21 | step: 44.63
- 40%|████      | 2348/5800 [6:36:32<6:37:17,  6.91s/it]                                                       {'loss': 0.0264, 'grad_norm': 8.615926742553711, 'learning_rate': 2.698595403711868e-05, 'epoch': 20.24}
- 40%|████      | 2348/5800 [6:36:32<6:37:17,  6.91s/it]score1 tensor([[0.5859],
-        [0.6445],
-        [0.6523],
-        [0.6211]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133, 0.5781, 0.6875, 0.6719], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0449, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:23:09,112] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 15:23:09,113] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.00 | bwd_microstep: 4615.56 | bwd_inner_microstep: 4610.01 | bwd_allreduce_microstep: 5.44 | step_microstep: 42.37
-[2025-01-25 15:23:09,113] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.96 | bwd: 4615.58 | bwd_inner: 4610.01 | bwd_allreduce: 5.49 | step: 42.38
- 40%|████      | 2349/5800 [6:36:39<6:36:58,  6.90s/it]                                                       {'loss': 0.0449, 'grad_norm': 4.680933952331543, 'learning_rate': 2.6975488286597643e-05, 'epoch': 20.25}
- 40%|████      | 2349/5800 [6:36:39<6:36:58,  6.90s/it]score1 tensor([[0.4961],
-        [0.4863],
-        [0.4336],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4629, 0.4297, 0.4375, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0322, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:23:16,012] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 15:23:16,013] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.45 | bwd_microstep: 4612.00 | bwd_inner_microstep: 4606.06 | bwd_allreduce_microstep: 5.86 | step_microstep: 48.13
-[2025-01-25 15:23:16,014] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.41 | bwd: 4612.03 | bwd_inner: 4606.06 | bwd_allreduce: 5.90 | step: 48.14
- 41%|████      | 2350/5800 [6:36:45<6:36:48,  6.90s/it]                                                       {'loss': 0.0322, 'grad_norm': 4.308464050292969, 'learning_rate': 2.6965020360998325e-05, 'epoch': 20.26}
- 41%|████      | 2350/5800 [6:36:45<6:36:48,  6.90s/it]score1 tensor([[0.4648],
-        [0.3926],
-        [0.4766],
-        [0.3398]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4531, 0.4434, 0.4219, 0.3926], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0425, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:23:22,922] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.36
-[2025-01-25 15:23:22,923] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.07 | bwd_microstep: 4618.19 | bwd_inner_microstep: 4612.85 | bwd_allreduce_microstep: 5.25 | step_microstep: 44.73
-[2025-01-25 15:23:22,923] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.02 | bwd: 4618.22 | bwd_inner: 4612.85 | bwd_allreduce: 5.30 | step: 44.74
- 41%|████      | 2351/5800 [6:36:52<6:36:49,  6.90s/it]                                                       {'loss': 0.0425, 'grad_norm': 0.8613190054893494, 'learning_rate': 2.695455026358481e-05, 'epoch': 20.27}
- 41%|████      | 2351/5800 [6:36:52<6:36:49,  6.90s/it]score1 tensor([[0.3242],
-        [0.3750],
-        [0.4902],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3418, 0.4512, 0.4805, 0.5703], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0337, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:23:29,821] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 15:23:29,823] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.40 | bwd_microstep: 4623.77 | bwd_inner_microstep: 4618.75 | bwd_allreduce_microstep: 4.93 | step_microstep: 42.55
-[2025-01-25 15:23:29,823] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.34 | bwd: 4623.79 | bwd_inner: 4618.75 | bwd_allreduce: 4.98 | step: 42.55
- 41%|████      | 2352/5800 [6:36:59<6:36:39,  6.90s/it]                                                       {'loss': 0.0337, 'grad_norm': 3.652289628982544, 'learning_rate': 2.694407799762184e-05, 'epoch': 20.28}
- 41%|████      | 2352/5800 [6:36:59<6:36:39,  6.90s/it]score1 tensor([[0.6094],
-        [0.5703],
-        [0.4043],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133, 0.6172, 0.4238, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:23:36,728] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 15:23:36,729] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.31 | bwd_microstep: 4617.46 | bwd_inner_microstep: 4612.36 | bwd_allreduce_microstep: 5.00 | step_microstep: 42.02
-[2025-01-25 15:23:36,730] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.28 | bwd: 4617.48 | bwd_inner: 4612.36 | bwd_allreduce: 5.05 | step: 42.03
- 41%|████      | 2353/5800 [6:37:06<6:36:39,  6.90s/it]                                                       {'loss': 0.0234, 'grad_norm': 8.605477333068848, 'learning_rate': 2.693360356637487e-05, 'epoch': 20.28}
- 41%|████      | 2353/5800 [6:37:06<6:36:39,  6.90s/it]score1 tensor([[0.4980],
-        [0.3613],
-        [0.5312],
-        [0.3926]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.4023, 0.5586, 0.3984], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0308, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:23:43,638] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 15:23:43,640] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.61 | bwd_microstep: 4625.32 | bwd_inner_microstep: 4619.90 | bwd_allreduce_microstep: 5.31 | step_microstep: 46.99
-[2025-01-25 15:23:43,640] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.58 | bwd: 4625.35 | bwd_inner: 4619.90 | bwd_allreduce: 5.36 | step: 47.00
- 41%|████      | 2354/5800 [6:37:13<6:36:48,  6.91s/it]                                                       {'loss': 0.0308, 'grad_norm': 7.887279033660889, 'learning_rate': 2.692312697310999e-05, 'epoch': 20.29}
- 41%|████      | 2354/5800 [6:37:13<6:36:48,  6.91s/it]score1 tensor([[0.4609],
-        [0.3574],
-        [0.5820],
-        [0.6523]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4707, 0.4141, 0.6445, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0342, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:23:50,553] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 15:23:50,555] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.18 | bwd_microstep: 4616.52 | bwd_inner_microstep: 4611.37 | bwd_allreduce_microstep: 5.08 | step_microstep: 45.01
-[2025-01-25 15:23:50,555] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.13 | bwd: 4616.55 | bwd_inner: 4611.37 | bwd_allreduce: 5.11 | step: 45.02
- 41%|████      | 2355/5800 [6:37:20<6:36:36,  6.91s/it]                                                       {'loss': 0.0342, 'grad_norm': 3.6436104774475098, 'learning_rate': 2.6912648221094e-05, 'epoch': 20.3}
- 41%|████      | 2355/5800 [6:37:20<6:36:36,  6.91s/it]score1 tensor([[0.3633],
-        [0.5312],
-        [0.4648],
-        [0.3496]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3652, 0.5000, 0.5117, 0.3340], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0239, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:23:57,458] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.37
-[2025-01-25 15:23:57,460] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.90 | bwd_microstep: 4623.78 | bwd_inner_microstep: 4619.05 | bwd_allreduce_microstep: 4.64 | step_microstep: 45.63
-[2025-01-25 15:23:57,460] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.87 | bwd: 4623.80 | bwd_inner: 4619.05 | bwd_allreduce: 4.69 | step: 45.64
- 41%|████      | 2356/5800 [6:37:27<6:36:26,  6.91s/it]                                                       {'loss': 0.0239, 'grad_norm': 0.4344697594642639, 'learning_rate': 2.6902167313594346e-05, 'epoch': 20.31}
- 41%|████      | 2356/5800 [6:37:27<6:36:26,  6.91s/it]score1 tensor([[0.6016],
-        [0.5977],
-        [0.4316],
-        [0.4922]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5898, 0.6172, 0.4590, 0.4902], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0151, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:24:04,363] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 15:24:04,364] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.00 | bwd_microstep: 4623.22 | bwd_inner_microstep: 4617.87 | bwd_allreduce_microstep: 5.23 | step_microstep: 45.11
-[2025-01-25 15:24:04,365] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.97 | bwd: 4623.25 | bwd_inner: 4617.87 | bwd_allreduce: 5.28 | step: 45.13
- 41%|████      | 2357/5800 [6:37:34<6:36:16,  6.91s/it]                                                       {'loss': 0.0151, 'grad_norm': 0.5555788278579712, 'learning_rate': 2.6891684253879168e-05, 'epoch': 20.32}
- 41%|████      | 2357/5800 [6:37:34<6:36:16,  6.91s/it]score1 tensor([[0.4297],
-        [0.5039],
-        [0.5312],
-        [0.4961]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4453, 0.4844, 0.5078, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0190, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:24:11,263] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 15:24:11,264] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.48 | bwd_microstep: 4625.16 | bwd_inner_microstep: 4620.14 | bwd_allreduce_microstep: 4.95 | step_microstep: 41.26
-[2025-01-25 15:24:11,264] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.44 | bwd: 4625.19 | bwd_inner: 4620.14 | bwd_allreduce: 4.98 | step: 41.26
- 41%|████      | 2358/5800 [6:37:41<6:36:09,  6.91s/it]                                                       {'loss': 0.019, 'grad_norm': 4.458010673522949, 'learning_rate': 2.6881199045217248e-05, 'epoch': 20.33}
- 41%|████      | 2358/5800 [6:37:41<6:36:09,  6.91s/it]score1 tensor([[0.5977],
-        [0.5586],
-        [0.4551],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.4883, 0.4668, 0.4883], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0400, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:24:18,168] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 15:24:18,170] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.51 | bwd_microstep: 4619.50 | bwd_inner_microstep: 4614.41 | bwd_allreduce_microstep: 5.01 | step_microstep: 42.31
-[2025-01-25 15:24:18,170] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.47 | bwd: 4619.53 | bwd_inner: 4614.42 | bwd_allreduce: 5.05 | step: 42.32
- 41%|████      | 2359/5800 [6:37:48<6:35:56,  6.90s/it]                                                       {'loss': 0.04, 'grad_norm': 4.726272106170654, 'learning_rate': 2.687071169087807e-05, 'epoch': 20.34}
- 41%|████      | 2359/5800 [6:37:48<6:35:56,  6.90s/it]score1 tensor([[0.4785],
-        [0.3672],
-        [0.6133],
-        [0.6094]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4336, 0.3613, 0.6562, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0410, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:24:25,068] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 15:24:25,070] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.66 | bwd_microstep: 4618.01 | bwd_inner_microstep: 4612.60 | bwd_allreduce_microstep: 5.29 | step_microstep: 45.00
-[2025-01-25 15:24:25,071] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.61 | bwd: 4618.03 | bwd_inner: 4612.60 | bwd_allreduce: 5.35 | step: 45.01
- 41%|████      | 2360/5800 [6:37:55<6:35:49,  6.90s/it]                                                       {'loss': 0.041, 'grad_norm': 3.7765512466430664, 'learning_rate': 2.686022219413177e-05, 'epoch': 20.34}
- 41%|████      | 2360/5800 [6:37:55<6:35:49,  6.90s/it]score1 tensor([[0.4746],
-        [0.6016],
-        [0.3633],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.6094, 0.3555, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:24:31,967] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 15:24:31,968] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.27 | bwd_microstep: 4616.06 | bwd_inner_microstep: 4611.02 | bwd_allreduce_microstep: 4.97 | step_microstep: 44.88
-[2025-01-25 15:24:31,969] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.21 | bwd: 4616.09 | bwd_inner: 4611.02 | bwd_allreduce: 5.00 | step: 44.89
- 41%|████      | 2361/5800 [6:38:01<6:35:34,  6.90s/it]                                                       {'loss': 0.0083, 'grad_norm': 3.7430121898651123, 'learning_rate': 2.684973055824916e-05, 'epoch': 20.35}
- 41%|████      | 2361/5800 [6:38:01<6:35:34,  6.90s/it]score1 tensor([[0.5352],
-        [0.4336],
-        [0.4492],
-        [0.6055]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.4043, 0.4492, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:24:38,827] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 15:24:38,829] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.70 | bwd_microstep: 4578.23 | bwd_inner_microstep: 4572.54 | bwd_allreduce_microstep: 5.58 | step_microstep: 43.91
-[2025-01-25 15:24:38,829] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.66 | bwd: 4578.25 | bwd_inner: 4572.54 | bwd_allreduce: 5.64 | step: 43.91
- 41%|████      | 2362/5800 [6:38:08<6:34:48,  6.89s/it]                                                       {'loss': 0.0132, 'grad_norm': 2.635118007659912, 'learning_rate': 2.68392367865017e-05, 'epoch': 20.36}
- 41%|████      | 2362/5800 [6:38:08<6:34:48,  6.89s/it]score1 tensor([[0.4785],
-        [0.5156],
-        [0.5391],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4805, 0.4844, 0.5469, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:24:45,733] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.36
-[2025-01-25 15:24:45,735] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.11 | bwd_microstep: 4621.33 | bwd_inner_microstep: 4615.86 | bwd_allreduce_microstep: 5.33 | step_microstep: 45.60
-[2025-01-25 15:24:45,735] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.08 | bwd: 4621.35 | bwd_inner: 4615.86 | bwd_allreduce: 5.42 | step: 45.60
- 41%|████      | 2363/5800 [6:38:15<6:35:00,  6.90s/it]                                                       {'loss': 0.0161, 'grad_norm': 4.21352481842041, 'learning_rate': 2.682874088216154e-05, 'epoch': 20.37}
- 41%|████      | 2363/5800 [6:38:15<6:35:00,  6.90s/it]score1 tensor([[0.5625],
-        [0.4238],
-        [0.5703],
-        [0.4922]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4492, 0.6094, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0249, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:24:52,650] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.37
-[2025-01-25 15:24:52,652] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.46 | bwd_microstep: 4628.48 | bwd_inner_microstep: 4623.59 | bwd_allreduce_microstep: 4.79 | step_microstep: 42.43
-[2025-01-25 15:24:52,652] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.42 | bwd: 4628.51 | bwd_inner: 4623.59 | bwd_allreduce: 4.84 | step: 42.44
- 41%|████      | 2364/5800 [6:38:22<6:35:14,  6.90s/it]                                                       {'loss': 0.0249, 'grad_norm': 3.9989988803863525, 'learning_rate': 2.6818242848501487e-05, 'epoch': 20.38}
- 41%|████      | 2364/5800 [6:38:22<6:35:14,  6.90s/it]score1 tensor([[0.5352],
-        [0.4844],
-        [0.4805],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.4922, 0.4863, 0.6484], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0405, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:24:59,569] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 15:24:59,570] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.26 | bwd_microstep: 4631.88 | bwd_inner_microstep: 4626.24 | bwd_allreduce_microstep: 5.49 | step_microstep: 47.96
-[2025-01-25 15:24:59,571] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.23 | bwd: 4631.91 | bwd_inner: 4626.24 | bwd_allreduce: 5.54 | step: 47.98
- 41%|████      | 2365/5800 [6:38:29<6:35:20,  6.91s/it]                                                       {'loss': 0.0405, 'grad_norm': 8.489326477050781, 'learning_rate': 2.6807742688795008e-05, 'epoch': 20.39}
- 41%|████      | 2365/5800 [6:38:29<6:35:20,  6.91s/it]score1 tensor([[0.5586],
-        [0.4219],
-        [0.5156],
-        [0.4922]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.3789, 0.5156, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0181, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:25:06,437] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.00 | optimizer_step: 4.36
-[2025-01-25 15:25:06,439] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.85 | bwd_microstep: 4587.66 | bwd_inner_microstep: 4582.71 | bwd_allreduce_microstep: 4.88 | step_microstep: 43.09
-[2025-01-25 15:25:06,439] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.81 | bwd: 4587.68 | bwd_inner: 4582.70 | bwd_allreduce: 4.91 | step: 43.10
- 41%|████      | 2366/5800 [6:38:36<6:34:35,  6.89s/it]                                                       {'loss': 0.0181, 'grad_norm': 2.030762195587158, 'learning_rate': 2.679724040631623e-05, 'epoch': 20.4}
- 41%|████      | 2366/5800 [6:38:36<6:34:35,  6.89s/it]score1 tensor([[0.5000],
-        [0.4824],
-        [0.5430],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.4277, 0.6289, 0.4590], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0474, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:25:13,337] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 15:25:13,338] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.80 | bwd_microstep: 4617.80 | bwd_inner_microstep: 4612.38 | bwd_allreduce_microstep: 5.33 | step_microstep: 41.44
-[2025-01-25 15:25:13,339] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.76 | bwd: 4617.84 | bwd_inner: 4612.38 | bwd_allreduce: 5.38 | step: 41.44
- 41%|████      | 2367/5800 [6:38:43<6:34:42,  6.90s/it]                                                       {'loss': 0.0474, 'grad_norm': 0.6408646702766418, 'learning_rate': 2.6786736004339954e-05, 'epoch': 20.41}
- 41%|████      | 2367/5800 [6:38:43<6:34:42,  6.90s/it]score1 tensor([[0.6133],
-        [0.4238],
-        [0.4961],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6367, 0.4043, 0.3262, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0698, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:25:20,259] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 15:25:20,260] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.91 | bwd_microstep: 4621.60 | bwd_inner_microstep: 4616.63 | bwd_allreduce_microstep: 4.87 | step_microstep: 43.50
-[2025-01-25 15:25:20,260] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.87 | bwd: 4621.62 | bwd_inner: 4616.63 | bwd_allreduce: 4.92 | step: 43.50
- 41%|████      | 2368/5800 [6:38:50<6:34:48,  6.90s/it]                                                       {'loss': 0.0698, 'grad_norm': 3.9228312969207764, 'learning_rate': 2.677622948614163e-05, 'epoch': 20.41}
- 41%|████      | 2368/5800 [6:38:50<6:34:48,  6.90s/it]score1 tensor([[0.6055],
-        [0.5195],
-        [0.4395],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.5000, 0.4004, 0.4570], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0449, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:25:27,158] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 15:25:27,159] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.01 | bwd_microstep: 4619.37 | bwd_inner_microstep: 4613.81 | bwd_allreduce_microstep: 5.45 | step_microstep: 45.44
-[2025-01-25 15:25:27,159] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.98 | bwd: 4619.40 | bwd_inner: 4613.81 | bwd_allreduce: 5.52 | step: 45.45
- 41%|████      | 2369/5800 [6:38:57<6:34:39,  6.90s/it]                                                       {'loss': 0.0449, 'grad_norm': 8.473833084106445, 'learning_rate': 2.6765720854997376e-05, 'epoch': 20.42}
- 41%|████      | 2369/5800 [6:38:57<6:34:39,  6.90s/it]score1 tensor([[0.5508],
-        [0.4238],
-        [0.5273],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.4023, 0.4980, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:25:34,061] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.10 | optimizer_step: 4.37
-[2025-01-25 15:25:34,063] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.32 | bwd_microstep: 4622.42 | bwd_inner_microstep: 4615.63 | bwd_allreduce_microstep: 6.68 | step_microstep: 46.85
-[2025-01-25 15:25:34,063] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.28 | bwd: 4622.45 | bwd_inner: 4615.63 | bwd_allreduce: 6.74 | step: 46.86
- 41%|████      | 2370/5800 [6:39:04<6:34:39,  6.90s/it]                                                       {'loss': 0.0176, 'grad_norm': 4.0987749099731445, 'learning_rate': 2.675521011418397e-05, 'epoch': 20.43}
- 41%|████      | 2370/5800 [6:39:04<6:34:39,  6.90s/it]score1 tensor([[0.4648],
-        [0.5391],
-        [0.4707],
-        [0.3633]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.5234, 0.4570, 0.3457], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0190, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:25:40,977] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 15:25:40,979] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.29 | bwd_microstep: 4629.09 | bwd_inner_microstep: 4624.21 | bwd_allreduce_microstep: 4.79 | step_microstep: 44.07
-[2025-01-25 15:25:40,979] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.22 | bwd: 4629.11 | bwd_inner: 4624.21 | bwd_allreduce: 4.83 | step: 44.08
- 41%|████      | 2371/5800 [6:39:10<6:34:44,  6.91s/it]                                                       {'loss': 0.019, 'grad_norm': 4.059065341949463, 'learning_rate': 2.674469726697883e-05, 'epoch': 20.44}
- 41%|████      | 2371/5800 [6:39:10<6:34:44,  6.91s/it]score1 tensor([[0.5898],
-        [0.5000],
-        [0.4648],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.5156, 0.4824, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0190, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:25:47,887] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.36
-[2025-01-25 15:25:47,889] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.48 | bwd_microstep: 4627.07 | bwd_inner_microstep: 4622.17 | bwd_allreduce_microstep: 4.82 | step_microstep: 43.51
-[2025-01-25 15:25:47,889] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.44 | bwd: 4627.10 | bwd_inner: 4622.17 | bwd_allreduce: 4.86 | step: 43.52
- 41%|████      | 2372/5800 [6:39:17<6:34:37,  6.91s/it]                                                       {'loss': 0.019, 'grad_norm': 4.339598655700684, 'learning_rate': 2.673418231666005e-05, 'epoch': 20.45}
- 41%|████      | 2372/5800 [6:39:17<6:34:37,  6.91s/it]score1 tensor([[0.4043],
-        [0.4180],
-        [0.4199],
-        [0.4980]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.4473, 0.4609, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0386, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:25:54,797] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 15:25:54,798] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.51 | bwd_microstep: 4628.50 | bwd_inner_microstep: 4623.75 | bwd_allreduce_microstep: 4.66 | step_microstep: 43.00
-[2025-01-25 15:25:54,798] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.46 | bwd: 4628.53 | bwd_inner: 4623.75 | bwd_allreduce: 4.71 | step: 43.02
- 41%|████      | 2373/5800 [6:39:24<6:34:32,  6.91s/it]                                                       {'loss': 0.0386, 'grad_norm': 7.738085746765137, 'learning_rate': 2.6723665266506378e-05, 'epoch': 20.46}
- 41%|████      | 2373/5800 [6:39:24<6:34:32,  6.91s/it]score1 tensor([[0.5625],
-        [0.5859],
-        [0.5742],
-        [0.3828]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133, 0.6133, 0.5625, 0.3730], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0249, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:26:01,699] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.37
-[2025-01-25 15:26:01,700] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.33 | bwd_microstep: 4617.65 | bwd_inner_microstep: 4612.30 | bwd_allreduce_microstep: 5.25 | step_microstep: 45.90
-[2025-01-25 15:26:01,701] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.30 | bwd: 4617.67 | bwd_inner: 4612.30 | bwd_allreduce: 5.30 | step: 45.90
- 41%|████      | 2374/5800 [6:39:31<6:34:21,  6.91s/it]                                                       {'loss': 0.0249, 'grad_norm': 0.8493212461471558, 'learning_rate': 2.671314611979721e-05, 'epoch': 20.47}
- 41%|████      | 2374/5800 [6:39:31<6:34:21,  6.91s/it]score1 tensor([[0.4492],
-        [0.5586],
-        [0.5469],
-        [0.3672]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.5312, 0.5156, 0.4160], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0317, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:26:08,601] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.37
-[2025-01-25 15:26:08,602] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.08 | bwd_microstep: 4619.40 | bwd_inner_microstep: 4614.15 | bwd_allreduce_microstep: 5.15 | step_microstep: 45.18
-[2025-01-25 15:26:08,603] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.04 | bwd: 4619.43 | bwd_inner: 4614.15 | bwd_allreduce: 5.20 | step: 45.20
- 41%|████      | 2375/5800 [6:39:38<6:34:08,  6.90s/it]                                                       {'loss': 0.0317, 'grad_norm': 0.9984537363052368, 'learning_rate': 2.6702624879812597e-05, 'epoch': 20.47}
- 41%|████      | 2375/5800 [6:39:38<6:34:08,  6.90s/it]score1 tensor([[0.5703],
-        [0.5312],
-        [0.5703],
-        [0.3359]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.5586, 0.5625, 0.3457], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:26:15,497] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 15:26:15,498] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.37 | bwd_microstep: 4615.44 | bwd_inner_microstep: 4610.34 | bwd_allreduce_microstep: 5.00 | step_microstep: 43.96
-[2025-01-25 15:26:15,498] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.34 | bwd: 4615.47 | bwd_inner: 4610.34 | bwd_allreduce: 5.06 | step: 43.96
- 41%|████      | 2376/5800 [6:39:45<6:33:52,  6.90s/it]                                                       {'loss': 0.0132, 'grad_norm': 1.0850902795791626, 'learning_rate': 2.669210154983325e-05, 'epoch': 20.48}
- 41%|████      | 2376/5800 [6:39:45<6:33:52,  6.90s/it]score1 tensor([[0.5117],
-        [0.6602],
-        [0.4043],
-        [0.3945]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.7070, 0.4277, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0420, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:26:22,392] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 15:26:22,393] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.48 | bwd_microstep: 4619.68 | bwd_inner_microstep: 4614.41 | bwd_allreduce_microstep: 5.17 | step_microstep: 42.08
-[2025-01-25 15:26:22,394] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.44 | bwd: 4619.71 | bwd_inner: 4614.41 | bwd_allreduce: 5.22 | step: 42.08
- 41%|████      | 2377/5800 [6:39:52<6:33:40,  6.90s/it]                                                       {'loss': 0.042, 'grad_norm': 8.262893676757812, 'learning_rate': 2.668157613314052e-05, 'epoch': 20.49}
- 41%|████      | 2377/5800 [6:39:52<6:33:40,  6.90s/it]score1 tensor([[0.3496],
-        [0.5547],
-        [0.2949],
-        [0.6055]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3652, 0.5742, 0.1787, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0476, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:26:29,288] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 15:26:29,290] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.94 | bwd_microstep: 4615.98 | bwd_inner_microstep: 4610.40 | bwd_allreduce_microstep: 5.45 | step_microstep: 43.84
-[2025-01-25 15:26:29,290] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.91 | bwd: 4616.01 | bwd_inner: 4610.40 | bwd_allreduce: 5.51 | step: 43.85
- 41%|████      | 2378/5800 [6:39:59<6:33:32,  6.90s/it]                                                       {'loss': 0.0476, 'grad_norm': 4.655627727508545, 'learning_rate': 2.6671048633016416e-05, 'epoch': 20.5}
- 41%|████      | 2378/5800 [6:39:59<6:33:32,  6.90s/it]score1 tensor([[0.5586],
-        [0.4336],
-        [0.5352],
-        [0.6211]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6406, 0.4922, 0.5898, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:26:36,201] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 15:26:36,203] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.22 | bwd_microstep: 4624.31 | bwd_inner_microstep: 4618.71 | bwd_allreduce_microstep: 5.52 | step_microstep: 45.47
-[2025-01-25 15:26:36,204] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.18 | bwd: 4624.34 | bwd_inner: 4618.71 | bwd_allreduce: 5.56 | step: 45.48
- 41%|████      | 2379/5800 [6:40:06<6:33:33,  6.90s/it]                                                       {'loss': 0.0664, 'grad_norm': 3.9889822006225586, 'learning_rate': 2.6660519052743595e-05, 'epoch': 20.51}
- 41%|████      | 2379/5800 [6:40:06<6:33:33,  6.90s/it]score1 tensor([[0.5469],
-        [0.4512],
-        [0.5469],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.4688, 0.5273, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0200, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:26:43,052] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 15:26:43,053] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.10 | bwd_microstep: 4573.87 | bwd_inner_microstep: 4568.76 | bwd_allreduce_microstep: 5.01 | step_microstep: 41.59
-[2025-01-25 15:26:43,054] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.06 | bwd: 4573.89 | bwd_inner: 4568.76 | bwd_allreduce: 5.06 | step: 41.60
- 41%|████      | 2380/5800 [6:40:13<6:32:34,  6.89s/it]                                                       {'loss': 0.02, 'grad_norm': 2.111675977706909, 'learning_rate': 2.6649987395605364e-05, 'epoch': 20.52}
- 41%|████      | 2380/5800 [6:40:13<6:32:34,  6.89s/it]score1 tensor([[0.5039],
-        [0.6172],
-        [0.3965],
-        [0.4531]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4805, 0.6016, 0.3672, 0.4141], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0269, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:26:49,950] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 15:26:49,952] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.07 | bwd_microstep: 4619.55 | bwd_inner_microstep: 4614.40 | bwd_allreduce_microstep: 5.07 | step_microstep: 40.52
-[2025-01-25 15:26:49,952] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.02 | bwd: 4619.57 | bwd_inner: 4614.40 | bwd_allreduce: 5.10 | step: 40.53
- 41%|████      | 2381/5800 [6:40:19<6:32:37,  6.89s/it]                                                       {'loss': 0.0269, 'grad_norm': 8.235405921936035, 'learning_rate': 2.6639453664885674e-05, 'epoch': 20.53}
- 41%|████      | 2381/5800 [6:40:19<6:32:37,  6.89s/it]score1 tensor([[0.5039],
-        [0.5000],
-        [0.5781],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.4922, 0.5742, 0.4902], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0210, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:26:56,851] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 15:26:56,852] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.67 | bwd_microstep: 4613.49 | bwd_inner_microstep: 4608.66 | bwd_allreduce_microstep: 4.73 | step_microstep: 52.10
-[2025-01-25 15:26:56,852] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.64 | bwd: 4613.52 | bwd_inner: 4608.66 | bwd_allreduce: 4.78 | step: 52.10
- 41%|████      | 2382/5800 [6:40:26<6:32:40,  6.89s/it]                                                       {'loss': 0.021, 'grad_norm': 8.630407333374023, 'learning_rate': 2.6628917863869128e-05, 'epoch': 20.53}
- 41%|████      | 2382/5800 [6:40:26<6:32:40,  6.89s/it]score1 tensor([[0.4395],
-        [0.5195],
-        [0.4512],
-        [0.4570]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4453, 0.5000, 0.4160, 0.4023], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0288, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:27:03,745] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.37
-[2025-01-25 15:27:03,747] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.15 | bwd_microstep: 4616.92 | bwd_inner_microstep: 4611.25 | bwd_allreduce_microstep: 5.55 | step_microstep: 46.69
-[2025-01-25 15:27:03,747] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.12 | bwd: 4616.94 | bwd_inner: 4611.25 | bwd_allreduce: 5.61 | step: 46.70
- 41%|████      | 2383/5800 [6:40:33<6:32:37,  6.89s/it]                                                       {'loss': 0.0288, 'grad_norm': 4.134047031402588, 'learning_rate': 2.6618379995840968e-05, 'epoch': 20.54}
- 41%|████      | 2383/5800 [6:40:33<6:32:37,  6.89s/it]score1 tensor([[0.5898],
-        [0.5469],
-        [0.5117],
-        [0.4551]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.5430, 0.5391, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0225, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:27:10,649] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 15:27:10,651] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.75 | bwd_microstep: 4620.49 | bwd_inner_microstep: 4615.45 | bwd_allreduce_microstep: 4.94 | step_microstep: 42.65
-[2025-01-25 15:27:10,651] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.70 | bwd: 4620.51 | bwd_inner: 4615.45 | bwd_allreduce: 4.98 | step: 42.66
- 41%|████      | 2384/5800 [6:40:40<6:32:41,  6.90s/it]                                                       {'loss': 0.0225, 'grad_norm': 4.389138698577881, 'learning_rate': 2.6607840064087088e-05, 'epoch': 20.55}
- 41%|████      | 2384/5800 [6:40:40<6:32:41,  6.90s/it]score1 tensor([[0.3887],
-        [0.5547],
-        [0.5156],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3477, 0.5586, 0.4941, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:27:17,557] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 7.84 | optimizer_step: 4.36
-[2025-01-25 15:27:17,559] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.94 | bwd_microstep: 4623.28 | bwd_inner_microstep: 4618.15 | bwd_allreduce_microstep: 5.00 | step_microstep: 48.09
-[2025-01-25 15:27:17,559] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.91 | bwd: 4623.30 | bwd_inner: 4618.15 | bwd_allreduce: 5.07 | step: 48.08
- 41%|████      | 2385/5800 [6:40:47<6:32:42,  6.90s/it]                                                       {'loss': 0.0215, 'grad_norm': 0.6952183842658997, 'learning_rate': 2.6597298071894024e-05, 'epoch': 20.56}
- 41%|████      | 2385/5800 [6:40:47<6:32:42,  6.90s/it]score1 tensor([[0.4258],
-        [0.6289],
-        [0.5625],
-        [0.4297]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4316, 0.6094, 0.5977, 0.4297], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0151, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:27:24,409] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 15:27:24,410] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.28 | bwd_microstep: 4571.09 | bwd_inner_microstep: 4565.86 | bwd_allreduce_microstep: 5.16 | step_microstep: 48.23
-[2025-01-25 15:27:24,411] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.23 | bwd: 4571.11 | bwd_inner: 4565.86 | bwd_allreduce: 5.19 | step: 48.24
- 41%|████      | 2386/5800 [6:40:54<6:31:46,  6.89s/it]                                                       {'loss': 0.0151, 'grad_norm': 1.785008430480957, 'learning_rate': 2.658675402254894e-05, 'epoch': 20.57}
- 41%|████      | 2386/5800 [6:40:54<6:31:46,  6.89s/it]score1 tensor([[0.4746],
-        [0.5703],
-        [0.4414],
-        [0.4082]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.5352, 0.4062, 0.3789], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0342, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:27:31,311] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 15:27:31,312] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.29 | bwd_microstep: 4626.32 | bwd_inner_microstep: 4621.07 | bwd_allreduce_microstep: 5.13 | step_microstep: 43.91
-[2025-01-25 15:27:31,313] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.26 | bwd: 4626.35 | bwd_inner: 4621.07 | bwd_allreduce: 5.19 | step: 43.92
- 41%|████      | 2387/5800 [6:41:01<6:32:00,  6.89s/it]                                                       {'loss': 0.0342, 'grad_norm': 4.099664211273193, 'learning_rate': 2.6576207919339668e-05, 'epoch': 20.58}
- 41%|████      | 2387/5800 [6:41:01<6:32:00,  6.89s/it]score1 tensor([[0.4785],
-        [0.3945],
-        [0.5664],
-        [0.6094]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.2812, 0.5664, 0.6289], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0356, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:27:38,187] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 15:27:38,189] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.37 | bwd_microstep: 4584.25 | bwd_inner_microstep: 4578.50 | bwd_allreduce_microstep: 5.66 | step_microstep: 49.95
-[2025-01-25 15:27:38,189] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.34 | bwd: 4584.27 | bwd_inner: 4578.50 | bwd_allreduce: 5.70 | step: 49.96
- 41%|████      | 2388/5800 [6:41:08<6:31:30,  6.88s/it]                                                       {'loss': 0.0356, 'grad_norm': 1.6742258071899414, 'learning_rate': 2.6565659765554663e-05, 'epoch': 20.59}
- 41%|████      | 2388/5800 [6:41:08<6:31:30,  6.88s/it]score1 tensor([[0.5781],
-        [0.4277],
-        [0.4434],
-        [0.5859]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.4570, 0.4473, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:27:45,090] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.36
-[2025-01-25 15:27:45,091] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.46 | bwd_microstep: 4626.78 | bwd_inner_microstep: 4621.06 | bwd_allreduce_microstep: 5.64 | step_microstep: 44.92
-[2025-01-25 15:27:45,091] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.42 | bwd: 4626.82 | bwd_inner: 4621.06 | bwd_allreduce: 5.68 | step: 44.93
- 41%|████      | 2389/5800 [6:41:15<6:31:44,  6.89s/it]                                                       {'loss': 0.0161, 'grad_norm': 1.1205068826675415, 'learning_rate': 2.6555109564483007e-05, 'epoch': 20.59}
- 41%|████      | 2389/5800 [6:41:15<6:31:44,  6.89s/it]score1 tensor([[0.5898],
-        [0.4629],
-        [0.4551],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.5078, 0.4492, 0.5703], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0322, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:27:51,982] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 15:27:51,983] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.65 | bwd_microstep: 4617.42 | bwd_inner_microstep: 4612.18 | bwd_allreduce_microstep: 5.14 | step_microstep: 44.40
-[2025-01-25 15:27:51,984] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.60 | bwd: 4617.45 | bwd_inner: 4612.18 | bwd_allreduce: 5.20 | step: 44.41
- 41%|████      | 2390/5800 [6:41:21<6:31:40,  6.89s/it]                                                       {'loss': 0.0322, 'grad_norm': 4.5324907302856445, 'learning_rate': 2.654455731941446e-05, 'epoch': 20.6}
- 41%|████      | 2390/5800 [6:41:21<6:31:40,  6.89s/it]score1 tensor([[0.3496],
-        [0.4961],
-        [0.4434],
-        [0.4824]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3516, 0.4980, 0.4473, 0.4824], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:27:58,831] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 15:27:58,832] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.41 | bwd_microstep: 4571.91 | bwd_inner_microstep: 4566.91 | bwd_allreduce_microstep: 4.90 | step_microstep: 45.27
-[2025-01-25 15:27:58,832] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.37 | bwd: 4571.94 | bwd_inner: 4566.91 | bwd_allreduce: 4.96 | step: 45.28
- 41%|████      | 2391/5800 [6:41:28<6:30:49,  6.88s/it]                                                       {'loss': 0.002, 'grad_norm': 5.782144069671631, 'learning_rate': 2.6534003033639373e-05, 'epoch': 20.61}
- 41%|████      | 2391/5800 [6:41:28<6:30:49,  6.88s/it]score1 tensor([[0.5273],
-        [0.6016],
-        [0.4395],
-        [0.4785]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.6445, 0.4180, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0239, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:28:05,728] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 15:28:05,730] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.60 | bwd_microstep: 4616.30 | bwd_inner_microstep: 4610.70 | bwd_allreduce_microstep: 5.50 | step_microstep: 48.46
-[2025-01-25 15:28:05,730] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.57 | bwd: 4616.32 | bwd_inner: 4610.70 | bwd_allreduce: 5.55 | step: 48.47
- 41%|████      | 2392/5800 [6:41:35<6:31:01,  6.88s/it]                                                       {'loss': 0.0239, 'grad_norm': 4.605490684509277, 'learning_rate': 2.652344671044877e-05, 'epoch': 20.62}
- 41%|████      | 2392/5800 [6:41:35<6:31:01,  6.88s/it]score1 tensor([[0.5781],
-        [0.5039],
-        [0.5234],
-        [0.4375]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6484, 0.5391, 0.5547, 0.4258], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0371, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:28:12,633] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.92 | optimizer_step: 4.36
-[2025-01-25 15:28:12,635] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.06 | bwd_microstep: 4618.29 | bwd_inner_microstep: 4612.68 | bwd_allreduce_microstep: 5.49 | step_microstep: 48.00
-[2025-01-25 15:28:12,635] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.01 | bwd: 4618.32 | bwd_inner: 4612.68 | bwd_allreduce: 5.55 | step: 48.01
- 41%|████▏     | 2393/5800 [6:41:42<6:31:18,  6.89s/it]                                                       {'loss': 0.0371, 'grad_norm': 4.55203914642334, 'learning_rate': 2.6512888353134296e-05, 'epoch': 20.63}
- 41%|████▏     | 2393/5800 [6:41:42<6:31:18,  6.89s/it]score1 tensor([[0.5234],
-        [0.5117],
-        [0.3613],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.5195, 0.3105, 0.6133], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0303, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:28:19,532] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.37
-[2025-01-25 15:28:19,534] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.08 | bwd_microstep: 4615.87 | bwd_inner_microstep: 4610.77 | bwd_allreduce_microstep: 5.00 | step_microstep: 46.69
-[2025-01-25 15:28:19,534] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.04 | bwd: 4615.90 | bwd_inner: 4610.77 | bwd_allreduce: 5.06 | step: 46.71
- 41%|████▏     | 2394/5800 [6:41:49<6:31:18,  6.89s/it]                                                       {'loss': 0.0303, 'grad_norm': 4.712932109832764, 'learning_rate': 2.6502327964988217e-05, 'epoch': 20.64}
- 41%|████▏     | 2394/5800 [6:41:49<6:31:18,  6.89s/it]score1 tensor([[0.5039],
-        [0.4043],
-        [0.4492],
-        [0.4629]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.3750, 0.4941, 0.4746], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0361, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:28:26,430] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 15:28:26,431] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.02 | bwd_microstep: 4616.19 | bwd_inner_microstep: 4610.41 | bwd_allreduce_microstep: 5.64 | step_microstep: 45.42
-[2025-01-25 15:28:26,432] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.97 | bwd: 4616.23 | bwd_inner: 4610.41 | bwd_allreduce: 5.72 | step: 45.43
- 41%|████▏     | 2395/5800 [6:41:56<6:31:14,  6.89s/it]                                                       {'loss': 0.0361, 'grad_norm': 4.191990852355957, 'learning_rate': 2.6491765549303455e-05, 'epoch': 20.65}
- 41%|████▏     | 2395/5800 [6:41:56<6:31:14,  6.89s/it]score1 tensor([[0.5586],
-        [0.5156],
-        [0.5156],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.4727, 0.5312, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0308, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:28:33,340] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 15:28:33,342] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.03 | bwd_microstep: 4624.35 | bwd_inner_microstep: 4618.98 | bwd_allreduce_microstep: 5.28 | step_microstep: 44.96
-[2025-01-25 15:28:33,342] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.05 | bwd: 4624.38 | bwd_inner: 4618.98 | bwd_allreduce: 5.33 | step: 44.96
- 41%|████▏     | 2396/5800 [6:42:03<6:31:20,  6.90s/it]                                                       {'loss': 0.0308, 'grad_norm': 0.48491981625556946, 'learning_rate': 2.6481201109373555e-05, 'epoch': 20.66}
- 41%|████▏     | 2396/5800 [6:42:03<6:31:20,  6.90s/it]score1 tensor([[0.5469],
-        [0.5000],
-        [0.4434],
-        [0.4824]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.5000, 0.4727, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0205, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:28:40,184] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 15:28:40,185] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.66 | bwd_microstep: 4563.77 | bwd_inner_microstep: 4557.86 | bwd_allreduce_microstep: 5.77 | step_microstep: 47.54
-[2025-01-25 15:28:40,186] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.63 | bwd: 4563.80 | bwd_inner: 4557.86 | bwd_allreduce: 5.85 | step: 47.55
- 41%|████▏     | 2397/5800 [6:42:10<6:30:18,  6.88s/it]                                                       {'loss': 0.0205, 'grad_norm': 2.3277645111083984, 'learning_rate': 2.6470634648492687e-05, 'epoch': 20.66}
- 41%|████▏     | 2397/5800 [6:42:10<6:30:18,  6.88s/it]score1 tensor([[0.4668],
-        [0.4629],
-        [0.4766],
-        [0.5977]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4355, 0.4512, 0.4121, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0317, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:28:47,079] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.36
-[2025-01-25 15:28:47,080] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.12 | bwd_microstep: 4615.06 | bwd_inner_microstep: 4609.94 | bwd_allreduce_microstep: 5.00 | step_microstep: 44.69
-[2025-01-25 15:28:47,081] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.07 | bwd: 4615.09 | bwd_inner: 4609.94 | bwd_allreduce: 5.06 | step: 44.70
- 41%|████▏     | 2398/5800 [6:42:17<6:30:27,  6.89s/it]                                                       {'loss': 0.0317, 'grad_norm': 8.254866600036621, 'learning_rate': 2.6460066169955668e-05, 'epoch': 20.67}
- 41%|████▏     | 2398/5800 [6:42:17<6:30:27,  6.89s/it]score1 tensor([[0.5938],
-        [0.4375],
-        [0.5508],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.3730, 0.4941, 0.4766], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0596, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:28:53,975] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.91 | optimizer_step: 4.36
-[2025-01-25 15:28:53,976] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.72 | bwd_microstep: 4616.85 | bwd_inner_microstep: 4611.54 | bwd_allreduce_microstep: 5.21 | step_microstep: 43.45
-[2025-01-25 15:28:53,977] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.68 | bwd: 4616.88 | bwd_inner: 4611.54 | bwd_allreduce: 5.25 | step: 43.46
- 41%|████▏     | 2399/5800 [6:42:23<6:30:33,  6.89s/it]                                                       {'loss': 0.0596, 'grad_norm': 8.580944061279297, 'learning_rate': 2.6449495677057915e-05, 'epoch': 20.68}
- 41%|████▏     | 2399/5800 [6:42:23<6:30:33,  6.89s/it]score1 tensor([[0.4902],
-        [0.5352],
-        [0.5117],
-        [0.6211]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.5625, 0.5195, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0229, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:29:00,878] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.37
-[2025-01-25 15:29:00,880] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.66 | bwd_microstep: 4619.24 | bwd_inner_microstep: 4614.52 | bwd_allreduce_microstep: 4.61 | step_microstep: 42.12
-[2025-01-25 15:29:00,880] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.63 | bwd: 4619.26 | bwd_inner: 4614.52 | bwd_allreduce: 4.67 | step: 42.12
- 41%|████▏     | 2400/5800 [6:42:30<6:30:35,  6.89s/it]                                                       {'loss': 0.0229, 'grad_norm': 3.999504327774048, 'learning_rate': 2.6438923173095504e-05, 'epoch': 20.69}
- 41%|████▏     | 2400/5800 [6:42:30<6:30:35,  6.89s/it]score1 tensor([[0.4902],
-        [0.6641],
-        [0.5234],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.6602, 0.4844, 0.4414], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0420, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:29:07,773] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 15:29:07,774] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.19 | bwd_microstep: 4619.77 | bwd_inner_microstep: 4615.04 | bwd_allreduce_microstep: 4.64 | step_microstep: 42.67
-[2025-01-25 15:29:07,774] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.14 | bwd: 4619.80 | bwd_inner: 4615.05 | bwd_allreduce: 4.68 | step: 42.68
- 41%|████▏     | 2401/5800 [6:42:37<6:30:30,  6.89s/it]                                                       {'loss': 0.042, 'grad_norm': 8.769235610961914, 'learning_rate': 2.6428348661365125e-05, 'epoch': 20.7}
- 41%|████▏     | 2401/5800 [6:42:37<6:30:30,  6.89s/it]score1 tensor([[0.5703],
-        [0.6172],
-        [0.4766],
-        [0.4492]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.5781, 0.4551, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0229, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:29:14,672] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 15:29:14,673] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.14 | bwd_microstep: 4624.06 | bwd_inner_microstep: 4618.87 | bwd_allreduce_microstep: 5.10 | step_microstep: 44.20
-[2025-01-25 15:29:14,673] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.10 | bwd: 4624.08 | bwd_inner: 4618.87 | bwd_allreduce: 5.15 | step: 44.21
- 41%|████▏     | 2402/5800 [6:42:44<6:30:27,  6.89s/it]                                                       {'loss': 0.0229, 'grad_norm': 8.583423614501953, 'learning_rate': 2.6417772145164096e-05, 'epoch': 20.71}
- 41%|████▏     | 2402/5800 [6:42:44<6:30:27,  6.89s/it]score1 tensor([[0.5938],
-        [0.5312],
-        [0.6289],
-        [0.4688]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.4688, 0.5625, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0425, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:29:21,561] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.52 | optimizer_step: 4.37
-[2025-01-25 15:29:21,562] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.14 | bwd_microstep: 4621.68 | bwd_inner_microstep: 4616.95 | bwd_allreduce_microstep: 4.65 | step_microstep: 43.33
-[2025-01-25 15:29:21,563] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.11 | bwd: 4621.70 | bwd_inner: 4616.95 | bwd_allreduce: 4.69 | step: 43.33
- 41%|████▏     | 2403/5800 [6:42:51<6:30:14,  6.89s/it]                                                       {'loss': 0.0425, 'grad_norm': 8.733064651489258, 'learning_rate': 2.640719362779035e-05, 'epoch': 20.72}
- 41%|████▏     | 2403/5800 [6:42:51<6:30:14,  6.89s/it]score1 tensor([[0.5391],
-        [0.4844],
-        [0.4980],
-        [0.4609]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.4648, 0.4980, 0.4258], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:29:28,405] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 15:29:28,406] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.12 | bwd_microstep: 4571.47 | bwd_inner_microstep: 4566.52 | bwd_allreduce_microstep: 4.84 | step_microstep: 43.48
-[2025-01-25 15:29:28,407] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.07 | bwd: 4571.50 | bwd_inner: 4566.52 | bwd_allreduce: 4.90 | step: 43.48
- 41%|████▏     | 2404/5800 [6:42:58<6:29:16,  6.88s/it]                                                       {'loss': 0.0146, 'grad_norm': 2.004951000213623, 'learning_rate': 2.6396613112542455e-05, 'epoch': 20.72}
- 41%|████▏     | 2404/5800 [6:42:58<6:29:16,  6.88s/it]score1 tensor([[0.4805],
-        [0.5234],
-        [0.4805],
-        [0.5898]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.4609, 0.5000, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0488, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:29:35,291] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 15:29:35,292] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.70 | bwd_microstep: 4616.45 | bwd_inner_microstep: 4611.76 | bwd_allreduce_microstep: 4.60 | step_microstep: 43.00
-[2025-01-25 15:29:35,293] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.66 | bwd: 4616.54 | bwd_inner: 4611.76 | bwd_allreduce: 4.64 | step: 43.00
- 41%|████▏     | 2405/5800 [6:43:05<6:29:26,  6.88s/it]                                                       {'loss': 0.0488, 'grad_norm': 4.42085599899292, 'learning_rate': 2.63860306027196e-05, 'epoch': 20.73}
- 41%|████▏     | 2405/5800 [6:43:05<6:29:26,  6.88s/it]score1 tensor([[0.5664],
-        [0.3809],
-        [0.4199],
-        [0.6914]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6875, 0.3789, 0.4512, 0.6250], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0552, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:29:42,194] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 15:29:42,196] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.00 | bwd_microstep: 4616.79 | bwd_inner_microstep: 4611.49 | bwd_allreduce_microstep: 5.21 | step_microstep: 48.51
-[2025-01-25 15:29:42,196] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.96 | bwd: 4616.81 | bwd_inner: 4611.49 | bwd_allreduce: 5.26 | step: 48.51
- 41%|████▏     | 2406/5800 [6:43:12<6:29:37,  6.89s/it]                                                       {'loss': 0.0552, 'grad_norm': 1.6194485425949097, 'learning_rate': 2.63754461016216e-05, 'epoch': 20.74}
- 41%|████▏     | 2406/5800 [6:43:12<6:29:37,  6.89s/it]score1 tensor([[0.4688],
-        [0.6289],
-        [0.5898],
-        [0.6211]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4570, 0.6562, 0.6445, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:29:49,088] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 15:29:49,089] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.82 | bwd_microstep: 4617.09 | bwd_inner_microstep: 4612.43 | bwd_allreduce_microstep: 4.56 | step_microstep: 42.03
-[2025-01-25 15:29:49,089] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.79 | bwd: 4617.11 | bwd_inner: 4612.43 | bwd_allreduce: 4.60 | step: 42.04
- 42%|████▏     | 2407/5800 [6:43:19<6:29:35,  6.89s/it]                                                       {'loss': 0.0293, 'grad_norm': 5.130281448364258, 'learning_rate': 2.6364859612548884e-05, 'epoch': 20.75}
- 42%|████▏     | 2407/5800 [6:43:19<6:29:35,  6.89s/it]score1 tensor([[0.6055],
-        [0.4902],
-        [0.3906],
-        [0.3379]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.5469, 0.4199, 0.4043], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0420, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:29:55,981] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 15:29:55,983] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.52 | bwd_microstep: 4619.68 | bwd_inner_microstep: 4615.05 | bwd_allreduce_microstep: 4.55 | step_microstep: 43.85
-[2025-01-25 15:29:55,983] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.48 | bwd: 4619.70 | bwd_inner: 4615.05 | bwd_allreduce: 4.58 | step: 43.86
- 42%|████▏     | 2408/5800 [6:43:25<6:29:29,  6.89s/it]                                                       {'loss': 0.042, 'grad_norm': 7.928564548492432, 'learning_rate': 2.6354271138802493e-05, 'epoch': 20.76}
- 42%|████▏     | 2408/5800 [6:43:25<6:29:29,  6.89s/it]score1 tensor([[0.5781],
-        [0.4336],
-        [0.4941],
-        [0.4023]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6406, 0.4883, 0.6211, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0615, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:30:02,874] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 15:30:02,875] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.70 | bwd_microstep: 4616.23 | bwd_inner_microstep: 4610.91 | bwd_allreduce_microstep: 5.22 | step_microstep: 46.83
-[2025-01-25 15:30:02,875] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.67 | bwd: 4616.26 | bwd_inner: 4610.91 | bwd_allreduce: 5.27 | step: 46.84
- 42%|████▏     | 2409/5800 [6:43:32<6:29:26,  6.89s/it]                                                       {'loss': 0.0615, 'grad_norm': 4.453514575958252, 'learning_rate': 2.634368068368411e-05, 'epoch': 20.77}
- 42%|████▏     | 2409/5800 [6:43:32<6:29:26,  6.89s/it]score1 tensor([[0.5820],
-        [0.4590],
-        [0.4590],
-        [0.3984]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6484, 0.4453, 0.4766, 0.4590], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0396, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:30:09,799] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 15:30:09,800] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.22 | bwd_microstep: 4641.51 | bwd_inner_microstep: 4636.25 | bwd_allreduce_microstep: 5.14 | step_microstep: 43.37
-[2025-01-25 15:30:09,800] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.18 | bwd: 4641.54 | bwd_inner: 4636.25 | bwd_allreduce: 5.20 | step: 43.37
- 42%|████▏     | 2410/5800 [6:43:39<6:29:54,  6.90s/it]                                                       {'loss': 0.0396, 'grad_norm': 4.119994163513184, 'learning_rate': 2.6333088250496012e-05, 'epoch': 20.78}
- 42%|████▏     | 2410/5800 [6:43:39<6:29:54,  6.90s/it]score1 tensor([[0.4160],
-        [0.4629],
-        [0.4824],
-        [0.4277]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4277, 0.5078, 0.4961, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0200, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:30:16,696] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 15:30:16,698] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.05 | bwd_microstep: 4623.48 | bwd_inner_microstep: 4618.46 | bwd_allreduce_microstep: 4.91 | step_microstep: 44.62
-[2025-01-25 15:30:16,698] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.99 | bwd: 4623.50 | bwd_inner: 4618.45 | bwd_allreduce: 4.97 | step: 44.63
- 42%|████▏     | 2411/5800 [6:43:46<6:29:47,  6.90s/it]                                                       {'loss': 0.02, 'grad_norm': 4.0472493171691895, 'learning_rate': 2.6322493842541113e-05, 'epoch': 20.78}
- 42%|████▏     | 2411/5800 [6:43:46<6:29:47,  6.90s/it]score1 tensor([[0.4609],
-        [0.4727],
-        [0.4707],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.5430, 0.5352, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0410, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:30:23,591] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 15:30:23,593] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.80 | bwd_microstep: 4616.77 | bwd_inner_microstep: 4611.70 | bwd_allreduce_microstep: 4.96 | step_microstep: 44.73
-[2025-01-25 15:30:23,593] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.76 | bwd: 4616.79 | bwd_inner: 4611.70 | bwd_allreduce: 5.01 | step: 44.74
- 42%|████▏     | 2412/5800 [6:43:53<6:29:31,  6.90s/it]                                                       {'loss': 0.041, 'grad_norm': 0.5105722546577454, 'learning_rate': 2.631189746312293e-05, 'epoch': 20.79}
- 42%|████▏     | 2412/5800 [6:43:53<6:29:31,  6.90s/it]score1 tensor([[0.4414],
-        [0.4258],
-        [0.5391],
-        [0.4805]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4766, 0.4961, 0.5273, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0376, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:30:30,488] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 15:30:30,490] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.96 | bwd_microstep: 4618.47 | bwd_inner_microstep: 4612.96 | bwd_allreduce_microstep: 5.39 | step_microstep: 48.48
-[2025-01-25 15:30:30,490] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.94 | bwd: 4618.50 | bwd_inner: 4612.96 | bwd_allreduce: 5.46 | step: 48.49
- 42%|████▏     | 2413/5800 [6:44:00<6:29:32,  6.90s/it]                                                       {'loss': 0.0376, 'grad_norm': 0.6878793835639954, 'learning_rate': 2.6301299115545596e-05, 'epoch': 20.8}
- 42%|████▏     | 2413/5800 [6:44:00<6:29:32,  6.90s/it]score1 tensor([[0.5508],
-        [0.5938],
-        [0.3906],
-        [0.4453]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.5547, 0.4062, 0.4746], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0239, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:30:37,414] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.37
-[2025-01-25 15:30:37,415] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.05 | bwd_microstep: 4621.88 | bwd_inner_microstep: 4616.52 | bwd_allreduce_microstep: 5.24 | step_microstep: 44.64
-[2025-01-25 15:30:37,416] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.02 | bwd: 4621.92 | bwd_inner: 4616.52 | bwd_allreduce: 5.29 | step: 44.64
- 42%|████▏     | 2414/5800 [6:44:07<6:29:47,  6.91s/it]                                                       {'loss': 0.0239, 'grad_norm': 3.7164559364318848, 'learning_rate': 2.6290698803113862e-05, 'epoch': 20.81}
- 42%|████▏     | 2414/5800 [6:44:07<6:29:47,  6.91s/it]score1 tensor([[0.4395],
-        [0.3438],
-        [0.5625],
-        [0.3887]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4277, 0.3711, 0.5391, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:30:44,321] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.36
-[2025-01-25 15:30:44,322] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.38 | bwd_microstep: 4613.94 | bwd_inner_microstep: 4608.91 | bwd_allreduce_microstep: 4.85 | step_microstep: 43.94
-[2025-01-25 15:30:44,323] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.33 | bwd: 4613.96 | bwd_inner: 4608.91 | bwd_allreduce: 4.90 | step: 43.95
- 42%|████▏     | 2415/5800 [6:44:14<6:29:35,  6.91s/it]                                                       {'loss': 0.0312, 'grad_norm': 0.9756556153297424, 'learning_rate': 2.6280096529133085e-05, 'epoch': 20.82}
- 42%|████▏     | 2415/5800 [6:44:14<6:29:35,  6.91s/it]score1 tensor([[0.5352],
-        [0.6445],
-        [0.6758],
-        [0.4668]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.6055, 0.6016, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0376, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:30:51,225] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.37
-[2025-01-25 15:30:51,226] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.06 | bwd_microstep: 4616.80 | bwd_inner_microstep: 4611.22 | bwd_allreduce_microstep: 5.47 | step_microstep: 47.57
-[2025-01-25 15:30:51,226] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.02 | bwd: 4616.83 | bwd_inner: 4611.22 | bwd_allreduce: 5.53 | step: 47.58
- 42%|████▏     | 2416/5800 [6:44:21<6:29:25,  6.90s/it]                                                       {'loss': 0.0376, 'grad_norm': 5.060088634490967, 'learning_rate': 2.626949229690924e-05, 'epoch': 20.83}
- 42%|████▏     | 2416/5800 [6:44:21<6:29:25,  6.90s/it]score1 tensor([[0.6094],
-        [0.6562],
-        [0.4121],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.6055, 0.4199, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:30:58,065] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.37
-[2025-01-25 15:30:58,067] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.54 | bwd_microstep: 4562.91 | bwd_inner_microstep: 4557.55 | bwd_allreduce_microstep: 5.25 | step_microstep: 43.52
-[2025-01-25 15:30:58,067] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.49 | bwd: 4562.94 | bwd_inner: 4557.55 | bwd_allreduce: 5.31 | step: 43.53
- 42%|████▏     | 2417/5800 [6:44:28<6:28:18,  6.89s/it]                                                       {'loss': 0.0234, 'grad_norm': 2.7174363136291504, 'learning_rate': 2.6258886109748912e-05, 'epoch': 20.84}
- 42%|████▏     | 2417/5800 [6:44:28<6:28:18,  6.89s/it]score1 tensor([[0.5977],
-        [0.4082],
-        [0.4746],
-        [0.6094]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.3750, 0.3750, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0479, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:31:04,973] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 15:31:04,974] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.71 | bwd_microstep: 4623.61 | bwd_inner_microstep: 4618.62 | bwd_allreduce_microstep: 4.90 | step_microstep: 42.70
-[2025-01-25 15:31:04,974] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.68 | bwd: 4623.63 | bwd_inner: 4618.62 | bwd_allreduce: 4.94 | step: 42.71
- 42%|████▏     | 2418/5800 [6:44:34<6:28:30,  6.89s/it]                                                       {'loss': 0.0479, 'grad_norm': 3.8869309425354004, 'learning_rate': 2.6248277970959296e-05, 'epoch': 20.84}
- 42%|████▏     | 2418/5800 [6:44:34<6:28:30,  6.89s/it]score1 tensor([[0.5039],
-        [0.4844],
-        [0.5078],
-        [0.4414]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.5039, 0.4375, 0.3984], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0420, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:31:11,868] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 15:31:11,869] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.38 | bwd_microstep: 4623.31 | bwd_inner_microstep: 4618.49 | bwd_allreduce_microstep: 4.74 | step_microstep: 42.96
-[2025-01-25 15:31:11,870] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.35 | bwd: 4623.33 | bwd_inner: 4618.49 | bwd_allreduce: 4.78 | step: 42.97
- 42%|████▏     | 2419/5800 [6:44:41<6:28:29,  6.89s/it]                                                       {'loss': 0.042, 'grad_norm': 4.114749908447266, 'learning_rate': 2.62376678838482e-05, 'epoch': 20.85}
- 42%|████▏     | 2419/5800 [6:44:41<6:28:29,  6.89s/it]score1 tensor([[0.6523],
-        [0.5859],
-        [0.5039],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6797, 0.6211, 0.4609, 0.5820], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0361, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:31:18,765] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 15:31:18,767] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.50 | bwd_microstep: 4614.12 | bwd_inner_microstep: 4608.69 | bwd_allreduce_microstep: 5.29 | step_microstep: 46.88
-[2025-01-25 15:31:18,767] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.47 | bwd: 4614.15 | bwd_inner: 4608.69 | bwd_allreduce: 5.37 | step: 46.89
- 42%|████▏     | 2420/5800 [6:44:48<6:28:23,  6.89s/it]                                                       {'loss': 0.0361, 'grad_norm': 4.78296422958374, 'learning_rate': 2.6227055851724014e-05, 'epoch': 20.86}
- 42%|████▏     | 2420/5800 [6:44:48<6:28:23,  6.89s/it]score1 tensor([[0.4883],
-        [0.3691],
-        [0.5234],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4453, 0.3809, 0.4629, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0347, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:31:25,652] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.36
-[2025-01-25 15:31:25,653] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.22 | bwd_microstep: 4611.94 | bwd_inner_microstep: 4607.26 | bwd_allreduce_microstep: 4.59 | step_microstep: 40.51
-[2025-01-25 15:31:25,654] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.19 | bwd: 4611.96 | bwd_inner: 4607.26 | bwd_allreduce: 4.64 | step: 40.52
- 42%|████▏     | 2421/5800 [6:44:55<6:28:05,  6.89s/it]                                                       {'loss': 0.0347, 'grad_norm': 4.571741580963135, 'learning_rate': 2.6216441877895766e-05, 'epoch': 20.87}
- 42%|████▏     | 2421/5800 [6:44:55<6:28:05,  6.89s/it]score1 tensor([[0.5742],
-        [0.6055],
-        [0.4863],
-        [0.4688]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6641, 0.6484, 0.4414, 0.4336], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0532, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:31:32,551] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 15:31:32,552] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.37 | bwd_microstep: 4623.32 | bwd_inner_microstep: 4618.68 | bwd_allreduce_microstep: 4.57 | step_microstep: 41.08
-[2025-01-25 15:31:32,552] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.33 | bwd: 4623.35 | bwd_inner: 4618.68 | bwd_allreduce: 4.61 | step: 41.08
- 42%|████▏     | 2422/5800 [6:45:02<6:28:08,  6.89s/it]                                                       {'loss': 0.0532, 'grad_norm': 0.80301433801651, 'learning_rate': 2.6205825965673075e-05, 'epoch': 20.88}
- 42%|████▏     | 2422/5800 [6:45:02<6:28:08,  6.89s/it]score1 tensor([[0.4219],
-        [0.5391],
-        [0.5820],
-        [0.4707]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4160, 0.5547, 0.6523, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0269, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:31:39,449] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 15:31:39,451] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.61 | bwd_microstep: 4623.68 | bwd_inner_microstep: 4618.15 | bwd_allreduce_microstep: 5.44 | step_microstep: 43.80
-[2025-01-25 15:31:39,451] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.58 | bwd: 4623.70 | bwd_inner: 4618.15 | bwd_allreduce: 5.48 | step: 43.81
- 42%|████▏     | 2423/5800 [6:45:09<6:28:06,  6.90s/it]                                                       {'loss': 0.0269, 'grad_norm': 1.0174139738082886, 'learning_rate': 2.6195208118366168e-05, 'epoch': 20.89}
- 42%|████▏     | 2423/5800 [6:45:09<6:28:06,  6.90s/it]score1 tensor([[0.4766],
-        [0.4121],
-        [0.4727],
-        [0.6406]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.3750, 0.4961, 0.6328], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0200, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:31:46,337] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 15:31:46,338] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.73 | bwd_microstep: 4611.79 | bwd_inner_microstep: 4607.01 | bwd_allreduce_microstep: 4.69 | step_microstep: 42.34
-[2025-01-25 15:31:46,339] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.70 | bwd: 4611.81 | bwd_inner: 4607.01 | bwd_allreduce: 4.74 | step: 42.34
- 42%|████▏     | 2424/5800 [6:45:16<6:27:51,  6.89s/it]                                                       {'loss': 0.02, 'grad_norm': 4.284241676330566, 'learning_rate': 2.6184588339285878e-05, 'epoch': 20.9}
- 42%|████▏     | 2424/5800 [6:45:16<6:27:51,  6.89s/it]score1 tensor([[0.4863],
-        [0.5078],
-        [0.4980],
-        [0.5938]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.5273, 0.5156, 0.6875], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:31:53,230] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.36
-[2025-01-25 15:31:53,232] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.96 | bwd_microstep: 4611.12 | bwd_inner_microstep: 4605.20 | bwd_allreduce_microstep: 5.75 | step_microstep: 47.79
-[2025-01-25 15:31:53,232] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.92 | bwd: 4611.15 | bwd_inner: 4605.20 | bwd_allreduce: 5.86 | step: 47.79
- 42%|████▏     | 2425/5800 [6:45:23<6:27:50,  6.89s/it]                                                       {'loss': 0.043, 'grad_norm': 8.420663833618164, 'learning_rate': 2.6173966631743627e-05, 'epoch': 20.91}
- 42%|████▏     | 2425/5800 [6:45:23<6:27:50,  6.89s/it]score1 tensor([[0.3945],
-        [0.4395],
-        [0.4316],
-        [0.4395]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3652, 0.4707, 0.4082, 0.4531], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0244, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:32:00,135] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 15:32:00,137] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.16 | bwd_microstep: 4621.10 | bwd_inner_microstep: 4616.15 | bwd_allreduce_microstep: 4.87 | step_microstep: 42.06
-[2025-01-25 15:32:00,137] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.12 | bwd: 4621.13 | bwd_inner: 4616.15 | bwd_allreduce: 4.91 | step: 42.08
- 42%|████▏     | 2426/5800 [6:45:30<6:27:46,  6.90s/it]                                                       {'loss': 0.0244, 'grad_norm': 0.5072581768035889, 'learning_rate': 2.6163342999051457e-05, 'epoch': 20.91}
- 42%|████▏     | 2426/5800 [6:45:30<6:27:46,  6.90s/it]score1 tensor([[0.4375],
-        [0.5234],
-        [0.4727],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4395, 0.5820, 0.4668, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0166, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:32:06,980] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 15:32:06,981] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.20 | bwd_microstep: 4569.43 | bwd_inner_microstep: 4564.36 | bwd_allreduce_microstep: 4.97 | step_microstep: 41.47
-[2025-01-25 15:32:06,982] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.16 | bwd: 4569.45 | bwd_inner: 4564.36 | bwd_allreduce: 5.03 | step: 41.48
- 42%|████▏     | 2427/5800 [6:45:36<6:26:49,  6.88s/it]                                                       {'loss': 0.0166, 'grad_norm': 2.051966428756714, 'learning_rate': 2.6152717444521994e-05, 'epoch': 20.92}
- 42%|████▏     | 2427/5800 [6:45:36<6:26:49,  6.88s/it]score1 tensor([[0.4688],
-        [0.4473],
-        [0.4902],
-        [0.4551]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.4355, 0.5117, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0371, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:32:13,876] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.36
-[2025-01-25 15:32:13,878] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.84 | bwd_microstep: 4614.78 | bwd_inner_microstep: 4609.88 | bwd_allreduce_microstep: 4.77 | step_microstep: 43.63
-[2025-01-25 15:32:13,878] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.80 | bwd: 4614.80 | bwd_inner: 4609.88 | bwd_allreduce: 4.83 | step: 43.64
- 42%|████▏     | 2428/5800 [6:45:43<6:26:59,  6.89s/it]                                                       {'loss': 0.0371, 'grad_norm': 4.048180103302002, 'learning_rate': 2.6142089971468472e-05, 'epoch': 20.93}
- 42%|████▏     | 2428/5800 [6:45:43<6:26:59,  6.89s/it]score1 tensor([[0.4980],
-        [0.4023],
-        [0.4727],
-        [0.4707]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.4004, 0.4961, 0.4863], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:32:20,791] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.88 | optimizer_step: 4.37
-[2025-01-25 15:32:20,793] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.17 | bwd_microstep: 4626.60 | bwd_inner_microstep: 4621.27 | bwd_allreduce_microstep: 5.21 | step_microstep: 48.46
-[2025-01-25 15:32:20,793] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.14 | bwd: 4626.64 | bwd_inner: 4621.27 | bwd_allreduce: 5.27 | step: 48.47
- 42%|████▏     | 2429/5800 [6:45:50<6:27:20,  6.89s/it]                                                       {'loss': 0.0137, 'grad_norm': 0.6282865405082703, 'learning_rate': 2.613146058320472e-05, 'epoch': 20.94}
- 42%|████▏     | 2429/5800 [6:45:50<6:27:20,  6.89s/it]score1 tensor([[0.5352],
-        [0.5195],
-        [0.4648],
-        [0.4297]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.5273, 0.5156, 0.4121], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0229, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:32:27,696] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 15:32:27,697] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.00 | bwd_microstep: 4626.43 | bwd_inner_microstep: 4621.33 | bwd_allreduce_microstep: 5.00 | step_microstep: 42.78
-[2025-01-25 15:32:27,697] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.95 | bwd: 4626.45 | bwd_inner: 4621.33 | bwd_allreduce: 5.05 | step: 42.79
- 42%|████▏     | 2430/5800 [6:45:57<6:27:22,  6.90s/it]                                                       {'loss': 0.0229, 'grad_norm': 4.408186912536621, 'learning_rate': 2.6120829283045172e-05, 'epoch': 20.95}
- 42%|████▏     | 2430/5800 [6:45:57<6:27:22,  6.90s/it]score1 tensor([[0.3691],
-        [0.4160],
-        [0.4648],
-        [0.4121]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3867, 0.4297, 0.4473, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0151, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:32:34,618] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 15:32:34,619] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.65 | bwd_microstep: 4641.01 | bwd_inner_microstep: 4635.86 | bwd_allreduce_microstep: 5.05 | step_microstep: 41.99
-[2025-01-25 15:32:34,620] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.61 | bwd: 4641.03 | bwd_inner: 4635.86 | bwd_allreduce: 5.10 | step: 41.99
- 42%|████▏     | 2431/5800 [6:46:04<6:27:42,  6.90s/it]                                                       {'loss': 0.0151, 'grad_norm': 0.5596683025360107, 'learning_rate': 2.6110196074304846e-05, 'epoch': 20.96}
- 42%|████▏     | 2431/5800 [6:46:04<6:27:42,  6.90s/it]score1 tensor([[0.6172],
-        [0.5664],
-        [0.5742],
-        [0.4199]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5938, 0.6016, 0.5781, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:32:41,540] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 15:32:41,541] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.75 | bwd_microstep: 4633.26 | bwd_inner_microstep: 4628.14 | bwd_allreduce_microstep: 5.00 | step_microstep: 45.09
-[2025-01-25 15:32:41,542] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.71 | bwd: 4633.29 | bwd_inner: 4628.14 | bwd_allreduce: 5.06 | step: 45.10
- 42%|████▏     | 2432/5800 [6:46:11<6:27:51,  6.91s/it]                                                       {'loss': 0.0234, 'grad_norm': 4.011789798736572, 'learning_rate': 2.6099560960299366e-05, 'epoch': 20.97}
- 42%|████▏     | 2432/5800 [6:46:11<6:27:51,  6.91s/it]score1 tensor([[0.5938],
-        [0.5117],
-        [0.5039],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.5742, 0.5117, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:32:48,451] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 15:32:48,452] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.30 | bwd_microstep: 4634.13 | bwd_inner_microstep: 4629.30 | bwd_allreduce_microstep: 4.67 | step_microstep: 42.47
-[2025-01-25 15:32:48,458] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.27 | bwd: 4634.16 | bwd_inner: 4629.30 | bwd_allreduce: 4.78 | step: 42.48
- 42%|████▏     | 2433/5800 [6:46:18<6:27:50,  6.91s/it]                                                       {'loss': 0.0293, 'grad_norm': 0.845397412776947, 'learning_rate': 2.6088923944344937e-05, 'epoch': 20.97}
- 42%|████▏     | 2433/5800 [6:46:18<6:27:50,  6.91s/it]score1 tensor([[0.7109],
-        [0.5273],
-        [0.4746],
-        [0.4863]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6953, 0.5195, 0.4551, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:32:55,368] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 15:32:55,369] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.61 | bwd_microstep: 4632.56 | bwd_inner_microstep: 4627.50 | bwd_allreduce_microstep: 4.98 | step_microstep: 42.00
-[2025-01-25 15:32:55,369] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.58 | bwd: 4632.59 | bwd_inner: 4627.50 | bwd_allreduce: 5.02 | step: 42.01
- 42%|████▏     | 2434/5800 [6:46:25<6:27:45,  6.91s/it]                                                       {'loss': 0.0137, 'grad_norm': 4.64605712890625, 'learning_rate': 2.6078285029758378e-05, 'epoch': 20.98}
- 42%|████▏     | 2434/5800 [6:46:25<6:27:45,  6.91s/it]score1 tensor([[0.5156],
-        [0.4082],
-        [0.5234],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4668, 0.4043, 0.4941, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:33:02,295] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 15:33:02,297] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.54 | bwd_microstep: 4645.32 | bwd_inner_microstep: 4640.32 | bwd_allreduce_microstep: 4.88 | step_microstep: 43.73
-[2025-01-25 15:33:02,297] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.51 | bwd: 4645.35 | bwd_inner: 4640.32 | bwd_allreduce: 4.94 | step: 43.74
- 42%|████▏     | 2435/5800 [6:46:32<6:27:59,  6.92s/it]                                                       {'loss': 0.0293, 'grad_norm': 8.254843711853027, 'learning_rate': 2.6067644219857072e-05, 'epoch': 20.99}
- 42%|████▏     | 2435/5800 [6:46:32<6:27:59,  6.92s/it][93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-score1 tensor([[0.6602]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6250], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:33:06,856] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.09 | optimizer_step: 4.37
-[2025-01-25 15:33:06,858] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 570.58 | bwd_microstep: 1222.29 | bwd_inner_microstep: 1216.51 | bwd_allreduce_microstep: 5.65 | step_microstep: 47.74
-[2025-01-25 15:33:06,858] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 570.56 | bwd: 1222.31 | bwd_inner: 1216.51 | bwd_allreduce: 5.72 | step: 47.74
- 42%|████▏     | 2436/5800 [6:46:36<5:48:13,  6.21s/it]                                                       {'loss': 0.0352, 'grad_norm': 9.722203254699707, 'learning_rate': 2.6057001517959015e-05, 'epoch': 21.0}
- 42%|████▏     | 2436/5800 [6:46:36<5:48:13,  6.21s/it][2025-01-25 15:33:11,618] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 15:33:21,766] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 15:33:32,028] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 15:33:41,948] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-score1 tensor([[0.6680],
-        [0.5352],
-        [0.5195],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133, 0.4941, 0.4805, 0.4902], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:33:57,244] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.95 | optimizer_step: 4.37
-[2025-01-25 15:33:57,246] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2141.64 | bwd_microstep: 4597.42 | bwd_inner_microstep: 4592.55 | bwd_allreduce_microstep: 4.79 | step_microstep: 44.14
-[2025-01-25 15:33:57,246] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2141.61 | bwd: 4597.45 | bwd_inner: 4592.55 | bwd_allreduce: 4.84 | step: 44.14
- 42%|████▏     | 2437/5800 [6:47:27<18:11:02, 19.47s/it]                                                        {'loss': 0.0391, 'grad_norm': 8.83217716217041, 'learning_rate': 2.604635692738279e-05, 'epoch': 21.01}
- 42%|████▏     | 2437/5800 [6:47:27<18:11:02, 19.47s/it]score1 tensor([[0.6406],
-        [0.4961],
-        [0.6641],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.4805, 0.6328, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0474, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:34:04,098] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.37
-[2025-01-25 15:34:04,100] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2131.53 | bwd_microstep: 4579.10 | bwd_inner_microstep: 4575.09 | bwd_allreduce_microstep: 3.95 | step_microstep: 40.81
-[2025-01-25 15:34:04,100] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2131.49 | bwd: 4579.12 | bwd_inner: 4575.09 | bwd_allreduce: 3.98 | step: 40.82
- 42%|████▏     | 2438/5800 [6:47:34<14:38:30, 15.68s/it]                                                        {'loss': 0.0474, 'grad_norm': 8.914750099182129, 'learning_rate': 2.603571045144756e-05, 'epoch': 21.02}
- 42%|████▏     | 2438/5800 [6:47:34<14:38:30, 15.68s/it]score1 tensor([[0.6562],
-        [0.4570],
-        [0.3574],
-        [0.6367]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6641, 0.4395, 0.3809, 0.6797], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0229, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:34:10,951] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 15:34:10,957] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2134.68 | bwd_microstep: 4598.31 | bwd_inner_microstep: 4593.24 | bwd_allreduce_microstep: 4.96 | step_microstep: 53.71
-[2025-01-25 15:34:10,958] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2134.58 | bwd: 4598.34 | bwd_inner: 4593.23 | bwd_allreduce: 5.02 | step: 53.74
- 42%|████▏     | 2439/5800 [6:47:40<12:10:04, 13.03s/it]                                                        {'loss': 0.0229, 'grad_norm': 4.535844326019287, 'learning_rate': 2.6025062093473086e-05, 'epoch': 21.03}
- 42%|████▏     | 2439/5800 [6:47:40<12:10:04, 13.03s/it]score1 tensor([[0.4336],
-        [0.5195],
-        [0.4180],
-        [0.4023]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4375, 0.5273, 0.4062, 0.3418], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0210, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:34:17,813] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.37
-[2025-01-25 15:34:17,815] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2142.36 | bwd_microstep: 4593.53 | bwd_inner_microstep: 4588.56 | bwd_allreduce_microstep: 4.88 | step_microstep: 42.98
-[2025-01-25 15:34:17,815] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2142.31 | bwd: 4593.55 | bwd_inner: 4588.56 | bwd_allreduce: 4.93 | step: 42.98
- 42%|████▏     | 2440/5800 [6:47:47<10:26:03, 11.18s/it]                                                        {'loss': 0.021, 'grad_norm': 0.6919851899147034, 'learning_rate': 2.6014411856779704e-05, 'epoch': 21.03}
- 42%|████▏     | 2440/5800 [6:47:47<10:26:03, 11.18s/it]score1 tensor([[0.6289],
-        [0.3359],
-        [0.5430],
-        [0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6289, 0.3555, 0.4941, 0.4727], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0200, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:34:24,632] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.37
-[2025-01-25 15:34:24,633] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2148.92 | bwd_microstep: 4552.52 | bwd_inner_microstep: 4547.64 | bwd_allreduce_microstep: 4.76 | step_microstep: 44.80
-[2025-01-25 15:34:24,634] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2148.89 | bwd: 4552.54 | bwd_inner: 4547.64 | bwd_allreduce: 4.83 | step: 44.81
- 42%|████▏     | 2441/5800 [6:47:54<9:12:42,  9.87s/it]                                                        {'loss': 0.02, 'grad_norm': 2.540605306625366, 'learning_rate': 2.600375974468835e-05, 'epoch': 21.04}
- 42%|████▏     | 2441/5800 [6:47:54<9:12:42,  9.87s/it]score1 tensor([[0.4961],
-        [0.4102],
-        [0.4844],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.4141, 0.4629, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0190, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:34:31,516] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.37
-[2025-01-25 15:34:31,517] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2147.13 | bwd_microstep: 4613.23 | bwd_inner_microstep: 4607.63 | bwd_allreduce_microstep: 5.51 | step_microstep: 43.77
-[2025-01-25 15:34:31,518] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2147.10 | bwd: 4613.26 | bwd_inner: 4607.63 | bwd_allreduce: 5.56 | step: 43.78
- 42%|████▏     | 2442/5800 [6:48:01<8:22:21,  8.98s/it]                                                       {'loss': 0.019, 'grad_norm': 4.040040969848633, 'learning_rate': 2.599310576052053e-05, 'epoch': 21.05}
- 42%|████▏     | 2442/5800 [6:48:01<8:22:21,  8.98s/it]score1 tensor([[0.5469],
-        [0.5820],
-        [0.4180],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.5781, 0.4492, 0.5742], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:34:38,411] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 15:34:38,413] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2148.68 | bwd_microstep: 4619.85 | bwd_inner_microstep: 4614.69 | bwd_allreduce_microstep: 5.05 | step_microstep: 44.91
-[2025-01-25 15:34:38,413] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2148.64 | bwd: 4619.87 | bwd_inner: 4614.69 | bwd_allreduce: 5.11 | step: 44.92
- 42%|████▏     | 2443/5800 [6:48:08<7:47:12,  8.35s/it]                                                       {'loss': 0.0176, 'grad_norm': 3.9603936672210693, 'learning_rate': 2.5982449907598342e-05, 'epoch': 21.06}
- 42%|████▏     | 2443/5800 [6:48:08<7:47:12,  8.35s/it]score1 tensor([[0.6680],
-        [0.5352],
-        [0.6289],
-        [0.5859]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6484, 0.5469, 0.6562, 0.5977], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:34:45,293] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 15:34:45,294] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.15 | bwd_microstep: 4619.19 | bwd_inner_microstep: 4614.29 | bwd_allreduce_microstep: 4.79 | step_microstep: 42.20
-[2025-01-25 15:34:45,295] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.11 | bwd: 4619.23 | bwd_inner: 4614.29 | bwd_allreduce: 4.85 | step: 42.21
- 42%|████▏     | 2444/5800 [6:48:15<7:22:27,  7.91s/it]                                                       {'loss': 0.0176, 'grad_norm': 4.2867045402526855, 'learning_rate': 2.5971792189244473e-05, 'epoch': 21.07}
- 42%|████▏     | 2444/5800 [6:48:15<7:22:27,  7.91s/it]score1 tensor([[0.5156],
-        [0.6328],
-        [0.5469],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.6445, 0.5430, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:34:52,106] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 15:34:52,107] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2148.46 | bwd_microstep: 4548.20 | bwd_inner_microstep: 4543.31 | bwd_allreduce_microstep: 4.80 | step_microstep: 42.35
-[2025-01-25 15:34:52,108] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2148.42 | bwd: 4548.22 | bwd_inner: 4543.31 | bwd_allreduce: 4.84 | step: 42.35
- 42%|█���██▏     | 2445/5800 [6:48:22<7:03:57,  7.58s/it]                                                       {'loss': 0.0195, 'grad_norm': 2.2458455562591553, 'learning_rate': 2.5961132608782178e-05, 'epoch': 21.08}
- 42%|████▏     | 2445/5800 [6:48:22<7:03:57,  7.58s/it]score1 tensor([[0.5312],
-        [0.5508],
-        [0.4629],
-        [0.4727]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.5195, 0.4473, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:34:58,993] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 15:34:58,994] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.63 | bwd_microstep: 4611.91 | bwd_inner_microstep: 4606.93 | bwd_allreduce_microstep: 4.86 | step_microstep: 42.19
-[2025-01-25 15:34:58,995] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.60 | bwd: 4611.94 | bwd_inner: 4606.93 | bwd_allreduce: 4.93 | step: 42.19
- 42%|████▏     | 2446/5800 [6:48:28<6:52:10,  7.37s/it]                                                       {'loss': 0.0176, 'grad_norm': 8.284466743469238, 'learning_rate': 2.5950471169535304e-05, 'epoch': 21.09}
- 42%|████▏     | 2446/5800 [6:48:28<6:52:10,  7.37s/it]score1 tensor([[0.4746],
-        [0.6562],
-        [0.4629],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4609, 0.6875, 0.4297, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0225, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:35:05,876] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 15:35:05,877] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.02 | bwd_microstep: 4610.19 | bwd_inner_microstep: 4605.09 | bwd_allreduce_microstep: 5.02 | step_microstep: 42.64
-[2025-01-25 15:35:05,878] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2151.99 | bwd: 4610.21 | bwd_inner: 4605.09 | bwd_allreduce: 5.06 | step: 42.65
- 42%|████▏     | 2447/5800 [6:48:35<6:43:48,  7.23s/it]                                                       {'loss': 0.0225, 'grad_norm': 3.8127832412719727, 'learning_rate': 2.5939807874828263e-05, 'epoch': 21.09}
- 42%|████▏     | 2447/5800 [6:48:35<6:43:48,  7.23s/it]score1 tensor([[0.5508],
-        [0.6289],
-        [0.4805],
-        [0.4238]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5820, 0.6133, 0.4922, 0.4316], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0166, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:35:12,756] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.37
-[2025-01-25 15:35:12,757] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.91 | bwd_microstep: 4609.20 | bwd_inner_microstep: 4604.36 | bwd_allreduce_microstep: 4.74 | step_microstep: 42.07
-[2025-01-25 15:35:12,757] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.88 | bwd: 4609.22 | bwd_inner: 4604.36 | bwd_allreduce: 4.79 | step: 42.08
- 42%|████▏     | 2448/5800 [6:48:42<6:37:54,  7.12s/it]                                                       {'loss': 0.0166, 'grad_norm': 3.718329429626465, 'learning_rate': 2.5929142727986065e-05, 'epoch': 21.1}
- 42%|████▏     | 2448/5800 [6:48:42<6:37:54,  7.12s/it]score1 tensor([[0.4375],
-        [0.6484],
-        [0.4863],
-        [0.4570]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.6055, 0.4609, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0244, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:35:19,652] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 15:35:19,653] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.61 | bwd_microstep: 4616.10 | bwd_inner_microstep: 4611.17 | bwd_allreduce_microstep: 4.84 | step_microstep: 42.79
-[2025-01-25 15:35:19,654] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.57 | bwd: 4616.12 | bwd_inner: 4611.17 | bwd_allreduce: 4.88 | step: 42.80
- 42%|████▏     | 2449/5800 [6:48:49<6:34:01,  7.05s/it]                                                       {'loss': 0.0244, 'grad_norm': 8.311616897583008, 'learning_rate': 2.5918475732334286e-05, 'epoch': 21.11}
- 42%|████▏     | 2449/5800 [6:48:49<6:34:01,  7.05s/it]score1 tensor([[0.4551],
-        [0.3984],
-        [0.6094],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4258, 0.4648, 0.6406, 0.5234], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0435, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:35:26,545] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.37
-[2025-01-25 15:35:26,546] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.57 | bwd_microstep: 4613.10 | bwd_inner_microstep: 4607.94 | bwd_allreduce_microstep: 5.03 | step_microstep: 42.17
-[2025-01-25 15:35:26,547] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.54 | bwd: 4613.13 | bwd_inner: 4607.94 | bwd_allreduce: 5.11 | step: 42.18
- 42%|████▏     | 2450/5800 [6:48:56<6:31:10,  7.01s/it]                                                       {'loss': 0.0435, 'grad_norm': 0.539354681968689, 'learning_rate': 2.5907806891199077e-05, 'epoch': 21.12}
- 42%|████▏     | 2450/5800 [6:48:56<6:31:10,  7.01s/it]score1 tensor([[0.4883],
-        [0.5117],
-        [0.6016],
-        [0.6016]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.4883, 0.5938, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:35:33,442] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 15:35:33,444] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.89 | bwd_microstep: 4616.84 | bwd_inner_microstep: 4611.94 | bwd_allreduce_microstep: 4.76 | step_microstep: 41.86
-[2025-01-25 15:35:33,444] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.84 | bwd: 4616.86 | bwd_inner: 4611.94 | bwd_allreduce: 4.85 | step: 41.87
- 42%|████▏     | 2451/5800 [6:49:03<6:29:14,  6.97s/it]                                                       {'loss': 0.0176, 'grad_norm': 4.704510688781738, 'learning_rate': 2.589713620790717e-05, 'epoch': 21.13}
- 42%|████▏     | 2451/5800 [6:49:03<6:29:14,  6.97s/it]score1 tensor([[0.5938],
-        [0.4199],
-        [0.5000],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5898, 0.4473, 0.4492, 0.4824], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0249, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:35:40,344] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 15:35:40,345] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.51 | bwd_microstep: 4621.70 | bwd_inner_microstep: 4616.96 | bwd_allreduce_microstep: 4.64 | step_microstep: 41.90
-[2025-01-25 15:35:40,346] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.47 | bwd: 4621.72 | bwd_inner: 4616.96 | bwd_allreduce: 4.70 | step: 41.91
- 42%|████▏     | 2452/5800 [6:49:10<6:27:51,  6.95s/it]                                                       {'loss': 0.0249, 'grad_norm': 4.5589470863342285, 'learning_rate': 2.5886463685785873e-05, 'epoch': 21.14}
- 42%|████▏     | 2452/5800 [6:49:10<6:27:51,  6.95s/it]score1 tensor([[0.4277],
-        [0.4258],
-        [0.4180],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4258, 0.4121, 0.3750, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:35:47,239] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 15:35:47,240] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.42 | bwd_microstep: 4620.81 | bwd_inner_microstep: 4615.99 | bwd_allreduce_microstep: 4.71 | step_microstep: 41.56
-[2025-01-25 15:35:47,240] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.37 | bwd: 4620.83 | bwd_inner: 4615.99 | bwd_allreduce: 4.77 | step: 41.58
- 42%|████▏     | 2453/5800 [6:49:17<6:26:47,  6.93s/it]                                                       {'loss': 0.0156, 'grad_norm': 7.854931831359863, 'learning_rate': 2.5875789328163055e-05, 'epoch': 21.15}
- 42%|████▏     | 2453/5800 [6:49:17<6:26:47,  6.93s/it]score1 tensor([[0.4766],
-        [0.4258],
-        [0.4902],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.3945, 0.5195, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0474, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:35:54,131] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 15:35:54,133] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.27 | bwd_microstep: 4620.65 | bwd_inner_microstep: 4615.75 | bwd_allreduce_microstep: 4.78 | step_microstep: 41.83
-[2025-01-25 15:35:54,133] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.23 | bwd: 4620.67 | bwd_inner: 4615.75 | bwd_allreduce: 4.84 | step: 41.84
- 42%|████▏     | 2454/5800 [6:49:24<6:26:03,  6.92s/it]                                                       {'loss': 0.0474, 'grad_norm': 0.4105145037174225, 'learning_rate': 2.5865113138367172e-05, 'epoch': 21.16}
- 42%|████▏     | 2454/5800 [6:49:24<6:26:03,  6.92s/it]score1 tensor([[0.4277],
-        [0.4004],
-        [0.4082],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.4336, 0.4141, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0200, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:36:01,026] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.37
-[2025-01-25 15:36:01,028] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.74 | bwd_microstep: 4615.49 | bwd_inner_microstep: 4609.76 | bwd_allreduce_microstep: 5.60 | step_microstep: 42.50
-[2025-01-25 15:36:01,028] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.71 | bwd: 4615.52 | bwd_inner: 4609.76 | bwd_allreduce: 5.67 | step: 42.51
- 42%|████▏     | 2455/5800 [6:49:31<6:25:27,  6.91s/it]                                                       {'loss': 0.02, 'grad_norm': 7.727317810058594, 'learning_rate': 2.5854435119727247e-05, 'epoch': 21.16}
- 42%|████▏     | 2455/5800 [6:49:31<6:25:27,  6.91s/it]score1 tensor([[0.5508],
-        [0.4648],
-        [0.4277],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.5195, 0.4434, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0459, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:36:07,922] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.37
-[2025-01-25 15:36:07,924] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.30 | bwd_microstep: 4614.13 | bwd_inner_microstep: 4608.26 | bwd_allreduce_microstep: 5.70 | step_microstep: 46.09
-[2025-01-25 15:36:07,924] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.26 | bwd: 4614.16 | bwd_inner: 4608.26 | bwd_allreduce: 5.82 | step: 46.12
- 42%|████▏     | 2456/5800 [6:49:37<6:25:04,  6.91s/it]                                                       {'loss': 0.0459, 'grad_norm': 8.20026683807373, 'learning_rate': 2.584375527557286e-05, 'epoch': 21.17}
- 42%|████▏     | 2456/5800 [6:49:37<6:25:04,  6.91s/it]score1 tensor([[0.6250],
-        [0.4160],
-        [0.4883],
-        [0.3828]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5977, 0.3750, 0.4961, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0361, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:36:14,818] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 15:36:14,820] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.58 | bwd_microstep: 4616.08 | bwd_inner_microstep: 4611.52 | bwd_allreduce_microstep: 4.44 | step_microstep: 42.16
-[2025-01-25 15:36:14,820] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.54 | bwd: 4616.10 | bwd_inner: 4611.52 | bwd_allreduce: 4.49 | step: 42.17
- 42%|████▏     | 2457/5800 [6:49:44<6:24:50,  6.91s/it]                                                       {'loss': 0.0361, 'grad_norm': 0.7382791638374329, 'learning_rate': 2.5833073609234195e-05, 'epoch': 21.18}
- 42%|████▏     | 2457/5800 [6:49:44<6:24:50,  6.91s/it]score1 tensor([[0.6211],
-        [0.4023],
-        [0.3828],
-        [0.6328]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6367, 0.4199, 0.3750, 0.6797], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0220, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:36:21,726] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 15:36:21,728] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.96 | bwd_microstep: 4624.39 | bwd_inner_microstep: 4619.02 | bwd_allreduce_microstep: 5.27 | step_microstep: 45.04
-[2025-01-25 15:36:21,728] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.93 | bwd: 4624.41 | bwd_inner: 4619.02 | bwd_allreduce: 5.32 | step: 45.06
- 42%|████▏     | 2458/5800 [6:49:51<6:24:32,  6.90s/it]                                                       {'loss': 0.022, 'grad_norm': 4.810057163238525, 'learning_rate': 2.5822390124041956e-05, 'epoch': 21.19}
- 42%|████▏     | 2458/5800 [6:49:51<6:24:32,  6.90s/it]score1 tensor([[0.6211],
-        [0.5039],
-        [0.4434],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6016, 0.5039, 0.4980, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0200, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:36:28,572] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.37
-[2025-01-25 15:36:28,573] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.67 | bwd_microstep: 4576.04 | bwd_inner_microstep: 4571.30 | bwd_allreduce_microstep: 4.66 | step_microstep: 41.40
-[2025-01-25 15:36:28,574] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.62 | bwd: 4576.06 | bwd_inner: 4571.30 | bwd_allreduce: 4.70 | step: 41.41
- 42%|████▏     | 2459/5800 [6:49:58<6:23:28,  6.89s/it]                                                       {'loss': 0.02, 'grad_norm': 2.5582807064056396, 'learning_rate': 2.581170482332745e-05, 'epoch': 21.2}
- 42%|████▏     | 2459/5800 [6:49:58<6:23:28,  6.89s/it]score1 tensor([[0.5078],
-        [0.3750],
-        [0.5195],
-        [0.4707]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.3398, 0.5664, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0283, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:36:35,470] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.88 | optimizer_step: 4.37
-[2025-01-25 15:36:35,471] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.06 | bwd_microstep: 4623.45 | bwd_inner_microstep: 4618.73 | bwd_allreduce_microstep: 4.65 | step_microstep: 41.74
-[2025-01-25 15:36:35,472] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.02 | bwd: 4623.48 | bwd_inner: 4618.73 | bwd_allreduce: 4.68 | step: 41.74
- 42%|████▏     | 2460/5800 [6:50:05<6:23:34,  6.89s/it]                                                       {'loss': 0.0283, 'grad_norm': 0.5888695120811462, 'learning_rate': 2.5801017710422537e-05, 'epoch': 21.21}
- 42%|████▏     | 2460/5800 [6:50:05<6:23:34,  6.89s/it]score1 tensor([[0.5547],
-        [0.4551],
-        [0.5391],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.4844, 0.5469, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0347, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:36:42,367] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 15:36:42,368] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.26 | bwd_microstep: 4615.62 | bwd_inner_microstep: 4610.97 | bwd_allreduce_microstep: 4.57 | step_microstep: 41.05
-[2025-01-25 15:36:42,368] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.21 | bwd: 4615.64 | bwd_inner: 4610.97 | bwd_allreduce: 4.61 | step: 41.06
- 42%|████▏     | 2461/5800 [6:50:12<6:23:30,  6.89s/it]                                                       {'loss': 0.0347, 'grad_norm': 8.427763938903809, 'learning_rate': 2.5790328788659647e-05, 'epoch': 21.22}
- 42%|████▏     | 2461/5800 [6:50:12<6:23:30,  6.89s/it]score1 tensor([[0.4414],
-        [0.5352],
-        [0.5234],
-        [0.4902]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4160, 0.5508, 0.5195, 0.4766], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:36:49,265] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 15:36:49,266] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.01 | bwd_microstep: 4617.68 | bwd_inner_microstep: 4612.73 | bwd_allreduce_microstep: 4.83 | step_microstep: 43.61
-[2025-01-25 15:36:49,267] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.97 | bwd: 4617.70 | bwd_inner: 4612.73 | bwd_allreduce: 4.90 | step: 43.61
- 42%|████▏     | 2462/5800 [6:50:19<6:23:33,  6.89s/it]                                                       {'loss': 0.0146, 'grad_norm': 3.95072078704834, 'learning_rate': 2.577963806137177e-05, 'epoch': 21.22}
- 42%|████▏     | 2462/5800 [6:50:19<6:23:33,  6.89s/it]score1 tensor([[0.4219],
-        [0.4258],
-        [0.5273],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4219, 0.4082, 0.5391, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0181, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:36:56,101] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 15:36:56,103] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.58 | bwd_microstep: 4564.16 | bwd_inner_microstep: 4559.45 | bwd_allreduce_microstep: 4.61 | step_microstep: 43.11
-[2025-01-25 15:36:56,103] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.55 | bwd: 4564.19 | bwd_inner: 4559.45 | bwd_allreduce: 4.66 | step: 43.11
- 42%|████▏     | 2463/5800 [6:50:26<6:22:35,  6.88s/it]                                                       {'loss': 0.0181, 'grad_norm': 2.3959946632385254, 'learning_rate': 2.576894553189246e-05, 'epoch': 21.23}
- 42%|████▏     | 2463/5800 [6:50:26<6:22:35,  6.88s/it]score1 tensor([[0.6133],
-        [0.4863],
-        [0.4023],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6250, 0.5234, 0.4941, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:37:03,007] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.49 | optimizer_step: 4.37
-[2025-01-25 15:37:03,008] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.90 | bwd_microstep: 4616.78 | bwd_inner_microstep: 4612.10 | bwd_allreduce_microstep: 4.60 | step_microstep: 45.55
-[2025-01-25 15:37:03,009] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.87 | bwd: 4616.81 | bwd_inner: 4612.10 | bwd_allreduce: 4.64 | step: 45.56
- 42%|████▏     | 2464/5800 [6:50:32<6:22:47,  6.88s/it]                                                       {'loss': 0.043, 'grad_norm': 8.294062614440918, 'learning_rate': 2.5758251203555834e-05, 'epoch': 21.24}
- 42%|████▏     | 2464/5800 [6:50:32<6:22:47,  6.88s/it]score1 tensor([[0.5352],
-        [0.5703],
-        [0.5039],
-        [0.3438]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.5781, 0.4707, 0.3730], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0225, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:37:09,900] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 15:37:09,901] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.61 | bwd_microstep: 4619.40 | bwd_inner_microstep: 4614.27 | bwd_allreduce_microstep: 5.02 | step_microstep: 42.65
-[2025-01-25 15:37:09,902] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.56 | bwd: 4619.43 | bwd_inner: 4614.27 | bwd_allreduce: 5.07 | step: 42.66
- 42%|████▎     | 2465/5800 [6:50:39<6:22:49,  6.89s/it]                                                       {'loss': 0.0225, 'grad_norm': 4.0133795738220215, 'learning_rate': 2.574755507969657e-05, 'epoch': 21.25}
- 42%|████▎     | 2465/5800 [6:50:39<6:22:49,  6.89s/it]score1 tensor([[0.4102],
-        [0.4844],
-        [0.6016],
-        [0.4121]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3789, 0.4570, 0.6719, 0.4121], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0322, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:37:16,751] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 15:37:16,752] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.16 | bwd_microstep: 4575.58 | bwd_inner_microstep: 4570.89 | bwd_allreduce_microstep: 4.61 | step_microstep: 42.86
-[2025-01-25 15:37:16,753] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.13 | bwd: 4575.61 | bwd_inner: 4570.89 | bwd_allreduce: 4.65 | step: 42.86
- 43%|████▎     | 2466/5800 [6:50:46<6:22:09,  6.88s/it]                                                       {'loss': 0.0322, 'grad_norm': 1.6770790815353394, 'learning_rate': 2.57368571636499e-05, 'epoch': 21.26}
- 43%|████▎     | 2466/5800 [6:50:46<6:22:09,  6.88s/it]score1 tensor([[0.4922],
-        [0.5898],
-        [0.3887],
-        [0.4922]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4590, 0.5117, 0.2812, 0.4609], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0625, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:37:23,649] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 15:37:23,650] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.06 | bwd_microstep: 4621.28 | bwd_inner_microstep: 4616.76 | bwd_allreduce_microstep: 4.44 | step_microstep: 42.28
-[2025-01-25 15:37:23,650] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.03 | bwd: 4621.30 | bwd_inner: 4616.76 | bwd_allreduce: 4.48 | step: 42.28
- 43%|████▎     | 2467/5800 [6:50:53<6:22:17,  6.88s/it]                                                       {'loss': 0.0625, 'grad_norm': 8.151591300964355, 'learning_rate': 2.572615745875162e-05, 'epoch': 21.27}
- 43%|████▎     | 2467/5800 [6:50:53<6:22:17,  6.88s/it]score1 tensor([[0.5586],
-        [0.5547],
-        [0.3652],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.5664, 0.4043, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0371, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:37:30,541] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.89 | optimizer_step: 4.36
-[2025-01-25 15:37:30,542] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.10 | bwd_microstep: 4621.50 | bwd_inner_microstep: 4615.91 | bwd_allreduce_microstep: 5.50 | step_microstep: 43.62
-[2025-01-25 15:37:30,542] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.06 | bwd: 4621.52 | bwd_inner: 4615.91 | bwd_allreduce: 5.54 | step: 43.62
- 43%|████▎     | 2468/5800 [6:51:00<6:22:28,  6.89s/it]                                                       {'loss': 0.0371, 'grad_norm': 3.95207142829895, 'learning_rate': 2.5715455968338092e-05, 'epoch': 21.28}
- 43%|████▎     | 2468/5800 [6:51:00<6:22:28,  6.89s/it]score1 tensor([[0.5469],
-        [0.6523],
-        [0.4785],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.6445, 0.4805, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:37:37,451] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 15:37:37,453] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.61 | bwd_microstep: 4624.15 | bwd_inner_microstep: 4619.08 | bwd_allreduce_microstep: 4.94 | step_microstep: 44.66
-[2025-01-25 15:37:37,453] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.58 | bwd: 4624.17 | bwd_inner: 4619.08 | bwd_allreduce: 5.01 | step: 44.66
- 43%|████▎     | 2469/5800 [6:51:07<6:22:45,  6.89s/it]                                                       {'loss': 0.0122, 'grad_norm': 4.6965012550354, 'learning_rate': 2.570475269574622e-05, 'epoch': 21.28}
- 43%|████▎     | 2469/5800 [6:51:07<6:22:45,  6.89s/it]score1 tensor([[0.5156],
-        [0.4531],
-        [0.4160],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.4180, 0.4277, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:37:44,360] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 15:37:44,361] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.83 | bwd_microstep: 4626.07 | bwd_inner_microstep: 4620.86 | bwd_allreduce_microstep: 5.10 | step_microstep: 43.56
-[2025-01-25 15:37:44,362] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.79 | bwd: 4626.10 | bwd_inner: 4620.86 | bwd_allreduce: 5.14 | step: 43.57
- 43%|████▎     | 2470/5800 [6:51:14<6:22:49,  6.90s/it]                                                       {'loss': 0.0195, 'grad_norm': 4.237902641296387, 'learning_rate': 2.5694047644313474e-05, 'epoch': 21.29}
- 43%|████▎     | 2470/5800 [6:51:14<6:22:49,  6.90s/it]score1 tensor([[0.4492],
-        [0.5469],
-        [0.6250],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.4453, 0.6289, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0288, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:37:51,271] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.37
-[2025-01-25 15:37:51,272] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.73 | bwd_microstep: 4622.65 | bwd_inner_microstep: 4615.77 | bwd_allreduce_microstep: 6.78 | step_microstep: 42.74
-[2025-01-25 15:37:51,273] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.69 | bwd: 4622.67 | bwd_inner: 4615.77 | bwd_allreduce: 6.83 | step: 42.74
- 43%|████▎     | 2471/5800 [6:51:21<6:22:52,  6.90s/it]                                                       {'loss': 0.0288, 'grad_norm': 3.963146686553955, 'learning_rate': 2.5683340817377868e-05, 'epoch': 21.3}
- 43%|████▎     | 2471/5800 [6:51:21<6:22:52,  6.90s/it]score1 tensor([[0.4551],
-        [0.5117],
-        [0.4766],
-        [0.6250]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4395, 0.4824, 0.4668, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:37:58,170] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 15:37:58,172] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.74 | bwd_microstep: 4619.37 | bwd_inner_microstep: 4613.19 | bwd_allreduce_microstep: 6.07 | step_microstep: 42.99
-[2025-01-25 15:37:58,172] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.71 | bwd: 4619.39 | bwd_inner: 4613.19 | bwd_allreduce: 6.12 | step: 43.00
- 43%|████▎     | 2472/5800 [6:51:28<6:22:45,  6.90s/it]                                                       {'loss': 0.0156, 'grad_norm': 8.350332260131836, 'learning_rate': 2.567263221827798e-05, 'epoch': 21.31}
- 43%|████▎     | 2472/5800 [6:51:28<6:22:45,  6.90s/it]score1 tensor([[0.3809],
-        [0.4902],
-        [0.4219],
-        [0.4160]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4062, 0.4961, 0.5664, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0479, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:38:05,067] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.37
-[2025-01-25 15:38:05,068] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.32 | bwd_microstep: 4620.13 | bwd_inner_microstep: 4615.20 | bwd_allreduce_microstep: 4.85 | step_microstep: 43.04
-[2025-01-25 15:38:05,068] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.28 | bwd: 4620.15 | bwd_inner: 4615.20 | bwd_allreduce: 4.89 | step: 43.05
- 43%|████▎     | 2473/5800 [6:51:35<6:22:29,  6.90s/it]                                                       {'loss': 0.0479, 'grad_norm': 3.9095137119293213, 'learning_rate': 2.566192185035294e-05, 'epoch': 21.32}
- 43%|████▎     | 2473/5800 [6:51:35<6:22:29,  6.90s/it]score1 tensor([[0.4688],
-        [0.5859],
-        [0.4297],
-        [0.3984]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.6016, 0.4570, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0454, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:38:11,961] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 15:38:11,963] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.41 | bwd_microstep: 4620.07 | bwd_inner_microstep: 4614.79 | bwd_allreduce_microstep: 5.19 | step_microstep: 41.31
-[2025-01-25 15:38:11,963] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.38 | bwd: 4620.10 | bwd_inner: 4614.79 | bwd_allreduce: 5.24 | step: 41.32
- 43%|████▎     | 2474/5800 [6:51:41<6:22:28,  6.90s/it]                                                       {'loss': 0.0454, 'grad_norm': 7.989767074584961, 'learning_rate': 2.5651209716942426e-05, 'epoch': 21.33}
- 43%|████▎     | 2474/5800 [6:51:41<6:22:28,  6.90s/it]score1 tensor([[0.3867],
-        [0.5195],
-        [0.4746],
-        [0.3633]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.5469, 0.4629, 0.4043], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0278, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:38:18,862] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.37
-[2025-01-25 15:38:18,864] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.10 | bwd_microstep: 4620.17 | bwd_inner_microstep: 4615.36 | bwd_allreduce_microstep: 4.71 | step_microstep: 45.25
-[2025-01-25 15:38:18,864] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.06 | bwd: 4620.19 | bwd_inner: 4615.36 | bwd_allreduce: 4.76 | step: 45.26
- 43%|████▎     | 2475/5800 [6:51:48<6:22:17,  6.90s/it]                                                       {'loss': 0.0278, 'grad_norm': 3.7650060653686523, 'learning_rate': 2.5640495821386666e-05, 'epoch': 21.34}
- 43%|████▎     | 2475/5800 [6:51:48<6:22:17,  6.90s/it]score1 tensor([[0.6484],
-        [0.6055],
-        [0.5312],
-        [0.4375]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6523, 0.6484, 0.5469, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:38:25,756] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.37
-[2025-01-25 15:38:25,758] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.63 | bwd_microstep: 4622.02 | bwd_inner_microstep: 4616.86 | bwd_allreduce_microstep: 5.05 | step_microstep: 41.09
-[2025-01-25 15:38:25,758] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.59 | bwd: 4622.05 | bwd_inner: 4616.86 | bwd_allreduce: 5.10 | step: 41.10
- 43%|████▎     | 2476/5800 [6:51:55<6:22:06,  6.90s/it]                                                       {'loss': 0.0176, 'grad_norm': 8.741866111755371, 'learning_rate': 2.5629780167026432e-05, 'epoch': 21.34}
- 43%|████▎     | 2476/5800 [6:51:55<6:22:06,  6.90s/it]score1 tensor([[0.6602],
-        [0.4551],
-        [0.5352],
-        [0.6250]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6562, 0.5156, 0.5352, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0210, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:38:32,612] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 15:38:32,613] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.83 | bwd_microstep: 4578.33 | bwd_inner_microstep: 4572.92 | bwd_allreduce_microstep: 5.31 | step_microstep: 45.27
-[2025-01-25 15:38:32,614] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.80 | bwd: 4578.36 | bwd_inner: 4572.92 | bwd_allreduce: 5.37 | step: 45.27
- 43%|████▎     | 2477/5800 [6:52:02<6:21:17,  6.88s/it]                                                       {'loss': 0.021, 'grad_norm': 1.940466046333313, 'learning_rate': 2.561906275720306e-05, 'epoch': 21.35}
- 43%|████▎     | 2477/5800 [6:52:02<6:21:17,  6.88s/it]score1 tensor([[0.4609],
-        [0.3770],
-        [0.5547],
-        [0.4375]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4141, 0.4160, 0.6055, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0386, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:38:39,522] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.37
-[2025-01-25 15:38:39,524] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.13 | bwd_microstep: 4625.32 | bwd_inner_microstep: 4620.43 | bwd_allreduce_microstep: 4.80 | step_microstep: 43.22
-[2025-01-25 15:38:39,524] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.10 | bwd: 4625.35 | bwd_inner: 4620.43 | bwd_allreduce: 4.84 | step: 43.22
- 43%|████▎     | 2478/5800 [6:52:09<6:21:36,  6.89s/it]                                                       {'loss': 0.0386, 'grad_norm': 3.983058452606201, 'learning_rate': 2.560834359525842e-05, 'epoch': 21.36}
- 43%|████▎     | 2478/5800 [6:52:09<6:21:36,  6.89s/it]score1 tensor([[0.3789],
-        [0.5508],
-        [0.6562],
-        [0.6602]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.5391, 0.6133, 0.6602], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:38:46,381] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 15:38:46,382] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.79 | bwd_microstep: 4579.60 | bwd_inner_microstep: 4574.75 | bwd_allreduce_microstep: 4.77 | step_microstep: 43.12
-[2025-01-25 15:38:46,386] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.76 | bwd: 4579.63 | bwd_inner: 4574.75 | bwd_allreduce: 4.80 | step: 43.13
- 43%|████▎     | 2479/5800 [6:52:16<6:21:01,  6.88s/it]                                                       {'loss': 0.0146, 'grad_norm': 6.359863758087158, 'learning_rate': 2.5597622684534935e-05, 'epoch': 21.37}
- 43%|████▎     | 2479/5800 [6:52:16<6:21:01,  6.88s/it]score1 tensor([[0.5703],
-        [0.4844],
-        [0.6094],
-        [0.4277]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4629, 0.5625, 0.3984], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:38:53,280] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 15:38:53,281] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.64 | bwd_microstep: 4620.63 | bwd_inner_microstep: 4615.92 | bwd_allreduce_microstep: 4.63 | step_microstep: 42.40
-[2025-01-25 15:38:53,281] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.61 | bwd: 4620.65 | bwd_inner: 4615.92 | bwd_allreduce: 4.67 | step: 42.41
- 43%|████▎     | 2480/5800 [6:52:23<6:21:11,  6.89s/it]                                                       {'loss': 0.0293, 'grad_norm': 8.39624309539795, 'learning_rate': 2.558690002837557e-05, 'epoch': 21.38}
- 43%|████▎     | 2480/5800 [6:52:23<6:21:11,  6.89s/it]score1 tensor([[0.4785],
-        [0.5859],
-        [0.4473],
-        [0.4316]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4355, 0.5312, 0.4277, 0.3730], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0439, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:39:00,190] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.37
-[2025-01-25 15:39:00,192] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.65 | bwd_microstep: 4624.65 | bwd_inner_microstep: 4619.73 | bwd_allreduce_microstep: 4.84 | step_microstep: 41.65
-[2025-01-25 15:39:00,192] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.61 | bwd: 4624.67 | bwd_inner: 4619.73 | bwd_allreduce: 4.87 | step: 41.65
- 43%|████▎     | 2481/5800 [6:52:30<6:21:20,  6.89s/it]                                                       {'loss': 0.0439, 'grad_norm': 8.092316627502441, 'learning_rate': 2.557617563012384e-05, 'epoch': 21.39}
- 43%|████▎     | 2481/5800 [6:52:30<6:21:20,  6.89s/it]score1 tensor([[0.5273],
-        [0.4492],
-        [0.4668],
-        [0.3789]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.4258, 0.4551, 0.3457], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0190, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:39:07,091] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 15:39:07,092] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.23 | bwd_microstep: 4624.48 | bwd_inner_microstep: 4619.75 | bwd_allreduce_microstep: 4.66 | step_microstep: 42.40
-[2025-01-25 15:39:07,092] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.18 | bwd: 4624.50 | bwd_inner: 4619.75 | bwd_allreduce: 4.69 | step: 42.43
- 43%|████▎     | 2482/5800 [6:52:37<6:21:15,  6.89s/it]                                                       {'loss': 0.019, 'grad_norm': 3.6265063285827637, 'learning_rate': 2.5565449493123783e-05, 'epoch': 21.4}
- 43%|████▎     | 2482/5800 [6:52:37<6:21:15,  6.89s/it]score1 tensor([[0.4375],
-        [0.6328],
-        [0.5820],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4336, 0.6875, 0.5273, 0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:39:13,991] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 15:39:13,992] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.20 | bwd_microstep: 4625.29 | bwd_inner_microstep: 4620.72 | bwd_allreduce_microstep: 4.49 | step_microstep: 45.64
-[2025-01-25 15:39:13,993] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.16 | bwd: 4625.31 | bwd_inner: 4620.72 | bwd_allreduce: 4.53 | step: 45.65
- 43%|████▎     | 2483/5800 [6:52:43<6:21:15,  6.90s/it]                                                       {'loss': 0.0391, 'grad_norm': 3.992043972015381, 'learning_rate': 2.5554721620720013e-05, 'epoch': 21.41}
- 43%|████▎     | 2483/5800 [6:52:43<6:21:15,  6.90s/it]score1 tensor([[0.5273],
-        [0.4336],
-        [0.6758],
-        [0.4297]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4531, 0.4277, 0.6367, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0347, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:39:20,880] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.37
-[2025-01-25 15:39:20,881] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.72 | bwd_microstep: 4617.74 | bwd_inner_microstep: 4613.27 | bwd_allreduce_microstep: 4.39 | step_microstep: 41.66
-[2025-01-25 15:39:20,881] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.67 | bwd: 4617.77 | bwd_inner: 4613.27 | bwd_allreduce: 4.43 | step: 41.67
- 43%|████▎     | 2484/5800 [6:52:50<6:21:03,  6.89s/it]                                                       {'loss': 0.0347, 'grad_norm': 4.629451751708984, 'learning_rate': 2.5543992016257652e-05, 'epoch': 21.41}
- 43%|████▎     | 2484/5800 [6:52:50<6:21:03,  6.89s/it]score1 tensor([[0.3555],
-        [0.4004],
-        [0.3867],
-        [0.4395]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3789, 0.3750, 0.4219, 0.4277], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0239, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:39:27,773] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 15:39:27,775] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.87 | bwd_microstep: 4615.32 | bwd_inner_microstep: 4610.61 | bwd_allreduce_microstep: 4.62 | step_microstep: 42.16
-[2025-01-25 15:39:27,775] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.83 | bwd: 4615.34 | bwd_inner: 4610.61 | bwd_allreduce: 4.67 | step: 42.18
- 43%|████▎     | 2485/5800 [6:52:57<6:20:54,  6.89s/it]                                                       {'loss': 0.0239, 'grad_norm': 0.6785792112350464, 'learning_rate': 2.553326068308239e-05, 'epoch': 21.42}
- 43%|████▎     | 2485/5800 [6:52:57<6:20:54,  6.89s/it]score1 tensor([[0.6172],
-        [0.3965],
-        [0.5195],
-        [0.4492]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6367, 0.3457, 0.5391, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:39:34,670] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 15:39:34,671] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.33 | bwd_microstep: 4618.78 | bwd_inner_microstep: 4614.17 | bwd_allreduce_microstep: 4.49 | step_microstep: 42.77
-[2025-01-25 15:39:34,672] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.29 | bwd: 4618.80 | bwd_inner: 4614.17 | bwd_allreduce: 4.55 | step: 42.77
- 43%|████▎     | 2486/5800 [6:53:04<6:20:52,  6.90s/it]                                                       {'loss': 0.0312, 'grad_norm': 4.5781426429748535, 'learning_rate': 2.5522527624540434e-05, 'epoch': 21.43}
- 43%|████▎     | 2486/5800 [6:53:04<6:20:52,  6.90s/it]score1 tensor([[0.5469],
-        [0.5430],
-        [0.5352],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.5547, 0.4941, 0.4863], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0225, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:39:41,570] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 15:39:41,571] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.96 | bwd_microstep: 4623.60 | bwd_inner_microstep: 4618.71 | bwd_allreduce_microstep: 4.79 | step_microstep: 42.08
-[2025-01-25 15:39:41,571] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.92 | bwd: 4623.62 | bwd_inner: 4618.71 | bwd_allreduce: 4.85 | step: 42.09
- 43%|████▎     | 2487/5800 [6:53:11<6:20:45,  6.90s/it]                                                       {'loss': 0.0225, 'grad_norm': 0.5016502141952515, 'learning_rate': 2.5511792843978543e-05, 'epoch': 21.44}
- 43%|████▎     | 2487/5800 [6:53:11<6:20:45,  6.90s/it]score1 tensor([[0.4473],
-        [0.6172],
-        [0.4199],
-        [0.4609]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4590, 0.6484, 0.4238, 0.4688], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:39:48,476] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 15:39:48,478] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.59 | bwd_microstep: 4623.69 | bwd_inner_microstep: 4618.97 | bwd_allreduce_microstep: 4.63 | step_microstep: 46.38
-[2025-01-25 15:39:48,478] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.55 | bwd: 4623.71 | bwd_inner: 4618.97 | bwd_allreduce: 4.68 | step: 46.39
- 43%|████▎     | 2488/5800 [6:53:18<6:20:54,  6.90s/it]                                                       {'loss': 0.0137, 'grad_norm': 8.085935592651367, 'learning_rate': 2.5501056344743997e-05, 'epoch': 21.45}
- 43%|████▎     | 2488/5800 [6:53:18<6:20:54,  6.90s/it]score1 tensor([[0.5312],
-        [0.5195],
-        [0.5312],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.5391, 0.5352, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:39:55,378] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 15:39:55,379] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.83 | bwd_microstep: 4625.67 | bwd_inner_microstep: 4620.64 | bwd_allreduce_microstep: 4.94 | step_microstep: 42.55
-[2025-01-25 15:39:55,379] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.79 | bwd: 4625.69 | bwd_inner: 4620.64 | bwd_allreduce: 4.99 | step: 42.56
- 43%|████▎     | 2489/5800 [6:53:25<6:20:45,  6.90s/it]                                                       {'loss': 0.0156, 'grad_norm': 8.461738586425781, 'learning_rate': 2.5490318130184635e-05, 'epoch': 21.46}
- 43%|████▎     | 2489/5800 [6:53:25<6:20:45,  6.90s/it]score1 tensor([[0.3418],
-        [0.4375],
-        [0.5508],
-        [0.4609]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3105, 0.4316, 0.5664, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0239, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:40:02,267] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 15:40:02,268] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.76 | bwd_microstep: 4614.29 | bwd_inner_microstep: 4609.16 | bwd_allreduce_microstep: 5.01 | step_microstep: 43.57
-[2025-01-25 15:40:02,268] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.73 | bwd: 4614.31 | bwd_inner: 4609.16 | bwd_allreduce: 5.07 | step: 43.58
- 43%|████▎     | 2490/5800 [6:53:32<6:20:24,  6.90s/it]                                                       {'loss': 0.0239, 'grad_norm': 0.8229027390480042, 'learning_rate': 2.5479578203648824e-05, 'epoch': 21.47}
- 43%|████▎     | 2490/5800 [6:53:32<6:20:24,  6.90s/it]score1 tensor([[0.4492],
-        [0.6016],
-        [0.2910],
-        [0.3535]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.6094, 0.1787, 0.3652], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0505, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:40:09,164] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 15:40:09,165] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.96 | bwd_microstep: 4621.95 | bwd_inner_microstep: 4617.12 | bwd_allreduce_microstep: 4.72 | step_microstep: 42.68
-[2025-01-25 15:40:09,165] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.92 | bwd: 4621.98 | bwd_inner: 4617.12 | bwd_allreduce: 4.78 | step: 42.69
- 43%|████▎     | 2491/5800 [6:53:39<6:20:18,  6.90s/it]                                                       {'loss': 0.0505, 'grad_norm': 4.4578351974487305, 'learning_rate': 2.546883656848544e-05, 'epoch': 21.47}
- 43%|████▎     | 2491/5800 [6:53:39<6:20:18,  6.90s/it]score1 tensor([[0.4961],
-        [0.5078],
-        [0.5547],
-        [0.4062]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4766, 0.5117, 0.5469, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0200, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:40:16,071] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 15:40:16,073] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.09 | bwd_microstep: 4638.04 | bwd_inner_microstep: 4633.22 | bwd_allreduce_microstep: 4.72 | step_microstep: 40.89
-[2025-01-25 15:40:16,073] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.05 | bwd: 4638.07 | bwd_inner: 4633.22 | bwd_allreduce: 4.77 | step: 40.90
- 43%|████▎     | 2492/5800 [6:53:46<6:20:24,  6.90s/it]                                                       {'loss': 0.02, 'grad_norm': 0.5999893546104431, 'learning_rate': 2.5458093228043926e-05, 'epoch': 21.48}
- 43%|████▎     | 2492/5800 [6:53:46<6:20:24,  6.90s/it]score1 tensor([[0.5430],
-        [0.6289],
-        [0.6328],
-        [0.3828]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5586, 0.6211, 0.6133, 0.3906], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:40:22,959] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.37
-[2025-01-25 15:40:22,961] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.77 | bwd_microstep: 4615.44 | bwd_inner_microstep: 4610.70 | bwd_allreduce_microstep: 4.67 | step_microstep: 41.28
-[2025-01-25 15:40:22,961] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.71 | bwd: 4615.46 | bwd_inner: 4610.70 | bwd_allreduce: 4.70 | step: 41.29
- 43%|████▎     | 2493/5800 [6:53:52<6:20:09,  6.90s/it]                                                       {'loss': 0.0127, 'grad_norm': 1.1245135068893433, 'learning_rate': 2.544734818567423e-05, 'epoch': 21.49}
- 43%|████▎     | 2493/5800 [6:53:52<6:20:09,  6.90s/it]score1 tensor([[0.4570],
-        [0.4531],
-        [0.5742],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.4941, 0.5586, 0.4609], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0435, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:40:29,860] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 15:40:29,861] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.34 | bwd_microstep: 4622.45 | bwd_inner_microstep: 4617.83 | bwd_allreduce_microstep: 4.55 | step_microstep: 41.62
-[2025-01-25 15:40:29,861] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.31 | bwd: 4622.48 | bwd_inner: 4617.83 | bwd_allreduce: 4.58 | step: 41.62
- 43%|████▎     | 2494/5800 [6:53:59<6:20:03,  6.90s/it]                                                       {'loss': 0.0435, 'grad_norm': 0.7801372408866882, 'learning_rate': 2.5436601444726862e-05, 'epoch': 21.5}
- 43%|████▎     | 2494/5800 [6:53:59<6:20:03,  6.90s/it]score1 tensor([[0.5234],
-        [0.4668],
-        [0.5547],
-        [0.4629]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.4512, 0.5391, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0303, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:40:36,750] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 15:40:36,751] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.83 | bwd_microstep: 4616.14 | bwd_inner_microstep: 4611.01 | bwd_allreduce_microstep: 5.02 | step_microstep: 41.30
-[2025-01-25 15:40:36,751] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.79 | bwd: 4616.17 | bwd_inner: 4611.01 | bwd_allreduce: 5.08 | step: 41.31
- 43%|████▎     | 2495/5800 [6:54:06<6:19:50,  6.90s/it]                                                       {'loss': 0.0303, 'grad_norm': 4.276058673858643, 'learning_rate': 2.5425853008552822e-05, 'epoch': 21.51}
- 43%|████▎     | 2495/5800 [6:54:06<6:19:50,  6.90s/it]score1 tensor([[0.5000],
-        [0.5508],
-        [0.4551],
-        [0.5898]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4863, 0.5664, 0.4414, 0.5586], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:40:43,644] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 15:40:43,645] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.20 | bwd_microstep: 4616.02 | bwd_inner_microstep: 4611.32 | bwd_allreduce_microstep: 4.61 | step_microstep: 41.71
-[2025-01-25 15:40:43,646] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.17 | bwd: 4616.05 | bwd_inner: 4611.33 | bwd_allreduce: 4.65 | step: 41.72
- 43%|████▎     | 2496/5800 [6:54:13<6:19:40,  6.89s/it]                                                       {'loss': 0.0186, 'grad_norm': 4.110388278961182, 'learning_rate': 2.541510288050367e-05, 'epoch': 21.52}
- 43%|████▎     | 2496/5800 [6:54:13<6:19:40,  6.89s/it]score1 tensor([[0.4316],
-        [0.6094],
-        [0.5078],
-        [0.3965]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.5547, 0.4883, 0.3867], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0269, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:40:50,541] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 15:40:50,543] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.64 | bwd_microstep: 4618.61 | bwd_inner_microstep: 4613.82 | bwd_allreduce_microstep: 4.69 | step_microstep: 43.30
-[2025-01-25 15:40:50,543] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.60 | bwd: 4618.63 | bwd_inner: 4613.82 | bwd_allreduce: 4.74 | step: 43.31
- 43%|████▎     | 2497/5800 [6:54:20<6:19:33,  6.89s/it]                                                       {'loss': 0.0269, 'grad_norm': 4.289310932159424, 'learning_rate': 2.5404351063931478e-05, 'epoch': 21.53}
- 43%|████▎     | 2497/5800 [6:54:20<6:19:33,  6.89s/it]score1 tensor([[0.6641],
-        [0.5273],
-        [0.4531],
-        [0.4570]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.7070, 0.5156, 0.4434, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:40:57,442] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 15:40:57,443] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.32 | bwd_microstep: 4628.97 | bwd_inner_microstep: 4622.24 | bwd_allreduce_microstep: 6.62 | step_microstep: 43.37
-[2025-01-25 15:40:57,443] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.28 | bwd: 4628.99 | bwd_inner: 4622.24 | bwd_allreduce: 6.68 | step: 43.38
- 43%|████▎     | 2498/5800 [6:54:27<6:19:34,  6.90s/it]                                                       {'loss': 0.0254, 'grad_norm': 0.615544319152832, 'learning_rate': 2.539359756218885e-05, 'epoch': 21.53}
- 43%|████▎     | 2498/5800 [6:54:27<6:19:34,  6.90s/it]score1 tensor([[0.4160],
-        [0.3477],
-        [0.5078],
-        [0.6133]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4043, 0.3438, 0.4297, 0.6562], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0342, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:41:04,364] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 15:41:04,366] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.72 | bwd_microstep: 4616.66 | bwd_inner_microstep: 4611.70 | bwd_allreduce_microstep: 4.85 | step_microstep: 49.52
-[2025-01-25 15:41:04,366] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.69 | bwd: 4616.68 | bwd_inner: 4611.70 | bwd_allreduce: 4.90 | step: 49.53
- 43%|████▎     | 2499/5800 [6:54:34<6:19:52,  6.90s/it]                                                       {'loss': 0.0342, 'grad_norm': 3.414027452468872, 'learning_rate': 2.5382842378628917e-05, 'epoch': 21.54}
- 43%|████▎     | 2499/5800 [6:54:34<6:19:52,  6.90s/it]score1 tensor([[0.5508],
-        [0.5117],
-        [0.4160],
-        [0.4414]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.5312, 0.4238, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:41:11,213] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 15:41:11,214] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.85 | bwd_microstep: 4570.56 | bwd_inner_microstep: 4565.42 | bwd_allreduce_microstep: 4.99 | step_microstep: 45.03
-[2025-01-25 15:41:11,214] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.81 | bwd: 4570.58 | bwd_inner: 4565.42 | bwd_allreduce: 5.09 | step: 45.04
- 43%|████▎     | 2500/5800 [6:54:41<6:18:46,  6.89s/it]                                                       {'loss': 0.0093, 'grad_norm': 5.877349853515625, 'learning_rate': 2.5372085516605333e-05, 'epoch': 21.55}
- 43%|████▎     | 2500/5800 [6:54:41<6:18:46,  6.89s/it]score1 tensor([[0.3750],
-        [0.4922],
-        [0.5625],
-        [0.4551]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3672, 0.5781, 0.6406, 0.4746], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0479, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:41:18,110] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.37
-[2025-01-25 15:41:18,112] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.69 | bwd_microstep: 4621.11 | bwd_inner_microstep: 4616.26 | bwd_allreduce_microstep: 4.76 | step_microstep: 45.06
-[2025-01-25 15:41:18,112] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.65 | bwd: 4621.13 | bwd_inner: 4616.26 | bwd_allreduce: 4.80 | step: 45.07
- 43%|████▎     | 2501/5800 [6:54:48<6:18:55,  6.89s/it]                                                       {'loss': 0.0479, 'grad_norm': 4.435573101043701, 'learning_rate': 2.5361326979472266e-05, 'epoch': 21.56}
- 43%|████▎     | 2501/5800 [6:54:48<6:18:55,  6.89s/it]score1 tensor([[0.4629],
-        [0.4102],
-        [0.4551],
-        [0.4199]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.4570, 0.4492, 0.4727], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0366, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:41:25,003] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 15:41:25,004] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.01 | bwd_microstep: 4616.24 | bwd_inner_microstep: 4611.49 | bwd_allreduce_microstep: 4.68 | step_microstep: 41.79
-[2025-01-25 15:41:25,005] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.96 | bwd: 4616.26 | bwd_inner: 4611.49 | bwd_allreduce: 4.71 | step: 41.80
- 43%|████▎     | 2502/5800 [6:54:54<6:18:53,  6.89s/it]                                                       {'loss': 0.0366, 'grad_norm': 3.723655939102173, 'learning_rate': 2.5350566770584423e-05, 'epoch': 21.57}
- 43%|████▎     | 2502/5800 [6:54:54<6:18:53,  6.89s/it]score1 tensor([[0.6133],
-        [0.4707],
-        [0.4102],
-        [0.4629]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.4805, 0.4180, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:41:31,919] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.82 | optimizer_step: 4.36
-[2025-01-25 15:41:31,920] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.70 | bwd_microstep: 4625.85 | bwd_inner_microstep: 4619.48 | bwd_allreduce_microstep: 6.23 | step_microstep: 50.06
-[2025-01-25 15:41:31,920] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.67 | bwd: 4625.88 | bwd_inner: 4619.48 | bwd_allreduce: 6.31 | step: 50.07
- 43%|████▎     | 2503/5800 [6:55:01<6:19:02,  6.90s/it]                                                       {'loss': 0.0093, 'grad_norm': 8.141084671020508, 'learning_rate': 2.5339804893297015e-05, 'epoch': 21.58}
- 43%|████▎     | 2503/5800 [6:55:01<6:19:02,  6.90s/it]score1 tensor([[0.5547],
-        [0.4727],
-        [0.4668],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5742, 0.4980, 0.4492, 0.5742], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:41:38,808] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.36
-[2025-01-25 15:41:38,809] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.60 | bwd_microstep: 4616.24 | bwd_inner_microstep: 4611.49 | bwd_allreduce_microstep: 4.67 | step_microstep: 42.34
-[2025-01-25 15:41:38,810] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.56 | bwd: 4616.27 | bwd_inner: 4611.49 | bwd_allreduce: 4.71 | step: 42.35
- 43%|████▎     | 2504/5800 [6:55:08<6:18:48,  6.90s/it]                                                       {'loss': 0.0312, 'grad_norm': 4.312799453735352, 'learning_rate': 2.5329041350965794e-05, 'epoch': 21.59}
- 43%|████▎     | 2504/5800 [6:55:08<6:18:48,  6.90s/it]score1 tensor([[0.4902],
-        [0.4688],
-        [0.4707],
-        [0.4609]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.4844, 0.4961, 0.5000], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0220, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:41:45,704] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.37
-[2025-01-25 15:41:45,706] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.06 | bwd_microstep: 4621.99 | bwd_inner_microstep: 4617.13 | bwd_allreduce_microstep: 4.78 | step_microstep: 42.30
-[2025-01-25 15:41:45,706] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.03 | bwd: 4622.03 | bwd_inner: 4617.13 | bwd_allreduce: 4.81 | step: 42.31
- 43%|████▎     | 2505/5800 [6:55:15<6:18:39,  6.90s/it]                                                       {'loss': 0.022, 'grad_norm': 7.984435081481934, 'learning_rate': 2.5318276146947005e-05, 'epoch': 21.59}
- 43%|████▎     | 2505/5800 [6:55:15<6:18:39,  6.90s/it]score1 tensor([[0.4590],
-        [0.4609],
-        [0.5273],
-        [0.4023]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.4453, 0.5430, 0.3672], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:41:52,594] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 15:41:52,595] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.55 | bwd_microstep: 4614.75 | bwd_inner_microstep: 4609.96 | bwd_allreduce_microstep: 4.71 | step_microstep: 43.63
-[2025-01-25 15:41:52,596] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.50 | bwd: 4614.77 | bwd_inner: 4609.96 | bwd_allreduce: 4.75 | step: 43.64
- 43%|████▎     | 2506/5800 [6:55:22<6:18:29,  6.89s/it]                                                       {'loss': 0.0176, 'grad_norm': 3.6581790447235107, 'learning_rate': 2.5307509284597442e-05, 'epoch': 21.6}
- 43%|████▎     | 2506/5800 [6:55:22<6:18:29,  6.89s/it]score1 tensor([[0.5508],
-        [0.6367],
-        [0.6016],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.7031, 0.5508, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0332, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:41:59,493] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.90 | optimizer_step: 4.37
-[2025-01-25 15:41:59,494] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.80 | bwd_microstep: 4616.39 | bwd_inner_microstep: 4611.52 | bwd_allreduce_microstep: 4.77 | step_microstep: 42.79
-[2025-01-25 15:41:59,494] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.76 | bwd: 4616.41 | bwd_inner: 4611.52 | bwd_allreduce: 4.82 | step: 42.80
- 43%|████▎     | 2507/5800 [6:55:29<6:18:29,  6.90s/it]                                                       {'loss': 0.0332, 'grad_norm': 4.318027496337891, 'learning_rate': 2.5296740767274388e-05, 'epoch': 21.61}
- 43%|████▎     | 2507/5800 [6:55:29<6:18:29,  6.90s/it]score1 tensor([[0.5195],
-        [0.5664],
-        [0.6250],
-        [0.6602]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.5547, 0.6172, 0.6133], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0166, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:42:06,338] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 15:42:06,340] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.41 | bwd_microstep: 4564.87 | bwd_inner_microstep: 4560.18 | bwd_allreduce_microstep: 4.61 | step_microstep: 42.55
-[2025-01-25 15:42:06,340] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.37 | bwd: 4564.89 | bwd_inner: 4560.18 | bwd_allreduce: 4.65 | step: 42.55
- 43%|████▎     | 2508/5800 [6:55:36<6:17:27,  6.88s/it]                                                       {'loss': 0.0166, 'grad_norm': 6.995848178863525, 'learning_rate': 2.5285970598335654e-05, 'epoch': 21.62}
- 43%|████▎     | 2508/5800 [6:55:36<6:17:27,  6.88s/it]score1 tensor([[0.4375],
-        [0.5898],
-        [0.4512],
-        [0.4609]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.5430, 0.4180, 0.4902], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:42:13,227] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 15:42:13,228] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.39 | bwd_microstep: 4613.72 | bwd_inner_microstep: 4609.07 | bwd_allreduce_microstep: 4.56 | step_microstep: 43.69
-[2025-01-25 15:42:13,228] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.34 | bwd: 4613.74 | bwd_inner: 4609.07 | bwd_allreduce: 4.60 | step: 43.70
- 43%|████▎     | 2509/5800 [6:55:43<6:17:29,  6.88s/it]                                                       {'loss': 0.0352, 'grad_norm': 0.5818108320236206, 'learning_rate': 2.5275198781139567e-05, 'epoch': 21.63}
- 43%|████▎     | 2509/5800 [6:55:43<6:17:29,  6.88s/it]score1 tensor([[0.4570],
-        [0.4648],
-        [0.4473],
-        [0.6328]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4629, 0.4180, 0.4473, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0200, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:42:20,074] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.37
-[2025-01-25 15:42:20,076] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.09 | bwd_microstep: 4575.56 | bwd_inner_microstep: 4570.03 | bwd_allreduce_microstep: 5.41 | step_microstep: 42.38
-[2025-01-25 15:42:20,076] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.05 | bwd: 4575.59 | bwd_inner: 4570.03 | bwd_allreduce: 5.48 | step: 42.38
- 43%|████▎     | 2510/5800 [6:55:50<6:16:50,  6.87s/it]                                                       {'loss': 0.02, 'grad_norm': 2.386457681655884, 'learning_rate': 2.5264425319044968e-05, 'epoch': 21.64}
- 43%|████▎     | 2510/5800 [6:55:50<6:16:50,  6.87s/it]score1 tensor([[0.5430],
-        [0.5977],
-        [0.6719],
-        [0.4551]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.5742, 0.6953, 0.4023], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0269, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:42:26,981] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 15:42:26,983] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.08 | bwd_microstep: 4633.20 | bwd_inner_microstep: 4628.43 | bwd_allreduce_microstep: 4.67 | step_microstep: 43.23
-[2025-01-25 15:42:26,983] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.05 | bwd: 4633.24 | bwd_inner: 4628.43 | bwd_allreduce: 4.72 | step: 43.23
- 43%|████▎     | 2511/5800 [6:55:56<6:17:20,  6.88s/it]                                                       {'loss': 0.0269, 'grad_norm': 4.014283180236816, 'learning_rate': 2.5253650215411214e-05, 'epoch': 21.65}
- 43%|████▎     | 2511/5800 [6:55:56<6:17:20,  6.88s/it]score1 tensor([[0.5039],
-        [0.3574],
-        [0.4883],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.3223, 0.5117, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0322, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:42:33,884] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.37
-[2025-01-25 15:42:33,885] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.86 | bwd_microstep: 4620.32 | bwd_inner_microstep: 4615.19 | bwd_allreduce_microstep: 5.04 | step_microstep: 44.27
-[2025-01-25 15:42:33,886] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.82 | bwd: 4620.34 | bwd_inner: 4615.19 | bwd_allreduce: 5.08 | step: 44.28
- 43%|████▎     | 2512/5800 [6:56:03<6:17:30,  6.89s/it]                                                       {'loss': 0.0322, 'grad_norm': 0.562836229801178, 'learning_rate': 2.5242873473598165e-05, 'epoch': 21.66}
- 43%|████▎     | 2512/5800 [6:56:03<6:17:30,  6.89s/it]score1 tensor([[0.5820],
-        [0.6172],
-        [0.5352],
-        [0.4824]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.6055, 0.6484, 0.4727], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0405, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:42:40,796] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.55 | optimizer_step: 4.36
-[2025-01-25 15:42:40,798] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2169.88 | bwd_microstep: 4620.23 | bwd_inner_microstep: 4615.30 | bwd_allreduce_microstep: 4.84 | step_microstep: 44.16
-[2025-01-25 15:42:40,798] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2169.85 | bwd: 4620.26 | bwd_inner: 4615.30 | bwd_allreduce: 4.89 | step: 44.17
- 43%|████▎     | 2513/5800 [6:56:10<6:17:43,  6.89s/it]                                                       {'loss': 0.0405, 'grad_norm': 0.4507313370704651, 'learning_rate': 2.523209509696619e-05, 'epoch': 21.66}
- 43%|████▎     | 2513/5800 [6:56:10<6:17:43,  6.89s/it]score1 tensor([[0.5039],
-        [0.4102],
-        [0.4805],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5078, 0.3887, 0.5000, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0181, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:42:47,688] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.37
-[2025-01-25 15:42:47,689] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.10 | bwd_microstep: 4619.32 | bwd_inner_microstep: 4614.50 | bwd_allreduce_microstep: 4.75 | step_microstep: 42.73
-[2025-01-25 15:42:47,690] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.07 | bwd: 4619.34 | bwd_inner: 4614.50 | bwd_allreduce: 4.78 | step: 42.73
- 43%|████▎     | 2514/5800 [6:56:17<6:17:35,  6.89s/it]                                                       {'loss': 0.0181, 'grad_norm': 4.389468193054199, 'learning_rate': 2.5221315088876175e-05, 'epoch': 21.67}
- 43%|████▎     | 2514/5800 [6:56:17<6:17:35,  6.89s/it]score1 tensor([[0.4629],
-        [0.5742],
-        [0.4414],
-        [0.4688]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.6172, 0.4512, 0.4570], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:42:54,588] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.02 | optimizer_step: 4.36
-[2025-01-25 15:42:54,590] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.91 | bwd_microstep: 4621.92 | bwd_inner_microstep: 4616.41 | bwd_allreduce_microstep: 5.40 | step_microstep: 47.86
-[2025-01-25 15:42:54,591] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.87 | bwd: 4621.95 | bwd_inner: 4616.41 | bwd_allreduce: 5.45 | step: 47.87
- 43%|████▎     | 2515/5800 [6:56:24<6:17:45,  6.90s/it]                                                       {'loss': 0.0176, 'grad_norm': 4.1436448097229, 'learning_rate': 2.5210533452689524e-05, 'epoch': 21.68}
- 43%|████▎     | 2515/5800 [6:56:24<6:17:45,  6.90s/it]score1 tensor([[0.4316],
-        [0.3887],
-        [0.5781],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3945, 0.3789, 0.6172, 0.6250], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0322, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:43:01,508] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.36
-[2025-01-25 15:43:01,510] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.79 | bwd_microstep: 4625.93 | bwd_inner_microstep: 4620.31 | bwd_allreduce_microstep: 5.53 | step_microstep: 45.44
-[2025-01-25 15:43:01,510] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.72 | bwd: 4625.95 | bwd_inner: 4620.31 | bwd_allreduce: 5.57 | step: 45.45
- 43%|████▎     | 2516/5800 [6:56:31<6:17:51,  6.90s/it]                                                       {'loss': 0.0322, 'grad_norm': 1.1581002473831177, 'learning_rate': 2.519975019176813e-05, 'epoch': 21.69}
- 43%|████▎     | 2516/5800 [6:56:31<6:17:51,  6.90s/it]score1 tensor([[0.5156],
-        [0.4883],
-        [0.5078],
-        [0.4824]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.5391, 0.5312, 0.4590], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:43:08,427] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.37
-[2025-01-25 15:43:08,428] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.89 | bwd_microstep: 4628.33 | bwd_inner_microstep: 4622.75 | bwd_allreduce_microstep: 5.46 | step_microstep: 43.77
-[2025-01-25 15:43:08,428] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.82 | bwd: 4628.35 | bwd_inner: 4622.75 | bwd_allreduce: 5.52 | step: 43.79
- 43%|████▎     | 2517/5800 [6:56:38<6:17:58,  6.91s/it]                                                       {'loss': 0.0273, 'grad_norm': 0.5001680254936218, 'learning_rate': 2.5188965309474404e-05, 'epoch': 21.7}
- 43%|████▎     | 2517/5800 [6:56:38<6:17:58,  6.91s/it]score1 tensor([[0.4980],
-        [0.3320],
-        [0.3906],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.3691, 0.3652, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0298, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:43:15,328] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.36
-[2025-01-25 15:43:15,330] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.26 | bwd_microstep: 4621.15 | bwd_inner_microstep: 4615.82 | bwd_allreduce_microstep: 5.23 | step_microstep: 45.06
-[2025-01-25 15:43:15,330] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.22 | bwd: 4621.18 | bwd_inner: 4615.82 | bwd_allreduce: 5.28 | step: 45.07
- 43%|████▎     | 2518/5800 [6:56:45<6:17:43,  6.91s/it]                                                       {'loss': 0.0298, 'grad_norm': 0.4753868877887726, 'learning_rate': 2.5178178809171258e-05, 'epoch': 21.71}
- 43%|████▎     | 2518/5800 [6:56:45<6:17:43,  6.91s/it]score1 tensor([[0.5312],
-        [0.4727],
-        [0.5469],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.5117, 0.4863, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0298, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:43:22,241] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 15:43:22,243] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.60 | bwd_microstep: 4628.88 | bwd_inner_microstep: 4623.31 | bwd_allreduce_microstep: 5.43 | step_microstep: 48.38
-[2025-01-25 15:43:22,244] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.57 | bwd: 4628.90 | bwd_inner: 4623.31 | bwd_allreduce: 5.51 | step: 48.39
- 43%|████▎     | 2519/5800 [6:56:52<6:17:48,  6.91s/it]                                                       {'loss': 0.0298, 'grad_norm': 4.426032066345215, 'learning_rate': 2.516739069422211e-05, 'epoch': 21.72}
- 43%|████▎     | 2519/5800 [6:56:52<6:17:48,  6.91s/it]score1 tensor([[0.4531],
-        [0.5000],
-        [0.4414],
-        [0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4883, 0.5273, 0.4492, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0244, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:43:29,157] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 15:43:29,159] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.22 | bwd_microstep: 4618.55 | bwd_inner_microstep: 4613.39 | bwd_allreduce_microstep: 5.07 | step_microstep: 48.40
-[2025-01-25 15:43:29,159] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.18 | bwd: 4618.58 | bwd_inner: 4613.39 | bwd_allreduce: 5.12 | step: 48.42
- 43%|████▎     | 2520/5800 [6:56:59<6:17:45,  6.91s/it]                                                       {'loss': 0.0244, 'grad_norm': 7.987000942230225, 'learning_rate': 2.515660096799088e-05, 'epoch': 21.72}
- 43%|████▎     | 2520/5800 [6:56:59<6:17:45,  6.91s/it]score1 tensor([[0.5117],
-        [0.6250],
-        [0.5742],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5000, 0.6875, 0.5820, 0.5586], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:43:36,057] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.88 | optimizer_step: 4.36
-[2025-01-25 15:43:36,058] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.20 | bwd_microstep: 4621.16 | bwd_inner_microstep: 4616.07 | bwd_allreduce_microstep: 4.98 | step_microstep: 41.45
-[2025-01-25 15:43:36,059] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.14 | bwd: 4621.18 | bwd_inner: 4616.07 | bwd_allreduce: 5.04 | step: 41.45
- 43%|████▎     | 2521/5800 [6:57:06<6:17:25,  6.91s/it]                                                       {'loss': 0.0234, 'grad_norm': 4.701102256774902, 'learning_rate': 2.514580963384199e-05, 'epoch': 21.73}
- 43%|████▎     | 2521/5800 [6:57:06<6:17:25,  6.91s/it]score1 tensor([[0.5000],
-        [0.6211],
-        [0.5156],
-        [0.3340]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.6445, 0.4961, 0.3086], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0210, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:43:42,959] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 15:43:42,961] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.16 | bwd_microstep: 4621.45 | bwd_inner_microstep: 4616.17 | bwd_allreduce_microstep: 5.16 | step_microstep: 42.47
-[2025-01-25 15:43:42,961] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.12 | bwd: 4621.48 | bwd_inner: 4616.17 | bwd_allreduce: 5.23 | step: 42.48
- 43%|████▎     | 2522/5800 [6:57:12<6:17:21,  6.91s/it]                                                       {'loss': 0.021, 'grad_norm': 3.6376495361328125, 'learning_rate': 2.513501669514037e-05, 'epoch': 21.74}
- 43%|████▎     | 2522/5800 [6:57:12<6:17:21,  6.91s/it]score1 tensor([[0.4316],
-        [0.4629],
-        [0.5898],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4355, 0.4668, 0.6094, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:43:49,902] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 15:43:49,903] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.12 | bwd_microstep: 4632.88 | bwd_inner_microstep: 4627.07 | bwd_allreduce_microstep: 5.72 | step_microstep: 50.00
-[2025-01-25 15:43:49,904] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.08 | bwd: 4632.90 | bwd_inner: 4627.07 | bwd_allreduce: 5.76 | step: 50.01
- 44%|████▎     | 2523/5800 [6:57:19<6:17:45,  6.92s/it]                                                       {'loss': 0.0078, 'grad_norm': 4.063116550445557, 'learning_rate': 2.5124222155251445e-05, 'epoch': 21.75}
- 44%|████▎     | 2523/5800 [6:57:19<6:17:45,  6.92s/it]score1 tensor([[0.5156],
-        [0.4629],
-        [0.3535],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.5000, 0.3477, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:43:56,819] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.93 | optimizer_step: 4.36
-[2025-01-25 15:43:56,820] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.62 | bwd_microstep: 4626.93 | bwd_inner_microstep: 4621.85 | bwd_allreduce_microstep: 4.99 | step_microstep: 45.68
-[2025-01-25 15:43:56,820] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.57 | bwd: 4626.97 | bwd_inner: 4621.85 | bwd_allreduce: 5.04 | step: 45.67
- 44%|████▎     | 2524/5800 [6:57:26<6:17:37,  6.92s/it]                                                       {'loss': 0.0293, 'grad_norm': 0.5973484516143799, 'learning_rate': 2.511342601754114e-05, 'epoch': 21.76}
- 44%|████▎     | 2524/5800 [6:57:26<6:17:37,  6.92s/it]score1 tensor([[0.4297],
-        [0.5430],
-        [0.4551],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4121, 0.5430, 0.5039, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0229, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:44:03,678] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.91 | optimizer_step: 4.37
-[2025-01-25 15:44:03,679] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.32 | bwd_microstep: 4582.04 | bwd_inner_microstep: 4576.97 | bwd_allreduce_microstep: 5.00 | step_microstep: 41.08
-[2025-01-25 15:44:03,679] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.29 | bwd: 4582.06 | bwd_inner: 4576.97 | bwd_allreduce: 5.04 | step: 41.09
- 44%|████▎     | 2525/5800 [6:57:33<6:16:40,  6.90s/it]                                                       {'loss': 0.0229, 'grad_norm': 1.9822956323623657, 'learning_rate': 2.5102628285375885e-05, 'epoch': 21.77}
- 44%|████▎     | 2525/5800 [6:57:33<6:16:40,  6.90s/it]score1 tensor([[0.3477],
-        [0.5859],
-        [0.4648],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3516, 0.5625, 0.3262, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0474, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:44:10,617] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 15:44:10,619] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.45 | bwd_microstep: 4647.02 | bwd_inner_microstep: 4641.89 | bwd_allreduce_microstep: 5.02 | step_microstep: 46.29
-[2025-01-25 15:44:10,619] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.42 | bwd: 4647.05 | bwd_inner: 4641.89 | bwd_allreduce: 5.08 | step: 46.30
- 44%|████▎     | 2526/5800 [6:57:40<6:17:02,  6.91s/it]                                                       {'loss': 0.0474, 'grad_norm': 4.6866044998168945, 'learning_rate': 2.5091828962122582e-05, 'epoch': 21.78}
- 44%|████▎     | 2526/5800 [6:57:40<6:17:02,  6.91s/it]score1 tensor([[0.4395],
-        [0.3730],
-        [0.5000],
-        [0.3906]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4531, 0.3652, 0.4980, 0.4082], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:44:17,547] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.99 | optimizer_step: 4.36
-[2025-01-25 15:44:17,548] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.68 | bwd_microstep: 4648.42 | bwd_inner_microstep: 4643.26 | bwd_allreduce_microstep: 5.06 | step_microstep: 46.01
-[2025-01-25 15:44:17,549] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.61 | bwd: 4648.44 | bwd_inner: 4643.26 | bwd_allreduce: 5.10 | step: 46.02
- 44%|████▎     | 2527/5800 [6:57:47<6:17:22,  6.92s/it]                                                       {'loss': 0.0103, 'grad_norm': 0.4512483477592468, 'learning_rate': 2.5081028051148663e-05, 'epoch': 21.78}
- 44%|████▎     | 2527/5800 [6:57:47<6:17:22,  6.92s/it]score1 tensor([[0.4004],
-        [0.3926],
-        [0.4160],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3926, 0.4023, 0.3809, 0.5703], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0200, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:44:24,479] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 15:44:24,481] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.61 | bwd_microstep: 4643.81 | bwd_inner_microstep: 4638.98 | bwd_allreduce_microstep: 4.76 | step_microstep: 42.57
-[2025-01-25 15:44:24,481] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.56 | bwd: 4643.83 | bwd_inner: 4638.97 | bwd_allreduce: 4.79 | step: 42.57
- 44%|████▎     | 2528/5800 [6:57:54<6:17:21,  6.92s/it]                                                       {'loss': 0.02, 'grad_norm': 0.5680421590805054, 'learning_rate': 2.507022555582203e-05, 'epoch': 21.79}
- 44%|████▎     | 2528/5800 [6:57:54<6:17:21,  6.92s/it]score1 tensor([[0.5703],
-        [0.5703],
-        [0.6250],
-        [0.4238]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5898, 0.5664, 0.5508, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0298, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:44:31,407] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 15:44:31,408] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.66 | bwd_microstep: 4648.32 | bwd_inner_microstep: 4643.26 | bwd_allreduce_microstep: 4.96 | step_microstep: 42.60
-[2025-01-25 15:44:31,408] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.63 | bwd: 4648.35 | bwd_inner: 4643.26 | bwd_allreduce: 5.01 | step: 42.61
- 44%|████▎     | 2529/5800 [6:58:01<6:17:33,  6.93s/it]                                                       {'loss': 0.0298, 'grad_norm': 0.7580841183662415, 'learning_rate': 2.5059421479511094e-05, 'epoch': 21.8}
- 44%|████▎     | 2529/5800 [6:58:01<6:17:33,  6.93s/it]score1 tensor([[0.5781],
-        [0.4570],
-        [0.4785],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.4512, 0.4922, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:44:38,353] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 15:44:38,354] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.48 | bwd_microstep: 4648.52 | bwd_inner_microstep: 4643.12 | bwd_allreduce_microstep: 5.28 | step_microstep: 49.57
-[2025-01-25 15:44:38,354] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.45 | bwd: 4648.55 | bwd_inner: 4643.12 | bwd_allreduce: 5.35 | step: 49.58
- 44%|████▎     | 2530/5800 [6:58:08<6:17:35,  6.93s/it]                                                       {'loss': 0.0137, 'grad_norm': 4.537919998168945, 'learning_rate': 2.5048615825584755e-05, 'epoch': 21.81}
- 44%|████▎     | 2530/5800 [6:58:08<6:17:35,  6.93s/it]evaluate!
-score1 tensor([[0.4648]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4805, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5625, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4609]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4766, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5703, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6406]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5742, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4043]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5547, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1504, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5547]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5977, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5547]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4219, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1328, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5742]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0859, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4824]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5000, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3887]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3945, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5000, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4941]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3730, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1211, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4473]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0449, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4473]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4414, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6797, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1523, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5391, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4824]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4648, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3906]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4258, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4043]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4707, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4316]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3965, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4941]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3750, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1191, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4883]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4492, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5078]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5586, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5938]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6523, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0586, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6328]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5898, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4355, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1035, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4980]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5156, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4766]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3262, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1504, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3809]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4863, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1055, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1211, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5664]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6055, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5742]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4570, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1172, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5508]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0977, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6055]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.7031, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0977, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4707]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4336, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0371, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5508]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5391, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4258]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4473, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5000]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5430, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4492]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4590, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6250]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6641, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4453, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0781, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5664]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.7109, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1445, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4766]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5273, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4531]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3691, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0840, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4121]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4180, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4629]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3223, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1406, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5898]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6016, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4531]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4648, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4395]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3418, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0977, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4629]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4590, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5039]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5117, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4766]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4551, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4492]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5234, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0742, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5664]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5312, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5898]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5820, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4258]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4199, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4023]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3926, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5742]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5000]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5039]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5039]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5352, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5820]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6250, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6211]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5703, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4258]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4727, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5547]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4434, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1113, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5000]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4941, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5625]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5352, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5039]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4570]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3828, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0742, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4160]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4219, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4609]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4844, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5312]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4805, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5742]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5938]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5898, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6680]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6641, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4863]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4688, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4199]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4297, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4023]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4668, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0645, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1016, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4531]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4297, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4102]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4609, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4980]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1191, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4375]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4492, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3691]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3730, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5078]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6445, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1367, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4551]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1934, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4609]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5430, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0820, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5117]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5273, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4199]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3867, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0332, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5000]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4668, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0332, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5664]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5625, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5898]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6250, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5664]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0625, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5586]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4258]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4785, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0527, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4590]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4062, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0527, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5781]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4395, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1387, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4277]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5664, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1387, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4551]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3555, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0996, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4395]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4785, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5078, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5859]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6016, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4414]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4277, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5703]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5781, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4629]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3477, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1152, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6133, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0977, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5586]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5156, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4297]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3379, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0918, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3887]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3418, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4883]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3848, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1035, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4180]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3828, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4766]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4277, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0488, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3730]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4043, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-Results saved to /DATA/DATA1/wjr/intern/InternVL/internvl_chat/new_train_result/stage2/mos3_test.csv
-Accuracy: 0.0
-SRCC_score: 0.6733076618380458
-PLCC_score: 0.6670188660131287
-KRCC_score: 0.4830640593647834
-SRCC_level: 0.6733076618380458
-PLCC_level: 0.6670188660131287
-KRCC_level: 0.4830640593647834
-New best SRCC_score: 0.6733076618380458. Saving model...
-[INFO|trainer.py:3705] 2025-01-25 15:55:06,936 >> Saving model checkpoint to /DATA/env/wjr/newtrain/stage2/mos3
-[INFO|configuration_utils.py:410] 2025-01-25 15:55:06,944 >> Configuration saved in /DATA/env/wjr/newtrain/stage2/mos3/config.json
-[INFO|configuration_utils.py:868] 2025-01-25 15:55:06,945 >> Configuration saved in /DATA/env/wjr/newtrain/stage2/mos3/generation_config.json
-[INFO|modeling_utils.py:2844] 2025-01-25 15:56:36,604 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 4 checkpoint shards. You can find where each parameters has been saved in the index located at /DATA/env/wjr/newtrain/stage2/mos3/model.safetensors.index.json.
-[INFO|tokenization_utils_base.py:2641] 2025-01-25 15:56:36,607 >> tokenizer config file saved in /DATA/env/wjr/newtrain/stage2/mos3/tokenizer_config.json
-[INFO|tokenization_utils_base.py:2650] 2025-01-25 15:56:36,608 >> Special tokens file saved in /DATA/env/wjr/newtrain/stage2/mos3/special_tokens_map.json
-[INFO|tokenization_utils_base.py:2701] 2025-01-25 15:56:36,608 >> added tokens file saved in /DATA/env/wjr/newtrain/stage2/mos3/added_tokens.json
-01/25/2025 15:56:48 - INFO - __main__ - Saved LoRA weights to /DATA/env/wjr/newtrain/stage2/mos3/lora_weights.pth
-score1 tensor([[0.4961],
-        [0.5469],
-        [0.6367],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.5625, 0.6641, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:56:54,946] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 15:56:54,947] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2123.40 | bwd_microstep: 4562.91 | bwd_inner_microstep: 4557.82 | bwd_allreduce_microstep: 4.95 | step_microstep: 45.10
-[2025-01-25 15:56:54,948] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2123.36 | bwd: 4562.93 | bwd_inner: 4557.82 | bwd_allreduce: 5.00 | step: 45.11
- 44%|████▎     | 2531/5800 [7:10:24<205:03:54, 225.83s/it]                                                          {'loss': 0.0146, 'grad_norm': 8.751948356628418, 'learning_rate': 2.50378085974124e-05, 'epoch': 21.82}
- 44%|████▎     | 2531/5800 [7:10:24<205:03:54, 225.83s/it]score1 tensor([[0.3828],
-        [0.5430],
-        [0.4590],
-        [0.4531]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.5469, 0.4941, 0.4707], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:57:01,767] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 15:57:01,768] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2132.80 | bwd_microstep: 4562.16 | bwd_inner_microstep: 4557.30 | bwd_allreduce_microstep: 4.78 | step_microstep: 47.14
-[2025-01-25 15:57:01,768] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2132.76 | bwd: 4562.18 | bwd_inner: 4557.30 | bwd_allreduce: 4.83 | step: 47.16
- 44%|████▎     | 2532/5800 [7:10:31<145:21:32, 160.13s/it]                                                          {'loss': 0.0186, 'grad_norm': 7.937232971191406, 'learning_rate': 2.5026999798363908e-05, 'epoch': 21.83}
- 44%|████▎     | 2532/5800 [7:10:31<145:21:32, 160.13s/it]score1 tensor([[0.4688],
-        [0.6680],
-        [0.4844],
-        [0.3555]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.6445, 0.4414, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0288, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:57:08,577] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 15:57:08,578] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2126.49 | bwd_microstep: 4557.89 | bwd_inner_microstep: 4552.87 | bwd_allreduce_microstep: 4.92 | step_microstep: 49.61
-[2025-01-25 15:57:08,579] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2126.46 | bwd: 4557.92 | bwd_inner: 4552.87 | bwd_allreduce: 4.98 | step: 49.62
- 44%|████▎     | 2533/5800 [7:10:38<103:34:28, 114.13s/it]                                                          {'loss': 0.0288, 'grad_norm': 4.813058376312256, 'learning_rate': 2.501618943180965e-05, 'epoch': 21.84}
- 44%|████▎     | 2533/5800 [7:10:38<103:34:28, 114.13s/it]score1 tensor([[0.5508],
-        [0.6250],
-        [0.4785],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.6094, 0.5391, 0.5820], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0229, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:57:15,378] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.36
-[2025-01-25 15:57:15,380] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2132.46 | bwd_microstep: 4539.77 | bwd_inner_microstep: 4534.52 | bwd_allreduce_microstep: 5.10 | step_microstep: 47.37
-[2025-01-25 15:57:15,380] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2132.42 | bwd: 4539.79 | bwd_inner: 4534.52 | bwd_allreduce: 5.19 | step: 47.38
- 44%|████▎     | 2534/5800 [7:10:45<74:20:05, 81.94s/it]                                                          {'loss': 0.0229, 'grad_norm': 2.5800411701202393, 'learning_rate': 2.5005377501120497e-05, 'epoch': 21.84}
- 44%|████▎     | 2534/5800 [7:10:45<74:20:05, 81.94s/it]score1 tensor([[0.6016],
-        [0.4902],
-        [0.5586],
-        [0.6172]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.5000, 0.5547, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:57:22,211] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.10 | optimizer_step: 4.37
-[2025-01-25 15:57:22,213] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2134.65 | bwd_microstep: 4542.37 | bwd_inner_microstep: 4537.17 | bwd_allreduce_microstep: 5.11 | step_microstep: 60.85
-[2025-01-25 15:57:22,214] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2134.62 | bwd: 4542.40 | bwd_inner: 4537.17 | bwd_allreduce: 5.16 | step: 60.89
- 44%|████▎     | 2535/5800 [7:10:52<53:52:40, 59.41s/it]                                                        {'loss': 0.0122, 'grad_norm': 2.4764976501464844, 'learning_rate': 2.499456400966778e-05, 'epoch': 21.85}
- 44%|████▎     | 2535/5800 [7:10:52<53:52:40, 59.41s/it]score1 tensor([[0.4551],
-        [0.4238],
-        [0.5195],
-        [0.4785]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.4199, 0.5156, 0.4375], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:57:29,136] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.38 | optimizer_step: 4.57
-[2025-01-25 15:57:29,138] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2140.58 | bwd_microstep: 4592.42 | bwd_inner_microstep: 4586.71 | bwd_allreduce_microstep: 5.56 | step_microstep: 80.23
-[2025-01-25 15:57:29,138] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2140.54 | bwd: 4592.44 | bwd_inner: 4586.71 | bwd_allreduce: 5.63 | step: 80.23
- 44%|████▎     | 2536/5800 [7:10:59<39:35:10, 43.66s/it]                                                        {'loss': 0.0195, 'grad_norm': 4.128132343292236, 'learning_rate': 2.4983748960823347e-05, 'epoch': 21.86}
- 44%|████▎     | 2536/5800 [7:10:59<39:35:10, 43.66s/it]score1 tensor([[0.6133],
-        [0.4688],
-        [0.4004],
-        [0.5938]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5898, 0.4766, 0.3984, 0.6289], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0171, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:57:36,020] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 15:57:36,022] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2140.19 | bwd_microstep: 4599.54 | bwd_inner_microstep: 4594.27 | bwd_allreduce_microstep: 5.19 | step_microstep: 46.95
-[2025-01-25 15:57:36,022] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2140.16 | bwd: 4599.56 | bwd_inner: 4594.26 | bwd_allreduce: 5.23 | step: 46.96
- 44%|████▎     | 2537/5800 [7:11:05<29:34:07, 32.62s/it]                                                        {'loss': 0.0171, 'grad_norm': 0.5017601251602173, 'learning_rate': 2.497293235795951e-05, 'epoch': 21.87}
- 44%|████▎     | 2537/5800 [7:11:05<29:34:07, 32.62s/it]score1 tensor([[0.4395],
-        [0.4805],
-        [0.5781],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4707, 0.4785, 0.4941, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0366, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:57:42,863] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 15:57:42,864] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2142.24 | bwd_microstep: 4583.60 | bwd_inner_microstep: 4578.75 | bwd_allreduce_microstep: 4.76 | step_microstep: 43.72
-[2025-01-25 15:57:42,865] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2142.20 | bwd: 4583.63 | bwd_inner: 4578.75 | bwd_allreduce: 4.81 | step: 43.73
- 44%|████▍     | 2538/5800 [7:11:12<22:33:08, 24.89s/it]                                                        {'loss': 0.0366, 'grad_norm': 4.51328706741333, 'learning_rate': 2.496211420444908e-05, 'epoch': 21.88}
- 44%|████▍     | 2538/5800 [7:11:12<22:33:08, 24.89s/it]score1 tensor([[0.4141],
-        [0.4453],
-        [0.4199],
-        [0.5938]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4297, 0.4395, 0.4551, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0171, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:57:49,744] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.37
-[2025-01-25 15:57:49,745] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2143.02 | bwd_microstep: 4595.49 | bwd_inner_microstep: 4587.18 | bwd_allreduce_microstep: 8.20 | step_microstep: 70.22
-[2025-01-25 15:57:49,745] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2142.99 | bwd: 4595.51 | bwd_inner: 4587.18 | bwd_allreduce: 8.26 | step: 70.24
- 44%|████▍     | 2539/5800 [7:11:19<17:39:15, 19.49s/it]                                                        {'loss': 0.0171, 'grad_norm': 4.132168769836426, 'learning_rate': 2.4951294503665336e-05, 'epoch': 21.89}
- 44%|████▍     | 2539/5800 [7:11:19<17:39:15, 19.49s/it]score1 tensor([[0.3887],
-        [0.3613],
-        [0.5508],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.3340, 0.5625, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:57:56,625] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 15:57:56,626] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2146.68 | bwd_microstep: 4585.35 | bwd_inner_microstep: 4580.58 | bwd_allreduce_microstep: 4.67 | step_microstep: 56.85
-[2025-01-25 15:57:56,626] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2146.63 | bwd: 4585.37 | bwd_inner: 4580.58 | bwd_allreduce: 4.71 | step: 56.85
- 44%|████▍     | 2540/5800 [7:11:26<14:13:16, 15.70s/it]                                                        {'loss': 0.0312, 'grad_norm': 4.372840881347656, 'learning_rate': 2.494047325898205e-05, 'epoch': 21.9}
- 44%|████▍     | 2540/5800 [7:11:26<14:13:16, 15.70s/it]score1 tensor([[0.3418],
-        [0.5039],
-        [0.4805],
-        [0.5859]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3711, 0.4785, 0.5117, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:58:03,514] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.36
-[2025-01-25 15:58:03,515] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2146.82 | bwd_microstep: 4600.80 | bwd_inner_microstep: 4592.76 | bwd_allreduce_microstep: 7.84 | step_microstep: 63.52
-[2025-01-25 15:58:03,516] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2146.77 | bwd: 4600.86 | bwd_inner: 4592.76 | bwd_allreduce: 7.95 | step: 63.49
- 44%|████▍     | 2541/5800 [7:11:33<11:49:24, 13.06s/it]                                                        {'loss': 0.0273, 'grad_norm': 3.993030309677124, 'learning_rate': 2.4929650473773482e-05, 'epoch': 21.91}
- 44%|████▍     | 2541/5800 [7:11:33<11:49:24, 13.06s/it]score1 tensor([[0.6211],
-        [0.6016],
-        [0.4980],
-        [0.4961]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.6016, 0.4980, 0.4727], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:58:10,266] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 15:58:10,267] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2148.36 | bwd_microstep: 4483.16 | bwd_inner_microstep: 4478.33 | bwd_allreduce_microstep: 4.73 | step_microstep: 44.20
-[2025-01-25 15:58:10,268] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2148.32 | bwd: 4483.18 | bwd_inner: 4478.33 | bwd_allreduce: 4.78 | step: 44.21
- 44%|████▍     | 2542/5800 [7:11:40<10:06:21, 11.17s/it]                                                        {'loss': 0.0059, 'grad_norm': 2.0944154262542725, 'learning_rate': 2.491882615141436e-05, 'epoch': 21.91}
- 44%|████▍     | 2542/5800 [7:11:40<10:06:21, 11.17s/it]score1 tensor([[0.5312],
-        [0.5195],
-        [0.4102],
-        [0.5781]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6016, 0.6055, 0.4160, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0513, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:58:17,163] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.27 | optimizer_step: 4.36
-[2025-01-25 15:58:17,165] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2148.58 | bwd_microstep: 4601.23 | bwd_inner_microstep: 4591.58 | bwd_allreduce_microstep: 9.43 | step_microstep: 54.23
-[2025-01-25 15:58:17,165] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2148.54 | bwd: 4601.26 | bwd_inner: 4591.58 | bwd_allreduce: 9.59 | step: 54.25
- 44%|████▍     | 2543/5800 [7:11:47<8:56:41,  9.89s/it]                                                        {'loss': 0.0513, 'grad_norm': 8.436439514160156, 'learning_rate': 2.4908000295279893e-05, 'epoch': 21.92}
- 44%|████▍     | 2543/5800 [7:11:47<8:56:41,  9.89s/it]score1 tensor([[0.5430],
-        [0.5391],
-        [0.5859],
-        [0.4258]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5078, 0.5430, 0.6445, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:58:24,048] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 15:58:24,049] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2147.95 | bwd_microstep: 4613.39 | bwd_inner_microstep: 4608.49 | bwd_allreduce_microstep: 4.83 | step_microstep: 45.02
-[2025-01-25 15:58:24,050] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2147.93 | bwd: 4613.42 | bwd_inner: 4608.49 | bwd_allreduce: 4.86 | step: 45.03
- 44%|████▍     | 2544/5800 [7:11:54<8:07:34,  8.98s/it]                                                       {'loss': 0.0293, 'grad_norm': 4.217962265014648, 'learning_rate': 2.4897172908745782e-05, 'epoch': 21.93}
- 44%|████▍     | 2544/5800 [7:11:54<8:07:34,  8.98s/it]score1 tensor([[0.6680],
-        [0.3906],
-        [0.4590],
-        [0.4805]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6836, 0.4023, 0.4668, 0.4375], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:58:30,946] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.37
-[2025-01-25 15:58:30,947] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.87 | bwd_microstep: 4625.52 | bwd_inner_microstep: 4620.43 | bwd_allreduce_microstep: 5.01 | step_microstep: 45.16
-[2025-01-25 15:58:30,947] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.84 | bwd: 4625.54 | bwd_inner: 4620.43 | bwd_allreduce: 5.05 | step: 45.17
- 44%|████▍     | 2545/5800 [7:12:00<7:33:32,  8.36s/it]                                                       {'loss': 0.0195, 'grad_norm': 4.33928918838501, 'learning_rate': 2.4886343995188175e-05, 'epoch': 21.94}
- 44%|████▍     | 2545/5800 [7:12:00<7:33:32,  8.36s/it]score1 tensor([[0.4688],
-        [0.4668],
-        [0.5938],
-        [0.5938]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4863, 0.4336, 0.5703, 0.5938], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:58:37,809] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 15:58:37,810] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.01 | bwd_microstep: 4581.68 | bwd_inner_microstep: 4576.89 | bwd_allreduce_microstep: 4.71 | step_microstep: 48.94
-[2025-01-25 15:58:37,811] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.98 | bwd: 4581.70 | bwd_inner: 4576.89 | bwd_allreduce: 4.75 | step: 48.94
- 44%|████▍     | 2546/5800 [7:12:07<7:08:59,  7.91s/it]                                                       {'loss': 0.0186, 'grad_norm': 2.3099544048309326, 'learning_rate': 2.4875513557983725e-05, 'epoch': 21.95}
- 44%|████▍     | 2546/5800 [7:12:07<7:08:59,  7.91s/it]score1 tensor([[0.5156],
-        [0.4922],
-        [0.5938],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4863, 0.4648, 0.5781, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0259, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:58:44,700] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 15:58:44,702] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.04 | bwd_microstep: 4623.16 | bwd_inner_microstep: 4618.32 | bwd_allreduce_microstep: 4.75 | step_microstep: 45.21
-[2025-01-25 15:58:44,702] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.01 | bwd: 4623.18 | bwd_inner: 4618.32 | bwd_allreduce: 4.79 | step: 45.21
- 44%|████▍     | 2547/5800 [7:12:14<6:52:18,  7.60s/it]                                                       {'loss': 0.0259, 'grad_norm': 4.343642711639404, 'learning_rate': 2.486468160050955e-05, 'epoch': 21.96}
- 44%|████▍     | 2547/5800 [7:12:14<6:52:18,  7.60s/it]score1 tensor([[0.5625],
-        [0.4746],
-        [0.3867],
-        [0.3887]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.4473, 0.4004, 0.4043], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0200, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:58:51,598] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 15:58:51,599] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.51 | bwd_microstep: 4623.08 | bwd_inner_microstep: 4617.57 | bwd_allreduce_microstep: 5.42 | step_microstep: 44.97
-[2025-01-25 15:58:51,600] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.48 | bwd: 4623.11 | bwd_inner: 4617.57 | bwd_allreduce: 5.46 | step: 44.97
- 44%|████▍     | 2548/5800 [7:12:21<6:40:38,  7.39s/it]                                                       {'loss': 0.02, 'grad_norm': 0.9086819887161255, 'learning_rate': 2.4853848126143244e-05, 'epoch': 21.97}
- 44%|████▍     | 2548/5800 [7:12:21<6:40:38,  7.39s/it]score1 tensor([[0.5039],
-        [0.4512],
-        [0.5039],
-        [0.4395]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4922, 0.4844, 0.4922, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0239, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:58:58,520] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 7.36 | optimizer_step: 4.37
-[2025-01-25 15:58:58,522] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.61 | bwd_microstep: 4623.53 | bwd_inner_microstep: 4617.16 | bwd_allreduce_microstep: 6.29 | step_microstep: 72.63
-[2025-01-25 15:58:58,523] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.58 | bwd: 4623.55 | bwd_inner: 4617.16 | bwd_allreduce: 6.33 | step: 72.69
- 44%|████▍     | 2549/5800 [7:12:28<6:33:24,  7.26s/it]                                                       {'loss': 0.0239, 'grad_norm': 4.1487298011779785, 'learning_rate': 2.4843013138262873e-05, 'epoch': 21.97}
- 44%|████▍     | 2549/5800 [7:12:28<6:33:24,  7.26s/it]score1 tensor([[0.4688],
-        [0.6211],
-        [0.3496],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4746, 0.5508, 0.3613, 0.5703], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0249, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:59:05,471] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 15:59:05,472] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.40 | bwd_microstep: 4623.29 | bwd_inner_microstep: 4618.18 | bwd_allreduce_microstep: 4.97 | step_microstep: 45.75
-[2025-01-25 15:59:05,472] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.33 | bwd: 4623.31 | bwd_inner: 4618.18 | bwd_allreduce: 5.05 | step: 45.72
- 44%|████▍     | 2550/5800 [7:12:35<6:27:42,  7.16s/it]                                                       {'loss': 0.0249, 'grad_norm': 1.2448853254318237, 'learning_rate': 2.4832176640246974e-05, 'epoch': 21.98}
- 44%|████▍     | 2550/5800 [7:12:35<6:27:42,  7.16s/it]score1 tensor([[0.5547],
-        [0.5234],
-        [0.4863],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.5195, 0.4941, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0166, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:59:12,381] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.23 | optimizer_step: 4.63
-[2025-01-25 15:59:12,383] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.61 | bwd_microstep: 4622.70 | bwd_inner_microstep: 4617.70 | bwd_allreduce_microstep: 4.89 | step_microstep: 59.65
-[2025-01-25 15:59:12,383] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.57 | bwd: 4622.73 | bwd_inner: 4617.70 | bwd_allreduce: 4.95 | step: 59.66
- 44%|████▍     | 2551/5800 [7:12:42<6:23:47,  7.09s/it]                                                       {'loss': 0.0166, 'grad_norm': 0.6324518918991089, 'learning_rate': 2.482133863547455e-05, 'epoch': 21.99}
- 44%|████▍     | 2551/5800 [7:12:42<6:23:47,  7.09s/it][93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-score1 tensor([[0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 15:59:17,831] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 15:59:17,832] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 579.58 | bwd_microstep: 1220.38 | bwd_inner_microstep: 1213.62 | bwd_allreduce_microstep: 6.55 | step_microstep: 63.10
-[2025-01-25 15:59:17,833] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 579.47 | bwd: 1220.43 | bwd_inner: 1213.62 | bwd_allreduce: 6.67 | step: 63.07
- 44%|████▍     | 2552/5800 [7:12:47<5:57:03,  6.60s/it]                                                       {'loss': 0.0273, 'grad_norm': 9.141136169433594, 'learning_rate': 2.4810499127325077e-05, 'epoch': 22.0}
- 44%|████▍     | 2552/5800 [7:12:47<5:57:03,  6.60s/it][2025-01-25 15:59:22,178] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 15:59:32,272] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 15:59:42,103] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 15:59:52,097] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-score1 tensor([[0.5820],
-        [0.4980],
-        [0.4531],
-        [0.4766]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.4844, 0.4805, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0200, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:00:07,865] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.31 | optimizer_step: 4.36
-[2025-01-25 16:00:07,866] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2138.49 | bwd_microstep: 4601.79 | bwd_inner_microstep: 4597.26 | bwd_allreduce_microstep: 4.45 | step_microstep: 77.52
-[2025-01-25 16:00:07,867] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2138.38 | bwd: 4601.82 | bwd_inner: 4597.26 | bwd_allreduce: 4.49 | step: 77.52
- 44%|████▍     | 2553/5800 [7:13:37<17:42:02, 19.62s/it]                                                        {'loss': 0.02, 'grad_norm': 4.174647808074951, 'learning_rate': 2.479965811917851e-05, 'epoch': 22.01}
- 44%|████▍     | 2553/5800 [7:13:37<17:42:02, 19.62s/it]score1 tensor([[0.5156],
-        [0.5312],
-        [0.5469],
-        [0.4531]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.5469, 0.5391, 0.4609], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:00:14,745] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 16:00:14,751] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2143.16 | bwd_microstep: 4589.81 | bwd_inner_microstep: 4581.80 | bwd_allreduce_microstep: 7.79 | step_microstep: 59.83
-[2025-01-25 16:00:14,752] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2143.09 | bwd: 4589.87 | bwd_inner: 4581.80 | bwd_allreduce: 7.91 | step: 59.81
- 44%|████▍     | 2554/5800 [7:13:44<14:15:06, 15.81s/it]                                                        {'loss': 0.0107, 'grad_norm': 0.4604041576385498, 'learning_rate': 2.4788815614415257e-05, 'epoch': 22.02}
- 44%|████▍     | 2554/5800 [7:13:44<14:15:06, 15.81s/it]score1 tensor([[0.4434],
-        [0.4746],
-        [0.4414],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.4980, 0.4180, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0151, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:00:21,605] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 16:00:21,606] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2138.08 | bwd_microstep: 4588.62 | bwd_inner_microstep: 4584.00 | bwd_allreduce_microstep: 4.54 | step_microstep: 41.67
-[2025-01-25 16:00:21,607] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2138.05 | bwd: 4588.64 | bwd_inner: 4584.00 | bwd_allreduce: 4.58 | step: 41.68
- 44%|████▍     | 2555/5800 [7:13:51<11:49:22, 13.12s/it]                                                        {'loss': 0.0151, 'grad_norm': 0.5152963399887085, 'learning_rate': 2.477797161641621e-05, 'epoch': 22.03}
- 44%|████▍     | 2555/5800 [7:13:51<11:49:22, 13.12s/it]score1 tensor([[0.4570],
-        [0.4609],
-        [0.4355],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4336, 0.4941, 0.4785, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0288, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:00:28,452] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.98 | optimizer_step: 4.37
-[2025-01-25 16:00:28,453] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2136.13 | bwd_microstep: 4591.08 | bwd_inner_microstep: 4586.42 | bwd_allreduce_microstep: 4.58 | step_microstep: 43.46
-[2025-01-25 16:00:28,454] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2136.10 | bwd: 4591.10 | bwd_inner: 4586.42 | bwd_allreduce: 4.62 | step: 43.47
- 44%|████▍     | 2556/5800 [7:13:58<10:07:32, 11.24s/it]                                                        {'loss': 0.0288, 'grad_norm': 0.5245023965835571, 'learning_rate': 2.47671261285627e-05, 'epoch': 22.03}
- 44%|████▍     | 2556/5800 [7:13:58<10:07:32, 11.24s/it]score1 tensor([[0.4863],
-        [0.6289],
-        [0.6016],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4531, 0.6602, 0.5391, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0405, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:00:35,291] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 16:00:35,292] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2138.30 | bwd_microstep: 4579.46 | bwd_inner_microstep: 4574.76 | bwd_allreduce_microstep: 4.62 | step_microstep: 42.84
-[2025-01-25 16:00:35,292] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2138.26 | bwd: 4579.48 | bwd_inner: 4574.76 | bwd_allreduce: 4.66 | step: 42.84
- 44%|████▍     | 2557/5800 [7:14:05<8:56:03,  9.92s/it]                                                        {'loss': 0.0405, 'grad_norm': 4.180708408355713, 'learning_rate': 2.4756279154236553e-05, 'epoch': 22.04}
- 44%|████▍     | 2557/5800 [7:14:05<8:56:03,  9.92s/it]score1 tensor([[0.5859],
-        [0.4590],
-        [0.4805],
-        [0.6602]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5742, 0.4512, 0.4727, 0.6797], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:00:42,132] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 16:00:42,133] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2139.72 | bwd_microstep: 4577.96 | bwd_inner_microstep: 4573.01 | bwd_allreduce_microstep: 4.87 | step_microstep: 41.93
-[2025-01-25 16:00:42,133] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2139.69 | bwd: 4577.98 | bwd_inner: 4573.01 | bwd_allreduce: 4.91 | step: 41.94
- 44%|█��██▍     | 2558/5800 [7:14:12<8:06:04,  9.00s/it]                                                       {'loss': 0.0117, 'grad_norm': 3.89127254486084, 'learning_rate': 2.4745430696820034e-05, 'epoch': 22.05}
- 44%|████▍     | 2558/5800 [7:14:12<8:06:04,  9.00s/it]score1 tensor([[0.4590],
-        [0.4023],
-        [0.5156],
-        [0.6289]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.3398, 0.5000, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0244, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:00:48,976] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 16:00:48,978] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2140.00 | bwd_microstep: 4584.34 | bwd_inner_microstep: 4579.47 | bwd_allreduce_microstep: 4.80 | step_microstep: 41.46
-[2025-01-25 16:00:48,978] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2139.96 | bwd: 4584.36 | bwd_inner: 4579.47 | bwd_allreduce: 4.83 | step: 41.48
- 44%|████▍     | 2559/5800 [7:14:18<7:30:57,  8.35s/it]                                                       {'loss': 0.0244, 'grad_norm': 8.268336296081543, 'learning_rate': 2.4734580759695882e-05, 'epoch': 22.06}
- 44%|████▍     | 2559/5800 [7:14:18<7:30:57,  8.35s/it]score1 tensor([[0.6250],
-        [0.4121],
-        [0.3984],
-        [0.4199]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6836, 0.3984, 0.3691, 0.4160], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0264, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:00:55,831] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 16:00:55,832] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2147.05 | bwd_microstep: 4584.58 | bwd_inner_microstep: 4579.52 | bwd_allreduce_microstep: 4.96 | step_microstep: 43.15
-[2025-01-25 16:00:55,832] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2147.01 | bwd: 4584.60 | bwd_inner: 4579.52 | bwd_allreduce: 5.01 | step: 43.15
- 44%|████▍     | 2560/5800 [7:14:25<7:06:39,  7.90s/it]                                                       {'loss': 0.0264, 'grad_norm': 3.3660683631896973, 'learning_rate': 2.4723729346247297e-05, 'epoch': 22.07}
- 44%|████▍     | 2560/5800 [7:14:25<7:06:39,  7.90s/it]score1 tensor([[0.6367],
-        [0.3984],
-        [0.5781],
-        [0.4668]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6562, 0.3945, 0.6367, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:01:02,704] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 16:01:02,705] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2144.58 | bwd_microstep: 4602.06 | bwd_inner_microstep: 4596.94 | bwd_allreduce_microstep: 5.00 | step_microstep: 42.89
-[2025-01-25 16:01:02,705] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2144.55 | bwd: 4602.08 | bwd_inner: 4596.94 | bwd_allreduce: 5.06 | step: 42.90
- 44%|████▍     | 2561/5800 [7:14:32<6:49:49,  7.59s/it]                                                       {'loss': 0.0234, 'grad_norm': 4.86919641494751, 'learning_rate': 2.4712876459857937e-05, 'epoch': 22.08}
- 44%|████▍     | 2561/5800 [7:14:32<6:49:49,  7.59s/it]score1 tensor([[0.4980],
-        [0.5430],
-        [0.5078],
-        [0.4180]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.5664, 0.5352, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0181, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:01:09,578] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 16:01:09,580] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2146.78 | bwd_microstep: 4607.37 | bwd_inner_microstep: 4602.52 | bwd_allreduce_microstep: 4.76 | step_microstep: 44.90
-[2025-01-25 16:01:09,580] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2146.74 | bwd: 4607.40 | bwd_inner: 4602.52 | bwd_allreduce: 4.81 | step: 44.91
- 44%|████▍     | 2562/5800 [7:14:39<6:38:10,  7.38s/it]                                                       {'loss': 0.0181, 'grad_norm': 0.5762536525726318, 'learning_rate': 2.4702022103911927e-05, 'epoch': 22.09}
- 44%|████▍     | 2562/5800 [7:14:39<6:38:10,  7.38s/it]score1 tensor([[0.4395],
-        [0.4199],
-        [0.5391],
-        [0.3809]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5000, 0.4316, 0.5664, 0.3652], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0288, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:01:16,490] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.92 | optimizer_step: 4.36
-[2025-01-25 16:01:16,491] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2151.84 | bwd_microstep: 4618.52 | bwd_inner_microstep: 4612.15 | bwd_allreduce_microstep: 6.30 | step_microstep: 49.73
-[2025-01-25 16:01:16,492] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2151.80 | bwd: 4618.55 | bwd_inner: 4612.15 | bwd_allreduce: 6.33 | step: 49.72
- 44%|████▍     | 2563/5800 [7:14:46<6:30:31,  7.24s/it]                                                       {'loss': 0.0288, 'grad_norm': 4.16266393661499, 'learning_rate': 2.4691166281793833e-05, 'epoch': 22.09}
- 44%|████▍     | 2563/5800 [7:14:46<6:30:31,  7.24s/it]score1 tensor([[0.5625],
-        [0.5352],
-        [0.5352],
-        [0.4648]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6719, 0.6211, 0.6211, 0.4688], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0713, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:01:23,434] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.42 | optimizer_step: 4.37
-[2025-01-25 16:01:23,436] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.73 | bwd_microstep: 4623.61 | bwd_inner_microstep: 4615.91 | bwd_allreduce_microstep: 7.46 | step_microstep: 70.59
-[2025-01-25 16:01:23,437] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.68 | bwd: 4623.71 | bwd_inner: 4615.91 | bwd_allreduce: 7.59 | step: 70.59
- 44%|████▍     | 2564/5800 [7:14:53<6:25:58,  7.16s/it]                                                       {'loss': 0.0713, 'grad_norm': 8.468365669250488, 'learning_rate': 2.4680308996888695e-05, 'epoch': 22.1}
- 44%|████▍     | 2564/5800 [7:14:53<6:25:58,  7.16s/it]score1 tensor([[0.4473],
-        [0.5156],
-        [0.4121],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4707, 0.5117, 0.4512, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:01:30,360] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 16:01:30,361] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.67 | bwd_microstep: 4617.33 | bwd_inner_microstep: 4612.23 | bwd_allreduce_microstep: 4.98 | step_microstep: 42.89
-[2025-01-25 16:01:30,362] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.63 | bwd: 4617.35 | bwd_inner: 4612.23 | bwd_allreduce: 5.04 | step: 42.90
- 44%|████▍     | 2565/5800 [7:15:00<6:21:47,  7.08s/it]                                                       {'loss': 0.0312, 'grad_norm': 3.8277721405029297, 'learning_rate': 2.4669450252582e-05, 'epoch': 22.11}
- 44%|████▍     | 2565/5800 [7:15:00<6:21:47,  7.08s/it]score1 tensor([[0.4805],
-        [0.4336],
-        [0.4121],
-        [0.4336]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4961, 0.4512, 0.4043, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:01:37,275] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 16:01:37,276] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.13 | bwd_microstep: 4622.42 | bwd_inner_microstep: 4617.21 | bwd_allreduce_microstep: 5.09 | step_microstep: 51.03
-[2025-01-25 16:01:37,276] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.08 | bwd: 4622.44 | bwd_inner: 4617.21 | bwd_allreduce: 5.15 | step: 51.03
- 44%|████▍     | 2566/5800 [7:15:07<6:18:52,  7.03s/it]                                                       {'loss': 0.0156, 'grad_norm': 3.986008882522583, 'learning_rate': 2.4658590052259697e-05, 'epoch': 22.12}
- 44%|████▍     | 2566/5800 [7:15:07<6:18:52,  7.03s/it]score1 tensor([[0.4219],
-        [0.4746],
-        [0.4746],
-        [0.4941]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4258, 0.4453, 0.4707, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:01:44,184] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 16:01:44,185] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.05 | bwd_microstep: 4619.45 | bwd_inner_microstep: 4614.43 | bwd_allreduce_microstep: 4.91 | step_microstep: 42.17
-[2025-01-25 16:01:44,185] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.96 | bwd: 4619.48 | bwd_inner: 4614.43 | bwd_allreduce: 4.97 | step: 42.17
- 44%|████▍     | 2567/5800 [7:15:14<6:16:48,  6.99s/it]                                                       {'loss': 0.0146, 'grad_norm': 0.4906696081161499, 'learning_rate': 2.4647728399308185e-05, 'epoch': 22.13}
- 44%|████▍     | 2567/5800 [7:15:14<6:16:48,  6.99s/it]score1 tensor([[0.4785],
-        [0.5547],
-        [0.6289],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4570, 0.5820, 0.6094, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0220, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:01:51,075] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 16:01:51,076] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.56 | bwd_microstep: 4614.24 | bwd_inner_microstep: 4609.39 | bwd_allreduce_microstep: 4.77 | step_microstep: 44.32
-[2025-01-25 16:01:51,077] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.53 | bwd: 4614.26 | bwd_inner: 4609.39 | bwd_allreduce: 4.81 | step: 44.33
- 44%|████▍     | 2568/5800 [7:15:21<6:15:08,  6.96s/it]                                                       {'loss': 0.022, 'grad_norm': 0.434877872467041, 'learning_rate': 2.4636865297114308e-05, 'epoch': 22.14}
- 44%|████▍     | 2568/5800 [7:15:21<6:15:08,  6.96s/it]score1 tensor([[0.5469],
-        [0.5000],
-        [0.5273],
-        [0.4316]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.5039, 0.5352, 0.4590], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:01:57,992] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 16:01:57,996] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.15 | bwd_microstep: 4629.81 | bwd_inner_microstep: 4625.00 | bwd_allreduce_microstep: 4.73 | step_microstep: 44.72
-[2025-01-25 16:01:57,997] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.11 | bwd: 4629.83 | bwd_inner: 4625.00 | bwd_allreduce: 4.77 | step: 44.73
- 44%|████▍     | 2569/5800 [7:15:27<6:14:13,  6.95s/it]                                                       {'loss': 0.0137, 'grad_norm': 3.9367802143096924, 'learning_rate': 2.462600074906538e-05, 'epoch': 22.15}
- 44%|████▍     | 2569/5800 [7:15:27<6:14:13,  6.95s/it]score1 tensor([[0.4336],
-        [0.5430],
-        [0.5039],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4082, 0.5352, 0.5156, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0151, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:02:04,899] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.88 | optimizer_step: 4.37
-[2025-01-25 16:02:04,900] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.41 | bwd_microstep: 4622.23 | bwd_inner_microstep: 4617.13 | bwd_allreduce_microstep: 5.00 | step_microstep: 43.01
-[2025-01-25 16:02:04,901] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.36 | bwd: 4622.25 | bwd_inner: 4617.13 | bwd_allreduce: 5.05 | step: 43.02
- 44%|████▍     | 2570/5800 [7:15:34<6:13:27,  6.94s/it]                                                       {'loss': 0.0151, 'grad_norm': 0.4687458872795105, 'learning_rate': 2.461513475854914e-05, 'epoch': 22.16}
- 44%|████▍     | 2570/5800 [7:15:34<6:13:27,  6.94s/it]score1 tensor([[0.5156],
-        [0.6758],
-        [0.5781],
-        [0.5859]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.6445, 0.6133, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:02:11,820] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.13 | optimizer_step: 4.36
-[2025-01-25 16:02:11,821] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.04 | bwd_microstep: 4624.27 | bwd_inner_microstep: 4618.81 | bwd_allreduce_microstep: 5.29 | step_microstep: 48.02
-[2025-01-25 16:02:11,822] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.01 | bwd: 4624.30 | bwd_inner: 4618.81 | bwd_allreduce: 5.38 | step: 48.03
- 44%|████▍     | 2571/5800 [7:15:41<6:13:01,  6.93s/it]                                                       {'loss': 0.0391, 'grad_norm': 4.55328893661499, 'learning_rate': 2.460426732895381e-05, 'epoch': 22.16}
- 44%|████▍     | 2571/5800 [7:15:41<6:13:01,  6.93s/it]score1 tensor([[0.4785],
-        [0.4414],
-        [0.5625],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.3789, 0.5898, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0327, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:02:18,749] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.81 | optimizer_step: 4.36
-[2025-01-25 16:02:18,751] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.16 | bwd_microstep: 4634.55 | bwd_inner_microstep: 4629.39 | bwd_allreduce_microstep: 4.97 | step_microstep: 57.07
-[2025-01-25 16:02:18,751] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.13 | bwd: 4634.60 | bwd_inner: 4629.39 | bwd_allreduce: 5.06 | step: 57.05
- 44%|████▍     | 2572/5800 [7:15:48<6:13:01,  6.93s/it]                                                       {'loss': 0.0327, 'grad_norm': 3.8472328186035156, 'learning_rate': 2.4593398463668036e-05, 'epoch': 22.17}
- 44%|████▍     | 2572/5800 [7:15:48<6:13:01,  6.93s/it]score1 tensor([[0.4473],
-        [0.5508],
-        [0.4512],
-        [0.6172]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3984, 0.5391, 0.4004, 0.6328], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0317, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:02:25,691] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 16:02:25,692] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.99 | bwd_microstep: 4629.80 | bwd_inner_microstep: 4624.29 | bwd_allreduce_microstep: 5.41 | step_microstep: 43.93
-[2025-01-25 16:02:25,692] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.91 | bwd: 4629.82 | bwd_inner: 4624.29 | bwd_allreduce: 5.46 | step: 43.94
- 44%|████▍     | 2573/5800 [7:15:55<6:13:02,  6.94s/it]                                                       {'loss': 0.0317, 'grad_norm': 3.741837501525879, 'learning_rate': 2.4582528166080915e-05, 'epoch': 22.18}
- 44%|████▍     | 2573/5800 [7:15:55<6:13:02,  6.94s/it]score1 tensor([[0.6016],
-        [0.5508],
-        [0.5469],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.5039, 0.5469, 0.4629], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0239, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:02:32,569] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.37
-[2025-01-25 16:02:32,572] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.64 | bwd_microstep: 4586.68 | bwd_inner_microstep: 4579.32 | bwd_allreduce_microstep: 7.20 | step_microstep: 45.85
-[2025-01-25 16:02:32,572] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.60 | bwd: 4586.73 | bwd_inner: 4579.32 | bwd_allreduce: 7.28 | step: 45.87
- 44%|████▍     | 2574/5800 [7:16:02<6:11:58,  6.92s/it]                                                       {'loss': 0.0239, 'grad_norm': 1.9971541166305542, 'learning_rate': 2.4571656439581995e-05, 'epoch': 22.19}
- 44%|████▍     | 2574/5800 [7:16:02<6:11:58,  6.92s/it]score1 tensor([[0.5391],
-        [0.4844],
-        [0.5859],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.4668, 0.5625, 0.5586], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0190, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:02:39,512] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.95 | optimizer_step: 4.36
-[2025-01-25 16:02:39,513] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2171.74 | bwd_microstep: 4642.04 | bwd_inner_microstep: 4637.10 | bwd_allreduce_microstep: 4.86 | step_microstep: 45.72
-[2025-01-25 16:02:39,513] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2171.70 | bwd: 4642.06 | bwd_inner: 4637.10 | bwd_allreduce: 4.89 | step: 45.72
- 44%|████▍     | 2575/5800 [7:16:09<6:12:08,  6.92s/it]                                                       {'loss': 0.019, 'grad_norm': 8.705592155456543, 'learning_rate': 2.456078328756128e-05, 'epoch': 22.2}
- 44%|████▍     | 2575/5800 [7:16:09<6:12:08,  6.92s/it]score1 tensor([[0.5586],
-        [0.5977],
-        [0.4980],
-        [0.4727]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.6445, 0.4688, 0.4375], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0327, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:02:46,432] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.10 | optimizer_step: 4.36
-[2025-01-25 16:02:46,433] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.70 | bwd_microstep: 4630.32 | bwd_inner_microstep: 4624.83 | bwd_allreduce_microstep: 5.37 | step_microstep: 45.20
-[2025-01-25 16:02:46,433] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.67 | bwd: 4630.34 | bwd_inner: 4624.83 | bwd_allreduce: 5.43 | step: 45.22
- 44%|████▍     | 2576/5800 [7:16:16<6:11:57,  6.92s/it]                                                       {'loss': 0.0327, 'grad_norm': 0.7420123815536499, 'learning_rate': 2.4549908713409196e-05, 'epoch': 22.21}
- 44%|████▍     | 2576/5800 [7:16:16<6:11:57,  6.92s/it]score1 tensor([[0.4219],
-        [0.6094],
-        [0.4609],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.6289, 0.4570, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0225, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:02:53,359] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 16:02:53,360] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.88 | bwd_microstep: 4630.80 | bwd_inner_microstep: 4625.34 | bwd_allreduce_microstep: 5.35 | step_microstep: 48.76
-[2025-01-25 16:02:53,361] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.85 | bwd: 4630.83 | bwd_inner: 4625.34 | bwd_allreduce: 5.40 | step: 48.77
- 44%|████▍     | 2577/5800 [7:16:23<6:11:51,  6.92s/it]                                                       {'loss': 0.0225, 'grad_norm': 0.4511263072490692, 'learning_rate': 2.4539032720516628e-05, 'epoch': 22.22}
- 44%|████▍     | 2577/5800 [7:16:23<6:11:51,  6.92s/it]score1 tensor([[0.3770],
-        [0.5117],
-        [0.5547],
-        [0.4473]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3789, 0.4863, 0.5547, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0142, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:03:00,249] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 16:03:00,251] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.42 | bwd_microstep: 4591.89 | bwd_inner_microstep: 4585.49 | bwd_allreduce_microstep: 6.20 | step_microstep: 56.46
-[2025-01-25 16:03:00,251] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.38 | bwd: 4591.99 | bwd_inner: 4585.49 | bwd_allreduce: 6.29 | step: 56.47
- 44%|████▍     | 2578/5800 [7:16:30<6:11:26,  6.92s/it]                                                       {'loss': 0.0142, 'grad_norm': 2.3081281185150146, 'learning_rate': 2.45281553122749e-05, 'epoch': 22.22}
- 44%|████▍     | 2578/5800 [7:16:30<6:11:26,  6.92s/it]score1 tensor([[0.5430],
-        [0.4609],
-        [0.4082],
-        [0.4453]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.4863, 0.4121, 0.4863], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0205, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:03:07,203] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 16:03:07,204] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.58 | bwd_microstep: 4633.01 | bwd_inner_microstep: 4623.69 | bwd_allreduce_microstep: 9.22 | step_microstep: 56.43
-[2025-01-25 16:03:07,205] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.54 | bwd: 4633.05 | bwd_inner: 4623.69 | bwd_allreduce: 9.27 | step: 56.44
- 44%|████▍     | 2579/5800 [7:16:37<6:11:44,  6.92s/it]                                                       {'loss': 0.0205, 'grad_norm': 7.936980247497559, 'learning_rate': 2.4517276492075772e-05, 'epoch': 22.23}
- 44%|████▍     | 2579/5800 [7:16:37<6:11:44,  6.92s/it]score1 tensor([[0.4121],
-        [0.5859],
-        [0.4395],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.6016, 0.4297, 0.5742], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:03:14,113] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 16:03:14,114] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.51 | bwd_microstep: 4627.89 | bwd_inner_microstep: 4622.88 | bwd_allreduce_microstep: 4.92 | step_microstep: 42.38
-[2025-01-25 16:03:14,115] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.47 | bwd: 4627.91 | bwd_inner: 4622.88 | bwd_allreduce: 4.96 | step: 42.39
- 44%|████▍     | 2580/5800 [7:16:44<6:11:33,  6.92s/it]                                                       {'loss': 0.0469, 'grad_norm': 4.342408180236816, 'learning_rate': 2.4506396263311453e-05, 'epoch': 22.24}
- 44%|████▍     | 2580/5800 [7:16:44<6:11:33,  6.92s/it]score1 tensor([[0.4629],
-        [0.5000],
-        [0.3223],
-        [0.4941]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.5586, 0.3418, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0308, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:03:21,043] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 16:03:21,044] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.78 | bwd_microstep: 4631.24 | bwd_inner_microstep: 4625.84 | bwd_allreduce_microstep: 5.28 | step_microstep: 45.48
-[2025-01-25 16:03:21,045] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.75 | bwd: 4631.26 | bwd_inner: 4625.84 | bwd_allreduce: 5.34 | step: 45.49
- 44%|████▍     | 2581/5800 [7:16:51<6:11:37,  6.93s/it]                                                       {'loss': 0.0308, 'grad_norm': 7.799673557281494, 'learning_rate': 2.4495514629374592e-05, 'epoch': 22.25}
- 44%|████▍     | 2581/5800 [7:16:51<6:11:37,  6.93s/it]score1 tensor([[0.4414],
-        [0.5156],
-        [0.3828],
-        [0.4668]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4375, 0.5430, 0.4004, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:03:27,997] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.08 | optimizer_step: 4.36
-[2025-01-25 16:03:27,998] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2177.36 | bwd_microstep: 4636.17 | bwd_inner_microstep: 4631.22 | bwd_allreduce_microstep: 4.83 | step_microstep: 43.48
-[2025-01-25 16:03:27,999] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2177.33 | bwd: 4636.19 | bwd_inner: 4631.22 | bwd_allreduce: 4.89 | step: 43.49
- 45%|████▍     | 2582/5800 [7:16:57<6:11:44,  6.93s/it]                                                       {'loss': 0.0161, 'grad_norm': 0.7466018795967102, 'learning_rate': 2.4484631593658258e-05, 'epoch': 22.26}
- 45%|████▍     | 2582/5800 [7:16:57<6:11:44,  6.93s/it]score1 tensor([[0.5000],
-        [0.4570],
-        [0.4844],
-        [0.3848]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.4707, 0.5352, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0269, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:03:34,926] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 16:03:34,927] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.00 | bwd_microstep: 4634.37 | bwd_inner_microstep: 4625.83 | bwd_allreduce_microstep: 8.18 | step_microstep: 46.81
-[2025-01-25 16:03:34,927] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.95 | bwd: 4634.43 | bwd_inner: 4625.83 | bwd_allreduce: 8.41 | step: 46.80
- 45%|████▍     | 2583/5800 [7:17:04<6:11:32,  6.93s/it]                                                       {'loss': 0.0269, 'grad_norm': 7.887701511383057, 'learning_rate': 2.447374715955598e-05, 'epoch': 22.27}
- 45%|████▍     | 2583/5800 [7:17:04<6:11:32,  6.93s/it]score1 tensor([[0.3984],
-        [0.3672],
-        [0.5000],
-        [0.4805]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4395, 0.3652, 0.5547, 0.4688], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:03:41,863] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.37
-[2025-01-25 16:03:41,865] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.61 | bwd_microstep: 4630.06 | bwd_inner_microstep: 4625.27 | bwd_allreduce_microstep: 4.71 | step_microstep: 44.26
-[2025-01-25 16:03:41,865] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.52 | bwd: 4630.08 | bwd_inner: 4625.27 | bwd_allreduce: 4.75 | step: 44.27
- 45%|████▍     | 2584/5800 [7:17:11<6:11:37,  6.93s/it]                                                       {'loss': 0.0273, 'grad_norm': 0.43575212359428406, 'learning_rate': 2.4462861330461714e-05, 'epoch': 22.28}
- 45%|████▍     | 2584/5800 [7:17:11<6:11:37,  6.93s/it]score1 tensor([[0.5078],
-        [0.5625],
-        [0.5195],
-        [0.6289]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.5625, 0.5312, 0.6562], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0205, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:03:48,742] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.93 | optimizer_step: 4.36
-[2025-01-25 16:03:48,743] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.70 | bwd_microstep: 4583.18 | bwd_inner_microstep: 4577.97 | bwd_allreduce_microstep: 5.10 | step_microstep: 42.19
-[2025-01-25 16:03:48,743] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.62 | bwd: 4583.21 | bwd_inner: 4577.97 | bwd_allreduce: 5.15 | step: 42.19
- 45%|████▍     | 2585/5800 [7:17:18<6:10:32,  6.92s/it]                                                       {'loss': 0.0205, 'grad_norm': 6.54498291015625, 'learning_rate': 2.4451974109769844e-05, 'epoch': 22.28}
- 45%|████▍     | 2585/5800 [7:17:18<6:10:32,  6.92s/it]score1 tensor([[0.5195],
-        [0.6094],
-        [0.5391],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5234, 0.6172, 0.5547, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:03:55,687] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.02 | optimizer_step: 4.36
-[2025-01-25 16:03:55,689] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2169.62 | bwd_microstep: 4638.95 | bwd_inner_microstep: 4633.88 | bwd_allreduce_microstep: 4.99 | step_microstep: 51.54
-[2025-01-25 16:03:55,689] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2169.58 | bwd: 4638.97 | bwd_inner: 4633.88 | bwd_allreduce: 5.03 | step: 51.55
- 45%|████▍     | 2586/5800 [7:17:25<6:10:59,  6.93s/it]                                                       {'loss': 0.0078, 'grad_norm': 4.484655380249023, 'learning_rate': 2.44410855008752e-05, 'epoch': 22.29}
- 45%|████▍     | 2586/5800 [7:17:25<6:10:59,  6.93s/it]score1 tensor([[0.5508],
-        [0.6367],
-        [0.4180],
-        [0.3066]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.6250, 0.4121, 0.3105], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:04:02,611] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 16:04:02,615] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.02 | bwd_microstep: 4630.47 | bwd_inner_microstep: 4625.14 | bwd_allreduce_microstep: 5.23 | step_microstep: 44.64
-[2025-01-25 16:04:02,615] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.98 | bwd: 4630.50 | bwd_inner: 4625.14 | bwd_allreduce: 5.28 | step: 44.65
- 45%|████▍     | 2587/5800 [7:17:32<6:10:49,  6.92s/it]                                                       {'loss': 0.0073, 'grad_norm': 4.8629326820373535, 'learning_rate': 2.4430195507173044e-05, 'epoch': 22.3}
- 45%|████▍     | 2587/5800 [7:17:32<6:10:49,  6.92s/it]score1 tensor([[0.4824],
-        [0.5195],
-        [0.4707],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.5664, 0.4668, 0.4746], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0283, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:04:09,568] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.41 | optimizer_step: 4.36
-[2025-01-25 16:04:09,570] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.67 | bwd_microstep: 4642.46 | bwd_inner_microstep: 4637.36 | bwd_allreduce_microstep: 5.00 | step_microstep: 56.45
-[2025-01-25 16:04:09,570] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.64 | bwd: 4642.48 | bwd_inner: 4637.35 | bwd_allreduce: 5.06 | step: 56.46
- 45%|████▍     | 2588/5800 [7:17:39<6:11:29,  6.94s/it]                                                       {'loss': 0.0283, 'grad_norm': 4.057955741882324, 'learning_rate': 2.441930413205905e-05, 'epoch': 22.31}
- 45%|████▍     | 2588/5800 [7:17:39<6:11:29,  6.94s/it]score1 tensor([[0.5312],
-        [0.4062],
-        [0.3555],
-        [0.4492]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.4199, 0.3457, 0.4277], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:04:16,505] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 16:04:16,507] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.34 | bwd_microstep: 4632.87 | bwd_inner_microstep: 4628.01 | bwd_allreduce_microstep: 4.78 | step_microstep: 42.26
-[2025-01-25 16:04:16,507] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.29 | bwd: 4632.90 | bwd_inner: 4628.01 | bwd_allreduce: 4.82 | step: 42.27
- 45%|████▍     | 2589/5800 [7:17:46<6:11:04,  6.93s/it]                                                       {'loss': 0.0132, 'grad_norm': 0.5337867140769958, 'learning_rate': 2.440841137892935e-05, 'epoch': 22.32}
- 45%|████▍     | 2589/5800 [7:17:46<6:11:04,  6.93s/it]score1 tensor([[0.4609],
-        [0.5234],
-        [0.5430],
-        [0.4473]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4297, 0.5000, 0.5312, 0.4395], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:04:23,429] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 16:04:23,431] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.23 | bwd_microstep: 4631.85 | bwd_inner_microstep: 4626.46 | bwd_allreduce_microstep: 5.29 | step_microstep: 51.89
-[2025-01-25 16:04:23,431] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.20 | bwd: 4631.88 | bwd_inner: 4626.46 | bwd_allreduce: 5.34 | step: 51.90
- 45%|████▍     | 2590/5800 [7:17:53<6:10:46,  6.93s/it]                                                       {'loss': 0.0186, 'grad_norm': 8.145829200744629, 'learning_rate': 2.4397517251180486e-05, 'epoch': 22.33}
- 45%|████▍     | 2590/5800 [7:17:53<6:10:46,  6.93s/it]score1 tensor([[0.5078],
-        [0.4512],
-        [0.6484],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.4473, 0.6133, 0.5703], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0229, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:04:30,362] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 16:04:30,363] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.61 | bwd_microstep: 4639.64 | bwd_inner_microstep: 4630.19 | bwd_allreduce_microstep: 9.31 | step_microstep: 44.69
-[2025-01-25 16:04:30,363] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.43 | bwd: 4639.66 | bwd_inner: 4630.19 | bwd_allreduce: 9.39 | step: 44.71
- 45%|████▍     | 2591/5800 [7:18:00<6:10:38,  6.93s/it]                                                       {'loss': 0.0229, 'grad_norm': 4.311188697814941, 'learning_rate': 2.438662175220944e-05, 'epoch': 22.34}
- 45%|████▍     | 2591/5800 [7:18:00<6:10:38,  6.93s/it]score1 tensor([[0.5977],
-        [0.6406],
-        [0.4570],
-        [0.3691]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5898, 0.6094, 0.4375, 0.3555], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0181, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:04:37,294] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 16:04:37,296] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.71 | bwd_microstep: 4643.02 | bwd_inner_microstep: 4637.59 | bwd_allreduce_microstep: 5.32 | step_microstep: 45.76
-[2025-01-25 16:04:37,296] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.67 | bwd: 4643.05 | bwd_inner: 4637.59 | bwd_allreduce: 5.38 | step: 45.77
- 45%|████▍     | 2592/5800 [7:18:07<6:10:33,  6.93s/it]                                                       {'loss': 0.0181, 'grad_norm': 8.423523902893066, 'learning_rate': 2.4375724885413616e-05, 'epoch': 22.34}
- 45%|████▍     | 2592/5800 [7:18:07<6:10:33,  6.93s/it]score1 tensor([[0.4961],
-        [0.4473],
-        [0.4922],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.4141, 0.4629, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0244, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:04:44,217] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 16:04:44,218] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.37 | bwd_microstep: 4641.35 | bwd_inner_microstep: 4636.12 | bwd_allreduce_microstep: 5.13 | step_microstep: 44.43
-[2025-01-25 16:04:44,219] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.33 | bwd: 4641.38 | bwd_inner: 4636.12 | bwd_allreduce: 5.19 | step: 44.44
- 45%|████▍     | 2593/5800 [7:18:14<6:10:18,  6.93s/it]                                                       {'loss': 0.0244, 'grad_norm': 8.274248123168945, 'learning_rate': 2.4364826654190852e-05, 'epoch': 22.35}
- 45%|████▍     | 2593/5800 [7:18:14<6:10:18,  6.93s/it]score1 tensor([[0.4570],
-        [0.5352],
-        [0.4707],
-        [0.4707]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.5312, 0.5391, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0259, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:04:51,140] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.37
-[2025-01-25 16:04:51,146] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.28 | bwd_microstep: 4634.75 | bwd_inner_microstep: 4629.91 | bwd_allreduce_microstep: 4.76 | step_microstep: 48.67
-[2025-01-25 16:04:51,147] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.25 | bwd: 4634.77 | bwd_inner: 4629.91 | bwd_allreduce: 4.80 | step: 48.69
- 45%|████▍     | 2594/5800 [7:18:21<6:10:13,  6.93s/it]                                                       {'loss': 0.0259, 'grad_norm': 0.4917689859867096, 'learning_rate': 2.4353927061939397e-05, 'epoch': 22.36}
- 45%|████▍     | 2594/5800 [7:18:21<6:10:13,  6.93s/it]score1 tensor([[0.3906],
-        [0.3359],
-        [0.4336],
-        [0.5859]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3945, 0.3926, 0.4512, 0.6289], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0303, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:04:58,081] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.93 | optimizer_step: 4.36
-[2025-01-25 16:04:58,083] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.93 | bwd_microstep: 4640.23 | bwd_inner_microstep: 4635.08 | bwd_allreduce_microstep: 5.06 | step_microstep: 47.68
-[2025-01-25 16:04:58,083] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.89 | bwd: 4640.26 | bwd_inner: 4635.08 | bwd_allreduce: 5.10 | step: 47.68
- 45%|████▍     | 2595/5800 [7:18:28<6:10:15,  6.93s/it]                                                       {'loss': 0.0303, 'grad_norm': 7.723413467407227, 'learning_rate': 2.4343026112057934e-05, 'epoch': 22.37}
- 45%|████▍     | 2595/5800 [7:18:28<6:10:15,  6.93s/it]score1 tensor([[0.3965],
-        [0.4277],
-        [0.4863],
-        [0.3418]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.4766, 0.4824, 0.3340], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0356, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:05:05,009] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 16:05:05,010] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.16 | bwd_microstep: 4637.93 | bwd_inner_microstep: 4632.60 | bwd_allreduce_microstep: 5.21 | step_microstep: 44.94
-[2025-01-25 16:05:05,011] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.12 | bwd: 4637.96 | bwd_inner: 4632.60 | bwd_allreduce: 5.27 | step: 44.95
- 45%|████▍     | 2596/5800 [7:18:34<6:10:05,  6.93s/it]                                                       {'loss': 0.0356, 'grad_norm': 0.38904818892478943, 'learning_rate': 2.4332123807945575e-05, 'epoch': 22.38}
- 45%|████▍     | 2596/5800 [7:18:34<6:10:05,  6.93s/it]score1 tensor([[0.4785],
-        [0.4590],
-        [0.4102],
-        [0.6172]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.5000, 0.4844, 0.6367], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0518, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:05:11,926] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 16:05:11,928] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.68 | bwd_microstep: 4631.54 | bwd_inner_microstep: 4625.06 | bwd_allreduce_microstep: 6.41 | step_microstep: 46.37
-[2025-01-25 16:05:11,928] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.65 | bwd: 4631.57 | bwd_inner: 4625.06 | bwd_allreduce: 6.44 | step: 46.37
- 45%|████▍     | 2597/5800 [7:18:41<6:09:44,  6.93s/it]                                                       {'loss': 0.0518, 'grad_norm': 8.232277870178223, 'learning_rate': 2.4321220153001846e-05, 'epoch': 22.39}
- 45%|████▍     | 2597/5800 [7:18:41<6:09:44,  6.93s/it]score1 tensor([[0.4863],
-        [0.4434],
-        [0.4219],
-        [0.4609]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.4453, 0.4844, 0.4609], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:05:18,806] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 16:05:18,808] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.84 | bwd_microstep: 4591.75 | bwd_inner_microstep: 4586.75 | bwd_allreduce_microstep: 4.91 | step_microstep: 46.03
-[2025-01-25 16:05:18,808] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.81 | bwd: 4591.78 | bwd_inner: 4586.75 | bwd_allreduce: 4.95 | step: 46.04
- 45%|████▍     | 2598/5800 [7:18:48<6:08:51,  6.91s/it]                                                       {'loss': 0.0293, 'grad_norm': 5.85737419128418, 'learning_rate': 2.431031515062669e-05, 'epoch': 22.4}
- 45%|████▍     | 2598/5800 [7:18:48<6:08:51,  6.91s/it]score1 tensor([[0.3223],
-        [0.4082],
-        [0.4375],
-        [0.6523]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.4160, 0.4473, 0.6875], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0264, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:05:25,725] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 16:05:25,726] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.71 | bwd_microstep: 4632.58 | bwd_inner_microstep: 4627.50 | bwd_allreduce_microstep: 5.01 | step_microstep: 45.65
-[2025-01-25 16:05:25,726] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.67 | bwd: 4632.61 | bwd_inner: 4627.50 | bwd_allreduce: 5.05 | step: 45.66
- 45%|████▍     | 2599/5800 [7:18:55<6:08:51,  6.91s/it]                                                       {'loss': 0.0264, 'grad_norm': 7.894259929656982, 'learning_rate': 2.4299408804220485e-05, 'epoch': 22.41}
- 45%|████▍     | 2599/5800 [7:18:55<6:08:51,  6.91s/it]score1 tensor([[0.3809],
-        [0.4648],
-        [0.5312],
-        [0.4102]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3730, 0.4941, 0.5352, 0.4023], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:05:32,640] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 16:05:32,641] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.23 | bwd_microstep: 4627.67 | bwd_inner_microstep: 4621.74 | bwd_allreduce_microstep: 5.81 | step_microstep: 45.45
-[2025-01-25 16:05:32,641] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.20 | bwd: 4627.69 | bwd_inner: 4621.74 | bwd_allreduce: 5.87 | step: 45.46
- 45%|████▍     | 2600/5800 [7:19:02<6:08:46,  6.91s/it]                                                       {'loss': 0.0122, 'grad_norm': 0.8019312620162964, 'learning_rate': 2.4288501117184012e-05, 'epoch': 22.41}
- 45%|████▍     | 2600/5800 [7:19:02<6:08:46,  6.91s/it]score1 tensor([[0.5430],
-        [0.7188],
-        [0.3691],
-        [0.5977]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.7070, 0.3711, 0.5742], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0181, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:05:39,564] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.90 | optimizer_step: 4.36
-[2025-01-25 16:05:39,566] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.24 | bwd_microstep: 4629.05 | bwd_inner_microstep: 4623.85 | bwd_allreduce_microstep: 5.10 | step_microstep: 44.03
-[2025-01-25 16:05:39,566] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.17 | bwd: 4629.07 | bwd_inner: 4623.85 | bwd_allreduce: 5.15 | step: 44.03
- 45%|████▍     | 2601/5800 [7:19:09<6:08:50,  6.92s/it]                                                       {'loss': 0.0181, 'grad_norm': 1.333519458770752, 'learning_rate': 2.4277592092918475e-05, 'epoch': 22.42}
- 45%|████▍     | 2601/5800 [7:19:09<6:08:50,  6.92s/it]score1 tensor([[0.4434],
-        [0.4883],
-        [0.6914],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.5195, 0.6797, 0.5898], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0317, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:05:46,489] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 16:05:46,490] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.33 | bwd_microstep: 4634.04 | bwd_inner_microstep: 4629.05 | bwd_allreduce_microstep: 4.90 | step_microstep: 41.92
-[2025-01-25 16:05:46,491] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.29 | bwd: 4634.06 | bwd_inner: 4629.05 | bwd_allreduce: 4.94 | step: 41.93
- 45%|████▍     | 2602/5800 [7:19:16<6:08:55,  6.92s/it]                                                       {'loss': 0.0317, 'grad_norm': 3.714759588241577, 'learning_rate': 2.42666817348255e-05, 'epoch': 22.43}
- 45%|████▍     | 2602/5800 [7:19:16<6:08:55,  6.92s/it]score1 tensor([[0.4609],
-        [0.5547],
-        [0.5781],
-        [0.4941]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.5781, 0.5586, 0.4766], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:05:53,412] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.37
-[2025-01-25 16:05:53,413] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.48 | bwd_microstep: 4627.83 | bwd_inner_microstep: 4622.67 | bwd_allreduce_microstep: 5.09 | step_microstep: 45.93
-[2025-01-25 16:05:53,414] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.44 | bwd: 4627.86 | bwd_inner: 4622.67 | bwd_allreduce: 5.12 | step: 45.93
- 45%|████▍     | 2603/5800 [7:19:23<6:08:45,  6.92s/it]                                                       {'loss': 0.0161, 'grad_norm': 0.4934215247631073, 'learning_rate': 2.425577004630713e-05, 'epoch': 22.44}
- 45%|████▍     | 2603/5800 [7:19:23<6:08:45,  6.92s/it]score1 tensor([[0.4219],
-        [0.4590],
-        [0.4805],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3887, 0.4668, 0.4453, 0.4766], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0405, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:06:00,332] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.36
-[2025-01-25 16:06:00,333] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.27 | bwd_microstep: 4627.89 | bwd_inner_microstep: 4623.10 | bwd_allreduce_microstep: 4.71 | step_microstep: 43.47
-[2025-01-25 16:06:00,333] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.22 | bwd: 4627.91 | bwd_inner: 4623.10 | bwd_allreduce: 4.75 | step: 43.49
- 45%|████▍     | 2604/5800 [7:19:30<6:08:37,  6.92s/it]                                                       {'loss': 0.0405, 'grad_norm': 4.1376447677612305, 'learning_rate': 2.4244857030765813e-05, 'epoch': 22.45}
- 45%|████▍     | 2604/5800 [7:19:30<6:08:37,  6.92s/it]score1 tensor([[0.6680],
-        [0.3828],
-        [0.4453],
-        [0.6367]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.3672, 0.4609, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:06:07,248] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 16:06:07,249] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.02 | bwd_microstep: 4634.07 | bwd_inner_microstep: 4629.01 | bwd_allreduce_microstep: 4.99 | step_microstep: 42.97
-[2025-01-25 16:06:07,249] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.98 | bwd: 4634.10 | bwd_inner: 4629.01 | bwd_allreduce: 5.02 | step: 42.97
- 45%|████▍     | 2605/5800 [7:19:37<6:08:27,  6.92s/it]                                                       {'loss': 0.0254, 'grad_norm': 4.725247383117676, 'learning_rate': 2.423394269160442e-05, 'epoch': 22.46}
- 45%|████▍     | 2605/5800 [7:19:37<6:08:27,  6.92s/it]score1 tensor([[0.5117],
-        [0.4629],
-        [0.4902],
-        [0.3828]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.4395, 0.4844, 0.3516], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:06:14,166] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.95 | optimizer_step: 4.37
-[2025-01-25 16:06:14,167] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.40 | bwd_microstep: 4623.10 | bwd_inner_microstep: 4618.17 | bwd_allreduce_microstep: 4.84 | step_microstep: 44.69
-[2025-01-25 16:06:14,167] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.38 | bwd: 4623.13 | bwd_inner: 4618.16 | bwd_allreduce: 4.89 | step: 44.70
- 45%|████▍     | 2606/5800 [7:19:44<6:08:21,  6.92s/it]                                                       {'loss': 0.0161, 'grad_norm': 3.73730731010437, 'learning_rate': 2.4223027032226235e-05, 'epoch': 22.47}
- 45%|████▍     | 2606/5800 [7:19:44<6:08:21,  6.92s/it]score1 tensor([[0.5391],
-        [0.5195],
-        [0.5469],
-        [0.5859]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.6055, 0.5469, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0303, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:06:21,037] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.94 | optimizer_step: 4.37
-[2025-01-25 16:06:21,038] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.45 | bwd_microstep: 4584.99 | bwd_inner_microstep: 4579.36 | bwd_allreduce_microstep: 5.53 | step_microstep: 41.72
-[2025-01-25 16:06:21,038] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.41 | bwd: 4585.01 | bwd_inner: 4579.36 | bwd_allreduce: 5.58 | step: 41.73
- 45%|████▍     | 2607/5800 [7:19:51<6:07:26,  6.90s/it]                                                       {'loss': 0.0303, 'grad_norm': 2.2918217182159424, 'learning_rate': 2.421211005603494e-05, 'epoch': 22.47}
- 45%|████▍     | 2607/5800 [7:19:51<6:07:26,  6.90s/it]score1 tensor([[0.3164],
-        [0.5820],
-        [0.6133],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3086, 0.6133, 0.6094, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:06:27,961] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.99 | optimizer_step: 4.37
-[2025-01-25 16:06:27,962] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.83 | bwd_microstep: 4632.52 | bwd_inner_microstep: 4627.41 | bwd_allreduce_microstep: 5.01 | step_microstep: 46.56
-[2025-01-25 16:06:27,962] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.79 | bwd: 4632.54 | bwd_inner: 4627.41 | bwd_allreduce: 5.06 | step: 46.57
- 45%|████▍     | 2608/5800 [7:19:57<6:07:37,  6.91s/it]                                                       {'loss': 0.0117, 'grad_norm': 3.870344400405884, 'learning_rate': 2.4201191766434645e-05, 'epoch': 22.48}
- 45%|████▍     | 2608/5800 [7:19:57<6:07:37,  6.91s/it]score1 tensor([[0.5234],
-        [0.5078],
-        [0.5742],
-        [0.3320]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4922, 0.4980, 0.6016, 0.3477], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0210, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:06:34,880] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.11 | optimizer_step: 4.37
-[2025-01-25 16:06:34,881] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.26 | bwd_microstep: 4630.77 | bwd_inner_microstep: 4625.63 | bwd_allreduce_microstep: 5.06 | step_microstep: 44.53
-[2025-01-25 16:06:34,882] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.23 | bwd: 4630.80 | bwd_inner: 4625.63 | bwd_allreduce: 5.10 | step: 44.54
- 45%|████▍     | 2609/5800 [7:20:04<6:07:37,  6.91s/it]                                                       {'loss': 0.021, 'grad_norm': 0.59539794921875, 'learning_rate': 2.4190272166829862e-05, 'epoch': 22.49}
- 45%|████▍     | 2609/5800 [7:20:04<6:07:37,  6.91s/it]score1 tensor([[0.6055],
-        [0.5234],
-        [0.5117],
-        [0.4980]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.5508, 0.5156, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:06:41,808] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.36
-[2025-01-25 16:06:41,809] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.64 | bwd_microstep: 4634.64 | bwd_inner_microstep: 4629.58 | bwd_allreduce_microstep: 4.97 | step_microstep: 45.02
-[2025-01-25 16:06:41,809] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.61 | bwd: 4634.66 | bwd_inner: 4629.58 | bwd_allreduce: 5.01 | step: 45.03
- 45%|████▌     | 2610/5800 [7:20:11<6:07:47,  6.92s/it]                                                       {'loss': 0.0122, 'grad_norm': 8.505681037902832, 'learning_rate': 2.417935126062551e-05, 'epoch': 22.5}
- 45%|████▌     | 2610/5800 [7:20:11<6:07:47,  6.92s/it]score1 tensor([[0.6367],
-        [0.5469],
-        [0.4453],
-        [0.4805]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6562, 0.5430, 0.4141, 0.4863], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0151, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:06:48,734] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 16:06:48,736] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.43 | bwd_microstep: 4635.68 | bwd_inner_microstep: 4630.17 | bwd_allreduce_microstep: 5.39 | step_microstep: 46.65
-[2025-01-25 16:06:48,736] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.40 | bwd: 4635.71 | bwd_inner: 4630.17 | bwd_allreduce: 5.46 | step: 46.67
- 45%|████▌     | 2611/5800 [7:20:18<6:07:43,  6.92s/it]                                                       {'loss': 0.0151, 'grad_norm': 0.5799654722213745, 'learning_rate': 2.4168429051226906e-05, 'epoch': 22.51}
- 45%|████▌     | 2611/5800 [7:20:18<6:07:43,  6.92s/it]score1 tensor([[0.3574],
-        [0.4648],
-        [0.4883],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.4629, 0.5664, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:06:55,651] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.37
-[2025-01-25 16:06:55,653] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.96 | bwd_microstep: 4624.16 | bwd_inner_microstep: 4618.95 | bwd_allreduce_microstep: 5.11 | step_microstep: 43.81
-[2025-01-25 16:06:55,653] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.93 | bwd: 4624.19 | bwd_inner: 4618.95 | bwd_allreduce: 5.16 | step: 43.81
- 45%|████▌     | 2612/5800 [7:20:25<6:07:38,  6.92s/it]                                                       {'loss': 0.0273, 'grad_norm': 3.9698143005371094, 'learning_rate': 2.4157505542039806e-05, 'epoch': 22.52}
- 45%|████▌     | 2612/5800 [7:20:25<6:07:38,  6.92s/it]score1 tensor([[0.5469],
-        [0.5039],
-        [0.4238],
-        [0.3340]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.4785, 0.4277, 0.3223], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:07:02,574] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 16:07:02,575] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.30 | bwd_microstep: 4636.93 | bwd_inner_microstep: 4631.19 | bwd_allreduce_microstep: 5.63 | step_microstep: 44.30
-[2025-01-25 16:07:02,575] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.26 | bwd: 4636.95 | bwd_inner: 4631.19 | bwd_allreduce: 5.69 | step: 44.31
- 45%|████▌     | 2613/5800 [7:20:32<6:07:33,  6.92s/it]                                                       {'loss': 0.0112, 'grad_norm': 4.016478061676025, 'learning_rate': 2.414658073647034e-05, 'epoch': 22.53}
- 45%|████▌     | 2613/5800 [7:20:32<6:07:33,  6.92s/it]score1 tensor([[0.4336],
-        [0.5859],
-        [0.6523],
-        [0.3457]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4258, 0.6484, 0.6016, 0.3438], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0308, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:07:09,500] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 16:07:09,501] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.14 | bwd_microstep: 4628.90 | bwd_inner_microstep: 4623.67 | bwd_allreduce_microstep: 5.14 | step_microstep: 45.65
-[2025-01-25 16:07:09,502] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.10 | bwd: 4628.92 | bwd_inner: 4623.67 | bwd_allreduce: 5.18 | step: 45.65
- 45%|████▌     | 2614/5800 [7:20:39<6:07:31,  6.92s/it]                                                       {'loss': 0.0308, 'grad_norm': 3.7896552085876465, 'learning_rate': 2.4135654637925044e-05, 'epoch': 22.53}
- 45%|████▌     | 2614/5800 [7:20:39<6:07:31,  6.92s/it]score1 tensor([[0.4355],
-        [0.4824],
-        [0.4336],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4883, 0.4297, 0.4551, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0347, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:07:16,422] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 16:07:16,424] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.34 | bwd_microstep: 4638.63 | bwd_inner_microstep: 4633.48 | bwd_allreduce_microstep: 5.03 | step_microstep: 43.47
-[2025-01-25 16:07:16,424] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.31 | bwd: 4638.65 | bwd_inner: 4633.48 | bwd_allreduce: 5.10 | step: 43.48
- 45%|████▌     | 2615/5800 [7:20:46<6:07:29,  6.92s/it]                                                       {'loss': 0.0347, 'grad_norm': 0.6846346855163574, 'learning_rate': 2.4124727249810877e-05, 'epoch': 22.54}
- 45%|████▌     | 2615/5800 [7:20:46<6:07:29,  6.92s/it]score1 tensor([[0.5508],
-        [0.4414],
-        [0.4453],
-        [0.4492]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.4180, 0.4609, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:07:23,344] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 7.65
-[2025-01-25 16:07:23,345] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.90 | bwd_microstep: 4626.12 | bwd_inner_microstep: 4621.18 | bwd_allreduce_microstep: 4.84 | step_microstep: 47.59
-[2025-01-25 16:07:23,345] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.87 | bwd: 4626.14 | bwd_inner: 4621.18 | bwd_allreduce: 4.89 | step: 47.59
- 45%|████▌     | 2616/5800 [7:20:53<6:07:18,  6.92s/it]                                                       {'loss': 0.0273, 'grad_norm': 4.201282978057861, 'learning_rate': 2.4113798575535185e-05, 'epoch': 22.55}
- 45%|████▌     | 2616/5800 [7:20:53<6:07:18,  6.92s/it]score1 tensor([[0.4805],
-        [0.4883],
-        [0.6953],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.5156, 0.6953, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:07:30,207] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.06 | optimizer_step: 4.37
-[2025-01-25 16:07:30,208] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.29 | bwd_microstep: 4579.47 | bwd_inner_microstep: 4574.21 | bwd_allreduce_microstep: 5.14 | step_microstep: 45.65
-[2025-01-25 16:07:30,208] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.26 | bwd: 4579.50 | bwd_inner: 4574.21 | bwd_allreduce: 5.20 | step: 45.66
- 45%|████▌     | 2617/5800 [7:21:00<6:06:16,  6.90s/it]                                                       {'loss': 0.0156, 'grad_norm': 1.9379295110702515, 'learning_rate': 2.410286861850571e-05, 'epoch': 22.56}
- 45%|████▌     | 2617/5800 [7:21:00<6:06:16,  6.90s/it]score1 tensor([[0.5039],
-        [0.4883],
-        [0.4883],
-        [0.6055]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.4922, 0.4902, 0.6875], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0317, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:07:37,136] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.88 | optimizer_step: 4.36
-[2025-01-25 16:07:37,137] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.05 | bwd_microstep: 4635.50 | bwd_inner_microstep: 4630.65 | bwd_allreduce_microstep: 4.76 | step_microstep: 43.36
-[2025-01-25 16:07:37,137] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.00 | bwd: 4635.52 | bwd_inner: 4630.65 | bwd_allreduce: 4.80 | step: 43.36
- 45%|████▌     | 2618/5800 [7:21:07<6:06:37,  6.91s/it]                                                       {'loss': 0.0317, 'grad_norm': 8.4932861328125, 'learning_rate': 2.4091937382130617e-05, 'epoch': 22.57}
- 45%|████▌     | 2618/5800 [7:21:07<6:06:37,  6.91s/it]score1 tensor([[0.5195],
-        [0.5000],
-        [0.4551],
-        [0.4961]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.4941, 0.4277, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:07:44,065] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.37
-[2025-01-25 16:07:44,066] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.33 | bwd_microstep: 4635.69 | bwd_inner_microstep: 4630.47 | bwd_allreduce_microstep: 5.10 | step_microstep: 44.99
-[2025-01-25 16:07:44,067] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.30 | bwd: 4635.71 | bwd_inner: 4630.47 | bwd_allreduce: 5.17 | step: 45.00
- 45%|████▌     | 2619/5800 [7:21:14<6:06:38,  6.92s/it]                                                       {'loss': 0.0107, 'grad_norm': 3.9780189990997314, 'learning_rate': 2.408100486981844e-05, 'epoch': 22.58}
- 45%|████▌     | 2619/5800 [7:21:14<6:06:38,  6.92s/it]score1 tensor([[0.4824],
-        [0.4688],
-        [0.5312],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.4355, 0.5469, 0.4863], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0239, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:07:50,978] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 16:07:50,979] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.29 | bwd_microstep: 4633.59 | bwd_inner_microstep: 4628.41 | bwd_allreduce_microstep: 5.09 | step_microstep: 43.05
-[2025-01-25 16:07:50,980] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.26 | bwd: 4633.62 | bwd_inner: 4628.41 | bwd_allreduce: 5.14 | step: 43.06
- 45%|████▌     | 2620/5800 [7:21:20<6:06:32,  6.92s/it]                                                       {'loss': 0.0239, 'grad_norm': 4.00063419342041, 'learning_rate': 2.4070071084978136e-05, 'epoch': 22.59}
- 45%|████▌     | 2620/5800 [7:21:20<6:06:32,  6.92s/it]score1 tensor([[0.5391],
-        [0.4414],
-        [0.4551],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.4160, 0.5195, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0317, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:07:57,905] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 8.29 | optimizer_step: 4.37
-[2025-01-25 16:07:57,906] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.35 | bwd_microstep: 4639.46 | bwd_inner_microstep: 4633.80 | bwd_allreduce_microstep: 5.54 | step_microstep: 50.16
-[2025-01-25 16:07:57,907] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.32 | bwd: 4639.49 | bwd_inner: 4633.80 | bwd_allreduce: 5.60 | step: 50.17
- 45%|████▌     | 2621/5800 [7:21:27<6:06:34,  6.92s/it]                                                       {'loss': 0.0317, 'grad_norm': 0.4138611853122711, 'learning_rate': 2.4059136031019044e-05, 'epoch': 22.59}
- 45%|████▌     | 2621/5800 [7:21:27<6:06:34,  6.92s/it]score1 tensor([[0.5742],
-        [0.5820],
-        [0.4941],
-        [0.4668]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6406, 0.5781, 0.4961, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:08:04,832] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.37
-[2025-01-25 16:08:04,833] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.97 | bwd_microstep: 4629.76 | bwd_inner_microstep: 4624.01 | bwd_allreduce_microstep: 5.63 | step_microstep: 51.27
-[2025-01-25 16:08:04,833] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.91 | bwd: 4629.79 | bwd_inner: 4624.01 | bwd_allreduce: 5.69 | step: 51.27
- 45%|████▌     | 2622/5800 [7:21:34<6:06:37,  6.92s/it]                                                       {'loss': 0.0186, 'grad_norm': 0.43419623374938965, 'learning_rate': 2.4048199711350905e-05, 'epoch': 22.6}
- 45%|████▌     | 2622/5800 [7:21:34<6:06:37,  6.92s/it]score1 tensor([[0.4238],
-        [0.4512],
-        [0.6094],
-        [0.4727]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4043, 0.4629, 0.5625, 0.4062], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0361, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:08:11,760] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.37
-[2025-01-25 16:08:11,762] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.23 | bwd_microstep: 4627.56 | bwd_inner_microstep: 4622.19 | bwd_allreduce_microstep: 5.24 | step_microstep: 49.19
-[2025-01-25 16:08:11,762] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.20 | bwd: 4627.58 | bwd_inner: 4622.19 | bwd_allreduce: 5.31 | step: 49.20
- 45%|████▌     | 2623/5800 [7:21:41<6:06:37,  6.92s/it]                                                       {'loss': 0.0361, 'grad_norm': 4.1998467445373535, 'learning_rate': 2.4037262129383856e-05, 'epoch': 22.61}
- 45%|████▌     | 2623/5800 [7:21:41<6:06:37,  6.92s/it]score1 tensor([[0.5430],
-        [0.5938],
-        [0.5273],
-        [0.4570]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.5977, 0.5625, 0.4805], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:08:18,678] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.02 | optimizer_step: 4.36
-[2025-01-25 16:08:18,679] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.42 | bwd_microstep: 4624.64 | bwd_inner_microstep: 4619.49 | bwd_allreduce_microstep: 5.05 | step_microstep: 43.28
-[2025-01-25 16:08:18,679] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.40 | bwd: 4624.66 | bwd_inner: 4619.49 | bwd_allreduce: 5.10 | step: 43.28
- 45%|████▌     | 2624/5800 [7:21:48<6:06:24,  6.92s/it]                                                       {'loss': 0.0176, 'grad_norm': 8.468588829040527, 'learning_rate': 2.4026323288528424e-05, 'epoch': 22.62}
- 45%|████▌     | 2624/5800 [7:21:48<6:06:24,  6.92s/it]score1 tensor([[0.5625],
-        [0.4941],
-        [0.5352],
-        [0.4863]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4141, 0.5273, 0.4883], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:08:25,601] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 16:08:25,603] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.10 | bwd_microstep: 4635.62 | bwd_inner_microstep: 4630.39 | bwd_allreduce_microstep: 5.09 | step_microstep: 44.40
-[2025-01-25 16:08:25,603] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.06 | bwd: 4635.64 | bwd_inner: 4630.39 | bwd_allreduce: 5.17 | step: 44.41
- 45%|████▌     | 2625/5800 [7:21:55<6:06:18,  6.92s/it]                                                       {'loss': 0.0254, 'grad_norm': 4.3072404861450195, 'learning_rate': 2.4015383192195527e-05, 'epoch': 22.63}
- 45%|████▌     | 2625/5800 [7:21:55<6:06:18,  6.92s/it]score1 tensor([[0.5586],
-        [0.5078],
-        [0.5312],
-        [0.4668]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.5469, 0.5703, 0.4355], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0283, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:08:32,525] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.36
-[2025-01-25 16:08:32,526] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.55 | bwd_microstep: 4626.00 | bwd_inner_microstep: 4620.85 | bwd_allreduce_microstep: 5.05 | step_microstep: 45.75
-[2025-01-25 16:08:32,526] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.51 | bwd: 4626.02 | bwd_inner: 4620.84 | bwd_allreduce: 5.10 | step: 45.78
- 45%|████▌     | 2626/5800 [7:22:02<6:06:16,  6.92s/it]                                                       {'loss': 0.0283, 'grad_norm': 4.49595308303833, 'learning_rate': 2.4004441843796474e-05, 'epoch': 22.64}
- 45%|████▌     | 2626/5800 [7:22:02<6:06:16,  6.92s/it]score1 tensor([[0.4453],
-        [0.4590],
-        [0.3730],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.4551, 0.3809, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:08:39,437] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 16:08:39,439] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.19 | bwd_microstep: 4625.00 | bwd_inner_microstep: 4620.23 | bwd_allreduce_microstep: 4.70 | step_microstep: 43.06
-[2025-01-25 16:08:39,439] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.15 | bwd: 4625.02 | bwd_inner: 4620.23 | bwd_allreduce: 4.73 | step: 43.07
- 45%|████▌     | 2627/5800 [7:22:09<6:05:53,  6.92s/it]                                                       {'loss': 0.0107, 'grad_norm': 3.8456919193267822, 'learning_rate': 2.3993499246742968e-05, 'epoch': 22.65}
- 45%|████▌     | 2627/5800 [7:22:09<6:05:53,  6.92s/it]score1 tensor([[0.5742],
-        [0.3945],
-        [0.4844],
-        [0.5938]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6406, 0.3867, 0.4805, 0.5938], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:08:46,305] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.36
-[2025-01-25 16:08:46,306] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.79 | bwd_microstep: 4581.91 | bwd_inner_microstep: 4576.72 | bwd_allreduce_microstep: 5.10 | step_microstep: 43.03
-[2025-01-25 16:08:46,307] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.76 | bwd: 4581.93 | bwd_inner: 4576.72 | bwd_allreduce: 5.14 | step: 43.04
- 45%|████▌     | 2628/5800 [7:22:16<6:05:08,  6.91s/it]                                                       {'loss': 0.0195, 'grad_norm': 1.724326252937317, 'learning_rate': 2.3982555404447112e-05, 'epoch': 22.66}
- 45%|████▌     | 2628/5800 [7:22:16<6:05:08,  6.91s/it]score1 tensor([[0.4316],
-        [0.6523],
-        [0.6680],
-        [0.4453]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.6133, 0.6484, 0.4238], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0244, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:08:53,225] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.96 | optimizer_step: 4.36
-[2025-01-25 16:08:53,227] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.83 | bwd_microstep: 4626.41 | bwd_inner_microstep: 4620.84 | bwd_allreduce_microstep: 5.45 | step_microstep: 43.94
-[2025-01-25 16:08:53,227] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.80 | bwd: 4626.43 | bwd_inner: 4620.84 | bwd_allreduce: 5.52 | step: 43.96
- 45%|████▌     | 2629/5800 [7:22:23<6:05:07,  6.91s/it]                                                       {'loss': 0.0244, 'grad_norm': 4.9323248863220215, 'learning_rate': 2.3971610320321372e-05, 'epoch': 22.66}
- 45%|████▌     | 2629/5800 [7:22:23<6:05:07,  6.91s/it]score1 tensor([[0.5156],
-        [0.5742],
-        [0.4980],
-        [0.4590]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.5508, 0.5352, 0.4531], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0220, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:09:00,144] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 16:09:00,145] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.53 | bwd_microstep: 4633.20 | bwd_inner_microstep: 4628.44 | bwd_allreduce_microstep: 4.69 | step_microstep: 42.10
-[2025-01-25 16:09:00,145] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.50 | bwd: 4633.22 | bwd_inner: 4628.44 | bwd_allreduce: 4.72 | step: 42.11
- 45%|████▌     | 2630/5800 [7:22:30<6:05:07,  6.91s/it]                                                       {'loss': 0.022, 'grad_norm': 4.307908058166504, 'learning_rate': 2.396066399777863e-05, 'epoch': 22.67}
- 45%|████▌     | 2630/5800 [7:22:30<6:05:07,  6.91s/it]score1 tensor([[0.5234],
-        [0.4902],
-        [0.5078],
-        [0.4219]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.5195, 0.5078, 0.4434], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:09:07,023] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 16:09:07,025] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.80 | bwd_microstep: 4583.55 | bwd_inner_microstep: 4578.84 | bwd_allreduce_microstep: 4.59 | step_microstep: 42.44
-[2025-01-25 16:09:07,025] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.77 | bwd: 4583.57 | bwd_inner: 4578.84 | bwd_allreduce: 4.65 | step: 42.44
- 45%|████▌     | 2631/5800 [7:22:37<6:04:32,  6.90s/it]                                                       {'loss': 0.0176, 'grad_norm': 6.053647994995117, 'learning_rate': 2.394971644023212e-05, 'epoch': 22.68}
- 45%|████▌     | 2631/5800 [7:22:37<6:04:32,  6.90s/it]score1 tensor([[0.5547],
-        [0.4961],
-        [0.4648],
-        [0.4395]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.5273, 0.4805, 0.4316], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0303, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:09:13,946] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 16:09:13,947] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.30 | bwd_microstep: 4632.24 | bwd_inner_microstep: 4627.64 | bwd_allreduce_microstep: 4.53 | step_microstep: 46.63
-[2025-01-25 16:09:13,947] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.27 | bwd: 4632.27 | bwd_inner: 4627.64 | bwd_allreduce: 4.56 | step: 46.64
- 45%|████▌     | 2632/5800 [7:22:43<6:04:43,  6.91s/it]                                                       {'loss': 0.0303, 'grad_norm': 4.322622776031494, 'learning_rate': 2.3938767651095495e-05, 'epoch': 22.69}
- 45%|████▌     | 2632/5800 [7:22:43<6:04:43,  6.91s/it]score1 tensor([[0.6055],
-        [0.5703],
-        [0.4414],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5938, 0.5469, 0.4844, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0205, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:09:20,869] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 16:09:20,870] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.93 | bwd_microstep: 4631.00 | bwd_inner_microstep: 4626.20 | bwd_allreduce_microstep: 4.70 | step_microstep: 50.34
-[2025-01-25 16:09:20,871] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.90 | bwd: 4631.03 | bwd_inner: 4626.20 | bwd_allreduce: 4.75 | step: 50.34
- 45%|████▌     | 2633/5800 [7:22:50<6:04:48,  6.91s/it]                                                       {'loss': 0.0205, 'grad_norm': 0.7426488399505615, 'learning_rate': 2.3927817633782765e-05, 'epoch': 22.7}
- 45%|████▌     | 2633/5800 [7:22:50<6:04:48,  6.91s/it]score1 tensor([[0.4395],
-        [0.5703],
-        [0.5469],
-        [0.4824]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.5625, 0.5664, 0.4883], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:09:27,765] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.36
-[2025-01-25 16:09:27,767] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.30 | bwd_microstep: 4620.59 | bwd_inner_microstep: 4615.74 | bwd_allreduce_microstep: 4.71 | step_microstep: 43.07
-[2025-01-25 16:09:27,767] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.26 | bwd: 4620.61 | bwd_inner: 4615.74 | bwd_allreduce: 4.79 | step: 43.08
- 45%|████▌     | 2634/5800 [7:22:57<6:04:28,  6.91s/it]                                                       {'loss': 0.0103, 'grad_norm': 3.923754930496216, 'learning_rate': 2.3916866391708352e-05, 'epoch': 22.71}
- 45%|████▌     | 2634/5800 [7:22:57<6:04:28,  6.91s/it]score1 tensor([[0.3965],
-        [0.4609],
-        [0.4453],
-        [0.5781]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4062, 0.5039, 0.4414, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0220, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:09:34,680] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.37
-[2025-01-25 16:09:34,682] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.60 | bwd_microstep: 4631.49 | bwd_inner_microstep: 4626.59 | bwd_allreduce_microstep: 4.81 | step_microstep: 45.79
-[2025-01-25 16:09:34,682] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.57 | bwd: 4631.52 | bwd_inner: 4626.59 | bwd_allreduce: 4.85 | step: 45.80
- 45%|████▌     | 2635/5800 [7:23:04<6:04:29,  6.91s/it]                                                       {'loss': 0.022, 'grad_norm': 4.110798358917236, 'learning_rate': 2.3905913928287032e-05, 'epoch': 22.72}
- 45%|████▌     | 2635/5800 [7:23:04<6:04:29,  6.91s/it]score1 tensor([[0.5820],
-        [0.4902],
-        [0.4141],
-        [0.6016]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.5117, 0.4180, 0.6250], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0220, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:09:41,599] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.10 | optimizer_step: 4.37
-[2025-01-25 16:09:41,600] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.09 | bwd_microstep: 4623.42 | bwd_inner_microstep: 4618.55 | bwd_allreduce_microstep: 4.80 | step_microstep: 46.22
-[2025-01-25 16:09:41,600] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.06 | bwd: 4623.44 | bwd_inner: 4618.55 | bwd_allreduce: 4.83 | step: 46.23
- 45%|████▌     | 2636/5800 [7:23:11<6:04:31,  6.91s/it]                                                       {'loss': 0.022, 'grad_norm': 3.94100022315979, 'learning_rate': 2.3894960246933975e-05, 'epoch': 22.72}
- 45%|████▌     | 2636/5800 [7:23:11<6:04:31,  6.91s/it]score1 tensor([[0.5742],
-        [0.4082],
-        [0.4121],
-        [0.4688]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.4043, 0.4180, 0.4121], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:09:48,520] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.37
-[2025-01-25 16:09:48,522] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.76 | bwd_microstep: 4633.85 | bwd_inner_microstep: 4628.95 | bwd_allreduce_microstep: 4.78 | step_microstep: 45.63
-[2025-01-25 16:09:48,522] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.71 | bwd: 4633.91 | bwd_inner: 4628.95 | bwd_allreduce: 4.82 | step: 45.64
- 45%|████▌     | 2637/5800 [7:23:18<6:04:33,  6.92s/it]                                                       {'loss': 0.0254, 'grad_norm': 4.251731872558594, 'learning_rate': 2.3884005351064735e-05, 'epoch': 22.73}
- 45%|████▌     | 2637/5800 [7:23:18<6:04:33,  6.92s/it]score1 tensor([[0.4844],
-        [0.5273],
-        [0.6328],
-        [0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.5469, 0.6055, 0.3262], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0513, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:09:55,378] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 16:09:55,379] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.56 | bwd_microstep: 4570.05 | bwd_inner_microstep: 4565.22 | bwd_allreduce_microstep: 4.75 | step_microstep: 42.59
-[2025-01-25 16:09:55,380] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.52 | bwd: 4570.07 | bwd_inner: 4565.22 | bwd_allreduce: 4.79 | step: 42.60
- 45%|████▌     | 2638/5800 [7:23:25<6:03:29,  6.90s/it]                                                       {'loss': 0.0513, 'grad_norm': 2.2876195907592773, 'learning_rate': 2.3873049244095228e-05, 'epoch': 22.74}
- 45%|████▌     | 2638/5800 [7:23:25<6:03:29,  6.90s/it]score1 tensor([[0.4961],
-        [0.4453],
-        [0.5703],
-        [0.5938]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.3906, 0.5781, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0347, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:10:02,282] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 16:10:02,284] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.16 | bwd_microstep: 4621.95 | bwd_inner_microstep: 4617.13 | bwd_allreduce_microstep: 4.75 | step_microstep: 43.73
-[2025-01-25 16:10:02,284] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.12 | bwd: 4621.97 | bwd_inner: 4617.13 | bwd_allreduce: 4.78 | step: 43.74
- 46%|████▌     | 2639/5800 [7:23:32<6:03:30,  6.90s/it]                                                       {'loss': 0.0347, 'grad_norm': 4.068386554718018, 'learning_rate': 2.3862091929441764e-05, 'epoch': 22.75}
- 46%|████▌     | 2639/5800 [7:23:32<6:03:30,  6.90s/it]score1 tensor([[0.5703],
-        [0.4570],
-        [0.4629],
-        [0.6367]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.4453, 0.4023, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0396, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:10:09,196] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.15 | optimizer_step: 4.37
-[2025-01-25 16:10:09,197] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.55 | bwd_microstep: 4624.39 | bwd_inner_microstep: 4619.37 | bwd_allreduce_microstep: 4.90 | step_microstep: 45.21
-[2025-01-25 16:10:09,197] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.52 | bwd: 4624.41 | bwd_inner: 4619.37 | bwd_allreduce: 4.96 | step: 45.22
- 46%|████▌     | 2640/5800 [7:23:39<6:03:38,  6.90s/it]                                                       {'loss': 0.0396, 'grad_norm': 8.50450325012207, 'learning_rate': 2.385113341052102e-05, 'epoch': 22.76}
- 46%|████▌     | 2640/5800 [7:23:39<6:03:38,  6.90s/it]score1 tensor([[0.5391],
-        [0.4922],
-        [0.4805],
-        [0.4102]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5820, 0.4492, 0.4590, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0356, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:10:16,111] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 16:10:16,112] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.92 | bwd_microstep: 4632.06 | bwd_inner_microstep: 4627.36 | bwd_allreduce_microstep: 4.62 | step_microstep: 44.92
-[2025-01-25 16:10:16,112] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.89 | bwd: 4632.09 | bwd_inner: 4627.36 | bwd_allreduce: 4.66 | step: 44.94
- 46%|████▌     | 2641/5800 [7:23:46<6:03:35,  6.91s/it]                                                       {'loss': 0.0356, 'grad_norm': 3.7444684505462646, 'learning_rate': 2.3840173690750058e-05, 'epoch': 22.77}
- 46%|████▌     | 2641/5800 [7:23:46<6:03:35,  6.91s/it]score1 tensor([[0.4863],
-        [0.4629],
-        [0.4199],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.4570, 0.4434, 0.5977], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0259, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:10:23,018] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 16:10:23,020] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.92 | bwd_microstep: 4635.36 | bwd_inner_microstep: 4630.54 | bwd_allreduce_microstep: 4.73 | step_microstep: 41.64
-[2025-01-25 16:10:23,020] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.88 | bwd: 4635.39 | bwd_inner: 4630.54 | bwd_allreduce: 4.77 | step: 41.65
- 46%|████▌     | 2642/5800 [7:23:52<6:03:32,  6.91s/it]                                                       {'loss': 0.0259, 'grad_norm': 4.101752758026123, 'learning_rate': 2.3829212773546305e-05, 'epoch': 22.78}
- 46%|████▌     | 2642/5800 [7:23:52<6:03:32,  6.91s/it]score1 tensor([[0.3613],
-        [0.6328],
-        [0.5703],
-        [0.4414]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3730, 0.6641, 0.5586, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0151, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:10:29,930] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.96 | optimizer_step: 4.36
-[2025-01-25 16:10:29,931] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.40 | bwd_microstep: 4633.19 | bwd_inner_microstep: 4628.15 | bwd_allreduce_microstep: 4.97 | step_microstep: 43.81
-[2025-01-25 16:10:29,931] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.36 | bwd: 4633.21 | bwd_inner: 4628.15 | bwd_allreduce: 5.00 | step: 43.82
- 46%|████▌     | 2643/5800 [7:23:59<6:03:34,  6.91s/it]                                                       {'loss': 0.0151, 'grad_norm': 3.801436424255371, 'learning_rate': 2.3818250662327572e-05, 'epoch': 22.78}
- 46%|████▌     | 2643/5800 [7:23:59<6:03:34,  6.91s/it]score1 tensor([[0.5820],
-        [0.5742],
-        [0.6211],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.6172, 0.6875, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0347, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:10:36,845] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.05 | optimizer_step: 4.37
-[2025-01-25 16:10:36,847] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.14 | bwd_microstep: 4633.45 | bwd_inner_microstep: 4628.70 | bwd_allreduce_microstep: 4.66 | step_microstep: 42.12
-[2025-01-25 16:10:36,847] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.10 | bwd: 4633.47 | bwd_inner: 4628.70 | bwd_allreduce: 4.70 | step: 42.14
- 46%|████▌     | 2644/5800 [7:24:06<6:03:28,  6.91s/it]                                                       {'loss': 0.0347, 'grad_norm': 0.5517410635948181, 'learning_rate': 2.3807287360512028e-05, 'epoch': 22.79}
- 46%|████▌     | 2644/5800 [7:24:06<6:03:28,  6.91s/it]score1 tensor([[0.5195],
-        [0.5469],
-        [0.4766],
-        [0.4336]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5078, 0.6172, 0.4258, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0366, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:10:43,754] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 16:10:43,755] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.65 | bwd_microstep: 4621.89 | bwd_inner_microstep: 4616.41 | bwd_allreduce_microstep: 5.36 | step_microstep: 49.47
-[2025-01-25 16:10:43,756] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.61 | bwd: 4621.91 | bwd_inner: 4616.41 | bwd_allreduce: 5.41 | step: 49.48
- 46%|████▌     | 2645/5800 [7:24:13<6:03:17,  6.91s/it]                                                       {'loss': 0.0366, 'grad_norm': 0.5179983377456665, 'learning_rate': 2.379632287151822e-05, 'epoch': 22.8}
- 46%|████▌     | 2645/5800 [7:24:13<6:03:17,  6.91s/it]score1 tensor([[0.4160],
-        [0.6250],
-        [0.5938],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4043, 0.6445, 0.6484, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:10:50,662] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 16:10:50,664] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.60 | bwd_microstep: 4623.25 | bwd_inner_microstep: 4618.43 | bwd_allreduce_microstep: 4.72 | step_microstep: 43.96
-[2025-01-25 16:10:50,664] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.56 | bwd: 4623.27 | bwd_inner: 4618.44 | bwd_allreduce: 4.77 | step: 43.97
- 46%|████▌     | 2646/5800 [7:24:20<6:03:16,  6.91s/it]                                                       {'loss': 0.0234, 'grad_norm': 4.85726261138916, 'learning_rate': 2.378535719876507e-05, 'epoch': 22.81}
- 46%|████▌     | 2646/5800 [7:24:20<6:03:16,  6.91s/it]score1 tensor([[0.4023],
-        [0.5195],
-        [0.5000],
-        [0.4531]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3613, 0.5195, 0.4980, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:10:57,521] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.03 | optimizer_step: 4.36
-[2025-01-25 16:10:57,523] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.84 | bwd_microstep: 4571.70 | bwd_inner_microstep: 4566.67 | bwd_allreduce_microstep: 4.94 | step_microstep: 45.38
-[2025-01-25 16:10:57,523] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.81 | bwd: 4571.72 | bwd_inner: 4566.67 | bwd_allreduce: 4.99 | step: 45.39
- 46%|████▌     | 2647/5800 [7:24:27<6:02:20,  6.90s/it]                                                       {'loss': 0.0195, 'grad_norm': 5.832620143890381, 'learning_rate': 2.3774390345671865e-05, 'epoch': 22.82}
- 46%|████▌     | 2647/5800 [7:24:27<6:02:20,  6.90s/it]score1 tensor([[0.4688],
-        [0.5469],
-        [0.4375],
-        [0.4434]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.5625, 0.4336, 0.4414], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:11:04,434] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 16:11:04,435] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.85 | bwd_microstep: 4629.59 | bwd_inner_microstep: 4624.55 | bwd_allreduce_microstep: 4.95 | step_microstep: 43.25
-[2025-01-25 16:11:04,436] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.79 | bwd: 4629.61 | bwd_inner: 4624.55 | bwd_allreduce: 4.99 | step: 43.25
- 46%|████▌     | 2648/5800 [7:24:34<6:02:26,  6.90s/it]                                                       {'loss': 0.0088, 'grad_norm': 3.8859777450561523, 'learning_rate': 2.3763422315658256e-05, 'epoch': 22.83}
- 46%|████▌     | 2648/5800 [7:24:34<6:02:26,  6.90s/it]score1 tensor([[0.4707],
-        [0.5156],
-        [0.5664],
-        [0.4297]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4746, 0.5156, 0.5742, 0.3809], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0151, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:11:11,301] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.36
-[2025-01-25 16:11:11,302] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.66 | bwd_microstep: 4578.11 | bwd_inner_microstep: 4573.30 | bwd_allreduce_microstep: 4.73 | step_microstep: 41.71
-[2025-01-25 16:11:11,302] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.62 | bwd: 4578.14 | bwd_inner: 4573.30 | bwd_allreduce: 4.77 | step: 41.72
- 46%|████▌     | 2649/5800 [7:24:41<6:01:48,  6.89s/it]                                                       {'loss': 0.0151, 'grad_norm': 2.32613205909729, 'learning_rate': 2.3752453112144274e-05, 'epoch': 22.84}
- 46%|████▌     | 2649/5800 [7:24:41<6:01:48,  6.89s/it]score1 tensor([[0.5234],
-        [0.4883],
-        [0.5000],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4922, 0.4785, 0.5000, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0181, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:11:18,166] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.89 | optimizer_step: 4.36
-[2025-01-25 16:11:18,167] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.49 | bwd_microstep: 4583.28 | bwd_inner_microstep: 4578.08 | bwd_allreduce_microstep: 5.09 | step_microstep: 42.33
-[2025-01-25 16:11:18,167] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.45 | bwd: 4583.30 | bwd_inner: 4578.08 | bwd_allreduce: 5.14 | step: 42.33
- 46%|████▌     | 2650/5800 [7:24:48<6:01:22,  6.88s/it]                                                       {'loss': 0.0181, 'grad_norm': 1.955883264541626, 'learning_rate': 2.3741482738550294e-05, 'epoch': 22.84}
- 46%|████▌     | 2650/5800 [7:24:48<6:01:22,  6.88s/it]score1 tensor([[0.5039],
-        [0.4805],
-        [0.4668],
-        [0.4609]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.5391, 0.4570, 0.4590], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:11:25,019] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 16:11:25,020] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.59 | bwd_microstep: 4571.38 | bwd_inner_microstep: 4566.47 | bwd_allreduce_microstep: 4.82 | step_microstep: 41.43
-[2025-01-25 16:11:25,020] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.56 | bwd: 4571.40 | bwd_inner: 4566.47 | bwd_allreduce: 4.87 | step: 41.43
- 46%|████▌     | 2651/5800 [7:24:54<6:00:45,  6.87s/it]                                                       {'loss': 0.0176, 'grad_norm': 1.9565905332565308, 'learning_rate': 2.373051119829708e-05, 'epoch': 22.85}
- 46%|████▌     | 2651/5800 [7:24:54<6:00:45,  6.87s/it]score1 tensor([[0.4844],
-        [0.6133],
-        [0.3613],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5234, 0.6289, 0.3652, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:11:31,933] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.92 | optimizer_step: 4.36
-[2025-01-25 16:11:31,934] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.10 | bwd_microstep: 4622.23 | bwd_inner_microstep: 4617.31 | bwd_allreduce_microstep: 4.83 | step_microstep: 45.83
-[2025-01-25 16:11:31,935] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.06 | bwd: 4622.26 | bwd_inner: 4617.31 | bwd_allreduce: 4.87 | step: 45.84
- 46%|████▌     | 2652/5800 [7:25:01<6:01:17,  6.89s/it]                                                       {'loss': 0.0195, 'grad_norm': 8.20009994506836, 'learning_rate': 2.371953849480574e-05, 'epoch': 22.86}
- 46%|████▌     | 2652/5800 [7:25:01<6:01:17,  6.89s/it]score1 tensor([[0.5664],
-        [0.5547],
-        [0.4062],
-        [0.2949]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.5508, 0.3750, 0.1787], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0486, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:11:38,845] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.37
-[2025-01-25 16:11:38,847] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.64 | bwd_microstep: 4623.72 | bwd_inner_microstep: 4618.94 | bwd_allreduce_microstep: 4.70 | step_microstep: 44.08
-[2025-01-25 16:11:38,847] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.60 | bwd: 4623.74 | bwd_inner: 4618.94 | bwd_allreduce: 4.74 | step: 44.08
- 46%|████▌     | 2653/5800 [7:25:08<6:01:33,  6.89s/it]                                                       {'loss': 0.0486, 'grad_norm': 3.4960973262786865, 'learning_rate': 2.3708564631497764e-05, 'epoch': 22.87}
- 46%|████▌     | 2653/5800 [7:25:08<6:01:33,  6.89s/it]score1 tensor([[0.4668],
-        [0.3730],
-        [0.5664],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.3789, 0.5703, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0190, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:11:45,761] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.92 | optimizer_step: 4.36
-[2025-01-25 16:11:45,763] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.99 | bwd_microstep: 4630.75 | bwd_inner_microstep: 4625.76 | bwd_allreduce_microstep: 4.90 | step_microstep: 43.29
-[2025-01-25 16:11:45,763] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.96 | bwd: 4630.77 | bwd_inner: 4625.75 | bwd_allreduce: 4.95 | step: 43.29
- 46%|████▌     | 2654/5800 [7:25:15<6:01:48,  6.90s/it]                                                       {'loss': 0.019, 'grad_norm': 8.042450904846191, 'learning_rate': 2.369758961179498e-05, 'epoch': 22.88}
- 46%|████▌     | 2654/5800 [7:25:15<6:01:48,  6.90s/it]score1 tensor([[0.4727],
-        [0.4590],
-        [0.5938],
-        [0.6211]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.4648, 0.6484, 0.6641], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:11:52,681] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.89 | optimizer_step: 4.37
-[2025-01-25 16:11:52,682] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.70 | bwd_microstep: 4625.65 | bwd_inner_microstep: 4620.23 | bwd_allreduce_microstep: 5.31 | step_microstep: 46.70
-[2025-01-25 16:11:52,686] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.64 | bwd: 4625.68 | bwd_inner: 4620.23 | bwd_allreduce: 5.37 | step: 46.72
- 46%|████▌     | 2655/5800 [7:25:22<6:02:05,  6.91s/it]                                                       {'loss': 0.0312, 'grad_norm': 8.589287757873535, 'learning_rate': 2.3686613439119603e-05, 'epoch': 22.89}
- 46%|████▌     | 2655/5800 [7:25:22<6:02:05,  6.91s/it]score1 tensor([[0.3945],
-        [0.5000],
-        [0.5547],
-        [0.4238]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.5195, 0.6094, 0.4238], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0337, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:11:59,571] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 16:11:59,573] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.73 | bwd_microstep: 4589.07 | bwd_inner_microstep: 4583.64 | bwd_allreduce_microstep: 5.30 | step_microstep: 45.96
-[2025-01-25 16:11:59,573] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.70 | bwd: 4589.09 | bwd_inner: 4583.64 | bwd_allreduce: 5.37 | step: 45.97
- 46%|████▌     | 2656/5800 [7:25:29<6:01:39,  6.90s/it]                                                       {'loss': 0.0337, 'grad_norm': 6.04892635345459, 'learning_rate': 2.3675636116894185e-05, 'epoch': 22.9}
- 46%|████▌     | 2656/5800 [7:25:29<6:01:39,  6.90s/it]score1 tensor([[0.5156],
-        [0.5195],
-        [0.4512],
-        [0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.5469, 0.4473, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:12:06,496] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 16:12:06,497] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.28 | bwd_microstep: 4623.84 | bwd_inner_microstep: 4618.37 | bwd_allreduce_microstep: 5.39 | step_microstep: 44.14
-[2025-01-25 16:12:06,498] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.24 | bwd: 4623.87 | bwd_inner: 4618.37 | bwd_allreduce: 5.43 | step: 44.15
- 46%|████▌     | 2657/5800 [7:25:36<6:01:52,  6.91s/it]                                                       {'loss': 0.0137, 'grad_norm': 0.49067991971969604, 'learning_rate': 2.3664657648541645e-05, 'epoch': 22.91}
- 46%|████▌     | 2657/5800 [7:25:36<6:01:52,  6.91s/it]score1 tensor([[0.4316],
-        [0.4668],
-        [0.5703],
-        [0.4141]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4023, 0.4727, 0.5508, 0.4219], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:12:13,439] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.37
-[2025-01-25 16:12:13,441] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.49 | bwd_microstep: 4630.29 | bwd_inner_microstep: 4625.27 | bwd_allreduce_microstep: 4.93 | step_microstep: 49.06
-[2025-01-25 16:12:13,441] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.46 | bwd: 4630.31 | bwd_inner: 4625.27 | bwd_allreduce: 4.97 | step: 49.07
- 46%|████▌     | 2658/5800 [7:25:43<6:02:12,  6.92s/it]                                                       {'loss': 0.0156, 'grad_norm': 0.5641546249389648, 'learning_rate': 2.3653678037485267e-05, 'epoch': 22.91}
- 46%|████▌     | 2658/5800 [7:25:43<6:02:12,  6.92s/it]score1 tensor([[0.4414],
-        [0.4785],
-        [0.4961],
-        [0.4395]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4336, 0.4922, 0.5078, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:12:20,348] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 16:12:20,349] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.44 | bwd_microstep: 4625.50 | bwd_inner_microstep: 4619.15 | bwd_allreduce_microstep: 6.26 | step_microstep: 43.42
-[2025-01-25 16:12:20,349] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.37 | bwd: 4625.53 | bwd_inner: 4619.15 | bwd_allreduce: 6.30 | step: 43.43
- 46%|████▌     | 2659/5800 [7:25:50<6:02:02,  6.92s/it]                                                       {'loss': 0.0112, 'grad_norm': 4.063178539276123, 'learning_rate': 2.3642697287148674e-05, 'epoch': 22.92}
- 46%|████▌     | 2659/5800 [7:25:50<6:02:02,  6.92s/it]score1 tensor([[0.5898],
-        [0.5000],
-        [0.6328],
-        [0.4941]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.5273, 0.6523, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0220, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:12:27,272] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.79 | optimizer_step: 4.37
-[2025-01-25 16:12:27,273] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.28 | bwd_microstep: 4623.53 | bwd_inner_microstep: 4617.97 | bwd_allreduce_microstep: 5.45 | step_microstep: 57.64
-[2025-01-25 16:12:27,273] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.24 | bwd: 4623.55 | bwd_inner: 4617.97 | bwd_allreduce: 5.51 | step: 57.64
- 46%|████▌     | 2660/5800 [7:25:57<6:02:01,  6.92s/it]                                                       {'loss': 0.022, 'grad_norm': 4.2219061851501465, 'learning_rate': 2.3631715400955867e-05, 'epoch': 22.93}
- 46%|████▌     | 2660/5800 [7:25:57<6:02:01,  6.92s/it]score1 tensor([[0.4805],
-        [0.3926],
-        [0.5391],
-        [0.4551]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.3672, 0.5625, 0.4824], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:12:34,196] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.37
-[2025-01-25 16:12:34,197] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.10 | bwd_microstep: 4636.46 | bwd_inner_microstep: 4631.32 | bwd_allreduce_microstep: 5.05 | step_microstep: 42.72
-[2025-01-25 16:12:34,197] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.06 | bwd: 4636.49 | bwd_inner: 4631.32 | bwd_allreduce: 5.08 | step: 42.72
- 46%|████▌     | 2661/5800 [7:26:04<6:02:02,  6.92s/it]                                                       {'loss': 0.0254, 'grad_norm': 0.7592261433601379, 'learning_rate': 2.362073238233118e-05, 'epoch': 22.94}
- 46%|████▌     | 2661/5800 [7:26:04<6:02:02,  6.92s/it]score1 tensor([[0.5312],
-        [0.5273],
-        [0.6445],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.4902, 0.6016, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0356, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:12:41,124] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.37
-[2025-01-25 16:12:41,125] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.89 | bwd_microstep: 4634.48 | bwd_inner_microstep: 4629.32 | bwd_allreduce_microstep: 5.05 | step_microstep: 46.18
-[2025-01-25 16:12:41,126] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.84 | bwd: 4634.50 | bwd_inner: 4629.32 | bwd_allreduce: 5.11 | step: 46.19
- 46%|████▌     | 2662/5800 [7:26:11<6:02:06,  6.92s/it]                                                       {'loss': 0.0356, 'grad_norm': 8.772687911987305, 'learning_rate': 2.3609748234699308e-05, 'epoch': 22.95}
- 46%|████▌     | 2662/5800 [7:26:11<6:02:06,  6.92s/it]score1 tensor([[0.3984],
-        [0.4473],
-        [0.4766],
-        [0.6445]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.4219, 0.4082, 0.6133], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0317, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:12:48,070] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.61 | optimizer_step: 4.36
-[2025-01-25 16:12:48,073] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.35 | bwd_microstep: 4630.43 | bwd_inner_microstep: 4625.07 | bwd_allreduce_microstep: 5.25 | step_microstep: 70.27
-[2025-01-25 16:12:48,074] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.31 | bwd: 4630.46 | bwd_inner: 4625.07 | bwd_allreduce: 5.31 | step: 70.38
- 46%|████▌     | 2663/5800 [7:26:18<6:02:43,  6.94s/it]                                                       {'loss': 0.0317, 'grad_norm': 4.519415855407715, 'learning_rate': 2.3598762961485307e-05, 'epoch': 22.96}
- 46%|████▌     | 2663/5800 [7:26:18<6:02:43,  6.94s/it]score1 tensor([[0.6562],
-        [0.4609],
-        [0.5742],
-        [0.4609]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.4688, 0.5312, 0.4277], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0239, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:12:55,031] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 16:12:55,032] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.24 | bwd_microstep: 4633.38 | bwd_inner_microstep: 4628.28 | bwd_allreduce_microstep: 4.98 | step_microstep: 47.09
-[2025-01-25 16:12:55,032] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.21 | bwd: 4633.40 | bwd_inner: 4628.28 | bwd_allreduce: 5.04 | step: 47.10
- 46%|████▌     | 2664/5800 [7:26:25<6:02:35,  6.94s/it]                                                       {'loss': 0.0239, 'grad_norm': 4.646256446838379, 'learning_rate': 2.3587776566114564e-05, 'epoch': 22.97}
- 46%|████▌     | 2664/5800 [7:26:25<6:02:35,  6.94s/it]score1 tensor([[0.4180],
-        [0.4277],
-        [0.4863],
-        [0.6289]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.4199, 0.4453, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0249, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:13:01,965] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.37
-[2025-01-25 16:13:01,967] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.10 | bwd_microstep: 4627.44 | bwd_inner_microstep: 4619.23 | bwd_allreduce_microstep: 7.99 | step_microstep: 63.62
-[2025-01-25 16:13:01,967] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.06 | bwd: 4627.50 | bwd_inner: 4619.23 | bwd_allreduce: 8.12 | step: 63.62
- 46%|████▌     | 2665/5800 [7:26:31<6:02:21,  6.94s/it]                                                       {'loss': 0.0249, 'grad_norm': 8.172165870666504, 'learning_rate': 2.3576789052012838e-05, 'epoch': 22.97}
- 46%|████▌     | 2665/5800 [7:26:31<6:02:21,  6.94s/it]score1 tensor([[0.5469],
-        [0.6484],
-        [0.6758],
-        [0.3926]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.6094, 0.6367, 0.2812], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0669, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:13:08,908] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.93 | optimizer_step: 4.37
-[2025-01-25 16:13:08,909] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.69 | bwd_microstep: 4631.52 | bwd_inner_microstep: 4626.09 | bwd_allreduce_microstep: 5.30 | step_microstep: 46.51
-[2025-01-25 16:13:08,914] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.60 | bwd: 4631.57 | bwd_inner: 4626.09 | bwd_allreduce: 5.37 | step: 46.52
- 46%|████▌     | 2666/5800 [7:26:38<6:02:28,  6.94s/it]                                                       {'loss': 0.0669, 'grad_norm': 8.846030235290527, 'learning_rate': 2.3565800422606226e-05, 'epoch': 22.98}
- 46%|████▌     | 2666/5800 [7:26:38<6:02:28,  6.94s/it]score1 tensor([[0.6602],
-        [0.6094],
-        [0.6328],
-        [0.6992]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.5820, 0.6445, 0.7031], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:13:15,830] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 16:13:15,831] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.12 | bwd_microstep: 4626.38 | bwd_inner_microstep: 4620.97 | bwd_allreduce_microstep: 5.32 | step_microstep: 47.41
-[2025-01-25 16:13:15,831] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.03 | bwd: 4626.40 | bwd_inner: 4620.97 | bwd_allreduce: 5.37 | step: 47.42
- 46%|████▌     | 2667/5800 [7:26:45<6:01:58,  6.93s/it]                                                       {'loss': 0.0146, 'grad_norm': 0.5525822043418884, 'learning_rate': 2.3554810681321164e-05, 'epoch': 22.99}
- 46%|████▌     | 2667/5800 [7:26:45<6:01:58,  6.93s/it][93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-score1 tensor([[0.3711]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3457], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:13:20,374] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 16:13:20,376] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 574.29 | bwd_microstep: 1221.53 | bwd_inner_microstep: 1216.89 | bwd_allreduce_microstep: 4.56 | step_microstep: 49.58
-[2025-01-25 16:13:20,376] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 574.24 | bwd: 1221.57 | bwd_inner: 1216.89 | bwd_allreduce: 4.59 | step: 49.59
- 46%|████▌     | 2668/5800 [7:26:50<5:24:27,  6.22s/it]                                                       {'loss': 0.0254, 'grad_norm': 7.286428451538086, 'learning_rate': 2.354381983158446e-05, 'epoch': 23.0}
- 46%|████▌     | 2668/5800 [7:26:50<5:24:27,  6.22s/it][2025-01-25 16:13:25,209] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 16:13:35,908] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 16:13:46,977] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 16:13:57,632] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-score1 tensor([[0.4727],
-        [0.4570],
-        [0.4531],
-        [0.4102]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4922, 0.4902, 0.4746, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0283, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:14:15,052] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.08 | optimizer_step: 4.37
-[2025-01-25 16:14:15,053] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2137.89 | bwd_microstep: 4597.00 | bwd_inner_microstep: 4591.71 | bwd_allreduce_microstep: 5.21 | step_microstep: 47.29
-[2025-01-25 16:14:15,053] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2137.85 | bwd: 4597.02 | bwd_inner: 4591.71 | bwd_allreduce: 5.25 | step: 47.29
- 46%|████▌     | 2669/5800 [7:27:45<18:03:02, 20.75s/it]                                                        {'loss': 0.0283, 'grad_norm': 7.783639430999756, 'learning_rate': 2.353282787682323e-05, 'epoch': 23.01}
- 46%|████▌     | 2669/5800 [7:27:45<18:03:02, 20.75s/it]score1 tensor([[0.4727],
-        [0.6758],
-        [0.5273],
-        [0.4141]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4922, 0.6953, 0.5664, 0.4121], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0200, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:14:21,888] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 16:14:21,889] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2131.14 | bwd_microstep: 4570.13 | bwd_inner_microstep: 4564.66 | bwd_allreduce_microstep: 5.39 | step_microstep: 48.13
-[2025-01-25 16:14:21,890] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2131.07 | bwd: 4570.15 | bwd_inner: 4564.66 | bwd_allreduce: 5.43 | step: 48.13
- 46%|████▌     | 2670/5800 [7:27:51<14:24:56, 16.58s/it]                                                        {'loss': 0.02, 'grad_norm': 4.779551029205322, 'learning_rate': 2.3521834820464978e-05, 'epoch': 23.02}
- 46%|████▌     | 2670/5800 [7:27:51<14:24:56, 16.58s/it]score1 tensor([[0.6484],
-        [0.4629],
-        [0.4980],
-        [0.6562]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6484, 0.4980, 0.5156, 0.6797], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0190, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:14:28,693] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 16:14:28,694] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2136.25 | bwd_microstep: 4537.59 | bwd_inner_microstep: 4532.43 | bwd_allreduce_microstep: 5.05 | step_microstep: 43.35
-[2025-01-25 16:14:28,694] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2136.21 | bwd: 4537.61 | bwd_inner: 4532.43 | bwd_allreduce: 5.11 | step: 43.38
- 46%|████▌     | 2671/5800 [7:27:58<11:51:40, 13.65s/it]                                                        {'loss': 0.019, 'grad_norm': 6.493805885314941, 'learning_rate': 2.3510840665937515e-05, 'epoch': 23.03}
- 46%|████▌     | 2671/5800 [7:27:58<11:51:40, 13.65s/it]score1 tensor([[0.5508],
-        [0.6562],
-        [0.3516],
-        [0.4727]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5938, 0.6562, 0.4043, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0356, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:14:35,494] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 16:14:35,495] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2137.55 | bwd_microstep: 4540.44 | bwd_inner_microstep: 4535.63 | bwd_allreduce_microstep: 4.73 | step_microstep: 43.33
-[2025-01-25 16:14:35,495] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2137.50 | bwd: 4540.46 | bwd_inner: 4535.63 | bwd_allreduce: 4.77 | step: 43.34
- 46%|████▌     | 2672/5800 [7:28:05<10:04:21, 11.59s/it]                                                        {'loss': 0.0356, 'grad_norm': 5.970426559448242, 'learning_rate': 2.3499845416669013e-05, 'epoch': 23.03}
- 46%|████▌     | 2672/5800 [7:28:05<10:04:21, 11.59s/it]score1 tensor([[0.4570],
-        [0.4668],
-        [0.3809],
-        [0.4355]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4805, 0.4980, 0.4160, 0.4414], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0239, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:14:42,355] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 16:14:42,357] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2144.08 | bwd_microstep: 4599.25 | bwd_inner_microstep: 4594.18 | bwd_allreduce_microstep: 4.93 | step_microstep: 43.20
-[2025-01-25 16:14:42,362] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2144.04 | bwd: 4599.28 | bwd_inner: 4594.18 | bwd_allreduce: 5.01 | step: 43.21
- 46%|████▌     | 2673/5800 [7:28:12<8:50:17, 10.18s/it]                                                        {'loss': 0.0239, 'grad_norm': 7.664059162139893, 'learning_rate': 2.3488849076087986e-05, 'epoch': 23.04}
- 46%|████▌     | 2673/5800 [7:28:12<8:50:17, 10.18s/it]score1 tensor([[0.5664],
-        [0.4180],
-        [0.3867],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.4414, 0.3789, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0171, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:14:49,235] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.99 | optimizer_step: 4.37
-[2025-01-25 16:14:49,236] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.44 | bwd_microstep: 4597.59 | bwd_inner_microstep: 4592.67 | bwd_allreduce_microstep: 4.84 | step_microstep: 43.46
-[2025-01-25 16:14:49,237] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.37 | bwd: 4597.61 | bwd_inner: 4592.67 | bwd_allreduce: 4.88 | step: 43.47
- 46%|████▌     | 2674/5800 [7:28:19<7:58:32,  9.19s/it]                                                       {'loss': 0.0171, 'grad_norm': 0.5145346522331238, 'learning_rate': 2.347785164762328e-05, 'epoch': 23.05}
- 46%|████▌     | 2674/5800 [7:28:19<7:58:32,  9.19s/it]score1 tensor([[0.5469],
-        [0.5117],
-        [0.5391],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4883, 0.4688, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:14:56,112] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 16:14:56,113] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2145.10 | bwd_microstep: 4608.36 | bwd_inner_microstep: 4603.61 | bwd_allreduce_microstep: 4.68 | step_microstep: 43.96
-[2025-01-25 16:14:56,113] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2145.07 | bwd: 4608.38 | bwd_inner: 4603.61 | bwd_allreduce: 4.71 | step: 43.97
- 46%|████▌     | 2675/5800 [7:28:26<7:22:18,  8.49s/it]                                                       {'loss': 0.0352, 'grad_norm': 4.3595662117004395, 'learning_rate': 2.3466853134704085e-05, 'epoch': 23.06}
- 46%|████▌     | 2675/5800 [7:28:26<7:22:18,  8.49s/it]score1 tensor([[0.5938],
-        [0.3320],
-        [0.5039],
-        [0.4805]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133, 0.3418, 0.4883, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:15:03,006] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 16:15:03,007] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.02 | bwd_microstep: 4621.60 | bwd_inner_microstep: 4616.54 | bwd_allreduce_microstep: 4.96 | step_microstep: 43.97
-[2025-01-25 16:15:03,008] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2149.97 | bwd: 4621.63 | bwd_inner: 4616.54 | bwd_allreduce: 5.01 | step: 43.98
- 46%|████▌     | 2676/5800 [7:28:32<6:57:11,  8.01s/it]                                                       {'loss': 0.0146, 'grad_norm': 3.9233832359313965, 'learning_rate': 2.345585354075992e-05, 'epoch': 23.07}
- 46%|████▌     | 2676/5800 [7:28:32<6:57:11,  8.01s/it]score1 tensor([[0.6094],
-        [0.4961],
-        [0.6406],
-        [0.3672]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5742, 0.4785, 0.6289, 0.3652], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0166, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:15:09,905] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.02 | optimizer_step: 4.37
-[2025-01-25 16:15:09,907] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2151.33 | bwd_microstep: 4623.26 | bwd_inner_microstep: 4617.86 | bwd_allreduce_microstep: 5.30 | step_microstep: 46.52
-[2025-01-25 16:15:09,907] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2151.30 | bwd: 4623.28 | bwd_inner: 4617.86 | bwd_allreduce: 5.35 | step: 46.53
- 46%|████▌     | 2677/5800 [7:28:39<6:39:40,  7.68s/it]                                                       {'loss': 0.0166, 'grad_norm': 8.535804748535156, 'learning_rate': 2.3444852869220665e-05, 'epoch': 23.08}
- 46%|████▌     | 2677/5800 [7:28:39<6:39:40,  7.68s/it]score1 tensor([[0.4277],
-        [0.6797],
-        [0.4766],
-        [0.5781]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.6289, 0.4492, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0410, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:15:16,791] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 16:15:16,792] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.56 | bwd_microstep: 4610.89 | bwd_inner_microstep: 4605.70 | bwd_allreduce_microstep: 5.07 | step_microstep: 43.72
-[2025-01-25 16:15:16,793] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.53 | bwd: 4610.91 | bwd_inner: 4605.70 | bwd_allreduce: 5.13 | step: 43.73
- 46%|████▌     | 2678/5800 [7:28:46<6:27:10,  7.44s/it]                                                       {'loss': 0.041, 'grad_norm': 4.86095666885376, 'learning_rate': 2.3433851123516508e-05, 'epoch': 23.09}
- 46%|████▌     | 2678/5800 [7:28:46<6:27:10,  7.44s/it]score1 tensor([[0.4082],
-        [0.3008],
-        [0.5664],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4062, 0.3105, 0.5273, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:15:23,674] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 16:15:23,676] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.22 | bwd_microstep: 4607.43 | bwd_inner_microstep: 4602.83 | bwd_allreduce_microstep: 4.53 | step_microstep: 41.87
-[2025-01-25 16:15:23,676] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.19 | bwd: 4607.46 | bwd_inner: 4602.83 | bwd_allreduce: 4.57 | step: 41.88
- 46%|████▌     | 2679/5800 [7:28:53<6:18:21,  7.27s/it]                                                       {'loss': 0.0137, 'grad_norm': 0.6237310171127319, 'learning_rate': 2.342284830707799e-05, 'epoch': 23.09}
- 46%|████▌     | 2679/5800 [7:28:53<6:18:21,  7.27s/it]score1 tensor([[0.4688],
-        [0.4492],
-        [0.4492],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4668, 0.4297, 0.4336, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0181, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:15:30,561] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 16:15:30,562] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.09 | bwd_microstep: 4614.26 | bwd_inner_microstep: 4609.32 | bwd_allreduce_microstep: 4.82 | step_microstep: 42.78
-[2025-01-25 16:15:30,562] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.05 | bwd: 4614.29 | bwd_inner: 4609.32 | bwd_allreduce: 4.89 | step: 42.80
- 46%|████▌     | 2680/5800 [7:29:00<6:12:10,  7.16s/it]                                                       {'loss': 0.0181, 'grad_norm': 8.036240577697754, 'learning_rate': 2.3411844423335976e-05, 'epoch': 23.1}
- 46%|████▌     | 2680/5800 [7:29:00<6:12:10,  7.16s/it]score1 tensor([[0.6289],
-        [0.6641],
-        [0.3535],
-        [0.4648]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.6211, 0.3867, 0.4238], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0420, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:15:37,449] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.93 | optimizer_step: 4.36
-[2025-01-25 16:15:37,450] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.56 | bwd_microstep: 4616.00 | bwd_inner_microstep: 4610.97 | bwd_allreduce_microstep: 4.92 | step_microstep: 42.58
-[2025-01-25 16:15:37,451] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.52 | bwd: 4616.02 | bwd_inner: 4610.97 | bwd_allreduce: 4.97 | step: 42.59
- 46%|████▌     | 2681/5800 [7:29:07<6:07:55,  7.08s/it]                                                       {'loss': 0.042, 'grad_norm': 5.127089977264404, 'learning_rate': 2.340083947572167e-05, 'epoch': 23.11}
- 46%|████▌     | 2681/5800 [7:29:07<6:07:55,  7.08s/it]score1 tensor([[0.5781],
-        [0.4395],
-        [0.4316],
-        [0.6133]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.4238, 0.4492, 0.6133], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:15:44,306] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 16:15:44,308] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.10 | bwd_microstep: 4578.08 | bwd_inner_microstep: 4573.44 | bwd_allreduce_microstep: 4.56 | step_microstep: 42.99
-[2025-01-25 16:15:44,308] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.07 | bwd: 4578.10 | bwd_inner: 4573.44 | bwd_allreduce: 4.60 | step: 43.00
- 46%|████▌     | 2682/5800 [7:29:14<6:04:14,  7.01s/it]                                                       {'loss': 0.0122, 'grad_norm': 2.3008646965026855, 'learning_rate': 2.3389833467666607e-05, 'epoch': 23.12}
- 46%|████▌     | 2682/5800 [7:29:14<6:04:14,  7.01s/it]score1 tensor([[0.4648],
-        [0.4316],
-        [0.4414],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.4629, 0.4297, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:15:51,211] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.36
-[2025-01-25 16:15:51,212] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.04 | bwd_microstep: 4627.21 | bwd_inner_microstep: 4622.64 | bwd_allreduce_microstep: 4.50 | step_microstep: 42.76
-[2025-01-25 16:15:51,213] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.01 | bwd: 4627.24 | bwd_inner: 4622.64 | bwd_allreduce: 4.53 | step: 42.78
- 46%|████▋     | 2683/5800 [7:29:21<6:02:31,  6.98s/it]                                                       {'loss': 0.0176, 'grad_norm': 0.4837746024131775, 'learning_rate': 2.337882640260265e-05, 'epoch': 23.13}
- 46%|████▋     | 2683/5800 [7:29:21<6:02:31,  6.98s/it]score1 tensor([[0.5977],
-        [0.5195],
-        [0.5000],
-        [0.5977]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.6055, 0.5195, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0410, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:15:58,105] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 16:15:58,106] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.65 | bwd_microstep: 4617.37 | bwd_inner_microstep: 4612.61 | bwd_allreduce_microstep: 4.68 | step_microstep: 44.77
-[2025-01-25 16:15:58,107] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.60 | bwd: 4617.40 | bwd_inner: 4612.61 | bwd_allreduce: 4.72 | step: 44.78
- 46%|████▋     | 2684/5800 [7:29:28<6:01:03,  6.95s/it]                                                       {'loss': 0.041, 'grad_norm': 8.80809211730957, 'learning_rate': 2.336781828396199e-05, 'epoch': 23.14}
- 46%|████▋     | 2684/5800 [7:29:28<6:01:03,  6.95s/it]score1 tensor([[0.4199],
-        [0.4590],
-        [0.4883],
-        [0.5859]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4570, 0.5312, 0.5117, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0410, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:16:05,000] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 16:16:05,001] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.53 | bwd_microstep: 4626.13 | bwd_inner_microstep: 4621.27 | bwd_allreduce_microstep: 4.78 | step_microstep: 41.59
-[2025-01-25 16:16:05,001] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.49 | bwd: 4626.15 | bwd_inner: 4621.27 | bwd_allreduce: 4.81 | step: 41.61
- 46%|████▋     | 2685/5800 [7:29:34<6:00:09,  6.94s/it]                                                       {'loss': 0.041, 'grad_norm': 8.158891677856445, 'learning_rate': 2.3356809115177156e-05, 'epoch': 23.15}
- 46%|████▋     | 2685/5800 [7:29:34<6:00:09,  6.94s/it]score1 tensor([[0.3359],
-        [0.3926],
-        [0.3945],
-        [0.3965]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3652, 0.4219, 0.4004, 0.4219], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0225, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:16:11,914] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 16:16:11,915] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.17 | bwd_microstep: 4628.44 | bwd_inner_microstep: 4624.20 | bwd_allreduce_microstep: 4.18 | step_microstep: 39.66
-[2025-01-25 16:16:11,915] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.12 | bwd: 4628.46 | bwd_inner: 4624.20 | bwd_allreduce: 4.21 | step: 39.67
- 46%|████▋     | 2686/5800 [7:29:41<5:59:33,  6.93s/it]                                                       {'loss': 0.0225, 'grad_norm': 7.183850288391113, 'learning_rate': 2.3345798899681e-05, 'epoch': 23.16}
- 46%|████▋     | 2686/5800 [7:29:41<5:59:33,  6.93s/it]score1 tensor([[0.3711],
-        [0.4219],
-        [0.4121],
-        [0.3984]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4023, 0.4785, 0.4277, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0381, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:16:18,818] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 16:16:18,819] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.48 | bwd_microstep: 4628.04 | bwd_inner_microstep: 4623.08 | bwd_allreduce_microstep: 4.88 | step_microstep: 45.41
-[2025-01-25 16:16:18,819] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.45 | bwd: 4628.07 | bwd_inner: 4623.08 | bwd_allreduce: 4.92 | step: 45.42
- 46%|████▋     | 2687/5800 [7:29:48<5:59:04,  6.92s/it]                                                       {'loss': 0.0381, 'grad_norm': 7.468924522399902, 'learning_rate': 2.3334787640906687e-05, 'epoch': 23.16}
- 46%|████▋     | 2687/5800 [7:29:48<5:59:04,  6.92s/it]score1 tensor([[0.4668],
-        [0.3750],
-        [0.4102],
-        [0.4258]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.4180, 0.4453, 0.4434], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0332, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:16:25,719] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.36
-[2025-01-25 16:16:25,720] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.27 | bwd_microstep: 4630.77 | bwd_inner_microstep: 4625.90 | bwd_allreduce_microstep: 4.79 | step_microstep: 45.37
-[2025-01-25 16:16:25,720] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.24 | bwd: 4630.79 | bwd_inner: 4625.90 | bwd_allreduce: 4.83 | step: 45.39
- 46%|████▋     | 2688/5800 [7:29:55<5:58:39,  6.92s/it]                                                       {'loss': 0.0332, 'grad_norm': 7.612344741821289, 'learning_rate': 2.332377534228772e-05, 'epoch': 23.17}
- 46%|████▋     | 2688/5800 [7:29:55<5:58:39,  6.92s/it]score1 tensor([[0.5547],
-        [0.6602],
-        [0.4512],
-        [0.3418]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.6523, 0.4629, 0.3730], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:16:32,615] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.36
-[2025-01-25 16:16:32,616] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.88 | bwd_microstep: 4621.82 | bwd_inner_microstep: 4616.93 | bwd_allreduce_microstep: 4.81 | step_microstep: 42.46
-[2025-01-25 16:16:32,616] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.84 | bwd: 4621.84 | bwd_inner: 4616.93 | bwd_allreduce: 4.85 | step: 42.47
- 46%|████▋     | 2689/5800 [7:30:02<5:58:19,  6.91s/it]                                                       {'loss': 0.0186, 'grad_norm': 3.535914659500122, 'learning_rate': 2.3312762007257928e-05, 'epoch': 23.18}
- 46%|████▋     | 2689/5800 [7:30:02<5:58:19,  6.91s/it]score1 tensor([[0.6719],
-        [0.3711],
-        [0.5430],
-        [0.4668]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.3613, 0.5078, 0.4629], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0190, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:16:39,525] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 16:16:39,526] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.95 | bwd_microstep: 4624.59 | bwd_inner_microstep: 4619.80 | bwd_allreduce_microstep: 4.69 | step_microstep: 41.74
-[2025-01-25 16:16:39,527] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.90 | bwd: 4624.62 | bwd_inner: 4619.80 | bwd_allreduce: 4.74 | step: 41.74
- 46%|████▋     | 2690/5800 [7:30:09<5:58:06,  6.91s/it]                                                       {'loss': 0.019, 'grad_norm': 8.332225799560547, 'learning_rate': 2.330174763925147e-05, 'epoch': 23.19}
- 46%|████▋     | 2690/5800 [7:30:09<5:58:06,  6.91s/it]score1 tensor([[0.5859],
-        [0.5898],
-        [0.4238],
-        [0.3262]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.5742, 0.4707, 0.3457], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0283, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:16:46,425] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 16:16:46,427] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.60 | bwd_microstep: 4626.24 | bwd_inner_microstep: 4621.42 | bwd_allreduce_microstep: 4.75 | step_microstep: 42.70
-[2025-01-25 16:16:46,427] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.58 | bwd: 4626.27 | bwd_inner: 4621.42 | bwd_allreduce: 4.79 | step: 42.71
- 46%|████▋     | 2691/5800 [7:30:16<5:57:48,  6.91s/it]                                                       {'loss': 0.0283, 'grad_norm': 1.4012778997421265, 'learning_rate': 2.32907322417028e-05, 'epoch': 23.2}
- 46%|████▋     | 2691/5800 [7:30:16<5:57:48,  6.91s/it]score1 tensor([[0.4492],
-        [0.4805],
-        [0.5859],
-        [0.6875]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4023, 0.4512, 0.5508, 0.6797], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0298, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:16:53,339] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.37
-[2025-01-25 16:16:53,340] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.51 | bwd_microstep: 4633.77 | bwd_inner_microstep: 4628.72 | bwd_allreduce_microstep: 4.94 | step_microstep: 42.57
-[2025-01-25 16:16:53,340] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.46 | bwd: 4633.79 | bwd_inner: 4628.72 | bwd_allreduce: 5.00 | step: 42.59
- 46%|████▋     | 2692/5800 [7:30:23<5:57:55,  6.91s/it]                                                       {'loss': 0.0298, 'grad_norm': 8.688587188720703, 'learning_rate': 2.327971581804672e-05, 'epoch': 23.21}
- 46%|████▋     | 2692/5800 [7:30:23<5:57:55,  6.91s/it]score1 tensor([[0.6680],
-        [0.4355],
-        [0.7148],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.4004, 0.6836, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0332, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:17:00,253] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 16:17:00,254] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.39 | bwd_microstep: 4625.27 | bwd_inner_microstep: 4620.62 | bwd_allreduce_microstep: 4.55 | step_microstep: 42.85
-[2025-01-25 16:17:00,254] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.35 | bwd: 4625.29 | bwd_inner: 4620.62 | bwd_allreduce: 4.60 | step: 42.85
- 46%|████▋     | 2693/5800 [7:30:30<5:57:56,  6.91s/it]                                                       {'loss': 0.0332, 'grad_norm': 4.809268474578857, 'learning_rate': 2.326869837171834e-05, 'epoch': 23.22}
- 46%|████▋     | 2693/5800 [7:30:30<5:57:56,  6.91s/it]score1 tensor([[0.4668],
-        [0.5625],
-        [0.5273],
-        [0.4316]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.5820, 0.5195, 0.3906], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0220, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:17:07,175] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.89 | optimizer_step: 4.36
-[2025-01-25 16:17:07,179] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.36 | bwd_microstep: 4631.41 | bwd_inner_microstep: 4626.62 | bwd_allreduce_microstep: 4.70 | step_microstep: 50.44
-[2025-01-25 16:17:07,179] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.33 | bwd: 4631.44 | bwd_inner: 4626.62 | bwd_allreduce: 4.74 | step: 50.45
- 46%|████▋     | 2694/5800 [7:30:37<5:57:59,  6.92s/it]                                                       {'loss': 0.022, 'grad_norm': 3.8409717082977295, 'learning_rate': 2.3257679906153094e-05, 'epoch': 23.22}
- 46%|████▋     | 2694/5800 [7:30:37<5:57:59,  6.92s/it]score1 tensor([[0.5977],
-        [0.5742],
-        [0.4102],
-        [0.6250]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.5742, 0.3789, 0.6016], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0225, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:17:14,038] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.50 | optimizer_step: 4.36
-[2025-01-25 16:17:14,040] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.25 | bwd_microstep: 4574.76 | bwd_inner_microstep: 4569.97 | bwd_allreduce_microstep: 4.71 | step_microstep: 41.37
-[2025-01-25 16:17:14,040] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.19 | bwd: 4574.78 | bwd_inner: 4569.97 | bwd_allreduce: 4.75 | step: 41.38
- 46%|████▋     | 2695/5800 [7:30:44<5:56:57,  6.90s/it]                                                       {'loss': 0.0225, 'grad_norm': 6.5423455238342285, 'learning_rate': 2.3246660424786724e-05, 'epoch': 23.23}
- 46%|████▋     | 2695/5800 [7:30:44<5:56:57,  6.90s/it]score1 tensor([[0.6133],
-        [0.3301],
-        [0.4609],
-        [0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.3086, 0.4160, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:17:20,915] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.37
-[2025-01-25 16:17:20,916] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.76 | bwd_microstep: 4584.63 | bwd_inner_microstep: 4579.40 | bwd_allreduce_microstep: 5.14 | step_microstep: 57.58
-[2025-01-25 16:17:20,916] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.73 | bwd: 4584.66 | bwd_inner: 4579.40 | bwd_allreduce: 5.18 | step: 57.59
- 46%|████▋     | 2696/5800 [7:30:50<5:56:36,  6.89s/it]                                                       {'loss': 0.0312, 'grad_norm': 5.9655585289001465, 'learning_rate': 2.32356399310553e-05, 'epoch': 23.24}
- 46%|████▋     | 2696/5800 [7:30:50<5:56:36,  6.89s/it]score1 tensor([[0.4492],
-        [0.6562],
-        [0.4453],
-        [0.4043]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4121, 0.6562, 0.4258, 0.4316], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0210, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:17:27,780] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 16:17:27,781] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.39 | bwd_microstep: 4577.22 | bwd_inner_microstep: 4572.06 | bwd_allreduce_microstep: 5.07 | step_microstep: 46.64
-[2025-01-25 16:17:27,781] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.35 | bwd: 4577.25 | bwd_inner: 4572.06 | bwd_allreduce: 5.11 | step: 46.65
- 46%|████▋     | 2697/5800 [7:30:57<5:56:03,  6.88s/it]                                                       {'loss': 0.021, 'grad_norm': 2.0664961338043213, 'learning_rate': 2.3224618428395198e-05, 'epoch': 23.25}
- 46%|████▋     | 2697/5800 [7:30:57<5:56:03,  6.88s/it]score1 tensor([[0.5000],
-        [0.4453],
-        [0.5781],
-        [0.3574]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.4141, 0.6055, 0.3398], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0229, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:17:34,728] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 16:17:34,729] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.64 | bwd_microstep: 4630.86 | bwd_inner_microstep: 4623.91 | bwd_allreduce_microstep: 6.72 | step_microstep: 64.37
-[2025-01-25 16:17:34,729] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.59 | bwd: 4630.91 | bwd_inner: 4623.91 | bwd_allreduce: 6.83 | step: 64.36
- 47%|████▋     | 2698/5800 [7:31:04<5:56:53,  6.90s/it]                                                       {'loss': 0.0229, 'grad_norm': 3.5337607860565186, 'learning_rate': 2.3213595920243127e-05, 'epoch': 23.26}
- 47%|████▋     | 2698/5800 [7:31:04<5:56:53,  6.90s/it]score1 tensor([[0.5820],
-        [0.4668],
-        [0.5430],
-        [0.6172]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5938, 0.5195, 0.6094, 0.6641], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0444, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:17:41,676] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.27 | optimizer_step: 4.56
-[2025-01-25 16:17:41,679] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.42 | bwd_microstep: 4626.46 | bwd_inner_microstep: 4621.05 | bwd_allreduce_microstep: 5.16 | step_microstep: 71.53
-[2025-01-25 16:17:41,679] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.38 | bwd: 4626.52 | bwd_inner: 4621.05 | bwd_allreduce: 5.29 | step: 71.56
- 47%|████▋     | 2699/5800 [7:31:11<5:57:37,  6.92s/it]                                                       {'loss': 0.0444, 'grad_norm': 8.697514533996582, 'learning_rate': 2.3202572410036077e-05, 'epoch': 23.27}
- 47%|████▋     | 2699/5800 [7:31:11<5:57:37,  6.92s/it]score1 tensor([[0.4922],
-        [0.4238],
-        [0.4883],
-        [0.4375]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.4648, 0.5508, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0327, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:17:48,605] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.47 | optimizer_step: 4.53
-[2025-01-25 16:17:48,607] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.31 | bwd_microstep: 4624.41 | bwd_inner_microstep: 4619.16 | bwd_allreduce_microstep: 5.17 | step_microstep: 56.41
-[2025-01-25 16:17:48,608] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.28 | bwd: 4624.44 | bwd_inner: 4619.16 | bwd_allreduce: 5.21 | step: 56.42
- 47%|████▋     | 2700/5800 [7:31:18<5:57:42,  6.92s/it]                                                       {'loss': 0.0327, 'grad_norm': 3.806985855102539, 'learning_rate': 2.3191547901211385e-05, 'epoch': 23.28}
- 47%|████▋     | 2700/5800 [7:31:18<5:57:42,  6.92s/it]score1 tensor([[0.4844],
-        [0.5430],
-        [0.5625],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.5781, 0.5625, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0303, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:17:55,503] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 16:17:55,504] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.86 | bwd_microstep: 4585.62 | bwd_inner_microstep: 4580.50 | bwd_allreduce_microstep: 5.04 | step_microstep: 44.75
-[2025-01-25 16:17:55,505] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.83 | bwd: 4585.65 | bwd_inner: 4580.50 | bwd_allreduce: 5.08 | step: 44.76
- 47%|████▋     | 2701/5800 [7:31:25<5:57:02,  6.91s/it]                                                       {'loss': 0.0303, 'grad_norm': 6.343245983123779, 'learning_rate': 2.318052239720668e-05, 'epoch': 23.28}
- 47%|████▋     | 2701/5800 [7:31:25<5:57:02,  6.91s/it]score1 tensor([[0.3906],
-        [0.4531],
-        [0.4121],
-        [0.3926]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.5391, 0.4473, 0.4336], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:18:02,422] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 16:18:02,423] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.29 | bwd_microstep: 4624.73 | bwd_inner_microstep: 4619.89 | bwd_allreduce_microstep: 4.75 | step_microstep: 43.05
-[2025-01-25 16:18:02,423] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.25 | bwd: 4624.76 | bwd_inner: 4619.89 | bwd_allreduce: 4.80 | step: 43.06
- 47%|████▋     | 2702/5800 [7:31:32<5:56:57,  6.91s/it]                                                       {'loss': 0.0547, 'grad_norm': 7.456450462341309, 'learning_rate': 2.3169495901459905e-05, 'epoch': 23.29}
- 47%|████▋     | 2702/5800 [7:31:32<5:56:57,  6.91s/it]score1 tensor([[0.5664],
-        [0.5703],
-        [0.3672],
-        [0.4766]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.6484, 0.3457, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0327, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:18:09,281] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 16:18:09,282] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.68 | bwd_microstep: 4576.81 | bwd_inner_microstep: 4570.94 | bwd_allreduce_microstep: 5.79 | step_microstep: 46.09
-[2025-01-25 16:18:09,283] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.63 | bwd: 4576.83 | bwd_inner: 4570.93 | bwd_allreduce: 5.83 | step: 46.10
- 47%|████▋     | 2703/5800 [7:31:39<5:55:59,  6.90s/it]                                                       {'loss': 0.0327, 'grad_norm': 2.5220279693603516, 'learning_rate': 2.315846841740931e-05, 'epoch': 23.3}
- 47%|████▋     | 2703/5800 [7:31:39<5:55:59,  6.90s/it]score1 tensor([[0.5391],
-        [0.3535],
-        [0.5195],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.3555, 0.5312, 0.5586], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0298, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:18:16,192] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 16:18:16,193] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.30 | bwd_microstep: 4624.45 | bwd_inner_microstep: 4619.58 | bwd_allreduce_microstep: 4.79 | step_microstep: 43.30
-[2025-01-25 16:18:16,193] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.26 | bwd: 4624.47 | bwd_inner: 4619.58 | bwd_allreduce: 4.82 | step: 43.30
- 47%|████▋     | 2704/5800 [7:31:46<5:56:04,  6.90s/it]                                                       {'loss': 0.0298, 'grad_norm': 8.145196914672852, 'learning_rate': 2.3147439948493462e-05, 'epoch': 23.31}
- 47%|████▋     | 2704/5800 [7:31:46<5:56:04,  6.90s/it]score1 tensor([[0.4570],
-        [0.3711],
-        [0.4180],
-        [0.4512]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.2812, 0.4551, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0503, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:18:23,108] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 16:18:23,110] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.37 | bwd_microstep: 4631.54 | bwd_inner_microstep: 4624.24 | bwd_allreduce_microstep: 7.20 | step_microstep: 50.22
-[2025-01-25 16:18:23,111] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.33 | bwd: 4631.56 | bwd_inner: 4624.24 | bwd_allreduce: 7.25 | step: 50.30
- 47%|████▋     | 2705/5800 [7:31:53<5:56:17,  6.91s/it]                                                       {'loss': 0.0503, 'grad_norm': 3.9993855953216553, 'learning_rate': 2.3136410498151225e-05, 'epoch': 23.32}
- 47%|████▋     | 2705/5800 [7:31:53<5:56:17,  6.91s/it]score1 tensor([[0.4629],
-        [0.5234],
-        [0.5391],
-        [0.6133]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.4941, 0.5625, 0.5977], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0225, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:18:30,020] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 16:18:30,022] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.59 | bwd_microstep: 4624.71 | bwd_inner_microstep: 4619.42 | bwd_allreduce_microstep: 5.21 | step_microstep: 46.73
-[2025-01-25 16:18:30,022] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.55 | bwd: 4624.73 | bwd_inner: 4619.42 | bwd_allreduce: 5.24 | step: 46.74
- 47%|████▋     | 2706/5800 [7:31:59<5:56:10,  6.91s/it]                                                       {'loss': 0.0225, 'grad_norm': 0.7717819213867188, 'learning_rate': 2.3125380069821772e-05, 'epoch': 23.33}
- 47%|████▋     | 2706/5800 [7:31:59<5:56:10,  6.91s/it]score1 tensor([[0.6016],
-        [0.5312],
-        [0.3809],
-        [0.4180]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.5977, 0.3652, 0.4590], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0415, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:18:36,944] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 16:18:36,945] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.96 | bwd_microstep: 4635.90 | bwd_inner_microstep: 4628.15 | bwd_allreduce_microstep: 7.57 | step_microstep: 50.83
-[2025-01-25 16:18:36,946] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.92 | bwd: 4635.96 | bwd_inner: 4628.15 | bwd_allreduce: 7.66 | step: 50.81
- 47%|████▋     | 2707/5800 [7:32:06<5:56:20,  6.91s/it]                                                       {'loss': 0.0415, 'grad_norm': 4.498579978942871, 'learning_rate': 2.311434866694458e-05, 'epoch': 23.34}
- 47%|████▋     | 2707/5800 [7:32:06<5:56:20,  6.91s/it]score1 tensor([[0.5938],
-        [0.4336],
-        [0.5430],
-        [0.4512]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.4570, 0.5469, 0.4355], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:18:43,866] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.37
-[2025-01-25 16:18:43,867] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.17 | bwd_microstep: 4638.39 | bwd_inner_microstep: 4632.86 | bwd_allreduce_microstep: 5.41 | step_microstep: 45.46
-[2025-01-25 16:18:43,867] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.14 | bwd: 4638.42 | bwd_inner: 4632.86 | bwd_allreduce: 5.48 | step: 45.48
- 47%|████▋     | 2708/5800 [7:32:13<5:56:19,  6.91s/it]                                                       {'loss': 0.0146, 'grad_norm': 4.388448238372803, 'learning_rate': 2.3103316292959437e-05, 'epoch': 23.34}
- 47%|████▋     | 2708/5800 [7:32:13<5:56:19,  6.91s/it]score1 tensor([[0.5039],
-        [0.6211],
-        [0.4902],
-        [0.5898]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.6367, 0.4766, 0.5742], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:18:50,714] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.36
-[2025-01-25 16:18:50,715] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.55 | bwd_microstep: 4572.27 | bwd_inner_microstep: 4567.14 | bwd_allreduce_microstep: 5.04 | step_microstep: 43.00
-[2025-01-25 16:18:50,716] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.52 | bwd: 4572.29 | bwd_inner: 4567.14 | bwd_allreduce: 5.08 | step: 43.02
- 47%|████▋     | 2709/5800 [7:32:20<5:55:13,  6.90s/it]                                                       {'loss': 0.0112, 'grad_norm': 1.937095284461975, 'learning_rate': 2.309228295130643e-05, 'epoch': 23.35}
- 47%|████▋     | 2709/5800 [7:32:20<5:55:13,  6.90s/it]score1 tensor([[0.4707],
-        [0.6406],
-        [0.4258],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4199, 0.6484, 0.4043, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0239, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:18:57,622] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.37
-[2025-01-25 16:18:57,624] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.92 | bwd_microstep: 4621.12 | bwd_inner_microstep: 4616.20 | bwd_allreduce_microstep: 4.80 | step_microstep: 46.94
-[2025-01-25 16:18:57,624] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.85 | bwd: 4621.14 | bwd_inner: 4616.20 | bwd_allreduce: 4.86 | step: 46.95
- 47%|████▋     | 2710/5800 [7:32:27<5:55:19,  6.90s/it]                                                       {'loss': 0.0239, 'grad_norm': 0.968021035194397, 'learning_rate': 2.3081248645425936e-05, 'epoch': 23.36}
- 47%|████▋     | 2710/5800 [7:32:27<5:55:19,  6.90s/it]score1 tensor([[0.6328],
-        [0.5547],
-        [0.4512],
-        [0.6523]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6016, 0.5234, 0.4453, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0464, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:19:04,528] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 16:19:04,530] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.74 | bwd_microstep: 4623.61 | bwd_inner_microstep: 4618.39 | bwd_allreduce_microstep: 5.12 | step_microstep: 44.61
-[2025-01-25 16:19:04,530] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.71 | bwd: 4623.64 | bwd_inner: 4618.39 | bwd_allreduce: 5.18 | step: 44.63
- 47%|████▋     | 2711/5800 [7:32:34<5:55:16,  6.90s/it]                                                       {'loss': 0.0464, 'grad_norm': 8.88355827331543, 'learning_rate': 2.307021337875866e-05, 'epoch': 23.37}
- 47%|████▋     | 2711/5800 [7:32:34<5:55:16,  6.90s/it]score1 tensor([[0.5195],
-        [0.6172],
-        [0.6758],
-        [0.4512]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.6211, 0.7070, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0332, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:19:11,427] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.92 | optimizer_step: 4.36
-[2025-01-25 16:19:11,428] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.84 | bwd_microstep: 4622.84 | bwd_inner_microstep: 4617.79 | bwd_allreduce_microstep: 4.94 | step_microstep: 42.59
-[2025-01-25 16:19:11,428] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.80 | bwd: 4622.87 | bwd_inner: 4617.79 | bwd_allreduce: 5.00 | step: 42.60
- 47%|████▋     | 2712/5800 [7:32:41<5:55:08,  6.90s/it]                                                       {'loss': 0.0332, 'grad_norm': 1.105872631072998, 'learning_rate': 2.3059177154745572e-05, 'epoch': 23.38}
- 47%|████▋     | 2712/5800 [7:32:41<5:55:08,  6.90s/it]score1 tensor([[0.5078],
-        [0.5234],
-        [0.5391],
-        [0.6133]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.4453, 0.5039, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0527, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:19:18,341] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 16:19:18,343] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.37 | bwd_microstep: 4633.24 | bwd_inner_microstep: 4627.74 | bwd_allreduce_microstep: 5.36 | step_microstep: 45.41
-[2025-01-25 16:19:18,343] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.33 | bwd: 4633.26 | bwd_inner: 4627.74 | bwd_allreduce: 5.44 | step: 45.44
- 47%|████▋     | 2713/5800 [7:32:48<5:55:13,  6.90s/it]                                                       {'loss': 0.0527, 'grad_norm': 8.536380767822266, 'learning_rate': 2.3048139976827967e-05, 'epoch': 23.39}
- 47%|████▋     | 2713/5800 [7:32:48<5:55:13,  6.90s/it]score1 tensor([[0.5156],
-        [0.6367],
-        [0.6484],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.6094, 0.5625, 0.4863], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0459, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:19:25,252] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.37
-[2025-01-25 16:19:25,254] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.97 | bwd_microstep: 4627.97 | bwd_inner_microstep: 4622.79 | bwd_allreduce_microstep: 5.09 | step_microstep: 43.83
-[2025-01-25 16:19:25,254] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.94 | bwd: 4627.99 | bwd_inner: 4622.79 | bwd_allreduce: 5.13 | step: 43.84
- 47%|████▋     | 2714/5800 [7:32:55<5:55:15,  6.91s/it]                                                       {'loss': 0.0459, 'grad_norm': 8.983440399169922, 'learning_rate': 2.303710184844743e-05, 'epoch': 23.4}
- 47%|████▋     | 2714/5800 [7:32:55<5:55:15,  6.91s/it]score1 tensor([[0.5156],
-        [0.4219],
-        [0.4922],
-        [0.4473]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.4141, 0.4668, 0.4141], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0225, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:19:32,168] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.53 | optimizer_step: 4.37
-[2025-01-25 16:19:32,169] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.70 | bwd_microstep: 4634.02 | bwd_inner_microstep: 4629.16 | bwd_allreduce_microstep: 4.78 | step_microstep: 44.40
-[2025-01-25 16:19:32,170] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.64 | bwd: 4634.04 | bwd_inner: 4629.16 | bwd_allreduce: 4.82 | step: 44.40
- 47%|████▋     | 2715/5800 [7:33:02<5:55:14,  6.91s/it]                                                       {'loss': 0.0225, 'grad_norm': 3.7524852752685547, 'learning_rate': 2.3026062773045835e-05, 'epoch': 23.41}
- 47%|████▋     | 2715/5800 [7:33:02<5:55:14,  6.91s/it]score1 tensor([[0.5938],
-        [0.5156],
-        [0.4961],
-        [0.4297]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.4844, 0.4609, 0.3887], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0386, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:19:39,078] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 16:19:39,079] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.64 | bwd_microstep: 4628.06 | bwd_inner_microstep: 4622.59 | bwd_allreduce_microstep: 5.37 | step_microstep: 44.53
-[2025-01-25 16:19:39,079] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.60 | bwd: 4628.08 | bwd_inner: 4622.59 | bwd_allreduce: 5.41 | step: 44.54
- 47%|████▋     | 2716/5800 [7:33:09<5:55:07,  6.91s/it]                                                       {'loss': 0.0386, 'grad_norm': 8.28612232208252, 'learning_rate': 2.3015022754065358e-05, 'epoch': 23.41}
- 47%|████▋     | 2716/5800 [7:33:09<5:55:07,  6.91s/it]score1 tensor([[0.5312],
-        [0.4277],
-        [0.5195],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.3750, 0.5000, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0410, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:19:45,984] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 16:19:45,986] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.13 | bwd_microstep: 4625.78 | bwd_inner_microstep: 4620.69 | bwd_allreduce_microstep: 4.98 | step_microstep: 46.88
-[2025-01-25 16:19:45,986] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.10 | bwd: 4625.81 | bwd_inner: 4620.69 | bwd_allreduce: 5.04 | step: 46.90
- 47%|████▋     | 2717/5800 [7:33:15<5:54:57,  6.91s/it]                                                       {'loss': 0.041, 'grad_norm': 3.9220991134643555, 'learning_rate': 2.300398179494848e-05, 'epoch': 23.42}
- 47%|████▋     | 2717/5800 [7:33:15<5:54:57,  6.91s/it]score1 tensor([[0.4727],
-        [0.5195],
-        [0.4727],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.5117, 0.5117, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0249, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:19:52,892] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 16:19:52,894] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.75 | bwd_microstep: 4628.69 | bwd_inner_microstep: 4623.41 | bwd_allreduce_microstep: 5.16 | step_microstep: 45.01
-[2025-01-25 16:19:52,894] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.72 | bwd: 4628.71 | bwd_inner: 4623.41 | bwd_allreduce: 5.23 | step: 45.02
- 47%|████▋     | 2718/5800 [7:33:22<5:54:51,  6.91s/it]                                                       {'loss': 0.0249, 'grad_norm': 0.7829583287239075, 'learning_rate': 2.2992939899137947e-05, 'epoch': 23.43}
- 47%|████▋     | 2718/5800 [7:33:22<5:54:51,  6.91s/it]score1 tensor([[0.5664],
-        [0.3340],
-        [0.2754],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.3672, 0.1787, 0.5586], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0481, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:19:59,801] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 16:19:59,802] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.03 | bwd_microstep: 4624.92 | bwd_inner_microstep: 4618.86 | bwd_allreduce_microstep: 5.94 | step_microstep: 46.00
-[2025-01-25 16:19:59,803] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.99 | bwd: 4624.95 | bwd_inner: 4618.86 | bwd_allreduce: 6.01 | step: 46.01
- 47%|████▋     | 2719/5800 [7:33:29<5:54:47,  6.91s/it]                                                       {'loss': 0.0481, 'grad_norm': 0.6247061491012573, 'learning_rate': 2.2981897070076827e-05, 'epoch': 23.44}
- 47%|████▋     | 2719/5800 [7:33:29<5:54:47,  6.91s/it]score1 tensor([[0.3809],
-        [0.5039],
-        [0.5352],
-        [0.4336]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4043, 0.5352, 0.5273, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0200, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:20:06,714] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 16:20:06,716] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.22 | bwd_microstep: 4625.37 | bwd_inner_microstep: 4619.56 | bwd_allreduce_microstep: 5.70 | step_microstep: 46.60
-[2025-01-25 16:20:06,716] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.16 | bwd: 4625.39 | bwd_inner: 4619.56 | bwd_allreduce: 5.76 | step: 46.61
- 47%|████▋     | 2720/5800 [7:33:36<5:54:39,  6.91s/it]                                                       {'loss': 0.02, 'grad_norm': 3.6380457878112793, 'learning_rate': 2.2970853311208455e-05, 'epoch': 23.45}
- 47%|████▋     | 2720/5800 [7:33:36<5:54:39,  6.91s/it]score1 tensor([[0.5352],
-        [0.5430],
-        [0.5703],
-        [0.4238]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.5430, 0.5664, 0.4395], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:20:13,564] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.37
-[2025-01-25 16:20:13,565] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.65 | bwd_microstep: 4573.35 | bwd_inner_microstep: 4568.03 | bwd_allreduce_microstep: 5.24 | step_microstep: 42.34
-[2025-01-25 16:20:13,565] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.60 | bwd: 4573.37 | bwd_inner: 4568.03 | bwd_allreduce: 5.27 | step: 42.35
- 47%|████▋     | 2721/5800 [7:33:43<5:53:45,  6.89s/it]                                                       {'loss': 0.0107, 'grad_norm': 2.5277323722839355, 'learning_rate': 2.2959808625976473e-05, 'epoch': 23.46}
- 47%|████▋     | 2721/5800 [7:33:43<5:53:45,  6.89s/it]score1 tensor([[0.4824],
-        [0.4453],
-        [0.4980],
-        [0.5977]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.4629, 0.4863, 0.6289], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:20:20,475] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.92 | optimizer_step: 4.36
-[2025-01-25 16:20:20,477] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.38 | bwd_microstep: 4626.77 | bwd_inner_microstep: 4621.84 | bwd_allreduce_microstep: 4.82 | step_microstep: 47.56
-[2025-01-25 16:20:20,477] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.34 | bwd: 4626.80 | bwd_inner: 4621.84 | bwd_allreduce: 4.87 | step: 47.57
- 47%|████▋     | 2722/5800 [7:33:50<5:53:55,  6.90s/it]                                                       {'loss': 0.0293, 'grad_norm': 4.240626335144043, 'learning_rate': 2.2948763017824804e-05, 'epoch': 23.47}
- 47%|████▋     | 2722/5800 [7:33:50<5:53:55,  6.90s/it]score1 tensor([[0.4531],
-        [0.4902],
-        [0.5977],
-        [0.6602]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.5195, 0.6094, 0.6367], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0288, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:20:27,388] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 16:20:27,389] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.27 | bwd_microstep: 4629.19 | bwd_inner_microstep: 4623.87 | bwd_allreduce_microstep: 5.22 | step_microstep: 46.19
-[2025-01-25 16:20:27,390] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.22 | bwd: 4629.22 | bwd_inner: 4623.87 | bwd_allreduce: 5.27 | step: 46.20
- 47%|████▋     | 2723/5800 [7:33:57<5:53:55,  6.90s/it]                                                       {'loss': 0.0288, 'grad_norm': 3.895650863647461, 'learning_rate': 2.293771649019766e-05, 'epoch': 23.47}
- 47%|████▋     | 2723/5800 [7:33:57<5:53:55,  6.90s/it]score1 tensor([[0.6133],
-        [0.4961],
-        [0.4570],
-        [0.3398]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6016, 0.5391, 0.4980, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0327, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:20:34,294] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 16:20:34,296] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.30 | bwd_microstep: 4624.05 | bwd_inner_microstep: 4618.91 | bwd_allreduce_microstep: 5.05 | step_microstep: 43.88
-[2025-01-25 16:20:34,296] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.26 | bwd: 4624.07 | bwd_inner: 4618.91 | bwd_allreduce: 5.09 | step: 43.89
- 47%|████▋     | 2724/5800 [7:34:04<5:53:53,  6.90s/it]                                                       {'loss': 0.0327, 'grad_norm': 3.5082333087921143, 'learning_rate': 2.2926669046539548e-05, 'epoch': 23.48}
- 47%|████▋     | 2724/5800 [7:34:04<5:53:53,  6.90s/it]score1 tensor([[0.3398],
-        [0.4766],
-        [0.3848],
-        [0.4043]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3691, 0.4824, 0.4199, 0.4082], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:20:41,206] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 16:20:41,207] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.96 | bwd_microstep: 4631.32 | bwd_inner_microstep: 4626.22 | bwd_allreduce_microstep: 5.02 | step_microstep: 43.71
-[2025-01-25 16:20:41,208] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.92 | bwd: 4631.35 | bwd_inner: 4626.22 | bwd_allreduce: 5.06 | step: 43.71
- 47%|████▋     | 2725/5800 [7:34:11<5:53:55,  6.91s/it]                                                       {'loss': 0.0186, 'grad_norm': 7.382269382476807, 'learning_rate': 2.291562069029524e-05, 'epoch': 23.49}
- 47%|████▋     | 2725/5800 [7:34:11<5:53:55,  6.91s/it]score1 tensor([[0.4941],
-        [0.6367],
-        [0.5508],
-        [0.6406]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.6875, 0.5391, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0244, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:20:48,064] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 16:20:48,065] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.91 | bwd_microstep: 4569.69 | bwd_inner_microstep: 4564.26 | bwd_allreduce_microstep: 5.32 | step_microstep: 49.13
-[2025-01-25 16:20:48,066] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.87 | bwd: 4569.72 | bwd_inner: 4564.26 | bwd_allreduce: 5.38 | step: 49.14
- 47%|████▋     | 2726/5800 [7:34:18<5:53:01,  6.89s/it]                                                       {'loss': 0.0244, 'grad_norm': 2.209218740463257, 'learning_rate': 2.290457142490981e-05, 'epoch': 23.5}
- 47%|████▋     | 2726/5800 [7:34:18<5:53:01,  6.89s/it]score1 tensor([[0.5664],
-        [0.5898],
-        [0.4453],
-        [0.5898]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.5781, 0.4121, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0342, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:20:54,976] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 16:20:54,977] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.21 | bwd_microstep: 4631.58 | bwd_inner_microstep: 4626.78 | bwd_allreduce_microstep: 4.71 | step_microstep: 45.04
-[2025-01-25 16:20:54,978] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.16 | bwd: 4631.61 | bwd_inner: 4626.78 | bwd_allreduce: 4.76 | step: 45.04
- 47%|████▋     | 2727/5800 [7:34:24<5:53:23,  6.90s/it]                                                       {'loss': 0.0342, 'grad_norm': 4.133954048156738, 'learning_rate': 2.289352125382861e-05, 'epoch': 23.51}
- 47%|████▋     | 2727/5800 [7:34:24<5:53:23,  6.90s/it]score1 tensor([[0.4766],
-        [0.4043],
-        [0.4512],
-        [0.4648]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.4375, 0.4375, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:21:01,891] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 16:21:01,893] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.51 | bwd_microstep: 4628.41 | bwd_inner_microstep: 4623.43 | bwd_allreduce_microstep: 4.89 | step_microstep: 45.38
-[2025-01-25 16:21:01,894] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.47 | bwd: 4628.44 | bwd_inner: 4623.43 | bwd_allreduce: 4.93 | step: 45.39
- 47%|████▋     | 2728/5800 [7:34:31<5:53:35,  6.91s/it]                                                       {'loss': 0.0156, 'grad_norm': 3.8683292865753174, 'learning_rate': 2.2882470180497277e-05, 'epoch': 23.52}
- 47%|████▋     | 2728/5800 [7:34:31<5:53:35,  6.91s/it]score1 tensor([[0.6758],
-        [0.4785],
-        [0.5430],
-        [0.6016]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133, 0.4824, 0.5156, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0244, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:21:08,819] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 16:21:08,821] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.14 | bwd_microstep: 4629.43 | bwd_inner_microstep: 4623.82 | bwd_allreduce_microstep: 5.52 | step_microstep: 50.30
-[2025-01-25 16:21:08,821] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.08 | bwd: 4629.45 | bwd_inner: 4623.82 | bwd_allreduce: 5.56 | step: 50.31
- 47%|████▋     | 2729/5800 [7:34:38<5:53:40,  6.91s/it]                                                       {'loss': 0.0244, 'grad_norm': 0.6409513354301453, 'learning_rate': 2.2871418208361728e-05, 'epoch': 23.53}
- 47%|████▋     | 2729/5800 [7:34:38<5:53:40,  6.91s/it]score1 tensor([[0.4004],
-        [0.3887],
-        [0.4258],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.4062, 0.4551, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0327, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:21:15,726] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.37
-[2025-01-25 16:21:15,727] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.99 | bwd_microstep: 4622.75 | bwd_inner_microstep: 4617.81 | bwd_allreduce_microstep: 4.86 | step_microstep: 43.49
-[2025-01-25 16:21:15,727] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.95 | bwd: 4622.77 | bwd_inner: 4617.81 | bwd_allreduce: 4.89 | step: 43.50
- 47%|████▋     | 2730/5800 [7:34:45<5:53:26,  6.91s/it]                                                       {'loss': 0.0327, 'grad_norm': 3.4156205654144287, 'learning_rate': 2.2860365340868162e-05, 'epoch': 23.53}
- 47%|████▋     | 2730/5800 [7:34:45<5:53:26,  6.91s/it]score1 tensor([[0.5391],
-        [0.5195],
-        [0.4863],
-        [0.6328]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4766, 0.4961, 0.4707, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:21:22,632] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 16:21:22,634] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.52 | bwd_microstep: 4623.97 | bwd_inner_microstep: 4618.70 | bwd_allreduce_microstep: 5.16 | step_microstep: 45.56
-[2025-01-25 16:21:22,634] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.48 | bwd: 4623.99 | bwd_inner: 4618.70 | bwd_allreduce: 5.22 | step: 45.58
- 47%|████▋     | 2731/5800 [7:34:52<5:53:22,  6.91s/it]                                                       {'loss': 0.0293, 'grad_norm': 8.600123405456543, 'learning_rate': 2.284931158146304e-05, 'epoch': 23.54}
- 47%|████▋     | 2731/5800 [7:34:52<5:53:22,  6.91s/it]score1 tensor([[0.5547],
-        [0.5664],
-        [0.5898],
-        [0.3535]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.5664, 0.5391, 0.3516], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0229, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:21:29,496] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 16:21:29,497] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.25 | bwd_microstep: 4579.84 | bwd_inner_microstep: 4574.55 | bwd_allreduce_microstep: 5.18 | step_microstep: 47.00
-[2025-01-25 16:21:29,498] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.22 | bwd: 4579.87 | bwd_inner: 4574.55 | bwd_allreduce: 5.24 | step: 47.01
- 47%|████▋     | 2732/5800 [7:34:59<5:52:30,  6.89s/it]                                                       {'loss': 0.0229, 'grad_norm': 6.168280601501465, 'learning_rate': 2.283825693359313e-05, 'epoch': 23.55}
- 47%|████▋     | 2732/5800 [7:34:59<5:52:30,  6.89s/it]score1 tensor([[0.4863],
-        [0.6055],
-        [0.4355],
-        [0.4492]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4863, 0.6094, 0.4277, 0.4609], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:21:36,348] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.93 | optimizer_step: 4.36
-[2025-01-25 16:21:36,349] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.84 | bwd_microstep: 4572.86 | bwd_inner_microstep: 4568.04 | bwd_allreduce_microstep: 4.73 | step_microstep: 42.16
-[2025-01-25 16:21:36,350] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.79 | bwd: 4572.88 | bwd_inner: 4568.04 | bwd_allreduce: 4.77 | step: 42.17
- 47%|████▋     | 2733/5800 [7:35:06<5:51:45,  6.88s/it]                                                       {'loss': 0.0059, 'grad_norm': 2.319701671600342, 'learning_rate': 2.2827201400705463e-05, 'epoch': 23.56}
- 47%|████▋     | 2733/5800 [7:35:06<5:51:45,  6.88s/it]score1 tensor([[0.3926],
-        [0.4883],
-        [0.5664],
-        [0.4004]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3926, 0.5078, 0.5352, 0.4023], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:21:43,201] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 16:21:43,202] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.83 | bwd_microstep: 4569.89 | bwd_inner_microstep: 4565.20 | bwd_allreduce_microstep: 4.61 | step_microstep: 43.55
-[2025-01-25 16:21:43,202] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.79 | bwd: 4569.91 | bwd_inner: 4565.20 | bwd_allreduce: 4.65 | step: 43.56
- 47%|████▋     | 2734/5800 [7:35:13<5:51:11,  6.87s/it]                                                       {'loss': 0.0132, 'grad_norm': 1.6996781826019287, 'learning_rate': 2.2816144986247342e-05, 'epoch': 23.57}
- 47%|████▋     | 2734/5800 [7:35:13<5:51:11,  6.87s/it]score1 tensor([[0.5430],
-        [0.4766],
-        [0.5312],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4609, 0.5156, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:21:50,102] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 16:21:50,104] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.71 | bwd_microstep: 4624.68 | bwd_inner_microstep: 4619.20 | bwd_allreduce_microstep: 5.38 | step_microstep: 44.39
-[2025-01-25 16:21:50,104] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.68 | bwd: 4624.70 | bwd_inner: 4619.20 | bwd_allreduce: 5.43 | step: 44.40
- 47%|████▋     | 2735/5800 [7:35:20<5:51:33,  6.88s/it]                                                       {'loss': 0.0107, 'grad_norm': 0.4936452805995941, 'learning_rate': 2.2805087693666354e-05, 'epoch': 23.58}
- 47%|████▋     | 2735/5800 [7:35:20<5:51:33,  6.88s/it]score1 tensor([[0.6875],
-        [0.4492],
-        [0.5234],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6367, 0.4512, 0.5000, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0190, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:21:56,968] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 16:21:56,969] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.50 | bwd_microstep: 4585.55 | bwd_inner_microstep: 4580.44 | bwd_allreduce_microstep: 5.01 | step_microstep: 41.65
-[2025-01-25 16:21:56,969] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.47 | bwd: 4585.57 | bwd_inner: 4580.44 | bwd_allreduce: 5.06 | step: 41.66
- 47%|████▋     | 2736/5800 [7:35:26<5:51:14,  6.88s/it]                                                       {'loss': 0.019, 'grad_norm': 2.717017650604248, 'learning_rate': 2.2794029526410348e-05, 'epoch': 23.59}
- 47%|████▋     | 2736/5800 [7:35:26<5:51:14,  6.88s/it]score1 tensor([[0.4941],
-        [0.5039],
-        [0.5312],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.5000, 0.5039, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0151, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:22:03,873] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 16:22:03,875] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.45 | bwd_microstep: 4619.05 | bwd_inner_microstep: 4613.99 | bwd_allreduce_microstep: 4.95 | step_microstep: 42.77
-[2025-01-25 16:22:03,875] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.40 | bwd: 4619.07 | bwd_inner: 4613.99 | bwd_allreduce: 5.01 | step: 42.79
- 47%|████▋     | 2737/5800 [7:35:33<5:51:28,  6.88s/it]                                                       {'loss': 0.0151, 'grad_norm': 8.35189437866211, 'learning_rate': 2.2782970487927454e-05, 'epoch': 23.59}
- 47%|████▋     | 2737/5800 [7:35:33<5:51:28,  6.88s/it]score1 tensor([[0.4004],
-        [0.4980],
-        [0.5938],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3809, 0.4668, 0.5820, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0264, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:22:10,787] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 16:22:10,789] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.92 | bwd_microstep: 4631.62 | bwd_inner_microstep: 4626.39 | bwd_allreduce_microstep: 5.15 | step_microstep: 44.82
-[2025-01-25 16:22:10,789] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.89 | bwd: 4631.65 | bwd_inner: 4626.39 | bwd_allreduce: 5.19 | step: 44.83
- 47%|████▋     | 2738/5800 [7:35:40<5:51:47,  6.89s/it]                                                       {'loss': 0.0264, 'grad_norm': 8.28318977355957, 'learning_rate': 2.2771910581666075e-05, 'epoch': 23.6}
- 47%|████▋     | 2738/5800 [7:35:40<5:51:47,  6.89s/it]score1 tensor([[0.5586],
-        [0.4473],
-        [0.5664],
-        [0.4980]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.4180, 0.5898, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0308, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:22:17,688] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.37
-[2025-01-25 16:22:17,689] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.57 | bwd_microstep: 4625.52 | bwd_inner_microstep: 4620.46 | bwd_allreduce_microstep: 4.96 | step_microstep: 42.80
-[2025-01-25 16:22:17,690] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.52 | bwd: 4625.55 | bwd_inner: 4620.46 | bwd_allreduce: 5.01 | step: 42.81
- 47%|████▋     | 2739/5800 [7:35:47<5:51:48,  6.90s/it]                                                       {'loss': 0.0308, 'grad_norm': 0.4929255247116089, 'learning_rate': 2.276084981107488e-05, 'epoch': 23.61}
- 47%|████▋     | 2739/5800 [7:35:47<5:51:48,  6.90s/it]score1 tensor([[0.5508],
-        [0.6172],
-        [0.5195],
-        [0.4629]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.6055, 0.5156, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:22:24,589] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 16:22:24,590] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.28 | bwd_microstep: 4622.09 | bwd_inner_microstep: 4617.24 | bwd_allreduce_microstep: 4.77 | step_microstep: 41.70
-[2025-01-25 16:22:24,590] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.25 | bwd: 4622.12 | bwd_inner: 4617.23 | bwd_allreduce: 4.82 | step: 41.71
- 47%|████▋     | 2740/5800 [7:35:54<5:51:45,  6.90s/it]                                                       {'loss': 0.0127, 'grad_norm': 8.469931602478027, 'learning_rate': 2.2749788179602807e-05, 'epoch': 23.62}
- 47%|████▋     | 2740/5800 [7:35:54<5:51:45,  6.90s/it]score1 tensor([[0.4824],
-        [0.4883],
-        [0.5195],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.4551, 0.5195, 0.4531], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0264, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:22:31,448] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 16:22:31,449] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.22 | bwd_microstep: 4582.00 | bwd_inner_microstep: 4576.38 | bwd_allreduce_microstep: 5.52 | step_microstep: 45.95
-[2025-01-25 16:22:31,450] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.19 | bwd: 4582.03 | bwd_inner: 4576.37 | bwd_allreduce: 5.58 | step: 45.96
- 47%|████▋     | 2741/5800 [7:36:01<5:51:06,  6.89s/it]                                                       {'loss': 0.0264, 'grad_norm': 6.113691806793213, 'learning_rate': 2.2738725690699063e-05, 'epoch': 23.63}
- 47%|████▋     | 2741/5800 [7:36:01<5:51:06,  6.89s/it]score1 tensor([[0.5430],
-        [0.4199],
-        [0.5938],
-        [0.4746]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4277, 0.6211, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0171, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:22:38,366] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.81 | optimizer_step: 4.36
-[2025-01-25 16:22:38,368] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.73 | bwd_microstep: 4629.16 | bwd_inner_microstep: 4623.83 | bwd_allreduce_microstep: 5.26 | step_microstep: 49.83
-[2025-01-25 16:22:38,368] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.69 | bwd: 4629.19 | bwd_inner: 4623.83 | bwd_allreduce: 5.29 | step: 49.84
- 47%|████▋     | 2742/5800 [7:36:08<5:51:33,  6.90s/it]                                                       {'loss': 0.0171, 'grad_norm': 4.245020866394043, 'learning_rate': 2.2727662347813136e-05, 'epoch': 23.64}
- 47%|████▋     | 2742/5800 [7:36:08<5:51:33,  6.90s/it]score1 tensor([[0.4609],
-        [0.5469],
-        [0.5156],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.5508, 0.5117, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0181, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:22:45,291] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 16:22:45,292] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.57 | bwd_microstep: 4627.94 | bwd_inner_microstep: 4622.86 | bwd_allreduce_microstep: 4.98 | step_microstep: 50.19
-[2025-01-25 16:22:45,293] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.54 | bwd: 4627.96 | bwd_inner: 4622.86 | bwd_allreduce: 5.02 | step: 50.19
- 47%|████▋     | 2743/5800 [7:36:15<5:51:42,  6.90s/it]                                                       {'loss': 0.0181, 'grad_norm': 4.140074253082275, 'learning_rate': 2.2716598154394757e-05, 'epoch': 23.65}
- 47%|████▋     | 2743/5800 [7:36:15<5:51:42,  6.90s/it]score1 tensor([[0.5703],
-        [0.5469],
-        [0.4590],
-        [0.3887]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.5586, 0.4551, 0.3711], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:22:52,214] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 16:22:52,216] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.50 | bwd_microstep: 4635.46 | bwd_inner_microstep: 4629.84 | bwd_allreduce_microstep: 5.53 | step_microstep: 47.29
-[2025-01-25 16:22:52,216] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.46 | bwd: 4635.48 | bwd_inner: 4629.84 | bwd_allreduce: 5.58 | step: 47.30
- 47%|████▋     | 2744/5800 [7:36:22<5:51:59,  6.91s/it]                                                       {'loss': 0.0103, 'grad_norm': 0.9527654647827148, 'learning_rate': 2.2705533113893932e-05, 'epoch': 23.66}
- 47%|████▋     | 2744/5800 [7:36:22<5:51:59,  6.91s/it]score1 tensor([[0.5547],
-        [0.3867],
-        [0.5664],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.4082, 0.5625, 0.4805], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:22:59,128] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 16:22:59,129] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.43 | bwd_microstep: 4620.12 | bwd_inner_microstep: 4615.21 | bwd_allreduce_microstep: 4.78 | step_microstep: 55.32
-[2025-01-25 16:22:59,130] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.40 | bwd: 4620.15 | bwd_inner: 4615.21 | bwd_allreduce: 4.84 | step: 55.33
- 47%|████▋     | 2745/5800 [7:36:29<5:51:49,  6.91s/it]                                                       {'loss': 0.0161, 'grad_norm': 4.698703289031982, 'learning_rate': 2.269446722976094e-05, 'epoch': 23.66}
- 47%|████▋     | 2745/5800 [7:36:29<5:51:49,  6.91s/it]score1 tensor([[0.4551],
-        [0.4824],
-        [0.6016],
-        [0.4004]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4805, 0.4648, 0.6133, 0.4277], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0205, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:23:06,033] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.00 | optimizer_step: 4.36
-[2025-01-25 16:23:06,034] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.73 | bwd_microstep: 4625.39 | bwd_inner_microstep: 4620.02 | bwd_allreduce_microstep: 5.27 | step_microstep: 45.33
-[2025-01-25 16:23:06,034] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.70 | bwd: 4625.42 | bwd_inner: 4620.02 | bwd_allreduce: 5.32 | step: 45.34
- 47%|████▋     | 2746/5800 [7:36:36<5:52:01,  6.92s/it]                                                       {'loss': 0.0205, 'grad_norm': 4.064297676086426, 'learning_rate': 2.2683400505446312e-05, 'epoch': 23.67}
- 47%|████▋     | 2746/5800 [7:36:36<5:52:01,  6.92s/it]score1 tensor([[0.4023],
-        [0.4570],
-        [0.5117],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4258, 0.4727, 0.5195, 0.5820], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0205, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:23:12,960] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 16:23:12,961] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.00 | bwd_microstep: 4625.62 | bwd_inner_microstep: 4620.77 | bwd_allreduce_microstep: 4.75 | step_microstep: 43.34
-[2025-01-25 16:23:12,962] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.95 | bwd: 4625.64 | bwd_inner: 4620.77 | bwd_allreduce: 4.80 | step: 43.34
- 47%|████▋     | 2747/5800 [7:36:42<5:51:52,  6.92s/it]                                                       {'loss': 0.0205, 'grad_norm': 7.997637748718262, 'learning_rate': 2.2672332944400845e-05, 'epoch': 23.68}
- 47%|████▋     | 2747/5800 [7:36:42<5:51:52,  6.92s/it]score1 tensor([[0.5586],
-        [0.5156],
-        [0.4434],
-        [0.4961]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.5703, 0.4473, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0171, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:23:19,891] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 16:23:19,892] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.94 | bwd_microstep: 4640.22 | bwd_inner_microstep: 4635.66 | bwd_allreduce_microstep: 4.46 | step_microstep: 43.12
-[2025-01-25 16:23:19,892] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.91 | bwd: 4640.24 | bwd_inner: 4635.66 | bwd_allreduce: 4.51 | step: 43.13
- 47%|████▋     | 2748/5800 [7:36:49<5:51:54,  6.92s/it]                                                       {'loss': 0.0171, 'grad_norm': 0.5342571139335632, 'learning_rate': 2.26612645500756e-05, 'epoch': 23.69}
- 47%|████▋     | 2748/5800 [7:36:49<5:51:54,  6.92s/it]score1 tensor([[0.5547],
-        [0.4355],
-        [0.4883],
-        [0.4062]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4512, 0.4609, 0.3945], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:23:26,831] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 16:23:26,832] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.35 | bwd_microstep: 4642.63 | bwd_inner_microstep: 4634.85 | bwd_allreduce_microstep: 7.54 | step_microstep: 58.53
-[2025-01-25 16:23:26,833] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.31 | bwd: 4642.71 | bwd_inner: 4634.85 | bwd_allreduce: 7.67 | step: 58.54
- 47%|████▋     | 2749/5800 [7:36:56<5:52:03,  6.92s/it]                                                       {'loss': 0.0146, 'grad_norm': 4.146838188171387, 'learning_rate': 2.2650195325921888e-05, 'epoch': 23.7}
- 47%|████▋     | 2749/5800 [7:36:56<5:52:03,  6.92s/it]score1 tensor([[0.4375],
-        [0.6016],
-        [0.6172],
-        [0.3848]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4395, 0.5703, 0.6484, 0.4160], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0239, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:23:33,736] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.37
-[2025-01-25 16:23:33,738] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.74 | bwd_microstep: 4626.74 | bwd_inner_microstep: 4622.10 | bwd_allreduce_microstep: 4.51 | step_microstep: 44.19
-[2025-01-25 16:23:33,738] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.72 | bwd: 4626.77 | bwd_inner: 4622.10 | bwd_allreduce: 4.57 | step: 44.20
- 47%|████▋     | 2750/5800 [7:37:03<5:51:39,  6.92s/it]                                                       {'loss': 0.0239, 'grad_norm': 3.769641399383545, 'learning_rate': 2.263912527539129e-05, 'epoch': 23.71}
- 47%|████▋     | 2750/5800 [7:37:03<5:51:39,  6.92s/it]score1 tensor([[0.3848],
-        [0.5273],
-        [0.5352],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.4980, 0.4941, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0239, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:23:40,677] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.60 | optimizer_step: 4.36
-[2025-01-25 16:23:40,679] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.71 | bwd_microstep: 4632.27 | bwd_inner_microstep: 4624.84 | bwd_allreduce_microstep: 7.31 | step_microstep: 71.72
-[2025-01-25 16:23:40,680] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.68 | bwd: 4632.29 | bwd_inner: 4624.84 | bwd_allreduce: 7.36 | step: 71.77
- 47%|████▋     | 2751/5800 [7:37:10<5:52:25,  6.94s/it]                                                       {'loss': 0.0239, 'grad_norm': 8.171553611755371, 'learning_rate': 2.2628054401935638e-05, 'epoch': 23.72}
- 47%|████▋     | 2751/5800 [7:37:10<5:52:25,  6.94s/it]score1 tensor([[0.4941],
-        [0.5391],
-        [0.5547],
-        [0.6328]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5000, 0.5430, 0.5469, 0.6719], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0142, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:23:47,649] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.37
-[2025-01-25 16:23:47,650] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.91 | bwd_microstep: 4636.65 | bwd_inner_microstep: 4631.50 | bwd_allreduce_microstep: 5.04 | step_microstep: 45.26
-[2025-01-25 16:23:47,651] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.86 | bwd: 4636.67 | bwd_inner: 4631.50 | bwd_allreduce: 5.09 | step: 45.27
- 47%|████▋     | 2752/5800 [7:37:17<5:52:22,  6.94s/it]                                                       {'loss': 0.0142, 'grad_norm': 4.280109405517578, 'learning_rate': 2.261698270900702e-05, 'epoch': 23.72}
- 47%|████▋     | 2752/5800 [7:37:17<5:52:22,  6.94s/it]score1 tensor([[0.6758],
-        [0.4180],
-        [0.4258],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6562, 0.4180, 0.4258, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:23:54,473] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.56
-[2025-01-25 16:23:54,474] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.85 | bwd_microstep: 4538.79 | bwd_inner_microstep: 4533.61 | bwd_allreduce_microstep: 5.07 | step_microstep: 44.11
-[2025-01-25 16:23:54,474] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.81 | bwd: 4538.83 | bwd_inner: 4533.61 | bwd_allreduce: 5.13 | step: 44.11
- 47%|████▋     | 2753/5800 [7:37:24<5:50:32,  6.90s/it]                                                       {'loss': 0.0059, 'grad_norm': 4.494281768798828, 'learning_rate': 2.260591020005779e-05, 'epoch': 23.73}
- 47%|████▋     | 2753/5800 [7:37:24<5:50:32,  6.90s/it]score1 tensor([[0.4824],
-        [0.4277],
-        [0.4727],
-        [0.6328]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4453, 0.4570, 0.4531, 0.6602], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0283, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:24:01,392] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.88 | optimizer_step: 4.36
-[2025-01-25 16:24:01,394] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.66 | bwd_microstep: 4633.11 | bwd_inner_microstep: 4628.36 | bwd_allreduce_microstep: 4.67 | step_microstep: 42.46
-[2025-01-25 16:24:01,394] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.64 | bwd: 4633.13 | bwd_inner: 4628.36 | bwd_allreduce: 4.70 | step: 42.46
- 47%|████▋     | 2754/5800 [7:37:31<5:50:41,  6.91s/it]                                                       {'loss': 0.0283, 'grad_norm': 0.590215802192688, 'learning_rate': 2.2594836878540538e-05, 'epoch': 23.74}
- 47%|████▋     | 2754/5800 [7:37:31<5:50:41,  6.91s/it]score1 tensor([[0.6172],
-        [0.5195],
-        [0.5391],
-        [0.4473]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133, 0.5078, 0.5273, 0.4375], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:24:08,301] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 16:24:08,302] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.55 | bwd_microstep: 4631.44 | bwd_inner_microstep: 4626.49 | bwd_allreduce_microstep: 4.84 | step_microstep: 42.99
-[2025-01-25 16:24:08,302] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.52 | bwd: 4631.47 | bwd_inner: 4626.49 | bwd_allreduce: 4.90 | step: 42.99
- 48%|████▊     | 2755/5800 [7:37:38<5:50:32,  6.91s/it]                                                       {'loss': 0.0093, 'grad_norm': 8.459864616394043, 'learning_rate': 2.2583762747908132e-05, 'epoch': 23.75}
- 48%|████▊     | 2755/5800 [7:37:38<5:50:32,  6.91s/it]score1 tensor([[0.6172],
-        [0.6562],
-        [0.4531],
-        [0.5977]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6250, 0.6445, 0.4648, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:24:15,209] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 16:24:15,211] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.58 | bwd_microstep: 4630.01 | bwd_inner_microstep: 4624.78 | bwd_allreduce_microstep: 5.13 | step_microstep: 44.26
-[2025-01-25 16:24:15,211] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.54 | bwd: 4630.04 | bwd_inner: 4624.78 | bwd_allreduce: 5.18 | step: 44.27
- 48%|████▊     | 2756/5800 [7:37:45<5:50:24,  6.91s/it]                                                       {'loss': 0.0127, 'grad_norm': 0.7566752433776855, 'learning_rate': 2.2572687811613664e-05, 'epoch': 23.76}
- 48%|████▊     | 2756/5800 [7:37:45<5:50:24,  6.91s/it]score1 tensor([[0.4941],
-        [0.5547],
-        [0.6055],
-        [0.3652]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.5469, 0.6406, 0.3945], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0225, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:24:22,111] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.37
-[2025-01-25 16:24:22,112] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.50 | bwd_microstep: 4624.67 | bwd_inner_microstep: 4619.70 | bwd_allreduce_microstep: 4.87 | step_microstep: 44.02
-[2025-01-25 16:24:22,113] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.47 | bwd: 4624.69 | bwd_inner: 4619.70 | bwd_allreduce: 4.92 | step: 44.04
- 48%|████▊     | 2757/5800 [7:37:52<5:50:14,  6.91s/it]                                                       {'loss': 0.0225, 'grad_norm': 3.8512606620788574, 'learning_rate': 2.256161207311049e-05, 'epoch': 23.77}
- 48%|████▊     | 2757/5800 [7:37:52<5:50:14,  6.91s/it]score1 tensor([[0.4805],
-        [0.5625],
-        [0.4590],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4746, 0.5664, 0.4707, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:24:29,005] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.36
-[2025-01-25 16:24:29,006] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.38 | bwd_microstep: 4622.67 | bwd_inner_microstep: 4617.52 | bwd_allreduce_microstep: 5.04 | step_microstep: 42.96
-[2025-01-25 16:24:29,007] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.35 | bwd: 4622.69 | bwd_inner: 4617.52 | bwd_allreduce: 5.09 | step: 42.97
- 48%|████▊     | 2758/5800 [7:37:58<5:49:56,  6.90s/it]                                                       {'loss': 0.0103, 'grad_norm': 0.4331933856010437, 'learning_rate': 2.255053553585223e-05, 'epoch': 23.78}
- 48%|████▊     | 2758/5800 [7:37:58<5:49:56,  6.90s/it]score1 tensor([[0.4160],
-        [0.5469],
-        [0.4531],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.5391, 0.4648, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:24:35,915] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.37
-[2025-01-25 16:24:35,916] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.94 | bwd_microstep: 4632.01 | bwd_inner_microstep: 4627.04 | bwd_allreduce_microstep: 4.87 | step_microstep: 43.61
-[2025-01-25 16:24:35,917] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.90 | bwd: 4632.04 | bwd_inner: 4627.04 | bwd_allreduce: 4.91 | step: 43.61
- 48%|████▊     | 2759/5800 [7:38:05<5:50:00,  6.91s/it]                                                       {'loss': 0.0195, 'grad_norm': 0.44129684567451477, 'learning_rate': 2.2539458203292734e-05, 'epoch': 23.78}
- 48%|████▊     | 2759/5800 [7:38:05<5:50:00,  6.91s/it]score1 tensor([[0.5820],
-        [0.6094],
-        [0.4375],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.6172, 0.4785, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0278, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:24:42,831] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 16:24:42,832] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.48 | bwd_microstep: 4625.17 | bwd_inner_microstep: 4620.63 | bwd_allreduce_microstep: 4.45 | step_microstep: 43.14
-[2025-01-25 16:24:42,832] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.44 | bwd: 4625.20 | bwd_inner: 4620.63 | bwd_allreduce: 4.50 | step: 43.15
- 48%|████▊     | 2760/5800 [7:38:12<5:49:59,  6.91s/it]                                                       {'loss': 0.0278, 'grad_norm': 8.551285743713379, 'learning_rate': 2.2528380078886104e-05, 'epoch': 23.79}
- 48%|████▊     | 2760/5800 [7:38:12<5:49:59,  6.91s/it]evaluate!
-score1 tensor([[0.4707]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4805, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5430]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5625, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5039]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4766, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5312]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5703, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6445]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5742, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4336]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5547, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1211, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5977, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4219, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1055, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5547]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4883]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5000, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4141]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3945, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5195]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5000, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5000]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3730, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1270, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4805]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4238]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4414, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6797, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1523, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5391, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4844]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4648, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3867]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4258, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4297]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4707, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0410, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4238]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3965, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4961]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3750, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1211, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4492, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5586, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6523, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1250, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5820]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5898, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5312]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4355, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0957, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4902]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5156, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4785]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3262, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1523, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4043]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4863, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0820, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4785]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1699, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6055, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5430]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4570, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0859, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5664]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0820, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6133]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.7031, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0898, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4746]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4336, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0410, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5508]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5391, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4238]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4473, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5430, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4453]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4590, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6406]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6641, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5000]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4453, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5938]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.7109, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1172, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5273, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4922]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3691, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1230, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4219]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4180, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4453]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3223, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1230, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5625]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6016, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4766]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4648, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4375]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3418, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0957, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4727]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4590, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4961]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5117, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4707]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4551, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4570]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5234, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5625]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5312, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5820]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5820, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0., device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4238]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4199, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3926]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3926, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0., device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5703]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5000]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5117]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5117]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5352, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6016]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6250, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5742]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5703, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4531]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4727, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4434, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0918, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4668]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4941, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5312]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5352, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4883]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0., device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4395]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3828, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0566, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4199]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4219, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4844]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4844, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0., device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5117]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4805, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5469]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6016]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5898, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6406]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6641, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4824]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4688, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4180]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4297, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4258]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4668, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0410, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0898, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4316]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4297, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4062]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4609, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4883]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1289, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4629]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4492, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3984]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3730, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5195]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6445, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1250, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4961]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1523, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4375]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5430, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1055, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5273, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4199]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3867, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0332, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4883]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4668, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5117]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5703]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5625, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5742]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6250, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5859]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5586]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4141]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4785, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0645, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4844]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4062, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0781, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5508]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4395, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1113, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4043]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5664, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1621, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4531]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3555, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0977, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4395]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4785, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5078, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6094]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6016, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4629]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4277, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6055]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5781, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4648]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3477, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1172, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4980]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6133, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1152, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5430]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5156, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4277]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3379, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0898, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4141]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3418, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0723, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4746]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3848, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0898, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4316]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3828, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0488, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4668]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4277, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3926]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4043, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-Results saved to /DATA/DATA1/wjr/intern/InternVL/internvl_chat/new_train_result/stage2/mos3_test.csv
-Accuracy: 0.0
-SRCC_score: 0.6927881112673244
-PLCC_score: 0.691174424316613
-KRCC_score: 0.5059257191672173
-SRCC_level: 0.6927881112673244
-PLCC_level: 0.691174424316613
-KRCC_level: 0.5059257191672173
-New best SRCC_score: 0.6927881112673244. Saving model...
-[INFO|trainer.py:3705] 2025-01-25 16:35:12,160 >> Saving model checkpoint to /DATA/env/wjr/newtrain/stage2/mos3
-[INFO|configuration_utils.py:410] 2025-01-25 16:35:12,168 >> Configuration saved in /DATA/env/wjr/newtrain/stage2/mos3/config.json
-[INFO|configuration_utils.py:868] 2025-01-25 16:35:12,169 >> Configuration saved in /DATA/env/wjr/newtrain/stage2/mos3/generation_config.json
-[INFO|modeling_utils.py:2844] 2025-01-25 16:36:34,589 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 4 checkpoint shards. You can find where each parameters has been saved in the index located at /DATA/env/wjr/newtrain/stage2/mos3/model.safetensors.index.json.
-[INFO|tokenization_utils_base.py:2641] 2025-01-25 16:36:34,592 >> tokenizer config file saved in /DATA/env/wjr/newtrain/stage2/mos3/tokenizer_config.json
-[INFO|tokenization_utils_base.py:2650] 2025-01-25 16:36:34,592 >> Special tokens file saved in /DATA/env/wjr/newtrain/stage2/mos3/special_tokens_map.json
-[INFO|tokenization_utils_base.py:2701] 2025-01-25 16:36:34,593 >> added tokens file saved in /DATA/env/wjr/newtrain/stage2/mos3/added_tokens.json
-01/25/2025 16:36:45 - INFO - __main__ - Saved LoRA weights to /DATA/env/wjr/newtrain/stage2/mos3/lora_weights.pth
-score1 tensor([[0.3672],
-        [0.4355],
-        [0.4707],
-        [0.4551]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.4688, 0.4844, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0200, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:36:51,937] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 16:36:51,938] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2124.78 | bwd_microstep: 4532.03 | bwd_inner_microstep: 4527.04 | bwd_allreduce_microstep: 4.89 | step_microstep: 43.96
-[2025-01-25 16:36:51,938] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2124.74 | bwd: 4532.05 | bwd_inner: 4527.04 | bwd_allreduce: 4.94 | step: 43.97
- 48%|████▊     | 2761/5800 [7:50:21<188:43:47, 223.57s/it]                                                          {'loss': 0.02, 'grad_norm': 5.665461540222168, 'learning_rate': 2.2517301166086703e-05, 'epoch': 23.8}
- 48%|████▊     | 2761/5800 [7:50:21<188:43:47, 223.57s/it]score1 tensor([[0.5547],
-        [0.4961],
-        [0.5391],
-        [0.4727]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.5000, 0.5234, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:36:58,722] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 16:36:58,724] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2133.96 | bwd_microstep: 4522.70 | bwd_inner_microstep: 4517.91 | bwd_allreduce_microstep: 4.63 | step_microstep: 46.48
-[2025-01-25 16:36:58,724] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2133.91 | bwd: 4522.72 | bwd_inner: 4517.91 | bwd_allreduce: 4.72 | step: 46.49
- 48%|████▊     | 2762/5800 [7:50:28<133:47:08, 158.53s/it]                                                          {'loss': 0.0098, 'grad_norm': 1.9572514295578003, 'learning_rate': 2.250622146834911e-05, 'epoch': 23.81}
- 48%|████▊     | 2762/5800 [7:50:28<133:47:08, 158.53s/it]score1 tensor([[0.5625],
-        [0.4395],
-        [0.5664],
-        [0.4980]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5586, 0.4590, 0.5352, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0151, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:37:05,532] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.36
-[2025-01-25 16:37:05,533] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2125.15 | bwd_microstep: 4559.61 | bwd_inner_microstep: 4554.61 | bwd_allreduce_microstep: 4.90 | step_microstep: 44.90
-[2025-01-25 16:37:05,534] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2125.11 | bwd: 4559.63 | bwd_inner: 4554.61 | bwd_allreduce: 4.96 | step: 44.91
- 48%|████▊     | 2763/5800 [7:50:35<95:20:24, 113.01s/it]                                                          {'loss': 0.0151, 'grad_norm': 4.5781121253967285, 'learning_rate': 2.2495140989128176e-05, 'epoch': 23.82}
- 48%|████▊     | 2763/5800 [7:50:35<95:20:24, 113.01s/it]score1 tensor([[0.4863],
-        [0.4062],
-        [0.3828],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.3477, 0.3340, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0420, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:37:12,358] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.36
-[2025-01-25 16:37:12,359] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2130.72 | bwd_microstep: 4576.03 | bwd_inner_microstep: 4570.91 | bwd_allreduce_microstep: 5.03 | step_microstep: 45.98
-[2025-01-25 16:37:12,360] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2130.69 | bwd: 4576.05 | bwd_inner: 4570.91 | bwd_allreduce: 5.08 | step: 45.99
- 48%|████▊     | 2764/5800 [7:50:42<68:26:33, 81.16s/it]                                                         {'loss': 0.042, 'grad_norm': 3.815903425216675, 'learning_rate': 2.2484059731878993e-05, 'epoch': 23.83}
- 48%|████▊     | 2764/5800 [7:50:42<68:26:33, 81.16s/it]score1 tensor([[0.4922],
-        [0.4492],
-        [0.5742],
-        [0.4609]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.4434, 0.5625, 0.4297], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0181, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:37:19,201] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 16:37:19,203] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2133.65 | bwd_microstep: 4588.82 | bwd_inner_microstep: 4584.16 | bwd_allreduce_microstep: 4.55 | step_microstep: 46.07
-[2025-01-25 16:37:19,203] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2133.63 | bwd: 4588.85 | bwd_inner: 4584.16 | bwd_allreduce: 4.61 | step: 46.09
- 48%|████▊     | 2765/5800 [7:50:49<49:37:36, 58.87s/it]                                                        {'loss': 0.0181, 'grad_norm': 8.108358383178711, 'learning_rate': 2.2472977700056878e-05, 'epoch': 23.84}
- 48%|████▊     | 2765/5800 [7:50:49<49:37:36, 58.87s/it]score1 tensor([[0.5273],
-        [0.3711],
-        [0.4746],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4863, 0.3750, 0.4766, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:37:26,049] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.37
-[2025-01-25 16:37:26,051] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2138.30 | bwd_microstep: 4587.16 | bwd_inner_microstep: 4582.11 | bwd_allreduce_microstep: 4.93 | step_microstep: 42.79
-[2025-01-25 16:37:26,051] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2138.26 | bwd: 4587.19 | bwd_inner: 4582.11 | bwd_allreduce: 5.00 | step: 42.80
- 48%|████▊     | 2766/5800 [7:50:56<36:27:28, 43.26s/it]                                                        {'loss': 0.0186, 'grad_norm': 0.7398658394813538, 'learning_rate': 2.246189489711741e-05, 'epoch': 23.84}
- 48%|████▊     | 2766/5800 [7:50:56<36:27:28, 43.26s/it]score1 tensor([[0.5117],
-        [0.3379],
-        [0.4258],
-        [0.4512]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4883, 0.3223, 0.4180, 0.4355], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:37:32,889] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 16:37:32,890] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2142.25 | bwd_microstep: 4575.88 | bwd_inner_microstep: 4571.26 | bwd_allreduce_microstep: 4.53 | step_microstep: 43.04
-[2025-01-25 16:37:32,890] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2142.22 | bwd: 4575.90 | bwd_inner: 4571.26 | bwd_allreduce: 4.57 | step: 43.05
- 48%|████▊     | 2767/5800 [7:51:02<27:14:24, 32.33s/it]                                                        {'loss': 0.0156, 'grad_norm': 7.5493903160095215, 'learning_rate': 2.2450811326516394e-05, 'epoch': 23.85}
- 48%|████▊     | 2767/5800 [7:51:02<27:14:24, 32.33s/it]score1 tensor([[0.6484],
-        [0.4453],
-        [0.5234],
-        [0.6055]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.4180, 0.5625, 0.6016], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:37:39,730] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 16:37:39,731] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2140.35 | bwd_microstep: 4585.34 | bwd_inner_microstep: 4580.52 | bwd_allreduce_microstep: 4.73 | step_microstep: 42.92
-[2025-01-25 16:37:39,732] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2140.29 | bwd: 4585.36 | bwd_inner: 4580.52 | bwd_allreduce: 4.77 | step: 42.93
- 48%|████▊     | 2768/5800 [7:51:09<20:47:26, 24.69s/it]                                                        {'loss': 0.0254, 'grad_norm': 4.483509063720703, 'learning_rate': 2.2439726991709877e-05, 'epoch': 23.86}
- 48%|████▊     | 2768/5800 [7:51:09<20:47:26, 24.69s/it]score1 tensor([[0.6641],
-        [0.5039],
-        [0.5977],
-        [0.4453]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6875, 0.5039, 0.6445, 0.4395], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0190, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:37:46,535] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 16:37:46,537] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2146.17 | bwd_microstep: 4545.63 | bwd_inner_microstep: 4540.89 | bwd_allreduce_microstep: 4.66 | step_microstep: 43.02
-[2025-01-25 16:37:46,537] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2146.14 | bwd: 4545.65 | bwd_inner: 4540.89 | bwd_allreduce: 4.70 | step: 43.03
- 48%|████▊     | 2769/5800 [7:51:16<16:16:03, 19.32s/it]                                                        {'loss': 0.019, 'grad_norm': 2.8390908241271973, 'learning_rate': 2.242864189615416e-05, 'epoch': 23.87}
- 48%|████▊     | 2769/5800 [7:51:16<16:16:03, 19.32s/it]score1 tensor([[0.4805],
-        [0.5078],
-        [0.4004],
-        [0.4922]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.5078, 0.4043, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:37:53,363] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.37
-[2025-01-25 16:37:53,364] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2145.89 | bwd_microstep: 4554.59 | bwd_inner_microstep: 4549.25 | bwd_allreduce_microstep: 5.24 | step_microstep: 51.07
-[2025-01-25 16:37:53,364] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2145.86 | bwd: 4554.62 | bwd_inner: 4549.25 | bwd_allreduce: 5.29 | step: 51.08
- 48%|████▊     | 2770/5800 [7:51:23<13:06:22, 15.57s/it]                                                        {'loss': 0.0049, 'grad_norm': 1.9136834144592285, 'learning_rate': 2.2417556043305752e-05, 'epoch': 23.88}
- 48%|████▊     | 2770/5800 [7:51:23<13:06:22, 15.57s/it]score1 tensor([[0.5156],
-        [0.5586],
-        [0.5430],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.5508, 0.5664, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0205, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:38:00,230] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 16:38:00,232] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.34 | bwd_microstep: 4604.65 | bwd_inner_microstep: 4599.67 | bwd_allreduce_microstep: 4.83 | step_microstep: 42.28
-[2025-01-25 16:38:00,232] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.29 | bwd: 4604.70 | bwd_inner: 4599.67 | bwd_allreduce: 4.90 | step: 42.29
- 48%|████▊     | 2771/5800 [7:51:30<10:54:23, 12.96s/it]                                                        {'loss': 0.0205, 'grad_norm': 4.178559303283691, 'learning_rate': 2.2406469436621423e-05, 'epoch': 23.89}
- 48%|████▊     | 2771/5800 [7:51:30<10:54:23, 12.96s/it]score1 tensor([[0.4141],
-        [0.4414],
-        [0.5156],
-        [0.6055]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3984, 0.4785, 0.4961, 0.6250], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0229, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:38:07,123] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.37
-[2025-01-25 16:38:07,124] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.90 | bwd_microstep: 4620.87 | bwd_inner_microstep: 4615.74 | bwd_allreduce_microstep: 5.05 | step_microstep: 42.81
-[2025-01-25 16:38:07,124] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.87 | bwd: 4620.89 | bwd_inner: 4615.74 | bwd_allreduce: 5.09 | step: 42.82
- 48%|████▊     | 2772/5800 [7:51:37<9:22:13, 11.14s/it]                                                        {'loss': 0.0229, 'grad_norm': 0.5357106924057007, 'learning_rate': 2.239538207955817e-05, 'epoch': 23.9}
- 48%|████▊     | 2772/5800 [7:51:37<9:22:13, 11.14s/it]score1 tensor([[0.3867],
-        [0.5039],
-        [0.6953],
-        [0.3828]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3789, 0.5469, 0.7031, 0.3672], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:38:14,015] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 16:38:14,016] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.58 | bwd_microstep: 4620.51 | bwd_inner_microstep: 4615.94 | bwd_allreduce_microstep: 4.49 | step_microstep: 42.58
-[2025-01-25 16:38:14,016] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.48 | bwd: 4620.54 | bwd_inner: 4615.94 | bwd_allreduce: 4.53 | step: 42.59
- 48%|████▊     | 2773/5800 [7:51:43<8:17:43,  9.87s/it]                                                       {'loss': 0.0186, 'grad_norm': 1.3812556266784668, 'learning_rate': 2.2384293975573222e-05, 'epoch': 23.91}
- 48%|████▊     | 2773/5800 [7:51:43<8:17:43,  9.87s/it]score1 tensor([[0.4531],
-        [0.5547],
-        [0.6055],
-        [0.3281]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.5664, 0.6328, 0.3438], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:38:20,909] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.91 | optimizer_step: 4.36
-[2025-01-25 16:38:20,910] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.95 | bwd_microstep: 4623.86 | bwd_inner_microstep: 4618.97 | bwd_allreduce_microstep: 4.74 | step_microstep: 43.96
-[2025-01-25 16:38:20,911] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.91 | bwd: 4623.89 | bwd_inner: 4618.96 | bwd_allreduce: 4.84 | step: 43.95
- 48%|████▊     | 2774/5800 [7:51:50<7:32:36,  8.97s/it]                                                       {'loss': 0.0215, 'grad_norm': 8.057863235473633, 'learning_rate': 2.2373205128124038e-05, 'epoch': 23.91}
- 48%|████▊     | 2774/5800 [7:51:50<7:32:36,  8.97s/it]score1 tensor([[0.4785],
-        [0.4785],
-        [0.6133],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4902, 0.5117, 0.6406, 0.5898], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0327, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:38:27,806] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 16:38:27,807] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.46 | bwd_microstep: 4622.91 | bwd_inner_microstep: 4617.62 | bwd_allreduce_microstep: 5.20 | step_microstep: 46.81
-[2025-01-25 16:38:27,808] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.41 | bwd: 4622.93 | bwd_inner: 4617.62 | bwd_allreduce: 5.24 | step: 46.82
- 48%|████▊     | 2775/5800 [7:51:57<7:01:01,  8.35s/it]                                                       {'loss': 0.0327, 'grad_norm': 8.457576751708984, 'learning_rate': 2.2362115540668318e-05, 'epoch': 23.92}
- 48%|████▊     | 2775/5800 [7:51:57<7:01:01,  8.35s/it]score1 tensor([[0.4473],
-        [0.4629],
-        [0.4062],
-        [0.3887]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4570, 0.4961, 0.3750, 0.3730], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0225, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:38:34,692] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.37
-[2025-01-25 16:38:34,693] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.50 | bwd_microstep: 4617.85 | bwd_inner_microstep: 4613.21 | bwd_allreduce_microstep: 4.54 | step_microstep: 41.48
-[2025-01-25 16:38:34,694] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.46 | bwd: 4617.88 | bwd_inner: 4613.21 | bwd_allreduce: 4.59 | step: 41.49
- 48%|████▊     | 2776/5800 [7:52:04<6:38:43,  7.91s/it]                                                       {'loss': 0.0225, 'grad_norm': 0.575898289680481, 'learning_rate': 2.2351025216663986e-05, 'epoch': 23.93}
- 48%|████▊     | 2776/5800 [7:52:04<6:38:43,  7.91s/it]score1 tensor([[0.4727],
-        [0.5273],
-        [0.5000],
-        [0.4668]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3262, 0.5156, 0.4590, 0.4805], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0532, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:38:41,583] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 16:38:41,584] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.38 | bwd_microstep: 4618.76 | bwd_inner_microstep: 4614.03 | bwd_allreduce_microstep: 4.64 | step_microstep: 44.34
-[2025-01-25 16:38:41,585] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.35 | bwd: 4618.79 | bwd_inner: 4614.03 | bwd_allreduce: 4.69 | step: 44.35
- 48%|████▊     | 2777/5800 [7:52:11<6:23:14,  7.61s/it]                                                       {'loss': 0.0532, 'grad_norm': 4.189241409301758, 'learning_rate': 2.2339934159569195e-05, 'epoch': 23.94}
- 48%|████▊     | 2777/5800 [7:52:11<6:23:14,  7.61s/it]score1 tensor([[0.4668],
-        [0.5820],
-        [0.5586],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.5430, 0.5469, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:38:48,446] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 16:38:48,448] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.83 | bwd_microstep: 4587.43 | bwd_inner_microstep: 4582.40 | bwd_allreduce_microstep: 4.90 | step_microstep: 45.00
-[2025-01-25 16:38:48,448] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.79 | bwd: 4587.45 | bwd_inner: 4582.40 | bwd_allreduce: 4.95 | step: 45.01
- 48%|████▊     | 2778/5800 [7:52:18<6:11:54,  7.38s/it]                                                       {'loss': 0.0195, 'grad_norm': 2.5376412868499756, 'learning_rate': 2.2328842372842335e-05, 'epoch': 23.95}
- 48%|████▊     | 2778/5800 [7:52:18<6:11:54,  7.38s/it]score1 tensor([[0.4883],
-        [0.5820],
-        [0.5586],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4863, 0.5508, 0.5273, 0.5703], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0171, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:38:55,353] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.98 | optimizer_step: 4.36
-[2025-01-25 16:38:55,355] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.93 | bwd_microstep: 4628.67 | bwd_inner_microstep: 4623.80 | bwd_allreduce_microstep: 4.75 | step_microstep: 42.59
-[2025-01-25 16:38:55,355] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.89 | bwd: 4628.69 | bwd_inner: 4623.80 | bwd_allreduce: 4.80 | step: 42.60
- 48%|████▊     | 2779/5800 [7:52:25<6:04:34,  7.24s/it]                                                       {'loss': 0.0171, 'grad_norm': 8.71721076965332, 'learning_rate': 2.2317749859942016e-05, 'epoch': 23.96}
- 48%|████▊     | 2779/5800 [7:52:25<6:04:34,  7.24s/it]score1 tensor([[0.6250],
-        [0.6992],
-        [0.4316],
-        [0.4355]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.6875, 0.4316, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:39:02,215] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 16:39:02,216] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.58 | bwd_microstep: 4575.86 | bwd_inner_microstep: 4570.61 | bwd_allreduce_microstep: 5.13 | step_microstep: 47.94
-[2025-01-25 16:39:02,217] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.55 | bwd: 4575.88 | bwd_inner: 4570.61 | bwd_allreduce: 5.19 | step: 47.94
- 48%|████▊     | 2780/5800 [7:52:32<5:58:39,  7.13s/it]                                                       {'loss': 0.0083, 'grad_norm': 6.752485275268555, 'learning_rate': 2.230665662432707e-05, 'epoch': 23.97}
- 48%|████▊     | 2780/5800 [7:52:32<5:58:39,  7.13s/it]score1 tensor([[0.6328],
-        [0.4980],
-        [0.5352],
-        [0.3926]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.4648, 0.5273, 0.3984], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:39:09,122] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 16:39:09,124] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.95 | bwd_microstep: 4631.34 | bwd_inner_microstep: 4626.77 | bwd_allreduce_microstep: 4.50 | step_microstep: 41.77
-[2025-01-25 16:39:09,124] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.91 | bwd: 4631.36 | bwd_inner: 4626.77 | bwd_allreduce: 4.53 | step: 41.79
- 48%|████▊     | 2781/5800 [7:52:39<5:55:14,  7.06s/it]                                                       {'loss': 0.0146, 'grad_norm': 0.520623505115509, 'learning_rate': 2.229556266945657e-05, 'epoch': 23.97}
- 48%|████▊     | 2781/5800 [7:52:39<5:55:14,  7.06s/it]score1 tensor([[0.5938],
-        [0.4180],
-        [0.4062],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.4336, 0.3809, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0181, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:39:16,023] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 16:39:16,024] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.21 | bwd_microstep: 4625.36 | bwd_inner_microstep: 4620.04 | bwd_allreduce_microstep: 5.22 | step_microstep: 43.88
-[2025-01-25 16:39:16,024] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.17 | bwd: 4625.38 | bwd_inner: 4620.04 | bwd_allreduce: 5.27 | step: 43.88
- 48%|████▊     | 2782/5800 [7:52:45<5:52:41,  7.01s/it]                                                       {'loss': 0.0181, 'grad_norm': 4.591244220733643, 'learning_rate': 2.2284467998789792e-05, 'epoch': 23.98}
- 48%|████▊     | 2782/5800 [7:52:45<5:52:41,  7.01s/it]score1 tensor([[0.5391],
-        [0.6680],
-        [0.4297],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.6641, 0.4492, 0.5898], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:39:22,916] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.55 | optimizer_step: 4.37
-[2025-01-25 16:39:22,918] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.79 | bwd_microstep: 4622.96 | bwd_inner_microstep: 4618.29 | bwd_allreduce_microstep: 4.58 | step_microstep: 42.16
-[2025-01-25 16:39:22,918] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.75 | bwd: 4622.99 | bwd_inner: 4618.29 | bwd_allreduce: 4.62 | step: 42.17
- 48%|████▊     | 2783/5800 [7:52:52<5:50:49,  6.98s/it]                                                       {'loss': 0.0127, 'grad_norm': 0.756996750831604, 'learning_rate': 2.2273372615786254e-05, 'epoch': 23.99}
- 48%|████▊     | 2783/5800 [7:52:52<5:50:49,  6.98s/it][93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-score1 tensor([[0.4316]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0410, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:39:27,142] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 16:39:27,143] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 572.19 | bwd_microstep: 1220.26 | bwd_inner_microstep: 1216.15 | bwd_allreduce_microstep: 4.04 | step_microstep: 41.59
-[2025-01-25 16:39:27,144] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 572.16 | bwd: 1220.28 | bwd_inner: 1216.15 | bwd_allreduce: 4.08 | step: 41.59
- 48%|████▊     | 2784/5800 [7:52:57<5:09:14,  6.15s/it]                                                       {'loss': 0.041, 'grad_norm': 7.6471076011657715, 'learning_rate': 2.226227652390569e-05, 'epoch': 24.0}
- 48%|████▊     | 2784/5800 [7:52:57<5:09:14,  6.15s/it][2025-01-25 16:39:31,667] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 16:39:41,287] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 16:39:51,315] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 16:40:01,163] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-score1 tensor([[0.3457],
-        [0.6562],
-        [0.6172],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3418, 0.6367, 0.6445, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0166, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:40:20,839] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 16:40:20,840] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2139.23 | bwd_microstep: 4590.62 | bwd_inner_microstep: 4585.70 | bwd_allreduce_microstep: 4.85 | step_microstep: 42.60
-[2025-01-25 16:40:20,841] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2139.16 | bwd: 4590.65 | bwd_inner: 4585.70 | bwd_allreduce: 4.89 | step: 42.60
- 48%|████▊     | 2785/5800 [7:53:50<17:05:55, 20.42s/it]                                                        {'loss': 0.0166, 'grad_norm': 0.6187290549278259, 'learning_rate': 2.225117972660805e-05, 'epoch': 24.01}
- 48%|████▊     | 2785/5800 [7:53:50<17:05:55, 20.42s/it]score1 tensor([[0.6562],
-        [0.3730],
-        [0.5117],
-        [0.4277]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6523, 0.3867, 0.5117, 0.4121], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:40:27,633] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 16:40:27,635] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2129.88 | bwd_microstep: 4538.74 | bwd_inner_microstep: 4533.90 | bwd_allreduce_microstep: 4.75 | step_microstep: 45.29
-[2025-01-25 16:40:27,635] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2129.84 | bwd: 4538.77 | bwd_inner: 4533.90 | bwd_allreduce: 4.80 | step: 45.30
- 48%|████▊     | 2786/5800 [7:53:57<13:40:12, 16.33s/it]                                                        {'loss': 0.0083, 'grad_norm': 2.5738964080810547, 'learning_rate': 2.224008222735351e-05, 'epoch': 24.02}
- 48%|████▊     | 2786/5800 [7:53:57<13:40:12, 16.33s/it]score1 tensor([[0.4883],
-        [0.4785],
-        [0.4785],
-        [0.3262]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4883, 0.4746, 0.4922, 0.3438], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:40:34,428] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 16:40:34,430] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2134.90 | bwd_microstep: 4539.22 | bwd_inner_microstep: 4533.80 | bwd_allreduce_microstep: 5.28 | step_microstep: 50.80
-[2025-01-25 16:40:34,431] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2134.87 | bwd: 4539.25 | bwd_inner: 4533.80 | bwd_allreduce: 5.34 | step: 50.81
- 48%|████▊     | 2787/5800 [7:54:04<11:16:25, 13.47s/it]                                                        {'loss': 0.0088, 'grad_norm': 1.7417035102844238, 'learning_rate': 2.2228984029602466e-05, 'epoch': 24.03}
- 48%|████▊     | 2787/5800 [7:54:04<11:16:25, 13.47s/it]score1 tensor([[0.4082],
-        [0.5234],
-        [0.4062],
-        [0.4395]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4277, 0.4707, 0.4043, 0.4590], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:40:41,295] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 16:40:41,296] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2141.96 | bwd_microstep: 4601.15 | bwd_inner_microstep: 4592.15 | bwd_allreduce_microstep: 8.93 | step_microstep: 43.52
-[2025-01-25 16:40:41,296] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2141.92 | bwd: 4601.18 | bwd_inner: 4592.15 | bwd_allreduce: 8.96 | step: 43.52
- 48%|████▊     | 2788/5800 [7:54:11<9:36:40, 11.49s/it]                                                        {'loss': 0.0234, 'grad_norm': 0.540738582611084, 'learning_rate': 2.2217885136815518e-05, 'epoch': 24.03}
- 48%|████▊     | 2788/5800 [7:54:11<9:36:40, 11.49s/it]score1 tensor([[0.4277],
-        [0.5898],
-        [0.4473],
-        [0.6836]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4141, 0.5742, 0.4512, 0.6875], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:40:48,160] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.37
-[2025-01-25 16:40:48,162] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2145.18 | bwd_microstep: 4603.74 | bwd_inner_microstep: 4599.14 | bwd_allreduce_microstep: 4.53 | step_microstep: 42.11
-[2025-01-25 16:40:48,162] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2145.13 | bwd: 4603.77 | bwd_inner: 4599.14 | bwd_allreduce: 4.57 | step: 42.12
- 48%|████▊     | 2789/5800 [7:54:18<8:26:52, 10.10s/it]                                                       {'loss': 0.0093, 'grad_norm': 0.9900962114334106, 'learning_rate': 2.2206785552453507e-05, 'epoch': 24.04}
- 48%|████▊     | 2789/5800 [7:54:18<8:26:52, 10.10s/it]score1 tensor([[0.4629],
-        [0.4863],
-        [0.3887],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.5039, 0.4258, 0.5820], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0166, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:40:54,987] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 16:40:54,988] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2147.69 | bwd_microstep: 4567.74 | bwd_inner_microstep: 4559.52 | bwd_allreduce_microstep: 8.13 | step_microstep: 44.97
-[2025-01-25 16:40:54,988] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2147.64 | bwd: 4567.77 | bwd_inner: 4559.52 | bwd_allreduce: 8.17 | step: 44.98
- 48%|████▊     | 2790/5800 [7:54:24<7:37:25,  9.12s/it]                                                       {'loss': 0.0166, 'grad_norm': 1.8970834016799927, 'learning_rate': 2.2195685279977468e-05, 'epoch': 24.05}
- 48%|████▊     | 2790/5800 [7:54:24<7:37:25,  9.12s/it]score1 tensor([[0.5117],
-        [0.4375],
-        [0.4941],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.4434, 0.4414, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0190, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:41:01,847] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.36
-[2025-01-25 16:41:01,848] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2148.56 | bwd_microstep: 4598.19 | bwd_inner_microstep: 4593.76 | bwd_allreduce_microstep: 4.32 | step_microstep: 41.71
-[2025-01-25 16:41:01,849] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2148.52 | bwd: 4598.21 | bwd_inner: 4593.76 | bwd_allreduce: 4.37 | step: 41.72
- 48%|████▊     | 2791/5800 [7:54:31<7:03:18,  8.44s/it]                                                       {'loss': 0.019, 'grad_norm': 0.3738730549812317, 'learning_rate': 2.2184584322848658e-05, 'epoch': 24.06}
- 48%|████▊     | 2791/5800 [7:54:31<7:03:18,  8.44s/it]score1 tensor([[0.4102],
-        [0.3906],
-        [0.5703],
-        [0.4453]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.4043, 0.5430, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:41:08,710] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.03 | optimizer_step: 4.37
-[2025-01-25 16:41:08,712] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2148.46 | bwd_microstep: 4600.36 | bwd_inner_microstep: 4595.49 | bwd_allreduce_microstep: 4.79 | step_microstep: 42.82
-[2025-01-25 16:41:08,712] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2148.42 | bwd: 4600.38 | bwd_inner: 4595.49 | bwd_allreduce: 4.83 | step: 42.83
- 48%|████▊     | 2792/5800 [7:54:38<6:39:31,  7.97s/it]                                                       {'loss': 0.0132, 'grad_norm': 0.5988840460777283, 'learning_rate': 2.217348268452856e-05, 'epoch': 24.07}
- 48%|████▊     | 2792/5800 [7:54:38<6:39:31,  7.97s/it]score1 tensor([[0.4160],
-        [0.6016],
-        [0.4746],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.6133, 0.4570, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0171, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:41:15,587] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 16:41:15,589] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2149.07 | bwd_microstep: 4602.01 | bwd_inner_microstep: 4596.98 | bwd_allreduce_microstep: 4.88 | step_microstep: 43.01
-[2025-01-25 16:41:15,589] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2149.04 | bwd: 4602.03 | bwd_inner: 4596.98 | bwd_allreduce: 4.96 | step: 43.02
- 48%|████▊     | 2793/5800 [7:54:45<6:22:56,  7.64s/it]                                                       {'loss': 0.0171, 'grad_norm': 3.85587477684021, 'learning_rate': 2.2162380368478836e-05, 'epoch': 24.08}
- 48%|████▊     | 2793/5800 [7:54:45<6:22:56,  7.64s/it]score1 tensor([[0.6211],
-        [0.5000],
-        [0.6367],
-        [0.5859]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.4785, 0.6406, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0288, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:41:22,464] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.44 | optimizer_step: 4.36
-[2025-01-25 16:41:22,465] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.04 | bwd_microstep: 4608.03 | bwd_inner_microstep: 4603.52 | bwd_allreduce_microstep: 4.41 | step_microstep: 40.66
-[2025-01-25 16:41:22,466] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.00 | bwd: 4608.06 | bwd_inner: 4603.52 | bwd_allreduce: 4.47 | step: 40.67
- 48%|████▊     | 2794/5800 [7:54:52<6:11:16,  7.41s/it]                                                       {'loss': 0.0288, 'grad_norm': 4.2959489822387695, 'learning_rate': 2.2151277378161396e-05, 'epoch': 24.09}
- 48%|████▊     | 2794/5800 [7:54:52<6:11:16,  7.41s/it]score1 tensor([[0.5391],
-        [0.4395],
-        [0.4570],
-        [0.3809]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.4141, 0.4180, 0.3105], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0337, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:41:29,283] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 16:41:29,284] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.30 | bwd_microstep: 4559.43 | bwd_inner_microstep: 4554.68 | bwd_allreduce_microstep: 4.65 | step_microstep: 37.69
-[2025-01-25 16:41:29,285] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.27 | bwd: 4559.45 | bwd_inner: 4554.68 | bwd_allreduce: 4.70 | step: 37.70
- 48%|████▊     | 2795/5800 [7:54:59<6:02:10,  7.23s/it]                                                       {'loss': 0.0337, 'grad_norm': 5.6778483390808105, 'learning_rate': 2.2140173717038348e-05, 'epoch': 24.09}
- 48%|████▊     | 2795/5800 [7:54:59<6:02:10,  7.23s/it]score1 tensor([[0.4863],
-        [0.4707],
-        [0.5781],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.4785, 0.5977, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0171, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:41:36,155] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.37
-[2025-01-25 16:41:36,157] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.99 | bwd_microstep: 4610.40 | bwd_inner_microstep: 4605.33 | bwd_allreduce_microstep: 4.97 | step_microstep: 44.73
-[2025-01-25 16:41:36,157] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.95 | bwd: 4610.42 | bwd_inner: 4605.33 | bwd_allreduce: 5.01 | step: 44.75
- 48%|████▊     | 2796/5800 [7:55:06<5:56:43,  7.12s/it]                                                       {'loss': 0.0171, 'grad_norm': 0.441125750541687, 'learning_rate': 2.212906938857199e-05, 'epoch': 24.1}
- 48%|████▊     | 2796/5800 [7:55:06<5:56:43,  7.12s/it]score1 tensor([[0.3809],
-        [0.5039],
-        [0.4160],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3652, 0.5156, 0.4473, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0205, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:41:43,031] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 16:41:43,033] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.48 | bwd_microstep: 4610.63 | bwd_inner_microstep: 4605.95 | bwd_allreduce_microstep: 4.58 | step_microstep: 42.00
-[2025-01-25 16:41:43,033] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.44 | bwd: 4610.66 | bwd_inner: 4605.95 | bwd_allreduce: 4.62 | step: 42.00
- 48%|████▊     | 2797/5800 [7:55:13<5:52:59,  7.05s/it]                                                       {'loss': 0.0205, 'grad_norm': 4.275949001312256, 'learning_rate': 2.211796439622486e-05, 'epoch': 24.11}
- 48%|████▊     | 2797/5800 [7:55:13<5:52:59,  7.05s/it]score1 tensor([[0.5547],
-        [0.4141],
-        [0.6484],
-        [0.6680]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5898, 0.3926, 0.6250, 0.6797], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0229, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:41:49,921] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 16:41:49,922] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.19 | bwd_microstep: 4613.70 | bwd_inner_microstep: 4608.46 | bwd_allreduce_microstep: 5.15 | step_microstep: 41.20
-[2025-01-25 16:41:49,922] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.16 | bwd: 4613.73 | bwd_inner: 4608.46 | bwd_allreduce: 5.20 | step: 41.21
- 48%|████▊     | 2798/5800 [7:55:19<5:50:21,  7.00s/it]                                                       {'loss': 0.0229, 'grad_norm': 0.6546617150306702, 'learning_rate': 2.2106858743459685e-05, 'epoch': 24.12}
- 48%|████▊     | 2798/5800 [7:55:19<5:50:21,  7.00s/it]score1 tensor([[0.5781],
-        [0.5039],
-        [0.6211],
-        [0.4707]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.4805, 0.6016, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0200, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:41:56,812] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.37
-[2025-01-25 16:41:56,813] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.46 | bwd_microstep: 4614.26 | bwd_inner_microstep: 4609.04 | bwd_allreduce_microstep: 5.14 | step_microstep: 45.90
-[2025-01-25 16:41:56,813] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.43 | bwd: 4614.29 | bwd_inner: 4609.04 | bwd_allreduce: 5.18 | step: 45.91
- 48%|████▊     | 2799/5800 [7:55:26<5:48:36,  6.97s/it]                                                       {'loss': 0.02, 'grad_norm': 8.652414321899414, 'learning_rate': 2.209575243373939e-05, 'epoch': 24.13}
- 48%|████▊     | 2799/5800 [7:55:26<5:48:36,  6.97s/it]score1 tensor([[0.4727],
-        [0.6289],
-        [0.5625],
-        [0.3809]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4570, 0.6445, 0.6016, 0.3711], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0200, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:42:03,708] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 16:42:03,709] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.90 | bwd_microstep: 4615.31 | bwd_inner_microstep: 4610.63 | bwd_allreduce_microstep: 4.58 | step_microstep: 43.21
-[2025-01-25 16:42:03,709] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.85 | bwd: 4615.33 | bwd_inner: 4610.63 | bwd_allreduce: 4.63 | step: 43.22
- 48%|████▊     | 2800/5800 [7:55:33<5:47:17,  6.95s/it]                                                       {'loss': 0.02, 'grad_norm': 1.109348177909851, 'learning_rate': 2.2084645470527122e-05, 'epoch': 24.14}
- 48%|████▊     | 2800/5800 [7:55:33<5:47:17,  6.95s/it]score1 tensor([[0.4004],
-        [0.4531],
-        [0.5352],
-        [0.6133]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3945, 0.4766, 0.5469, 0.6133], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:42:10,550] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.37
-[2025-01-25 16:42:10,551] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.88 | bwd_microstep: 4574.16 | bwd_inner_microstep: 4569.32 | bwd_allreduce_microstep: 4.76 | step_microstep: 39.44
-[2025-01-25 16:42:10,551] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.84 | bwd: 4574.19 | bwd_inner: 4569.32 | bwd_allreduce: 4.80 | step: 39.44
- 48%|████▊     | 2801/5800 [7:55:40<5:45:36,  6.91s/it]                                                       {'loss': 0.0103, 'grad_norm': 2.2666900157928467, 'learning_rate': 2.2073537857286226e-05, 'epoch': 24.15}
- 48%|████▊     | 2801/5800 [7:55:40<5:45:36,  6.91s/it]score1 tensor([[0.4199],
-        [0.5820],
-        [0.4844],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.5391, 0.4492, 0.4121], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0464, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:42:17,453] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.95 | optimizer_step: 4.36
-[2025-01-25 16:42:17,458] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.57 | bwd_microstep: 4625.18 | bwd_inner_microstep: 4620.25 | bwd_allreduce_microstep: 4.82 | step_microstep: 46.18
-[2025-01-25 16:42:17,458] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.54 | bwd: 4625.21 | bwd_inner: 4620.25 | bwd_allreduce: 4.89 | step: 46.19
- 48%|████▊     | 2802/5800 [7:55:47<5:45:25,  6.91s/it]                                                       {'loss': 0.0464, 'grad_norm': 4.459285736083984, 'learning_rate': 2.206242959748025e-05, 'epoch': 24.16}
- 48%|████▊     | 2802/5800 [7:55:47<5:45:25,  6.91s/it]score1 tensor([[0.4570],
-        [0.4414],
-        [0.4492],
-        [0.6758]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.4590, 0.4082, 0.6875], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0200, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:42:24,352] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 16:42:24,353] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.70 | bwd_microstep: 4618.57 | bwd_inner_microstep: 4613.87 | bwd_allreduce_microstep: 4.61 | step_microstep: 42.78
-[2025-01-25 16:42:24,353] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.66 | bwd: 4618.60 | bwd_inner: 4613.87 | bwd_allreduce: 4.66 | step: 42.78
- 48%|████▊     | 2803/5800 [7:55:54<5:45:03,  6.91s/it]                                                       {'loss': 0.02, 'grad_norm': 0.8009021282196045, 'learning_rate': 2.2051320694572934e-05, 'epoch': 24.16}
- 48%|████▊     | 2803/5800 [7:55:54<5:45:03,  6.91s/it]score1 tensor([[0.4336],
-        [0.3711],
-        [0.5664],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4219, 0.3398, 0.5547, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:42:31,252] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.10 | optimizer_step: 4.37
-[2025-01-25 16:42:31,254] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.50 | bwd_microstep: 4616.18 | bwd_inner_microstep: 4611.08 | bwd_allreduce_microstep: 5.02 | step_microstep: 45.43
-[2025-01-25 16:42:31,254] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.46 | bwd: 4616.21 | bwd_inner: 4611.08 | bwd_allreduce: 5.06 | step: 45.44
- 48%|████▊     | 2804/5800 [7:56:01<5:44:51,  6.91s/it]                                                       {'loss': 0.0156, 'grad_norm': 3.68670392036438, 'learning_rate': 2.2040211152028234e-05, 'epoch': 24.17}
- 48%|████▊     | 2804/5800 [7:56:01<5:44:51,  6.91s/it]score1 tensor([[0.6602],
-        [0.3145],
-        [0.4395],
-        [0.4277]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.3223, 0.4316, 0.4707], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:42:38,149] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.37
-[2025-01-25 16:42:38,151] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.36 | bwd_microstep: 4615.35 | bwd_inner_microstep: 4610.42 | bwd_allreduce_microstep: 4.85 | step_microstep: 43.07
-[2025-01-25 16:42:38,151] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.32 | bwd: 4615.37 | bwd_inner: 4610.42 | bwd_allreduce: 4.88 | step: 43.09
- 48%|████▊     | 2805/5800 [7:56:08<5:44:34,  6.90s/it]                                                       {'loss': 0.0186, 'grad_norm': 1.189853549003601, 'learning_rate': 2.2029100973310295e-05, 'epoch': 24.18}
- 48%|████▊     | 2805/5800 [7:56:08<5:44:34,  6.90s/it]score1 tensor([[0.4609],
-        [0.4375],
-        [0.5000],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.4844, 0.5352, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:42:45,060] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 16:42:45,061] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.59 | bwd_microstep: 4631.12 | bwd_inner_microstep: 4626.20 | bwd_allreduce_microstep: 4.81 | step_microstep: 45.18
-[2025-01-25 16:42:45,061] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.56 | bwd: 4631.14 | bwd_inner: 4626.20 | bwd_allreduce: 4.87 | step: 45.20
- 48%|████▊     | 2806/5800 [7:56:15<5:44:31,  6.90s/it]                                                       {'loss': 0.0391, 'grad_norm': 8.120087623596191, 'learning_rate': 2.2017990161883464e-05, 'epoch': 24.19}
- 48%|████▊     | 2806/5800 [7:56:15<5:44:31,  6.90s/it]score1 tensor([[0.6172],
-        [0.5117],
-        [0.4785],
-        [0.4980]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.5352, 0.5352, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0371, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:42:51,981] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.37
-[2025-01-25 16:42:51,983] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.59 | bwd_microstep: 4640.11 | bwd_inner_microstep: 4635.14 | bwd_allreduce_microstep: 4.89 | step_microstep: 48.84
-[2025-01-25 16:42:51,983] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.56 | bwd: 4640.13 | bwd_inner: 4635.14 | bwd_allreduce: 4.93 | step: 48.85
- 48%|████▊     | 2807/5800 [7:56:21<5:44:42,  6.91s/it]                                                       {'loss': 0.0371, 'grad_norm': 3.8501687049865723, 'learning_rate': 2.2006878721212285e-05, 'epoch': 24.2}
- 48%|████▊     | 2807/5800 [7:56:21<5:44:42,  6.91s/it]score1 tensor([[0.4473],
-        [0.5078],
-        [0.4434],
-        [0.6562]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.5195, 0.4453, 0.6562], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:42:58,851] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 16:42:58,853] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.27 | bwd_microstep: 4590.73 | bwd_inner_microstep: 4585.85 | bwd_allreduce_microstep: 4.80 | step_microstep: 46.20
-[2025-01-25 16:42:58,853] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.24 | bwd: 4590.76 | bwd_inner: 4585.85 | bwd_allreduce: 4.84 | step: 46.21
- 48%|████▊     | 2808/5800 [7:56:28<5:43:56,  6.90s/it]                                                       {'loss': 0.0078, 'grad_norm': 5.9825663566589355, 'learning_rate': 2.1995766654761504e-05, 'epoch': 24.21}
- 48%|████▊     | 2808/5800 [7:56:28<5:43:56,  6.90s/it]score1 tensor([[0.6055],
-        [0.5547],
-        [0.4180],
-        [0.4727]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.6016, 0.4062, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0200, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:43:05,705] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 16:43:05,706] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.94 | bwd_microstep: 4578.30 | bwd_inner_microstep: 4573.41 | bwd_allreduce_microstep: 4.78 | step_microstep: 43.36
-[2025-01-25 16:43:05,706] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.91 | bwd: 4578.33 | bwd_inner: 4573.41 | bwd_allreduce: 4.84 | step: 43.37
- 48%|████▊     | 2809/5800 [7:56:35<5:43:20,  6.89s/it]                                                       {'loss': 0.02, 'grad_norm': 2.351884365081787, 'learning_rate': 2.1984653965996046e-05, 'epoch': 24.22}
- 48%|████▊     | 2809/5800 [7:56:35<5:43:20,  6.89s/it]score1 tensor([[0.5039],
-        [0.4805],
-        [0.6211],
-        [0.4375]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.5000, 0.5781, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0210, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:43:12,630] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 16:43:12,631] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.82 | bwd_microstep: 4633.67 | bwd_inner_microstep: 4628.81 | bwd_allreduce_microstep: 4.79 | step_microstep: 44.48
-[2025-01-25 16:43:12,632] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.79 | bwd: 4633.69 | bwd_inner: 4628.81 | bwd_allreduce: 4.82 | step: 44.49
- 48%|████▊     | 2810/5800 [7:56:42<5:43:43,  6.90s/it]                                                       {'loss': 0.021, 'grad_norm': 0.7636982202529907, 'learning_rate': 2.1973540658381043e-05, 'epoch': 24.22}
- 48%|████▊     | 2810/5800 [7:56:42<5:43:43,  6.90s/it]score1 tensor([[0.3984],
-        [0.5234],
-        [0.5508],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3809, 0.4844, 0.5234, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0239, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:43:19,559] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.36
-[2025-01-25 16:43:19,560] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.20 | bwd_microstep: 4633.62 | bwd_inner_microstep: 4628.26 | bwd_allreduce_microstep: 5.23 | step_microstep: 42.90
-[2025-01-25 16:43:19,561] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.16 | bwd: 4633.64 | bwd_inner: 4628.26 | bwd_allreduce: 5.29 | step: 42.91
- 48%|████▊     | 2811/5800 [7:56:49<5:44:05,  6.91s/it]                                                       {'loss': 0.0239, 'grad_norm': 8.235084533691406, 'learning_rate': 2.1962426735381824e-05, 'epoch': 24.23}
- 48%|████▊     | 2811/5800 [7:56:49<5:44:05,  6.91s/it]score1 tensor([[0.4902],
-        [0.3984],
-        [0.5195],
-        [0.5898]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4609, 0.3750, 0.5039, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0220, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:43:26,476] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 16:43:26,477] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.89 | bwd_microstep: 4639.27 | bwd_inner_microstep: 4633.76 | bwd_allreduce_microstep: 5.44 | step_microstep: 42.09
-[2025-01-25 16:43:26,478] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.82 | bwd: 4639.30 | bwd_inner: 4633.76 | bwd_allreduce: 5.48 | step: 42.11
- 48%|████▊     | 2812/5800 [7:56:56<5:44:05,  6.91s/it]                                                       {'loss': 0.022, 'grad_norm': 3.739244222640991, 'learning_rate': 2.1951312200463897e-05, 'epoch': 24.24}
- 48%|████▊     | 2812/5800 [7:56:56<5:44:05,  6.91s/it]score1 tensor([[0.4688],
-        [0.4316],
-        [0.6797],
-        [0.4531]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.4316, 0.6641, 0.4395], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0190, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:43:33,347] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 16:43:33,348] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.66 | bwd_microstep: 4589.43 | bwd_inner_microstep: 4584.86 | bwd_allreduce_microstep: 4.48 | step_microstep: 41.77
-[2025-01-25 16:43:33,348] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.62 | bwd: 4589.45 | bwd_inner: 4584.86 | bwd_allreduce: 4.53 | step: 41.78
- 48%|████▊     | 2813/5800 [7:57:03<5:43:26,  6.90s/it]                                                       {'loss': 0.019, 'grad_norm': 2.47688364982605, 'learning_rate': 2.1940197057092964e-05, 'epoch': 24.25}
- 48%|████▊     | 2813/5800 [7:57:03<5:43:26,  6.90s/it]score1 tensor([[0.4727],
-        [0.4824],
-        [0.7031],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4883, 0.5156, 0.6953, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0229, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:43:40,274] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 16:43:40,275] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.28 | bwd_microstep: 4638.95 | bwd_inner_microstep: 4634.01 | bwd_allreduce_microstep: 4.86 | step_microstep: 46.36
-[2025-01-25 16:43:40,275] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.25 | bwd: 4638.97 | bwd_inner: 4634.01 | bwd_allreduce: 4.90 | step: 46.37
- 49%|████▊     | 2814/5800 [7:57:10<5:43:39,  6.91s/it]                                                       {'loss': 0.0229, 'grad_norm': 3.6877269744873047, 'learning_rate': 2.192908130873493e-05, 'epoch': 24.26}
- 49%|████▊     | 2814/5800 [7:57:10<5:43:39,  6.91s/it]score1 tensor([[0.5273],
-        [0.5117],
-        [0.4980],
-        [0.4316]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.5039, 0.4961, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:43:47,198] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.90 | optimizer_step: 4.36
-[2025-01-25 16:43:47,199] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.70 | bwd_microstep: 4644.61 | bwd_inner_microstep: 4639.45 | bwd_allreduce_microstep: 5.07 | step_microstep: 42.59
-[2025-01-25 16:43:47,200] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.67 | bwd: 4644.64 | bwd_inner: 4639.45 | bwd_allreduce: 5.12 | step: 42.59
- 49%|████▊     | 2815/5800 [7:57:17<5:43:54,  6.91s/it]                                                       {'loss': 0.0068, 'grad_norm': 3.960494041442871, 'learning_rate': 2.1917964958855863e-05, 'epoch': 24.27}
- 49%|████▊     | 2815/5800 [7:57:17<5:43:54,  6.91s/it]score1 tensor([[0.4961],
-        [0.6367],
-        [0.3887],
-        [0.4863]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4961, 0.6172, 0.3457, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0171, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:43:54,119] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.00 | optimizer_step: 4.36
-[2025-01-25 16:43:54,121] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.03 | bwd_microstep: 4598.65 | bwd_inner_microstep: 4577.01 | bwd_allreduce_microstep: 21.49 | step_microstep: 79.49
-[2025-01-25 16:43:54,122] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.98 | bwd: 4598.68 | bwd_inner: 4577.01 | bwd_allreduce: 21.58 | step: 79.50
- 49%|████▊     | 2816/5800 [7:57:24<5:43:55,  6.92s/it]                                                       {'loss': 0.0171, 'grad_norm': 2.2278318405151367, 'learning_rate': 2.1906848010922042e-05, 'epoch': 24.28}
- 49%|████▊     | 2816/5800 [7:57:24<5:43:55,  6.92s/it]score1 tensor([[0.4512],
-        [0.6094],
-        [0.4941],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.6094, 0.4805, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:44:01,004] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.36
-[2025-01-25 16:44:01,005] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.02 | bwd_microstep: 4592.07 | bwd_inner_microstep: 4586.92 | bwd_allreduce_microstep: 5.04 | step_microstep: 43.30
-[2025-01-25 16:44:01,005] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.99 | bwd: 4592.09 | bwd_inner: 4586.92 | bwd_allreduce: 5.10 | step: 43.32
- 49%|████▊     | 2817/5800 [7:57:30<5:43:17,  6.90s/it]                                                       {'loss': 0.0059, 'grad_norm': 1.9540612697601318, 'learning_rate': 2.1895730468399936e-05, 'epoch': 24.28}
- 49%|████▊     | 2817/5800 [7:57:30<5:43:17,  6.90s/it]score1 tensor([[0.6172],
-        [0.5703],
-        [0.3965],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6328, 0.5977, 0.4004, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0361, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:44:07,931] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.36
-[2025-01-25 16:44:07,933] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.46 | bwd_microstep: 4644.47 | bwd_inner_microstep: 4639.49 | bwd_allreduce_microstep: 4.88 | step_microstep: 45.25
-[2025-01-25 16:44:07,933] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.42 | bwd: 4644.50 | bwd_inner: 4639.49 | bwd_allreduce: 4.93 | step: 45.27
- 49%|████▊     | 2818/5800 [7:57:37<5:43:31,  6.91s/it]                                                       {'loss': 0.0361, 'grad_norm': 8.478758811950684, 'learning_rate': 2.1884612334756166e-05, 'epoch': 24.29}
- 49%|████▊     | 2818/5800 [7:57:37<5:43:31,  6.91s/it]score1 tensor([[0.4707],
-        [0.4082],
-        [0.5273],
-        [0.3281]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4805, 0.4453, 0.5391, 0.3457], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0190, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:44:14,853] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.93 | optimizer_step: 4.37
-[2025-01-25 16:44:14,855] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.26 | bwd_microstep: 4641.70 | bwd_inner_microstep: 4636.99 | bwd_allreduce_microstep: 4.64 | step_microstep: 42.48
-[2025-01-25 16:44:14,855] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.22 | bwd: 4641.73 | bwd_inner: 4636.99 | bwd_allreduce: 4.68 | step: 42.49
- 49%|████▊     | 2819/5800 [7:57:44<5:43:35,  6.92s/it]                                                       {'loss': 0.019, 'grad_norm': 7.670001983642578, 'learning_rate': 2.1873493613457572e-05, 'epoch': 24.3}
- 49%|████▊     | 2819/5800 [7:57:44<5:43:35,  6.92s/it]score1 tensor([[0.5820],
-        [0.5273],
-        [0.4316],
-        [0.4648]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5703, 0.5508, 0.4688, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0229, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:44:21,776] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 16:44:21,777] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.57 | bwd_microstep: 4635.44 | bwd_inner_microstep: 4630.43 | bwd_allreduce_microstep: 4.92 | step_microstep: 44.29
-[2025-01-25 16:44:21,778] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.54 | bwd: 4635.46 | bwd_inner: 4630.43 | bwd_allreduce: 4.96 | step: 44.31
- 49%|████▊     | 2820/5800 [7:57:51<5:43:37,  6.92s/it]                                                       {'loss': 0.0229, 'grad_norm': 0.47776687145233154, 'learning_rate': 2.1862374307971162e-05, 'epoch': 24.31}
- 49%|████▊     | 2820/5800 [7:57:51<5:43:37,  6.92s/it]score1 tensor([[0.4492],
-        [0.5742],
-        [0.5391],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.6289, 0.5586, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0283, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:44:28,707] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.89 | optimizer_step: 4.36
-[2025-01-25 16:44:28,709] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.90 | bwd_microstep: 4639.11 | bwd_inner_microstep: 4634.34 | bwd_allreduce_microstep: 4.70 | step_microstep: 42.42
-[2025-01-25 16:44:28,709] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2168.86 | bwd: 4639.14 | bwd_inner: 4634.34 | bwd_allreduce: 4.73 | step: 42.44
- 49%|████▊     | 2821/5800 [7:57:58<5:43:39,  6.92s/it]                                                       {'loss': 0.0283, 'grad_norm': 8.432687759399414, 'learning_rate': 2.1851254421764118e-05, 'epoch': 24.32}
- 49%|████▊     | 2821/5800 [7:57:58<5:43:39,  6.92s/it]score1 tensor([[0.5156],
-        [0.4336],
-        [0.5078],
-        [0.3652]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4824, 0.4902, 0.4941, 0.3613], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0269, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:44:35,634] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 16:44:35,635] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.12 | bwd_microstep: 4637.56 | bwd_inner_microstep: 4632.37 | bwd_allreduce_microstep: 5.11 | step_microstep: 46.26
-[2025-01-25 16:44:35,636] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.09 | bwd: 4637.58 | bwd_inner: 4632.37 | bwd_allreduce: 5.14 | step: 46.27
- 49%|████▊     | 2822/5800 [7:58:05<5:43:32,  6.92s/it]                                                       {'loss': 0.0269, 'grad_norm': 4.018005847930908, 'learning_rate': 2.1840133958303835e-05, 'epoch': 24.33}
- 49%|████▊     | 2822/5800 [7:58:05<5:43:32,  6.92s/it]score1 tensor([[0.3613],
-        [0.4102],
-        [0.6836],
-        [0.3477]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3789, 0.4219, 0.7070, 0.3672], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0181, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:44:42,555] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 16:44:42,557] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.93 | bwd_microstep: 4641.48 | bwd_inner_microstep: 4636.42 | bwd_allreduce_microstep: 4.95 | step_microstep: 42.18
-[2025-01-25 16:44:42,557] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.90 | bwd: 4641.51 | bwd_inner: 4636.42 | bwd_allreduce: 5.00 | step: 42.19
- 49%|████▊     | 2823/5800 [7:58:12<5:43:28,  6.92s/it]                                                       {'loss': 0.0181, 'grad_norm': 7.76607084274292, 'learning_rate': 2.1829012921057854e-05, 'epoch': 24.34}
- 49%|████▊     | 2823/5800 [7:58:12<5:43:28,  6.92s/it]score1 tensor([[0.5156],
-        [0.5156],
-        [0.4199],
-        [0.4824]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.5195, 0.4785, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0278, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:44:49,481] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 7.93 | optimizer_step: 4.36
-[2025-01-25 16:44:49,482] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.01 | bwd_microstep: 4638.63 | bwd_inner_microstep: 4633.68 | bwd_allreduce_microstep: 4.86 | step_microstep: 46.76
-[2025-01-25 16:44:49,482] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.97 | bwd: 4638.66 | bwd_inner: 4633.68 | bwd_allreduce: 4.90 | step: 46.77
- 49%|████▊     | 2824/5800 [7:58:19<5:43:23,  6.92s/it]                                                       {'loss': 0.0278, 'grad_norm': 4.0713372230529785, 'learning_rate': 2.1817891313493902e-05, 'epoch': 24.34}
- 49%|████▊     | 2824/5800 [7:58:19<5:43:23,  6.92s/it]score1 tensor([[0.3594],
-        [0.4590],
-        [0.5352],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3691, 0.4395, 0.5469, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:44:56,398] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 16:44:56,399] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.67 | bwd_microstep: 4637.18 | bwd_inner_microstep: 4632.20 | bwd_allreduce_microstep: 4.90 | step_microstep: 41.28
-[2025-01-25 16:44:56,399] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.64 | bwd: 4637.20 | bwd_inner: 4632.20 | bwd_allreduce: 4.93 | step: 41.29
- 49%|████▊     | 2825/5800 [7:58:26<5:43:11,  6.92s/it]                                                       {'loss': 0.0161, 'grad_norm': 4.106231689453125, 'learning_rate': 2.1806769139079904e-05, 'epoch': 24.35}
- 49%|████▊     | 2825/5800 [7:58:26<5:43:11,  6.92s/it]score1 tensor([[0.5391],
-        [0.4590],
-        [0.6484],
-        [0.4297]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4727, 0.6797, 0.4160], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:45:03,328] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.01 | optimizer_step: 4.37
-[2025-01-25 16:45:03,329] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.84 | bwd_microstep: 4645.78 | bwd_inner_microstep: 4640.38 | bwd_allreduce_microstep: 5.29 | step_microstep: 42.58
-[2025-01-25 16:45:03,330] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.80 | bwd: 4645.80 | bwd_inner: 4640.38 | bwd_allreduce: 5.34 | step: 42.58
- 49%|████▊     | 2826/5800 [7:58:33<5:43:16,  6.93s/it]                                                       {'loss': 0.0176, 'grad_norm': 4.697100639343262, 'learning_rate': 2.1795646401283933e-05, 'epoch': 24.36}
- 49%|████▊     | 2826/5800 [7:58:33<5:43:16,  6.93s/it]score1 tensor([[0.3984],
-        [0.4180],
-        [0.4746],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.4453, 0.4609, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0317, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:45:10,259] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 16:45:10,261] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.71 | bwd_microstep: 4638.67 | bwd_inner_microstep: 4633.50 | bwd_allreduce_microstep: 5.08 | step_microstep: 43.28
-[2025-01-25 16:45:10,261] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.68 | bwd: 4638.69 | bwd_inner: 4633.50 | bwd_allreduce: 5.12 | step: 43.28
- 49%|████▊     | 2827/5800 [7:58:40<5:43:11,  6.93s/it]                                                       {'loss': 0.0317, 'grad_norm': 4.169806480407715, 'learning_rate': 2.1784523103574254e-05, 'epoch': 24.37}
- 49%|████▊     | 2827/5800 [7:58:40<5:43:11,  6.93s/it]score1 tensor([[0.5391],
-        [0.5000],
-        [0.5195],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.4941, 0.5195, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:45:17,124] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 16:45:17,125] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.80 | bwd_microstep: 4581.82 | bwd_inner_microstep: 4576.49 | bwd_allreduce_microstep: 5.19 | step_microstep: 43.12
-[2025-01-25 16:45:17,125] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.77 | bwd: 4581.85 | bwd_inner: 4576.49 | bwd_allreduce: 5.27 | step: 43.12
- 49%|████▉     | 2828/5800 [7:58:47<5:42:07,  6.91s/it]                                                       {'loss': 0.0054, 'grad_norm': 6.452630519866943, 'learning_rate': 2.1773399249419312e-05, 'epoch': 24.38}
- 49%|████▉     | 2828/5800 [7:58:47<5:42:07,  6.91s/it]score1 tensor([[0.4805],
-        [0.5195],
-        [0.4961],
-        [0.5859]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.5000, 0.4531, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0283, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:45:24,047] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 16:45:24,048] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.44 | bwd_microstep: 4636.85 | bwd_inner_microstep: 4631.96 | bwd_allreduce_microstep: 4.79 | step_microstep: 45.00
-[2025-01-25 16:45:24,049] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.41 | bwd: 4636.88 | bwd_inner: 4631.96 | bwd_allreduce: 4.85 | step: 45.01
- 49%|████▉     | 2829/5800 [7:58:54<5:42:17,  6.91s/it]                                                       {'loss': 0.0283, 'grad_norm': 8.393837928771973, 'learning_rate': 2.1762274842287715e-05, 'epoch': 24.39}
- 49%|████▉     | 2829/5800 [7:58:54<5:42:17,  6.91s/it]score1 tensor([[0.5781],
-        [0.4609],
-        [0.4902],
-        [0.6562]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5586, 0.4121, 0.4707, 0.6484], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0239, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:45:30,962] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.95 | optimizer_step: 4.37
-[2025-01-25 16:45:30,963] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.70 | bwd_microstep: 4632.97 | bwd_inner_microstep: 4627.93 | bwd_allreduce_microstep: 4.95 | step_microstep: 42.35
-[2025-01-25 16:45:30,964] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.67 | bwd: 4632.99 | bwd_inner: 4627.93 | bwd_allreduce: 4.99 | step: 42.35
- 49%|████▉     | 2830/5800 [7:59:00<5:42:11,  6.91s/it]                                                       {'loss': 0.0239, 'grad_norm': 8.629087448120117, 'learning_rate': 2.1751149885648237e-05, 'epoch': 24.4}
- 49%|████▉     | 2830/5800 [7:59:00<5:42:11,  6.91s/it]score1 tensor([[0.4160],
-        [0.4805],
-        [0.4258],
-        [0.4648]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.4922, 0.4062, 0.4668], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:45:37,880] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 16:45:37,881] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.99 | bwd_microstep: 4640.53 | bwd_inner_microstep: 4635.50 | bwd_allreduce_microstep: 4.94 | step_microstep: 44.74
-[2025-01-25 16:45:37,882] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.96 | bwd: 4640.55 | bwd_inner: 4635.50 | bwd_allreduce: 4.98 | step: 44.76
- 49%|████▉     | 2831/5800 [7:59:07<5:42:08,  6.91s/it]                                                       {'loss': 0.0088, 'grad_norm': 4.010739803314209, 'learning_rate': 2.1740024382969844e-05, 'epoch': 24.41}
- 49%|████▉     | 2831/5800 [7:59:07<5:42:08,  6.91s/it]score1 tensor([[0.3730],
-        [0.5781],
-        [0.6250],
-        [0.5781]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3555, 0.6289, 0.6133, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0229, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:45:44,803] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 16:45:44,805] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.24 | bwd_microstep: 4640.24 | bwd_inner_microstep: 4635.25 | bwd_allreduce_microstep: 4.90 | step_microstep: 42.96
-[2025-01-25 16:45:44,805] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.19 | bwd: 4640.26 | bwd_inner: 4635.25 | bwd_allreduce: 4.95 | step: 42.97
- 49%|████▉     | 2832/5800 [7:59:14<5:42:10,  6.92s/it]                                                       {'loss': 0.0229, 'grad_norm': 4.1925458908081055, 'learning_rate': 2.1728898337721657e-05, 'epoch': 24.41}
- 49%|████▉     | 2832/5800 [7:59:14<5:42:10,  6.92s/it]score1 tensor([[0.5469],
-        [0.5547],
-        [0.4395],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.5625, 0.4238, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:45:51,730] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.37
-[2025-01-25 16:45:51,731] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.96 | bwd_microstep: 4641.50 | bwd_inner_microstep: 4636.28 | bwd_allreduce_microstep: 5.11 | step_microstep: 47.89
-[2025-01-25 16:45:51,732] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.93 | bwd: 4641.52 | bwd_inner: 4636.28 | bwd_allreduce: 5.17 | step: 47.90
- 49%|████▉     | 2833/5800 [7:59:21<5:42:16,  6.92s/it]                                                       {'loss': 0.0088, 'grad_norm': 0.4544055163860321, 'learning_rate': 2.1717771753372965e-05, 'epoch': 24.42}
- 49%|████▉     | 2833/5800 [7:59:21<5:42:16,  6.92s/it]score1 tensor([[0.5234],
-        [0.6445],
-        [0.4258],
-        [0.6445]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5078, 0.6289, 0.3906, 0.6484], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:45:58,668] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.93 | optimizer_step: 4.37
-[2025-01-25 16:45:58,669] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.14 | bwd_microstep: 4647.81 | bwd_inner_microstep: 4642.66 | bwd_allreduce_microstep: 5.06 | step_microstep: 43.29
-[2025-01-25 16:45:58,669] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.10 | bwd: 4647.84 | bwd_inner: 4642.66 | bwd_allreduce: 5.11 | step: 43.30
- 49%|████▉     | 2834/5800 [7:59:28<5:42:17,  6.92s/it]                                                       {'loss': 0.0176, 'grad_norm': 4.030498027801514, 'learning_rate': 2.170664463339324e-05, 'epoch': 24.43}
- 49%|████▉     | 2834/5800 [7:59:28<5:42:17,  6.92s/it]score1 tensor([[0.5234],
-        [0.4414],
-        [0.4941],
-        [0.3926]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.4648, 0.5000, 0.3477], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0205, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:46:05,583] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 16:46:05,585] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.46 | bwd_microstep: 4633.87 | bwd_inner_microstep: 4628.80 | bwd_allreduce_microstep: 4.99 | step_microstep: 41.54
-[2025-01-25 16:46:05,585] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.43 | bwd: 4633.90 | bwd_inner: 4628.80 | bwd_allreduce: 5.03 | step: 41.54
- 49%|████▉     | 2835/5800 [7:59:35<5:42:04,  6.92s/it]                                                       {'loss': 0.0205, 'grad_norm': 4.314779758453369, 'learning_rate': 2.1695516981252104e-05, 'epoch': 24.44}
- 49%|████▉     | 2835/5800 [7:59:35<5:42:04,  6.92s/it]score1 tensor([[0.4844],
-        [0.6016],
-        [0.4570],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.6445, 0.4473, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:46:12,521] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 16:46:12,523] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.43 | bwd_microstep: 4648.59 | bwd_inner_microstep: 4643.38 | bwd_allreduce_microstep: 5.10 | step_microstep: 45.67
-[2025-01-25 16:46:12,523] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.40 | bwd: 4648.61 | bwd_inner: 4643.38 | bwd_allreduce: 5.15 | step: 45.68
- 49%|████▉     | 2836/5800 [7:59:42<5:42:11,  6.93s/it]                                                       {'loss': 0.0186, 'grad_norm': 0.7509508728981018, 'learning_rate': 2.168438880041935e-05, 'epoch': 24.45}
- 49%|████▉     | 2836/5800 [7:59:42<5:42:11,  6.93s/it]score1 tensor([[0.5781],
-        [0.4707],
-        [0.5625],
-        [0.4297]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.4844, 0.5508, 0.4336], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:46:19,451] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 16:46:19,452] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.88 | bwd_microstep: 4643.53 | bwd_inner_microstep: 4637.92 | bwd_allreduce_microstep: 5.51 | step_microstep: 47.43
-[2025-01-25 16:46:19,453] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.85 | bwd: 4643.56 | bwd_inner: 4637.92 | bwd_allreduce: 5.57 | step: 47.44
- 49%|████▉     | 2837/5800 [7:59:49<5:42:06,  6.93s/it]                                                       {'loss': 0.0161, 'grad_norm': 0.8849055767059326, 'learning_rate': 2.167326009436494e-05, 'epoch': 24.46}
- 49%|████▉     | 2837/5800 [7:59:49<5:42:06,  6.93s/it]score1 tensor([[0.5156],
-        [0.5547],
-        [0.5469],
-        [0.4375]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.5664, 0.5625, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0205, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:46:26,377] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.36
-[2025-01-25 16:46:26,379] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.75 | bwd_microstep: 4643.61 | bwd_inner_microstep: 4638.66 | bwd_allreduce_microstep: 4.86 | step_microstep: 44.05
-[2025-01-25 16:46:26,379] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.71 | bwd: 4643.64 | bwd_inner: 4638.66 | bwd_allreduce: 4.91 | step: 44.06
- 49%|████▉     | 2838/5800 [7:59:56<5:42:01,  6.93s/it]                                                       {'loss': 0.0205, 'grad_norm': 4.200005054473877, 'learning_rate': 2.1662130866559e-05, 'epoch': 24.47}
- 49%|████▉     | 2838/5800 [7:59:56<5:42:01,  6.93s/it]score1 tensor([[0.4648],
-        [0.6562],
-        [0.4844],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.6602, 0.4863, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0181, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:46:33,313] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.90 | optimizer_step: 4.36
-[2025-01-25 16:46:33,314] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.35 | bwd_microstep: 4645.66 | bwd_inner_microstep: 4640.79 | bwd_allreduce_microstep: 4.80 | step_microstep: 44.61
-[2025-01-25 16:46:33,315] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.32 | bwd: 4645.69 | bwd_inner: 4640.79 | bwd_allreduce: 4.84 | step: 44.62
- 49%|████▉     | 2839/5800 [8:00:03<5:41:55,  6.93s/it]                                                       {'loss': 0.0181, 'grad_norm': 8.58032512664795, 'learning_rate': 2.1651001120471818e-05, 'epoch': 24.47}
- 49%|████▉     | 2839/5800 [8:00:03<5:41:55,  6.93s/it]score1 tensor([[0.3691],
-        [0.5898],
-        [0.4277],
-        [0.4551]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3730, 0.5742, 0.4004, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:46:40,244] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 16:46:40,245] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.66 | bwd_microstep: 4649.04 | bwd_inner_microstep: 4644.18 | bwd_allreduce_microstep: 4.76 | step_microstep: 43.32
-[2025-01-25 16:46:40,246] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.62 | bwd: 4649.07 | bwd_inner: 4644.18 | bwd_allreduce: 4.81 | step: 43.26
- 49%|████▉     | 2840/5800 [8:00:10<5:41:57,  6.93s/it]                                                       {'loss': 0.0215, 'grad_norm': 0.6992490887641907, 'learning_rate': 2.1639870859573836e-05, 'epoch': 24.48}
- 49%|████▉     | 2840/5800 [8:00:10<5:41:57,  6.93s/it]score1 tensor([[0.5000],
-        [0.4414],
-        [0.4180],
-        [0.6289]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5078, 0.4043, 0.4180, 0.6484], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:46:47,129] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.37
-[2025-01-25 16:46:47,130] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.50 | bwd_microstep: 4592.47 | bwd_inner_microstep: 4587.43 | bwd_allreduce_microstep: 4.95 | step_microstep: 45.95
-[2025-01-25 16:46:47,130] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.43 | bwd: 4592.49 | bwd_inner: 4587.43 | bwd_allreduce: 5.00 | step: 45.95
- 49%|████▉     | 2841/5800 [8:00:17<5:41:01,  6.92s/it]                                                       {'loss': 0.0161, 'grad_norm': 2.572749614715576, 'learning_rate': 2.1628740087335668e-05, 'epoch': 24.49}
- 49%|████▉     | 2841/5800 [8:00:17<5:41:01,  6.92s/it]score1 tensor([[0.5234],
-        [0.4082],
-        [0.4297],
-        [0.6289]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.4082, 0.4258, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:46:54,018] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.88 | optimizer_step: 4.36
-[2025-01-25 16:46:54,020] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.26 | bwd_microstep: 4598.73 | bwd_inner_microstep: 4593.47 | bwd_allreduce_microstep: 5.16 | step_microstep: 49.72
-[2025-01-25 16:46:54,020] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2168.21 | bwd: 4598.76 | bwd_inner: 4593.47 | bwd_allreduce: 5.21 | step: 49.73
- 49%|████▉     | 2842/5800 [8:00:23<5:40:35,  6.91s/it]                                                       {'loss': 0.0107, 'grad_norm': 2.1844778060913086, 'learning_rate': 2.1617608807228087e-05, 'epoch': 24.5}
- 49%|████▉     | 2842/5800 [8:00:23<5:40:35,  6.91s/it]score1 tensor([[0.5664],
-        [0.3984],
-        [0.6211],
-        [0.4375]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.3945, 0.6094, 0.4609], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:47:00,940] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.37
-[2025-01-25 16:47:00,942] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.49 | bwd_microstep: 4633.58 | bwd_inner_microstep: 4628.60 | bwd_allreduce_microstep: 4.87 | step_microstep: 44.68
-[2025-01-25 16:47:00,942] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.46 | bwd: 4633.60 | bwd_inner: 4628.60 | bwd_allreduce: 4.93 | step: 44.69
- 49%|████▉     | 2843/5800 [8:00:30<5:40:45,  6.91s/it]                                                       {'loss': 0.0156, 'grad_norm': 4.515658378601074, 'learning_rate': 2.1606477022722005e-05, 'epoch': 24.51}
- 49%|████▉     | 2843/5800 [8:00:30<5:40:45,  6.91s/it]score1 tensor([[0.4766],
-        [0.6289],
-        [0.4102],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4277, 0.6172, 0.3984, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0200, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:47:07,873] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.93 | optimizer_step: 4.36
-[2025-01-25 16:47:07,875] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.65 | bwd_microstep: 4646.67 | bwd_inner_microstep: 4641.62 | bwd_allreduce_microstep: 4.93 | step_microstep: 43.20
-[2025-01-25 16:47:07,875] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.62 | bwd: 4646.70 | bwd_inner: 4641.62 | bwd_allreduce: 4.99 | step: 43.21
- 49%|████▉     | 2844/5800 [8:00:37<5:40:50,  6.92s/it]                                                       {'loss': 0.02, 'grad_norm': 8.277668952941895, 'learning_rate': 2.1595344737288513e-05, 'epoch': 24.52}
- 49%|████▉     | 2844/5800 [8:00:37<5:40:50,  6.92s/it]score1 tensor([[0.4453],
-        [0.4160],
-        [0.5430],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4297, 0.4043, 0.4941, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0220, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:47:14,803] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.37
-[2025-01-25 16:47:14,805] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.97 | bwd_microstep: 4638.19 | bwd_inner_microstep: 4633.06 | bwd_allreduce_microstep: 5.06 | step_microstep: 51.66
-[2025-01-25 16:47:14,805] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.92 | bwd: 4638.22 | bwd_inner: 4633.06 | bwd_allreduce: 5.09 | step: 51.67
- 49%|████▉     | 2845/5800 [8:00:44<5:40:54,  6.92s/it]                                                       {'loss': 0.022, 'grad_norm': 8.03347396850586, 'learning_rate': 2.1584211954398845e-05, 'epoch': 24.53}
- 49%|████▉     | 2845/5800 [8:00:44<5:40:54,  6.92s/it]score1 tensor([[0.4258],
-        [0.5078],
-        [0.5078],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4277, 0.4922, 0.5039, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0190, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:47:21,738] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 16:47:21,739] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.10 | bwd_microstep: 4643.25 | bwd_inner_microstep: 4638.22 | bwd_allreduce_microstep: 4.94 | step_microstep: 43.80
-[2025-01-25 16:47:21,740] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.07 | bwd: 4643.27 | bwd_inner: 4638.22 | bwd_allreduce: 4.98 | step: 43.81
- 49%|████▉     | 2846/5800 [8:00:51<5:40:55,  6.92s/it]                                                       {'loss': 0.019, 'grad_norm': 4.431514263153076, 'learning_rate': 2.157307867752441e-05, 'epoch': 24.53}
- 49%|████▉     | 2846/5800 [8:00:51<5:40:55,  6.92s/it]score1 tensor([[0.4180],
-        [0.5078],
-        [0.6094],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4336, 0.5000, 0.5586, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0322, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:47:28,660] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 16:47:28,661] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.90 | bwd_microstep: 4638.48 | bwd_inner_microstep: 4632.96 | bwd_allreduce_microstep: 5.37 | step_microstep: 44.00
-[2025-01-25 16:47:28,661] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.85 | bwd: 4638.51 | bwd_inner: 4632.96 | bwd_allreduce: 5.45 | step: 44.01
- 49%|████▉     | 2847/5800 [8:00:58<5:40:46,  6.92s/it]                                                       {'loss': 0.0322, 'grad_norm': 4.831479549407959, 'learning_rate': 2.156194491013674e-05, 'epoch': 24.54}
- 49%|████▉     | 2847/5800 [8:00:58<5:40:46,  6.92s/it]score1 tensor([[0.4121],
-        [0.5312],
-        [0.4707],
-        [0.4551]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4023, 0.5469, 0.4668, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:47:35,599] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.94 | optimizer_step: 4.37
-[2025-01-25 16:47:35,600] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.76 | bwd_microstep: 4643.66 | bwd_inner_microstep: 4637.82 | bwd_allreduce_microstep: 5.69 | step_microstep: 48.41
-[2025-01-25 16:47:35,600] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.71 | bwd: 4643.69 | bwd_inner: 4637.81 | bwd_allreduce: 5.74 | step: 48.42
- 49%|████▉     | 2848/5800 [8:01:05<5:40:55,  6.93s/it]                                                       {'loss': 0.0146, 'grad_norm': 0.5629718899726868, 'learning_rate': 2.155081065570756e-05, 'epoch': 24.55}
- 49%|████▉     | 2848/5800 [8:01:05<5:40:55,  6.93s/it]score1 tensor([[0.4414],
-        [0.4219],
-        [0.4238],
-        [0.4395]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.4473, 0.4355, 0.4629], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0220, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:47:42,526] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.10 | optimizer_step: 4.36
-[2025-01-25 16:47:42,527] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.80 | bwd_microstep: 4638.27 | bwd_inner_microstep: 4633.01 | bwd_allreduce_microstep: 5.16 | step_microstep: 44.44
-[2025-01-25 16:47:42,527] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.76 | bwd: 4638.29 | bwd_inner: 4633.01 | bwd_allreduce: 5.21 | step: 44.44
- 49%|████▉     | 2849/5800 [8:01:12<5:40:48,  6.93s/it]                                                       {'loss': 0.022, 'grad_norm': 7.5895185470581055, 'learning_rate': 2.153967591770869e-05, 'epoch': 24.56}
- 49%|████▉     | 2849/5800 [8:01:12<5:40:48,  6.93s/it]score1 tensor([[0.4219],
-        [0.5117],
-        [0.4219],
-        [0.6250]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4668, 0.5469, 0.4844, 0.6562], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0435, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:47:49,457] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.53 | optimizer_step: 4.36
-[2025-01-25 16:47:49,458] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.44 | bwd_microstep: 4637.54 | bwd_inner_microstep: 4632.42 | bwd_allreduce_microstep: 5.01 | step_microstep: 48.06
-[2025-01-25 16:47:49,458] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2168.41 | bwd: 4637.56 | bwd_inner: 4632.42 | bwd_allreduce: 5.07 | step: 48.09
- 49%|████▉     | 2850/5800 [8:01:19<5:40:40,  6.93s/it]                                                       {'loss': 0.0435, 'grad_norm': 8.175063133239746, 'learning_rate': 2.152854069961216e-05, 'epoch': 24.57}
- 49%|████▉     | 2850/5800 [8:01:19<5:40:40,  6.93s/it]score1 tensor([[0.4258],
-        [0.4512],
-        [0.5469],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4375, 0.4805, 0.5664, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0171, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:47:56,375] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.92 | optimizer_step: 4.37
-[2025-01-25 16:47:56,376] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.89 | bwd_microstep: 4635.72 | bwd_inner_microstep: 4630.82 | bwd_allreduce_microstep: 4.77 | step_microstep: 42.67
-[2025-01-25 16:47:56,378] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.86 | bwd: 4635.74 | bwd_inner: 4630.82 | bwd_allreduce: 4.84 | step: 42.67
- 49%|████▉     | 2851/5800 [8:01:26<5:40:27,  6.93s/it]                                                       {'loss': 0.0171, 'grad_norm': 8.141271591186523, 'learning_rate': 2.151740500489012e-05, 'epoch': 24.58}
- 49%|████▉     | 2851/5800 [8:01:26<5:40:27,  6.93s/it]score1 tensor([[0.4941],
-        [0.5000],
-        [0.4922],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.5000, 0.5156, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0171, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:48:03,248] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.88 | optimizer_step: 4.37
-[2025-01-25 16:48:03,250] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.51 | bwd_microstep: 4583.83 | bwd_inner_microstep: 4578.93 | bwd_allreduce_microstep: 4.78 | step_microstep: 43.41
-[2025-01-25 16:48:03,250] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.48 | bwd: 4583.85 | bwd_inner: 4578.93 | bwd_allreduce: 4.83 | step: 43.41
- 49%|████▉     | 2852/5800 [8:01:33<5:39:33,  6.91s/it]                                                       {'loss': 0.0171, 'grad_norm': 6.179383277893066, 'learning_rate': 2.150626883701487e-05, 'epoch': 24.59}
- 49%|████▉     | 2852/5800 [8:01:33<5:39:33,  6.91s/it]score1 tensor([[0.4062],
-        [0.4824],
-        [0.5625],
-        [0.4609]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.5078, 0.6172, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0400, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:48:10,177] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 16:48:10,178] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.50 | bwd_microstep: 4642.03 | bwd_inner_microstep: 4637.24 | bwd_allreduce_microstep: 4.69 | step_microstep: 43.33
-[2025-01-25 16:48:10,178] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.46 | bwd: 4642.06 | bwd_inner: 4637.24 | bwd_allreduce: 4.75 | step: 43.34
- 49%|████▉     | 2853/5800 [8:01:40<5:39:39,  6.92s/it]                                                       {'loss': 0.04, 'grad_norm': 8.0107421875, 'learning_rate': 2.1495132199458855e-05, 'epoch': 24.59}
- 49%|████▉     | 2853/5800 [8:01:40<5:39:39,  6.92s/it]score1 tensor([[0.3867],
-        [0.5078],
-        [0.3633],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4141, 0.5508, 0.3809, 0.5938], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0376, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:48:17,103] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.98 | optimizer_step: 4.36
-[2025-01-25 16:48:17,105] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.50 | bwd_microstep: 4633.01 | bwd_inner_microstep: 4628.29 | bwd_allreduce_microstep: 4.63 | step_microstep: 50.09
-[2025-01-25 16:48:17,105] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.47 | bwd: 4633.03 | bwd_inner: 4628.29 | bwd_allreduce: 4.68 | step: 50.11
- 49%|████▉     | 2854/5800 [8:01:47<5:39:46,  6.92s/it]                                                       {'loss': 0.0376, 'grad_norm': 7.772265911102295, 'learning_rate': 2.148399509569468e-05, 'epoch': 24.6}
- 49%|████▉     | 2854/5800 [8:01:47<5:39:46,  6.92s/it]score1 tensor([[0.5859],
-        [0.3730],
-        [0.5781],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.3340, 0.5625, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:48:24,035] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.37
-[2025-01-25 16:48:24,037] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.55 | bwd_microstep: 4641.94 | bwd_inner_microstep: 4637.05 | bwd_allreduce_microstep: 4.81 | step_microstep: 42.66
-[2025-01-25 16:48:24,037] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.51 | bwd: 4641.96 | bwd_inner: 4637.05 | bwd_allreduce: 4.84 | step: 42.68
- 49%|████▉     | 2855/5800 [8:01:54<5:39:47,  6.92s/it]                                                       {'loss': 0.0234, 'grad_norm': 4.061330318450928, 'learning_rate': 2.1472857529195087e-05, 'epoch': 24.61}
- 49%|████▉     | 2855/5800 [8:01:54<5:39:47,  6.92s/it]score1 tensor([[0.6016],
-        [0.4141],
-        [0.5625],
-        [0.4141]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.4277, 0.5469, 0.3984], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0220, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:48:30,965] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 16:48:30,966] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.56 | bwd_microstep: 4641.15 | bwd_inner_microstep: 4636.26 | bwd_allreduce_microstep: 4.80 | step_microstep: 42.48
-[2025-01-25 16:48:30,966] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.50 | bwd: 4641.18 | bwd_inner: 4636.26 | bwd_allreduce: 4.84 | step: 42.48
- 49%|████▉     | 2856/5800 [8:02:00<5:39:44,  6.92s/it]                                                       {'loss': 0.022, 'grad_norm': 0.37546610832214355, 'learning_rate': 2.1461719503432962e-05, 'epoch': 24.62}
- 49%|████▉     | 2856/5800 [8:02:00<5:39:44,  6.92s/it]score1 tensor([[0.6680],
-        [0.6680],
-        [0.5664],
-        [0.4805]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6250, 0.6445, 0.5625, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0210, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:48:37,883] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 16:48:37,884] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.99 | bwd_microstep: 4634.52 | bwd_inner_microstep: 4629.88 | bwd_allreduce_microstep: 4.56 | step_microstep: 43.02
-[2025-01-25 16:48:37,885] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.94 | bwd: 4634.55 | bwd_inner: 4629.88 | bwd_allreduce: 4.60 | step: 43.03
- 49%|████▉     | 2857/5800 [8:02:07<5:39:31,  6.92s/it]                                                       {'loss': 0.021, 'grad_norm': 5.136723041534424, 'learning_rate': 2.145058102188133e-05, 'epoch': 24.63}
- 49%|████▉     | 2857/5800 [8:02:07<5:39:31,  6.92s/it]score1 tensor([[0.4355],
-        [0.5469],
-        [0.5938],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4258, 0.5898, 0.6055, 0.4863], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0244, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:48:44,814] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.37
-[2025-01-25 16:48:44,815] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.68 | bwd_microstep: 4643.32 | bwd_inner_microstep: 4638.02 | bwd_allreduce_microstep: 5.20 | step_microstep: 44.03
-[2025-01-25 16:48:44,816] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.64 | bwd: 4643.34 | bwd_inner: 4638.02 | bwd_allreduce: 5.25 | step: 44.04
- 49%|████▉     | 2858/5800 [8:02:14<5:39:31,  6.92s/it]                                                       {'loss': 0.0244, 'grad_norm': 0.728549599647522, 'learning_rate': 2.1439442088013362e-05, 'epoch': 24.64}
- 49%|████▉     | 2858/5800 [8:02:14<5:39:31,  6.92s/it]score1 tensor([[0.4648],
-        [0.5820],
-        [0.4336],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4590, 0.5820, 0.3262, 0.4863], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0288, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:48:51,676] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 16:48:51,677] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.33 | bwd_microstep: 4578.58 | bwd_inner_microstep: 4573.51 | bwd_allreduce_microstep: 4.95 | step_microstep: 45.47
-[2025-01-25 16:48:51,678] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.30 | bwd: 4578.60 | bwd_inner: 4573.51 | bwd_allreduce: 5.01 | step: 45.47
- 49%|████▉     | 2859/5800 [8:02:21<5:38:30,  6.91s/it]                                                       {'loss': 0.0288, 'grad_norm': 5.896834850311279, 'learning_rate': 2.142830270530238e-05, 'epoch': 24.65}
- 49%|████▉     | 2859/5800 [8:02:21<5:38:30,  6.91s/it]score1 tensor([[0.5312],
-        [0.4629],
-        [0.6523],
-        [0.6328]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.4512, 0.6367, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:48:58,590] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.00 | optimizer_step: 4.37
-[2025-01-25 16:48:58,591] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.79 | bwd_microstep: 4633.56 | bwd_inner_microstep: 4628.39 | bwd_allreduce_microstep: 5.06 | step_microstep: 44.41
-[2025-01-25 16:48:58,592] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.75 | bwd: 4633.58 | bwd_inner: 4628.39 | bwd_allreduce: 5.12 | step: 44.43
- 49%|████▉     | 2860/5800 [8:02:28<5:38:33,  6.91s/it]                                                       {'loss': 0.0146, 'grad_norm': 8.85073471069336, 'learning_rate': 2.1417162877221833e-05, 'epoch': 24.66}
- 49%|████▉     | 2860/5800 [8:02:28<5:38:33,  6.91s/it]score1 tensor([[0.5078],
-        [0.6133],
-        [0.4746],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.5898, 0.4570, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:49:05,535] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 16:49:05,536] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.24 | bwd_microstep: 4643.29 | bwd_inner_microstep: 4637.76 | bwd_allreduce_microstep: 5.41 | step_microstep: 45.08
-[2025-01-25 16:49:05,537] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.21 | bwd: 4643.32 | bwd_inner: 4637.76 | bwd_allreduce: 5.48 | step: 45.09
- 49%|████▉     | 2861/5800 [8:02:35<5:38:56,  6.92s/it]                                                       {'loss': 0.0186, 'grad_norm': 8.545052528381348, 'learning_rate': 2.1406022607245304e-05, 'epoch': 24.66}
- 49%|████▉     | 2861/5800 [8:02:35<5:38:56,  6.92s/it]score1 tensor([[0.6758],
-        [0.5273],
-        [0.4316],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6836, 0.5039, 0.4355, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0244, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:49:12,453] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.99 | optimizer_step: 4.37
-[2025-01-25 16:49:12,454] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.80 | bwd_microstep: 4632.83 | bwd_inner_microstep: 4627.44 | bwd_allreduce_microstep: 5.30 | step_microstep: 44.62
-[2025-01-25 16:49:12,455] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.76 | bwd: 4632.85 | bwd_inner: 4627.44 | bwd_allreduce: 5.34 | step: 44.61
- 49%|████▉     | 2862/5800 [8:02:42<5:38:46,  6.92s/it]                                                       {'loss': 0.0244, 'grad_norm': 0.5712491869926453, 'learning_rate': 2.139488189884653e-05, 'epoch': 24.67}
- 49%|████▉     | 2862/5800 [8:02:42<5:38:46,  6.92s/it]score1 tensor([[0.6328],
-        [0.3633],
-        [0.4473],
-        [0.5781]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.3672, 0.4844, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0249, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:49:19,371] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.36
-[2025-01-25 16:49:19,372] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.31 | bwd_microstep: 4633.21 | bwd_inner_microstep: 4628.33 | bwd_allreduce_microstep: 4.80 | step_microstep: 45.54
-[2025-01-25 16:49:19,372] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.26 | bwd: 4633.23 | bwd_inner: 4628.33 | bwd_allreduce: 4.84 | step: 45.54
- 49%|████▉     | 2863/5800 [8:02:49<5:38:38,  6.92s/it]                                                       {'loss': 0.0249, 'grad_norm': 3.6069142818450928, 'learning_rate': 2.138374075549938e-05, 'epoch': 24.68}
- 49%|████▉     | 2863/5800 [8:02:49<5:38:38,  6.92s/it]score1 tensor([[0.4824],
-        [0.5664],
-        [0.4453],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4824, 0.6211, 0.5312, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0479, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:49:26,235] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.36
-[2025-01-25 16:49:26,237] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.82 | bwd_microstep: 4582.48 | bwd_inner_microstep: 4577.33 | bwd_allreduce_microstep: 5.02 | step_microstep: 43.29
-[2025-01-25 16:49:26,237] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.79 | bwd: 4582.50 | bwd_inner: 4577.33 | bwd_allreduce: 5.09 | step: 43.30
- 49%|████▉     | 2864/5800 [8:02:56<5:37:46,  6.90s/it]                                                       {'loss': 0.0479, 'grad_norm': 6.246973991394043, 'learning_rate': 2.1372599180677854e-05, 'epoch': 24.69}
- 49%|████▉     | 2864/5800 [8:02:56<5:37:46,  6.90s/it]score1 tensor([[0.5234],
-        [0.4277],
-        [0.3730],
-        [0.3711]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4551, 0.3652, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0229, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:49:33,161] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.00 | optimizer_step: 4.36
-[2025-01-25 16:49:33,163] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.66 | bwd_microstep: 4636.54 | bwd_inner_microstep: 4631.62 | bwd_allreduce_microstep: 4.80 | step_microstep: 44.15
-[2025-01-25 16:49:33,163] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.62 | bwd: 4636.57 | bwd_inner: 4631.62 | bwd_allreduce: 4.86 | step: 44.16
- 49%|████▉     | 2865/5800 [8:03:03<5:38:05,  6.91s/it]                                                       {'loss': 0.0229, 'grad_norm': 4.052162170410156, 'learning_rate': 2.1361457177856083e-05, 'epoch': 24.7}
- 49%|████▉     | 2865/5800 [8:03:03<5:38:05,  6.91s/it]score1 tensor([[0.4629],
-        [0.6016],
-        [0.5430],
-        [0.4805]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.5664, 0.5273, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0171, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:49:40,090] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 16:49:40,091] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.29 | bwd_microstep: 4635.04 | bwd_inner_microstep: 4629.87 | bwd_allreduce_microstep: 5.10 | step_microstep: 47.59
-[2025-01-25 16:49:40,092] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.25 | bwd: 4635.07 | bwd_inner: 4629.87 | bwd_allreduce: 5.13 | step: 47.60
- 49%|████▉     | 2866/5800 [8:03:10<5:38:09,  6.92s/it]                                                       {'loss': 0.0171, 'grad_norm': 0.7130522727966309, 'learning_rate': 2.1350314750508345e-05, 'epoch': 24.71}
- 49%|████▉     | 2866/5800 [8:03:10<5:38:09,  6.92s/it]score1 tensor([[0.5195],
-        [0.5820],
-        [0.3730],
-        [0.4160]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.6055, 0.3652, 0.4023], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:49:47,008] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 16:49:47,009] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.13 | bwd_microstep: 4633.48 | bwd_inner_microstep: 4626.81 | bwd_allreduce_microstep: 6.58 | step_microstep: 43.72
-[2025-01-25 16:49:47,009] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.08 | bwd: 4633.51 | bwd_inner: 4626.81 | bwd_allreduce: 6.62 | step: 43.73
- 49%|████▉     | 2867/5800 [8:03:16<5:38:01,  6.92s/it]                                                       {'loss': 0.0215, 'grad_norm': 3.548870801925659, 'learning_rate': 2.1339171902109037e-05, 'epoch': 24.72}
- 49%|████▉     | 2867/5800 [8:03:16<5:38:01,  6.92s/it]score1 tensor([[0.4766],
-        [0.4688],
-        [0.4316],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4766, 0.4941, 0.4180, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:49:53,879] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.37
-[2025-01-25 16:49:53,881] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.47 | bwd_microstep: 4588.72 | bwd_inner_microstep: 4584.09 | bwd_allreduce_microstep: 4.53 | step_microstep: 43.05
-[2025-01-25 16:49:53,881] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.42 | bwd: 4588.74 | bwd_inner: 4584.09 | bwd_allreduce: 4.58 | step: 43.06
- 49%|████▉     | 2868/5800 [8:03:23<5:37:23,  6.90s/it]                                                       {'loss': 0.0195, 'grad_norm': 2.2058801651000977, 'learning_rate': 2.132802863613269e-05, 'epoch': 24.72}
- 49%|████▉     | 2868/5800 [8:03:23<5:37:23,  6.90s/it]score1 tensor([[0.5078],
-        [0.5977],
-        [0.4629],
-        [0.4629]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4961, 0.5938, 0.4688, 0.4727], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:50:00,815] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 16:50:00,817] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.42 | bwd_microstep: 4644.33 | bwd_inner_microstep: 4635.20 | bwd_allreduce_microstep: 9.03 | step_microstep: 45.06
-[2025-01-25 16:50:00,817] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.39 | bwd: 4644.35 | bwd_inner: 4635.20 | bwd_allreduce: 9.08 | step: 45.06
- 49%|████▉     | 2869/5800 [8:03:30<5:37:37,  6.91s/it]                                                       {'loss': 0.0078, 'grad_norm': 0.6904929876327515, 'learning_rate': 2.1316884956053974e-05, 'epoch': 24.73}
- 49%|████▉     | 2869/5800 [8:03:30<5:37:37,  6.91s/it]score1 tensor([[0.6289],
-        [0.3965],
-        [0.6289],
-        [0.6211]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6406, 0.3750, 0.6641, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0278, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:50:07,733] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.37
-[2025-01-25 16:50:07,734] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.55 | bwd_microstep: 4637.95 | bwd_inner_microstep: 4633.15 | bwd_allreduce_microstep: 4.72 | step_microstep: 43.43
-[2025-01-25 16:50:07,734] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.51 | bwd: 4637.98 | bwd_inner: 4633.15 | bwd_allreduce: 4.76 | step: 43.44
- 49%|████▉     | 2870/5800 [8:03:37<5:37:36,  6.91s/it]                                                       {'loss': 0.0278, 'grad_norm': 0.816703200340271, 'learning_rate': 2.1305740865347674e-05, 'epoch': 24.74}
- 49%|████▉     | 2870/5800 [8:03:37<5:37:36,  6.91s/it]score1 tensor([[0.4980],
-        [0.5586],
-        [0.5078],
-        [0.5781]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.5430, 0.4980, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:50:14,660] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.37
-[2025-01-25 16:50:14,661] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.19 | bwd_microstep: 4641.52 | bwd_inner_microstep: 4636.58 | bwd_allreduce_microstep: 4.85 | step_microstep: 44.21
-[2025-01-25 16:50:14,662] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.16 | bwd: 4641.54 | bwd_inner: 4636.58 | bwd_allreduce: 4.89 | step: 44.23
- 50%|████▉     | 2871/5800 [8:03:44<5:37:40,  6.92s/it]                                                       {'loss': 0.0156, 'grad_norm': 4.447737216949463, 'learning_rate': 2.1294596367488717e-05, 'epoch': 24.75}
- 50%|████▉     | 2871/5800 [8:03:44<5:37:40,  6.92s/it]score1 tensor([[0.4570],
-        [0.3770],
-        [0.3691],
-        [0.5898]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4453, 0.3730, 0.3789, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:50:21,583] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.37
-[2025-01-25 16:50:21,584] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.86 | bwd_microstep: 4643.10 | bwd_inner_microstep: 4638.00 | bwd_allreduce_microstep: 5.00 | step_microstep: 42.96
-[2025-01-25 16:50:21,585] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.82 | bwd: 4643.12 | bwd_inner: 4638.00 | bwd_allreduce: 5.05 | step: 42.97
- 50%|████▉     | 2872/5800 [8:03:51<5:37:39,  6.92s/it]                                                       {'loss': 0.0132, 'grad_norm': 0.5330850481987, 'learning_rate': 2.1283451465952153e-05, 'epoch': 24.76}
- 50%|████▉     | 2872/5800 [8:03:51<5:37:39,  6.92s/it]score1 tensor([[0.4785],
-        [0.5469],
-        [0.4160],
-        [0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4746, 0.5391, 0.4180, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:50:28,502] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.88 | optimizer_step: 4.37
-[2025-01-25 16:50:28,503] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.51 | bwd_microstep: 4632.78 | bwd_inner_microstep: 4627.37 | bwd_allreduce_microstep: 5.31 | step_microstep: 46.79
-[2025-01-25 16:50:28,503] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.48 | bwd: 4632.81 | bwd_inner: 4627.37 | bwd_allreduce: 5.36 | step: 46.80
- 50%|████▉     | 2873/5800 [8:03:58<5:37:31,  6.92s/it]                                                       {'loss': 0.0122, 'grad_norm': 0.5593582987785339, 'learning_rate': 2.127230616421315e-05, 'epoch': 24.77}
- 50%|████▉     | 2873/5800 [8:03:58<5:37:31,  6.92s/it]score1 tensor([[0.4473],
-        [0.4785],
-        [0.4492],
-        [0.4082]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4629, 0.4766, 0.4375, 0.4375], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:50:35,432] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 16:50:35,433] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.43 | bwd_microstep: 4639.65 | bwd_inner_microstep: 4634.48 | bwd_allreduce_microstep: 5.09 | step_microstep: 46.89
-[2025-01-25 16:50:35,434] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.39 | bwd: 4639.68 | bwd_inner: 4634.48 | bwd_allreduce: 5.13 | step: 46.90
- 50%|████▉     | 2874/5800 [8:04:05<5:37:38,  6.92s/it]                                                       {'loss': 0.0146, 'grad_norm': 0.5393314361572266, 'learning_rate': 2.126116046574701e-05, 'epoch': 24.78}
- 50%|████▉     | 2874/5800 [8:04:05<5:37:38,  6.92s/it]score1 tensor([[0.5781],
-        [0.4102],
-        [0.6445],
-        [0.4551]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.4336, 0.6211, 0.4395], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:50:42,355] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.67 | optimizer_step: 4.36
-[2025-01-25 16:50:42,357] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.49 | bwd_microstep: 4636.77 | bwd_inner_microstep: 4631.83 | bwd_allreduce_microstep: 4.86 | step_microstep: 42.82
-[2025-01-25 16:50:42,357] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.45 | bwd: 4636.79 | bwd_inner: 4631.83 | bwd_allreduce: 4.90 | step: 42.83
- 50%|████▉     | 2875/5800 [8:04:12<5:37:32,  6.92s/it]                                                       {'loss': 0.0215, 'grad_norm': 4.780670642852783, 'learning_rate': 2.1250014374029146e-05, 'epoch': 24.78}
- 50%|████▉     | 2875/5800 [8:04:12<5:37:32,  6.92s/it]score1 tensor([[0.4746],
-        [0.5391],
-        [0.6367],
-        [0.4336]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.5742, 0.6484, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0269, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:50:49,283] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 16:50:49,285] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.09 | bwd_microstep: 4638.43 | bwd_inner_microstep: 4633.40 | bwd_allreduce_microstep: 4.91 | step_microstep: 44.69
-[2025-01-25 16:50:49,285] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.05 | bwd: 4638.46 | bwd_inner: 4633.40 | bwd_allreduce: 4.97 | step: 44.70
- 50%|████▉     | 2876/5800 [8:04:19<5:37:24,  6.92s/it]                                                       {'loss': 0.0269, 'grad_norm': 8.411340713500977, 'learning_rate': 2.1238867892535117e-05, 'epoch': 24.79}
- 50%|████▉     | 2876/5800 [8:04:19<5:37:24,  6.92s/it]score1 tensor([[0.5273],
-        [0.4395],
-        [0.5859],
-        [0.4375]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.4727, 0.6172, 0.4297], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0190, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:50:56,210] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.37
-[2025-01-25 16:50:56,212] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.36 | bwd_microstep: 4642.10 | bwd_inner_microstep: 4636.49 | bwd_allreduce_microstep: 5.49 | step_microstep: 43.37
-[2025-01-25 16:50:56,212] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.32 | bwd: 4642.13 | bwd_inner: 4636.49 | bwd_allreduce: 5.55 | step: 43.38
- 50%|████▉     | 2877/5800 [8:04:26<5:37:23,  6.93s/it]                                                       {'loss': 0.019, 'grad_norm': 4.379856586456299, 'learning_rate': 2.1227721024740583e-05, 'epoch': 24.8}
- 50%|████▉     | 2877/5800 [8:04:26<5:37:23,  6.93s/it]score1 tensor([[0.4766],
-        [0.6211],
-        [0.3535],
-        [0.4160]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.6211, 0.3516, 0.3887], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:51:03,087] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 16:51:03,089] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.24 | bwd_microstep: 4580.26 | bwd_inner_microstep: 4574.61 | bwd_allreduce_microstep: 5.54 | step_microstep: 48.61
-[2025-01-25 16:51:03,089] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.20 | bwd: 4580.28 | bwd_inner: 4574.61 | bwd_allreduce: 5.59 | step: 48.62
- 50%|████▉     | 2878/5800 [8:04:33<5:36:28,  6.91s/it]                                                       {'loss': 0.0127, 'grad_norm': 1.6557414531707764, 'learning_rate': 2.1216573774121333e-05, 'epoch': 24.81}
- 50%|████▉     | 2878/5800 [8:04:33<5:36:28,  6.91s/it]score1 tensor([[0.5234],
-        [0.5234],
-        [0.4199],
-        [0.3086]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.5273, 0.4551, 0.3086], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0166, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:51:09,955] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.09 | optimizer_step: 4.36
-[2025-01-25 16:51:09,957] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.95 | bwd_microstep: 4592.70 | bwd_inner_microstep: 4587.61 | bwd_allreduce_microstep: 4.94 | step_microstep: 43.16
-[2025-01-25 16:51:09,957] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.90 | bwd: 4592.73 | bwd_inner: 4587.61 | bwd_allreduce: 5.04 | step: 43.17
- 50%|████▉     | 2879/5800 [8:04:39<5:35:55,  6.90s/it]                                                       {'loss': 0.0166, 'grad_norm': 6.110503196716309, 'learning_rate': 2.120542614415327e-05, 'epoch': 24.82}
- 50%|████▉     | 2879/5800 [8:04:39<5:35:55,  6.90s/it]score1 tensor([[0.4375],
-        [0.6133],
-        [0.4727],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4434, 0.6133, 0.4902, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:51:16,833] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 16:51:16,835] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.65 | bwd_microstep: 4584.84 | bwd_inner_microstep: 4579.70 | bwd_allreduce_microstep: 4.99 | step_microstep: 44.39
-[2025-01-25 16:51:16,835] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.60 | bwd: 4584.87 | bwd_inner: 4579.70 | bwd_allreduce: 5.07 | step: 44.40
- 50%|████▉     | 2880/5800 [8:04:46<5:35:29,  6.89s/it]                                                       {'loss': 0.0107, 'grad_norm': 6.003325939178467, 'learning_rate': 2.1194278138312418e-05, 'epoch': 24.83}
- 50%|████▉     | 2880/5800 [8:04:46<5:35:29,  6.89s/it]score1 tensor([[0.6406],
-        [0.4277],
-        [0.5078],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6016, 0.4297, 0.4629, 0.5820], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:51:23,771] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.37
-[2025-01-25 16:51:23,773] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.47 | bwd_microstep: 4633.20 | bwd_inner_microstep: 4628.08 | bwd_allreduce_microstep: 5.05 | step_microstep: 49.48
-[2025-01-25 16:51:23,773] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.42 | bwd: 4633.23 | bwd_inner: 4628.08 | bwd_allreduce: 5.08 | step: 49.48
- 50%|████▉     | 2881/5800 [8:04:53<5:35:53,  6.90s/it]                                                       {'loss': 0.0254, 'grad_norm': 0.6367859244346619, 'learning_rate': 2.118312976007492e-05, 'epoch': 24.84}
- 50%|████▉     | 2881/5800 [8:04:53<5:35:53,  6.90s/it]score1 tensor([[0.2178],
-        [0.6094],
-        [0.4805],
-        [0.3984]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.1787, 0.5664, 0.4980, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0308, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:51:30,686] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.94 | optimizer_step: 4.36
-[2025-01-25 16:51:30,687] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.73 | bwd_microstep: 4630.51 | bwd_inner_microstep: 4625.37 | bwd_allreduce_microstep: 5.02 | step_microstep: 44.56
-[2025-01-25 16:51:30,687] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.69 | bwd: 4630.53 | bwd_inner: 4625.37 | bwd_allreduce: 5.08 | step: 44.57
- 50%|████▉     | 2882/5800 [8:05:00<5:35:54,  6.91s/it]                                                       {'loss': 0.0308, 'grad_norm': 3.5211493968963623, 'learning_rate': 2.1171981012917034e-05, 'epoch': 24.84}
- 50%|████▉     | 2882/5800 [8:05:00<5:35:54,  6.91s/it]score1 tensor([[0.6562],
-        [0.7383],
-        [0.5156],
-        [0.6836]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.7031, 0.5039, 0.6562], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0303, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:51:37,598] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.36
-[2025-01-25 16:51:37,599] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.20 | bwd_microstep: 4632.82 | bwd_inner_microstep: 4627.74 | bwd_allreduce_microstep: 5.00 | step_microstep: 42.40
-[2025-01-25 16:51:37,600] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.16 | bwd: 4632.84 | bwd_inner: 4627.74 | bwd_allreduce: 5.04 | step: 42.42
- 50%|████▉     | 2883/5800 [8:05:07<5:35:57,  6.91s/it]                                                       {'loss': 0.0303, 'grad_norm': 9.65993881225586, 'learning_rate': 2.1160831900315132e-05, 'epoch': 24.85}
- 50%|████▉     | 2883/5800 [8:05:07<5:35:57,  6.91s/it]score1 tensor([[0.6133],
-        [0.5430],
-        [0.6172],
-        [0.3965]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.5156, 0.6094, 0.4199], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0342, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:51:44,531] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.13 | optimizer_step: 4.36
-[2025-01-25 16:51:44,532] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.26 | bwd_microstep: 4638.22 | bwd_inner_microstep: 4633.11 | bwd_allreduce_microstep: 5.00 | step_microstep: 47.73
-[2025-01-25 16:51:44,533] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.22 | bwd: 4638.24 | bwd_inner: 4633.10 | bwd_allreduce: 5.06 | step: 47.74
- 50%|████▉     | 2884/5800 [8:05:14<5:36:13,  6.92s/it]                                                       {'loss': 0.0342, 'grad_norm': 5.007843971252441, 'learning_rate': 2.1149682425745698e-05, 'epoch': 24.86}
- 50%|████▉     | 2884/5800 [8:05:14<5:36:13,  6.92s/it]score1 tensor([[0.4629],
-        [0.4473],
-        [0.6758],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.4531, 0.6875, 0.5742], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:51:51,457] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 16:51:51,458] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.53 | bwd_microstep: 4634.16 | bwd_inner_microstep: 4628.68 | bwd_allreduce_microstep: 5.40 | step_microstep: 46.87
-[2025-01-25 16:51:51,459] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.48 | bwd: 4634.18 | bwd_inner: 4628.68 | bwd_allreduce: 5.44 | step: 46.89
- 50%|████▉     | 2885/5800 [8:05:21<5:36:06,  6.92s/it]                                                       {'loss': 0.0122, 'grad_norm': 4.642104625701904, 'learning_rate': 2.113853259268534e-05, 'epoch': 24.87}
- 50%|████▉     | 2885/5800 [8:05:21<5:36:06,  6.92s/it]score1 tensor([[0.5234],
-        [0.4727],
-        [0.4609],
-        [0.4902]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5078, 0.4609, 0.4863, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0151, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:51:58,369] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.36
-[2025-01-25 16:51:58,371] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.08 | bwd_microstep: 4628.36 | bwd_inner_microstep: 4623.05 | bwd_allreduce_microstep: 5.19 | step_microstep: 45.39
-[2025-01-25 16:51:58,371] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.01 | bwd: 4628.39 | bwd_inner: 4623.05 | bwd_allreduce: 5.26 | step: 45.40
- 50%|████▉     | 2886/5800 [8:05:28<5:35:55,  6.92s/it]                                                       {'loss': 0.0151, 'grad_norm': 0.5269936323165894, 'learning_rate': 2.1127382404610764e-05, 'epoch': 24.88}
- 50%|████▉     | 2886/5800 [8:05:28<5:35:55,  6.92s/it]score1 tensor([[0.4941],
-        [0.4922],
-        [0.6094],
-        [0.4414]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4570, 0.5117, 0.6094, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:52:05,226] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 16:52:05,228] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.20 | bwd_microstep: 4579.97 | bwd_inner_microstep: 4574.94 | bwd_allreduce_microstep: 4.96 | step_microstep: 41.96
-[2025-01-25 16:52:05,228] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.16 | bwd: 4580.00 | bwd_inner: 4574.94 | bwd_allreduce: 5.00 | step: 41.96
- 50%|████▉     | 2887/5800 [8:05:35<5:34:56,  6.90s/it]                                                       {'loss': 0.0176, 'grad_norm': 1.9338775873184204, 'learning_rate': 2.111623186499879e-05, 'epoch': 24.89}
- 50%|████▉     | 2887/5800 [8:05:35<5:34:56,  6.90s/it]score1 tensor([[0.6289],
-        [0.4277],
-        [0.4824],
-        [0.4531]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6719, 0.4199, 0.4551, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0308, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:52:12,147] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 16:52:12,148] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.37 | bwd_microstep: 4633.00 | bwd_inner_microstep: 4627.91 | bwd_allreduce_microstep: 5.00 | step_microstep: 45.70
-[2025-01-25 16:52:12,149] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.33 | bwd: 4633.02 | bwd_inner: 4627.91 | bwd_allreduce: 5.05 | step: 45.70
- 50%|████▉     | 2888/5800 [8:05:42<5:35:05,  6.90s/it]                                                       {'loss': 0.0308, 'grad_norm': 0.6786638498306274, 'learning_rate': 2.1105080977326355e-05, 'epoch': 24.9}
- 50%|████▉     | 2888/5800 [8:05:42<5:35:05,  6.90s/it]score1 tensor([[0.5625],
-        [0.5625],
-        [0.6055],
-        [0.5977]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.5703, 0.6367, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:52:19,062] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 16:52:19,064] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.22 | bwd_microstep: 4633.80 | bwd_inner_microstep: 4628.88 | bwd_allreduce_microstep: 4.83 | step_microstep: 48.11
-[2025-01-25 16:52:19,064] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.17 | bwd: 4633.82 | bwd_inner: 4628.89 | bwd_allreduce: 4.87 | step: 48.12
- 50%|████▉     | 2889/5800 [8:05:49<5:35:08,  6.91s/it]                                                       {'loss': 0.0176, 'grad_norm': 4.582265853881836, 'learning_rate': 2.1093929745070493e-05, 'epoch': 24.91}
- 50%|████▉     | 2889/5800 [8:05:49<5:35:08,  6.91s/it]score1 tensor([[0.4551],
-        [0.4707],
-        [0.4805],
-        [0.4531]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.4473, 0.5391, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0210, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:52:25,922] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 16:52:25,923] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.46 | bwd_microstep: 4577.22 | bwd_inner_microstep: 4572.59 | bwd_allreduce_microstep: 4.56 | step_microstep: 44.39
-[2025-01-25 16:52:25,924] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.42 | bwd: 4577.24 | bwd_inner: 4572.59 | bwd_allreduce: 4.59 | step: 44.40
- 50%|████▉     | 2890/5800 [8:05:55<5:34:20,  6.89s/it]                                                       {'loss': 0.021, 'grad_norm': 1.958587646484375, 'learning_rate': 2.1082778171708355e-05, 'epoch': 24.91}
- 50%|████▉     | 2890/5800 [8:05:55<5:34:20,  6.89s/it]score1 tensor([[0.4902],
-        [0.5547],
-        [0.5273],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.5312, 0.4844, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:52:32,825] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 16:52:32,826] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.32 | bwd_microstep: 4627.11 | bwd_inner_microstep: 4622.18 | bwd_allreduce_microstep: 4.85 | step_microstep: 42.20
-[2025-01-25 16:52:32,827] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.28 | bwd: 4627.14 | bwd_inner: 4622.18 | bwd_allreduce: 4.89 | step: 42.20
- 50%|████▉     | 2891/5800 [8:06:02<5:34:26,  6.90s/it]                                                       {'loss': 0.0215, 'grad_norm': 0.5523310303688049, 'learning_rate': 2.1071626260717196e-05, 'epoch': 24.92}
- 50%|████▉     | 2891/5800 [8:06:02<5:34:26,  6.90s/it]score1 tensor([[0.5000],
-        [0.4316],
-        [0.4707],
-        [0.3926]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.4023, 0.4238, 0.2812], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0522, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:52:39,741] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.13 | optimizer_step: 4.37
-[2025-01-25 16:52:39,742] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.08 | bwd_microstep: 4631.58 | bwd_inner_microstep: 4626.36 | bwd_allreduce_microstep: 5.06 | step_microstep: 44.33
-[2025-01-25 16:52:39,743] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.05 | bwd: 4631.61 | bwd_inner: 4626.36 | bwd_allreduce: 5.13 | step: 44.34
- 50%|████▉     | 2892/5800 [8:06:09<5:34:32,  6.90s/it]                                                       {'loss': 0.0522, 'grad_norm': 7.783912658691406, 'learning_rate': 2.1060474015574376e-05, 'epoch': 24.93}
- 50%|████▉     | 2892/5800 [8:06:09<5:34:32,  6.90s/it]score1 tensor([[0.5039],
-        [0.4219],
-        [0.4824],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4863, 0.4160, 0.4883, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:52:46,658] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.37
-[2025-01-25 16:52:46,659] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.68 | bwd_microstep: 4629.66 | bwd_inner_microstep: 4624.89 | bwd_allreduce_microstep: 4.70 | step_microstep: 42.77
-[2025-01-25 16:52:46,660] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.62 | bwd: 4629.68 | bwd_inner: 4624.89 | bwd_allreduce: 4.73 | step: 42.77
- 50%|████▉     | 2893/5800 [8:06:16<5:34:39,  6.91s/it]                                                       {'loss': 0.0161, 'grad_norm': 4.043951511383057, 'learning_rate': 2.1049321439757354e-05, 'epoch': 24.94}
- 50%|████▉     | 2893/5800 [8:06:16<5:34:39,  6.91s/it]score1 tensor([[0.4219],
-        [0.5000],
-        [0.3848],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.5234, 0.3789, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:52:53,576] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.89 | optimizer_step: 4.36
-[2025-01-25 16:52:53,577] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.87 | bwd_microstep: 4629.62 | bwd_inner_microstep: 4624.31 | bwd_allreduce_microstep: 5.18 | step_microstep: 44.49
-[2025-01-25 16:52:53,578] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.83 | bwd: 4629.64 | bwd_inner: 4624.31 | bwd_allreduce: 5.27 | step: 44.50
- 50%|████▉     | 2894/5800 [8:06:23<5:34:43,  6.91s/it]                                                       {'loss': 0.0146, 'grad_norm': 0.3803449273109436, 'learning_rate': 2.10381685367437e-05, 'epoch': 24.95}
- 50%|████▉     | 2894/5800 [8:06:23<5:34:43,  6.91s/it]score1 tensor([[0.4570],
-        [0.4980],
-        [0.4277],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.4629, 0.4551, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0303, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:53:00,485] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.37
-[2025-01-25 16:53:00,487] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.12 | bwd_microstep: 4625.91 | bwd_inner_microstep: 4620.80 | bwd_allreduce_microstep: 4.98 | step_microstep: 44.83
-[2025-01-25 16:53:00,487] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.07 | bwd: 4625.93 | bwd_inner: 4620.80 | bwd_allreduce: 5.04 | step: 44.84
- 50%|████▉     | 2895/5800 [8:06:30<5:34:31,  6.91s/it]                                                       {'loss': 0.0303, 'grad_norm': 4.016303539276123, 'learning_rate': 2.102701531001109e-05, 'epoch': 24.96}
- 50%|████▉     | 2895/5800 [8:06:30<5:34:31,  6.91s/it]score1 tensor([[0.3613],
-        [0.3984],
-        [0.4805],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.4160, 0.5117, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0225, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:53:07,391] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.37
-[2025-01-25 16:53:07,392] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.03 | bwd_microstep: 4625.28 | bwd_inner_microstep: 4620.25 | bwd_allreduce_microstep: 4.91 | step_microstep: 44.33
-[2025-01-25 16:53:07,393] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.99 | bwd: 4625.30 | bwd_inner: 4620.25 | bwd_allreduce: 4.96 | step: 44.34
- 50%|████▉     | 2896/5800 [8:06:37<5:34:23,  6.91s/it]                                                       {'loss': 0.0225, 'grad_norm': 7.663023948669434, 'learning_rate': 2.1015861763037274e-05, 'epoch': 24.97}
- 50%|████▉     | 2896/5800 [8:06:37<5:34:23,  6.91s/it]score1 tensor([[0.5117],
-        [0.5625],
-        [0.5469],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.5586, 0.5508, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:53:14,312] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.04 | optimizer_step: 4.36
-[2025-01-25 16:53:14,314] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.54 | bwd_microstep: 4633.17 | bwd_inner_microstep: 4628.02 | bwd_allreduce_microstep: 5.04 | step_microstep: 45.09
-[2025-01-25 16:53:14,314] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.48 | bwd: 4633.20 | bwd_inner: 4628.02 | bwd_allreduce: 5.10 | step: 45.10
- 50%|████▉     | 2897/5800 [8:06:44<5:34:27,  6.91s/it]                                                       {'loss': 0.0068, 'grad_norm': 4.220796585083008, 'learning_rate': 2.1004707899300147e-05, 'epoch': 24.97}
- 50%|████▉     | 2897/5800 [8:06:44<5:34:27,  6.91s/it]score1 tensor([[0.5156],
-        [0.5430],
-        [0.5508],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.5703, 0.5508, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:53:21,140] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.94 | optimizer_step: 4.36
-[2025-01-25 16:53:21,141] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.96 | bwd_microstep: 4545.38 | bwd_inner_microstep: 4538.51 | bwd_allreduce_microstep: 6.77 | step_microstep: 42.75
-[2025-01-25 16:53:21,142] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.93 | bwd: 4545.40 | bwd_inner: 4538.51 | bwd_allreduce: 6.82 | step: 42.76
- 50%|████▉     | 2898/5800 [8:06:51<5:33:03,  6.89s/it]                                                       {'loss': 0.0068, 'grad_norm': 2.1691293716430664, 'learning_rate': 2.099355372227766e-05, 'epoch': 24.98}
- 50%|████▉     | 2898/5800 [8:06:51<5:33:03,  6.89s/it]score1 tensor([[0.6289],
-        [0.6328],
-        [0.5430],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.6133, 0.5391, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0166, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:53:28,052] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 16:53:28,053] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.34 | bwd_microstep: 4625.72 | bwd_inner_microstep: 4620.39 | bwd_allreduce_microstep: 5.24 | step_microstep: 50.13
-[2025-01-25 16:53:28,053] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.25 | bwd: 4625.75 | bwd_inner: 4620.39 | bwd_allreduce: 5.29 | step: 50.14
- 50%|████▉     | 2899/5800 [8:06:58<5:33:24,  6.90s/it]                                                       {'loss': 0.0166, 'grad_norm': 4.732377052307129, 'learning_rate': 2.0982399235447873e-05, 'epoch': 24.99}
- 50%|████▉     | 2899/5800 [8:06:58<5:33:24,  6.90s/it][93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-score1 tensor([[0.4746]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4414], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0332, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:53:32,640] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.05 | optimizer_step: 4.37
-[2025-01-25 16:53:32,642] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 570.68 | bwd_microstep: 1219.74 | bwd_inner_microstep: 1214.67 | bwd_allreduce_microstep: 4.94 | step_microstep: 42.58
-[2025-01-25 16:53:32,642] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 570.64 | bwd: 1219.76 | bwd_inner: 1214.67 | bwd_allreduce: 5.01 | step: 42.59
- 50%|█████     | 2900/5800 [8:07:02<4:59:48,  6.20s/it]                                                       {'loss': 0.0332, 'grad_norm': 8.059818267822266, 'learning_rate': 2.097124444228897e-05, 'epoch': 25.0}
- 50%|█████     | 2900/5800 [8:07:02<4:59:48,  6.20s/it][2025-01-25 16:53:37,274] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 16:53:47,751] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 16:53:58,533] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 16:54:09,282] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-score1 tensor([[0.5078],
-        [0.3809],
-        [0.6055],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.3457, 0.5781, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:54:33,360] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.90 | optimizer_step: 4.37
-[2025-01-25 16:54:33,361] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2144.04 | bwd_microstep: 4591.24 | bwd_inner_microstep: 4586.03 | bwd_allreduce_microstep: 5.11 | step_microstep: 44.73
-[2025-01-25 16:54:33,362] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2144.00 | bwd: 4591.26 | bwd_inner: 4586.03 | bwd_allreduce: 5.17 | step: 44.73
- 50%|█████     | 2901/5800 [8:08:03<18:09:53, 22.56s/it]                                                        {'loss': 0.0234, 'grad_norm': 0.47380560636520386, 'learning_rate': 2.096008934627919e-05, 'epoch': 25.01}
- 50%|█████     | 2901/5800 [8:08:03<18:09:53, 22.56s/it]score1 tensor([[0.6250],
-        [0.4883],
-        [0.5312],
-        [0.4941]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6289, 0.5039, 0.5195, 0.4688], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0142, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:54:40,218] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.15 | optimizer_step: 4.37
-[2025-01-25 16:54:40,220] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2142.34 | bwd_microstep: 4591.41 | bwd_inner_microstep: 4581.78 | bwd_allreduce_microstep: 9.51 | step_microstep: 44.83
-[2025-01-25 16:54:40,220] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2142.30 | bwd: 4591.44 | bwd_inner: 4581.78 | bwd_allreduce: 9.57 | step: 44.84
- 50%|█████     | 2902/5800 [8:08:10<14:22:02, 17.85s/it]                                                        {'loss': 0.0142, 'grad_norm': 0.475232869386673, 'learning_rate': 2.0948933950896895e-05, 'epoch': 25.02}
- 50%|█████     | 2902/5800 [8:08:10<14:22:02, 17.85s/it]score1 tensor([[0.6055],
-        [0.4707],
-        [0.3672],
-        [0.6719]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.4961, 0.3105, 0.6484], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:54:47,058] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.11 | optimizer_step: 4.37
-[2025-01-25 16:54:47,060] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2137.06 | bwd_microstep: 4584.76 | bwd_inner_microstep: 4579.78 | bwd_allreduce_microstep: 4.88 | step_microstep: 44.29
-[2025-01-25 16:54:47,060] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2137.03 | bwd: 4584.79 | bwd_inner: 4579.78 | bwd_allreduce: 4.92 | step: 44.29
- 50%|█████     | 2903/5800 [8:08:17<11:42:17, 14.55s/it]                                                        {'loss': 0.0273, 'grad_norm': 0.5585145950317383, 'learning_rate': 2.093777825962053e-05, 'epoch': 25.03}
- 50%|█████     | 2903/5800 [8:08:17<11:42:17, 14.55s/it]score1 tensor([[0.6484],
-        [0.5820],
-        [0.5586],
-        [0.4141]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6484, 0.5742, 0.5430, 0.3730], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:54:53,863] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 16:54:53,865] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2135.41 | bwd_microstep: 4542.05 | bwd_inner_microstep: 4536.81 | bwd_allreduce_microstep: 5.16 | step_microstep: 48.50
-[2025-01-25 16:54:53,865] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2135.37 | bwd: 4542.07 | bwd_inner: 4536.81 | bwd_allreduce: 5.20 | step: 48.50
- 50%|█████     | 2904/5800 [8:08:23<9:49:52, 12.22s/it]                                                        {'loss': 0.0161, 'grad_norm': 6.313168525695801, 'learning_rate': 2.092662227592863e-05, 'epoch': 25.03}
- 50%|█████     | 2904/5800 [8:08:23<9:49:52, 12.22s/it]score1 tensor([[0.6328],
-        [0.4043],
-        [0.6211],
-        [0.4473]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6523, 0.3984, 0.6367, 0.4609], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:55:00,721] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.96 | optimizer_step: 4.37
-[2025-01-25 16:55:00,723] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2143.91 | bwd_microstep: 4590.95 | bwd_inner_microstep: 4585.92 | bwd_allreduce_microstep: 4.93 | step_microstep: 44.01
-[2025-01-25 16:55:00,723] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2143.87 | bwd: 4590.97 | bwd_inner: 4585.92 | bwd_allreduce: 4.99 | step: 44.02
- 50%|█████     | 2905/5800 [8:08:30<8:32:07, 10.61s/it]                                                       {'loss': 0.0137, 'grad_norm': 4.79669713973999, 'learning_rate': 2.091546600329983e-05, 'epoch': 25.04}
- 50%|█████     | 2905/5800 [8:08:30<8:32:07, 10.61s/it]score1 tensor([[0.6602],
-        [0.5430],
-        [0.5508],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.5000, 0.5430, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0244, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:55:07,594] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 16:55:07,596] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2144.24 | bwd_microstep: 4606.62 | bwd_inner_microstep: 4601.14 | bwd_allreduce_microstep: 5.39 | step_microstep: 44.35
-[2025-01-25 16:55:07,596] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2144.19 | bwd: 4606.65 | bwd_inner: 4601.14 | bwd_allreduce: 5.44 | step: 44.40
- 50%|█████     | 2906/5800 [8:08:37<7:37:49,  9.49s/it]                                                       {'loss': 0.0244, 'grad_norm': 4.625203609466553, 'learning_rate': 2.090430944521284e-05, 'epoch': 25.05}
- 50%|█████     | 2906/5800 [8:08:37<7:37:49,  9.49s/it]score1 tensor([[0.4668],
-        [0.5625],
-        [0.3789],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4277, 0.5508, 0.3945, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:55:14,471] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 16:55:14,472] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2145.00 | bwd_microstep: 4608.89 | bwd_inner_microstep: 4603.86 | bwd_allreduce_microstep: 4.95 | step_microstep: 42.72
-[2025-01-25 16:55:14,473] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2144.97 | bwd: 4608.92 | bwd_inner: 4603.86 | bwd_allreduce: 4.98 | step: 42.73
- 50%|█████     | 2907/5800 [8:08:44<6:59:49,  8.71s/it]                                                       {'loss': 0.0195, 'grad_norm': 0.5053467154502869, 'learning_rate': 2.0893152605146477e-05, 'epoch': 25.06}
- 50%|█████     | 2907/5800 [8:08:44<6:59:49,  8.71s/it]score1 tensor([[0.5820],
-        [0.4688],
-        [0.4648],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.4219, 0.4492, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0259, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:55:21,356] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.90 | optimizer_step: 4.37
-[2025-01-25 16:55:21,358] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2147.56 | bwd_microstep: 4615.19 | bwd_inner_microstep: 4608.95 | bwd_allreduce_microstep: 6.14 | step_microstep: 47.15
-[2025-01-25 16:55:21,358] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2147.52 | bwd: 4615.21 | bwd_inner: 4608.95 | bwd_allreduce: 6.19 | step: 47.18
- 50%|█████     | 2908/5800 [8:08:51<6:33:23,  8.16s/it]                                                       {'loss': 0.0259, 'grad_norm': 4.190314769744873, 'learning_rate': 2.0881995486579632e-05, 'epoch': 25.07}
- 50%|█████     | 2908/5800 [8:08:51<6:33:23,  8.16s/it]score1 tensor([[0.5391],
-        [0.4609],
-        [0.3359],
-        [0.4922]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.4336, 0.3418, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:55:28,229] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 16:55:28,230] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.63 | bwd_microstep: 4596.56 | bwd_inner_microstep: 4591.70 | bwd_allreduce_microstep: 4.77 | step_microstep: 42.76
-[2025-01-25 16:55:28,230] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.59 | bwd: 4596.58 | bwd_inner: 4591.70 | bwd_allreduce: 4.81 | step: 42.76
- 50%|█████     | 2909/5800 [8:08:58<6:14:34,  7.77s/it]                                                       {'loss': 0.0161, 'grad_norm': 0.5011335611343384, 'learning_rate': 2.087083809299129e-05, 'epoch': 25.08}
- 50%|█████     | 2909/5800 [8:08:58<6:14:34,  7.77s/it]score1 tensor([[0.6367],
-        [0.6016],
-        [0.5625],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6250, 0.6094, 0.5820, 0.5586], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:55:35,098] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 16:55:35,099] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2148.05 | bwd_microstep: 4595.74 | bwd_inner_microstep: 4590.35 | bwd_allreduce_microstep: 5.30 | step_microstep: 44.37
-[2025-01-25 16:55:35,100] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2148.01 | bwd: 4595.76 | bwd_inner: 4590.35 | bwd_allreduce: 5.34 | step: 44.38
- 50%|█████     | 2910/5800 [8:09:05<6:01:21,  7.50s/it]                                                       {'loss': 0.0137, 'grad_norm': 0.4707895815372467, 'learning_rate': 2.0859680427860524e-05, 'epoch': 25.09}
- 50%|█████     | 2910/5800 [8:09:05<6:01:21,  7.50s/it]score1 tensor([[0.4668],
-        [0.4629],
-        [0.5234],
-        [0.4238]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4434, 0.4727, 0.5078, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:55:41,985] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 16:55:41,987] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2147.11 | bwd_microstep: 4605.45 | bwd_inner_microstep: 4600.49 | bwd_allreduce_microstep: 4.85 | step_microstep: 43.12
-[2025-01-25 16:55:41,987] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2147.07 | bwd: 4605.48 | bwd_inner: 4600.49 | bwd_allreduce: 4.91 | step: 43.13
- 50%|█████     | 2911/5800 [8:09:11<5:52:21,  7.32s/it]                                                       {'loss': 0.0137, 'grad_norm': 4.056578636169434, 'learning_rate': 2.084852249466648e-05, 'epoch': 25.09}
- 50%|█████     | 2911/5800 [8:09:11<5:52:21,  7.32s/it]score1 tensor([[0.6016],
-        [0.4863],
-        [0.5156],
-        [0.4453]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133, 0.4688, 0.4922, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0181, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:55:48,853] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.19 | optimizer_step: 4.36
-[2025-01-25 16:55:48,854] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2148.59 | bwd_microstep: 4598.86 | bwd_inner_microstep: 4593.71 | bwd_allreduce_microstep: 5.04 | step_microstep: 42.64
-[2025-01-25 16:55:48,854] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2148.56 | bwd: 4598.89 | bwd_inner: 4593.71 | bwd_allreduce: 5.10 | step: 42.64
- 50%|█████     | 2912/5800 [8:09:18<5:45:42,  7.18s/it]                                                       {'loss': 0.0181, 'grad_norm': 0.45124512910842896, 'learning_rate': 2.0837364296888402e-05, 'epoch': 25.1}
- 50%|█████     | 2912/5800 [8:09:18<5:45:42,  7.18s/it]score1 tensor([[0.5117],
-        [0.5117],
-        [0.5781],
-        [0.3770]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.4863, 0.6016, 0.3809], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0166, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:55:55,729] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.37
-[2025-01-25 16:55:55,731] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.87 | bwd_microstep: 4601.18 | bwd_inner_microstep: 4595.83 | bwd_allreduce_microstep: 5.22 | step_microstep: 43.83
-[2025-01-25 16:55:55,738] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.83 | bwd: 4601.20 | bwd_inner: 4595.83 | bwd_allreduce: 5.29 | step: 43.84
- 50%|█████     | 2913/5800 [8:09:25<5:41:20,  7.09s/it]                                                       {'loss': 0.0166, 'grad_norm': 0.4600519835948944, 'learning_rate': 2.0826205838005603e-05, 'epoch': 25.11}
- 50%|█████     | 2913/5800 [8:09:25<5:41:20,  7.09s/it]score1 tensor([[0.6016],
-        [0.6523],
-        [0.4902],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6484, 0.6875, 0.4941, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0264, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:56:02,628] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 16:56:02,630] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.06 | bwd_microstep: 4610.52 | bwd_inner_microstep: 4605.20 | bwd_allreduce_microstep: 5.22 | step_microstep: 45.94
-[2025-01-25 16:56:02,630] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.01 | bwd: 4610.54 | bwd_inner: 4605.20 | bwd_allreduce: 5.27 | step: 45.94
- 50%|█████     | 2914/5800 [8:09:32<5:38:20,  7.03s/it]                                                       {'loss': 0.0264, 'grad_norm': 4.511037826538086, 'learning_rate': 2.081504712149749e-05, 'epoch': 25.12}
- 50%|█████     | 2914/5800 [8:09:32<5:38:20,  7.03s/it]score1 tensor([[0.5391],
-        [0.6055],
-        [0.3379],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.5898, 0.3438, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:56:09,522] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.18 | optimizer_step: 4.36
-[2025-01-25 16:56:09,523] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.90 | bwd_microstep: 4600.32 | bwd_inner_microstep: 4593.55 | bwd_allreduce_microstep: 6.67 | step_microstep: 51.70
-[2025-01-25 16:56:09,524] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.83 | bwd: 4600.35 | bwd_inner: 4593.55 | bwd_allreduce: 6.72 | step: 51.72
- 50%|█████     | 2915/5800 [8:09:39<5:36:07,  6.99s/it]                                                       {'loss': 0.0093, 'grad_norm': 5.045271873474121, 'learning_rate': 2.080388815084354e-05, 'epoch': 25.13}
- 50%|█████     | 2915/5800 [8:09:39<5:36:07,  6.99s/it]score1 tensor([[0.4648],
-        [0.4883],
-        [0.5195],
-        [0.4961]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.4941, 0.5312, 0.5234], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:56:16,408] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.36
-[2025-01-25 16:56:16,410] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.05 | bwd_microstep: 4606.77 | bwd_inner_microstep: 4601.53 | bwd_allreduce_microstep: 5.14 | step_microstep: 44.43
-[2025-01-25 16:56:16,410] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.00 | bwd: 4606.79 | bwd_inner: 4601.53 | bwd_allreduce: 5.19 | step: 44.44
- 50%|█████     | 2916/5800 [8:09:46<5:34:30,  6.96s/it]                                                       {'loss': 0.0122, 'grad_norm': 8.136317253112793, 'learning_rate': 2.0792728929523326e-05, 'epoch': 25.14}
- 50%|█████     | 2916/5800 [8:09:46<5:34:30,  6.96s/it]score1 tensor([[0.6055],
-        [0.4512],
-        [0.5742],
-        [0.3750]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.5039, 0.6055, 0.4160], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:56:23,295] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 16:56:23,297] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.31 | bwd_microstep: 4612.88 | bwd_inner_microstep: 4607.54 | bwd_allreduce_microstep: 5.22 | step_microstep: 45.79
-[2025-01-25 16:56:23,297] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.28 | bwd: 4612.90 | bwd_inner: 4607.54 | bwd_allreduce: 5.28 | step: 45.80
- 50%|█████     | 2917/5800 [8:09:53<5:33:16,  6.94s/it]                                                       {'loss': 0.0352, 'grad_norm': 8.23768138885498, 'learning_rate': 2.078156946101647e-05, 'epoch': 25.15}
- 50%|█████     | 2917/5800 [8:09:53<5:33:16,  6.94s/it]score1 tensor([[0.5938],
-        [0.4844],
-        [0.6289],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.5156, 0.6484, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0225, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:56:30,175] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.95 | optimizer_step: 4.36
-[2025-01-25 16:56:30,177] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.32 | bwd_microstep: 4608.76 | bwd_inner_microstep: 4603.17 | bwd_allreduce_microstep: 5.49 | step_microstep: 44.59
-[2025-01-25 16:56:30,177] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.27 | bwd: 4608.79 | bwd_inner: 4603.17 | bwd_allreduce: 5.54 | step: 44.60
- 50%|█████     | 2918/5800 [8:10:00<5:32:25,  6.92s/it]                                                       {'loss': 0.0225, 'grad_norm': 4.205337047576904, 'learning_rate': 2.077040974880269e-05, 'epoch': 25.16}
- 50%|█████     | 2918/5800 [8:10:00<5:32:25,  6.92s/it]score1 tensor([[0.5547],
-        [0.6602],
-        [0.5625],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5742, 0.6875, 0.5938, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:56:37,025] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.54 | optimizer_step: 4.37
-[2025-01-25 16:56:37,026] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.46 | bwd_microstep: 4565.14 | bwd_inner_microstep: 4560.10 | bwd_allreduce_microstep: 4.96 | step_microstep: 48.37
-[2025-01-25 16:56:37,027] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.43 | bwd: 4565.16 | bwd_inner: 4560.10 | bwd_allreduce: 5.00 | step: 48.38
- 50%|█████     | 2919/5800 [8:10:06<5:31:13,  6.90s/it]                                                       {'loss': 0.0195, 'grad_norm': 6.802084922790527, 'learning_rate': 2.0759249796361786e-05, 'epoch': 25.16}
- 50%|█████     | 2919/5800 [8:10:07<5:31:13,  6.90s/it]score1 tensor([[0.5820],
-        [0.5547],
-        [0.3828],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.5391, 0.3887, 0.6133], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0220, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:56:43,916] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.37
-[2025-01-25 16:56:43,917] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.57 | bwd_microstep: 4612.58 | bwd_inner_microstep: 4606.90 | bwd_allreduce_microstep: 5.59 | step_microstep: 43.36
-[2025-01-25 16:56:43,917] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.53 | bwd: 4612.60 | bwd_inner: 4606.90 | bwd_allreduce: 5.64 | step: 43.37
- 50%|█████     | 2920/5800 [8:10:13<5:31:02,  6.90s/it]                                                       {'loss': 0.022, 'grad_norm': 0.6612594127655029, 'learning_rate': 2.0748089607173622e-05, 'epoch': 25.17}
- 50%|█████     | 2920/5800 [8:10:13<5:31:02,  6.90s/it]score1 tensor([[0.3633],
-        [0.4980],
-        [0.4258],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3652, 0.5000, 0.4336, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:56:50,811] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 16:56:50,813] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.70 | bwd_microstep: 4615.50 | bwd_inner_microstep: 4609.44 | bwd_allreduce_microstep: 5.97 | step_microstep: 48.14
-[2025-01-25 16:56:50,813] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.66 | bwd: 4615.54 | bwd_inner: 4609.44 | bwd_allreduce: 6.01 | step: 48.13
- 50%|█████     | 2921/5800 [8:10:20<5:30:56,  6.90s/it]                                                       {'loss': 0.0117, 'grad_norm': 3.4751391410827637, 'learning_rate': 2.0736929184718137e-05, 'epoch': 25.18}
- 50%|█████     | 2921/5800 [8:10:20<5:30:56,  6.90s/it]score1 tensor([[0.3828],
-        [0.5078],
-        [0.5781],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3945, 0.5352, 0.5625, 0.5703], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0205, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:56:57,706] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.09 | optimizer_step: 4.36
-[2025-01-25 16:56:57,707] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.46 | bwd_microstep: 4605.80 | bwd_inner_microstep: 4600.59 | bwd_allreduce_microstep: 5.14 | step_microstep: 54.15
-[2025-01-25 16:56:57,708] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.43 | bwd: 4605.83 | bwd_inner: 4600.59 | bwd_allreduce: 5.17 | step: 54.16
- 50%|█████     | 2922/5800 [8:10:27<5:30:45,  6.90s/it]                                                       {'loss': 0.0205, 'grad_norm': 3.8373403549194336, 'learning_rate': 2.0725768532475353e-05, 'epoch': 25.19}
- 50%|█████     | 2922/5800 [8:10:27<5:30:45,  6.90s/it]score1 tensor([[0.6172],
-        [0.3770],
-        [0.6445],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.3613, 0.6445, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:57:04,506] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.99 | optimizer_step: 4.37
-[2025-01-25 16:57:04,507] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.42 | bwd_microstep: 4526.53 | bwd_inner_microstep: 4521.41 | bwd_allreduce_microstep: 5.02 | step_microstep: 43.30
-[2025-01-25 16:57:04,507] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.39 | bwd: 4526.56 | bwd_inner: 4521.41 | bwd_allreduce: 5.08 | step: 43.30
- 50%|█████     | 2923/5800 [8:10:34<5:29:17,  6.87s/it]                                                       {'loss': 0.0176, 'grad_norm': 3.981837749481201, 'learning_rate': 2.0714607653925335e-05, 'epoch': 25.2}
- 50%|█████     | 2923/5800 [8:10:34<5:29:17,  6.87s/it]score1 tensor([[0.4727],
-        [0.5508],
-        [0.4805],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.5312, 0.4629, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:57:11,398] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 16:57:11,399] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.00 | bwd_microstep: 4616.11 | bwd_inner_microstep: 4611.29 | bwd_allreduce_microstep: 4.75 | step_microstep: 42.47
-[2025-01-25 16:57:11,399] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.97 | bwd: 4616.13 | bwd_inner: 4611.29 | bwd_allreduce: 4.78 | step: 42.48
- 50%|█████     | 2924/5800 [8:10:41<5:29:31,  6.87s/it]                                                       {'loss': 0.0215, 'grad_norm': 8.38704776763916, 'learning_rate': 2.0703446552548257e-05, 'epoch': 25.21}
- 50%|█████     | 2924/5800 [8:10:41<5:29:31,  6.87s/it]score1 tensor([[0.4102],
-        [0.5234],
-        [0.4570],
-        [0.6680]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3809, 0.4922, 0.4551, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:57:18,288] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.90 | optimizer_step: 4.36
-[2025-01-25 16:57:18,289] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.04 | bwd_microstep: 4613.59 | bwd_inner_microstep: 4608.63 | bwd_allreduce_microstep: 4.86 | step_microstep: 43.88
-[2025-01-25 16:57:18,289] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.00 | bwd: 4613.61 | bwd_inner: 4608.63 | bwd_allreduce: 4.91 | step: 43.89
- 50%|█████     | 2925/5800 [8:10:48<5:29:41,  6.88s/it]                                                       {'loss': 0.0215, 'grad_norm': 8.363536834716797, 'learning_rate': 2.069228523182434e-05, 'epoch': 25.22}
- 50%|█████     | 2925/5800 [8:10:48<5:29:41,  6.88s/it]score1 tensor([[0.4922],
-        [0.5508],
-        [0.3750],
-        [0.4609]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4805, 0.5117, 0.3730, 0.4609], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:57:25,136] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 16:57:25,137] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.72 | bwd_microstep: 4564.90 | bwd_inner_microstep: 4559.58 | bwd_allreduce_microstep: 5.21 | step_microstep: 45.99
-[2025-01-25 16:57:25,138] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.69 | bwd: 4564.93 | bwd_inner: 4559.58 | bwd_allreduce: 5.27 | step: 46.00
- 50%|█████     | 2926/5800 [8:10:55<5:29:01,  6.87s/it]                                                       {'loss': 0.0132, 'grad_norm': 5.978193283081055, 'learning_rate': 2.068112369523387e-05, 'epoch': 25.22}
- 50%|█████     | 2926/5800 [8:10:55<5:29:01,  6.87s/it]score1 tensor([[0.4199],
-        [0.5938],
-        [0.4902],
-        [0.4707]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4062, 0.5664, 0.4707, 0.4590], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0181, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:57:32,031] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.94 | optimizer_step: 4.36
-[2025-01-25 16:57:32,033] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.37 | bwd_microstep: 4613.20 | bwd_inner_microstep: 4608.02 | bwd_allreduce_microstep: 5.08 | step_microstep: 45.39
-[2025-01-25 16:57:32,033] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.32 | bwd: 4613.22 | bwd_inner: 4608.02 | bwd_allreduce: 5.13 | step: 45.39
- 50%|█████     | 2927/5800 [8:11:02<5:29:17,  6.88s/it]                                                       {'loss': 0.0181, 'grad_norm': 8.115311622619629, 'learning_rate': 2.066996194625721e-05, 'epoch': 25.23}
- 50%|█████     | 2927/5800 [8:11:02<5:29:17,  6.88s/it]score1 tensor([[0.6328],
-        [0.4805],
-        [0.5117],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6289, 0.4492, 0.5039, 0.4727], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:57:38,924] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.36
-[2025-01-25 16:57:38,925] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.79 | bwd_microstep: 4616.42 | bwd_inner_microstep: 4611.56 | bwd_allreduce_microstep: 4.76 | step_microstep: 43.66
-[2025-01-25 16:57:38,926] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.74 | bwd: 4616.44 | bwd_inner: 4611.56 | bwd_allreduce: 4.80 | step: 43.67
- 50%|█████     | 2928/5800 [8:11:08<5:29:26,  6.88s/it]                                                       {'loss': 0.0186, 'grad_norm': 8.461844444274902, 'learning_rate': 2.0658799988374798e-05, 'epoch': 25.24}
- 50%|█████     | 2928/5800 [8:11:08<5:29:26,  6.88s/it]score1 tensor([[0.5039],
-        [0.3555],
-        [0.4023],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.3223, 0.2812, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0532, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:57:45,826] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.14 | optimizer_step: 4.36
-[2025-01-25 16:57:45,827] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.71 | bwd_microstep: 4618.61 | bwd_inner_microstep: 4613.54 | bwd_allreduce_microstep: 4.98 | step_microstep: 47.49
-[2025-01-25 16:57:45,828] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.67 | bwd: 4618.63 | bwd_inner: 4613.54 | bwd_allreduce: 5.02 | step: 47.48
- 50%|█████     | 2929/5800 [8:11:15<5:29:35,  6.89s/it]                                                       {'loss': 0.0532, 'grad_norm': 7.709992408752441, 'learning_rate': 2.0647637825067123e-05, 'epoch': 25.25}
- 50%|█████     | 2929/5800 [8:11:15<5:29:35,  6.89s/it]score1 tensor([[0.5117],
-        [0.4551],
-        [0.4922],
-        [0.4355]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.4609, 0.4961, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:57:52,725] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.36
-[2025-01-25 16:57:52,727] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.19 | bwd_microstep: 4621.49 | bwd_inner_microstep: 4616.62 | bwd_allreduce_microstep: 4.78 | step_microstep: 42.78
-[2025-01-25 16:57:52,727] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.16 | bwd: 4621.51 | bwd_inner: 4616.62 | bwd_allreduce: 4.83 | step: 42.79
- 51%|█████     | 2930/5800 [8:11:22<5:29:41,  6.89s/it]                                                       {'loss': 0.0103, 'grad_norm': 3.8306379318237305, 'learning_rate': 2.0636475459814725e-05, 'epoch': 25.26}
- 51%|█████     | 2930/5800 [8:11:22<5:29:41,  6.89s/it]score1 tensor([[0.5938],
-        [0.4941],
-        [0.6250],
-        [0.4746]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.5117, 0.6641, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:57:59,621] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.91 | optimizer_step: 4.36
-[2025-01-25 16:57:59,623] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.60 | bwd_microstep: 4610.15 | bwd_inner_microstep: 4605.07 | bwd_allreduce_microstep: 4.98 | step_microstep: 43.87
-[2025-01-25 16:57:59,624] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.56 | bwd: 4610.18 | bwd_inner: 4605.07 | bwd_allreduce: 5.03 | step: 43.88
- 51%|█████     | 2931/5800 [8:11:29<5:29:36,  6.89s/it]                                                       {'loss': 0.0254, 'grad_norm': 8.62996768951416, 'learning_rate': 2.0625312896098242e-05, 'epoch': 25.27}
- 51%|█████     | 2931/5800 [8:11:29<5:29:36,  6.89s/it]score1 tensor([[0.5430],
-        [0.6367],
-        [0.5625],
-        [0.6367]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5586, 0.6406, 0.5781, 0.6797], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:58:06,527] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.94 | optimizer_step: 4.37
-[2025-01-25 16:58:06,528] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.35 | bwd_microstep: 4618.22 | bwd_inner_microstep: 4612.94 | bwd_allreduce_microstep: 5.18 | step_microstep: 45.61
-[2025-01-25 16:58:06,529] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.29 | bwd: 4618.25 | bwd_inner: 4612.94 | bwd_allreduce: 5.24 | step: 45.62
- 51%|█████     | 2932/5800 [8:11:36<5:29:41,  6.90s/it]                                                       {'loss': 0.0195, 'grad_norm': 9.086446762084961, 'learning_rate': 2.0614150137398346e-05, 'epoch': 25.28}
- 51%|█████     | 2932/5800 [8:11:36<5:29:41,  6.90s/it]score1 tensor([[0.5352],
-        [0.3574],
-        [0.3809],
-        [0.4980]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.3652, 0.3652, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:58:13,421] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 16:58:13,423] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.78 | bwd_microstep: 4612.32 | bwd_inner_microstep: 4607.72 | bwd_allreduce_microstep: 4.47 | step_microstep: 45.59
-[2025-01-25 16:58:13,423] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.73 | bwd: 4612.35 | bwd_inner: 4607.72 | bwd_allreduce: 4.54 | step: 45.60
- 51%|█████     | 2933/5800 [8:11:43<5:29:25,  6.89s/it]                                                       {'loss': 0.0107, 'grad_norm': 0.4128740727901459, 'learning_rate': 2.0602987187195782e-05, 'epoch': 25.28}
- 51%|█████     | 2933/5800 [8:11:43<5:29:25,  6.89s/it]score1 tensor([[0.4512],
-        [0.4355],
-        [0.3965],
-        [0.4160]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4375, 0.4668, 0.3789, 0.4258], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0181, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:58:20,315] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.08 | optimizer_step: 4.36
-[2025-01-25 16:58:20,316] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.89 | bwd_microstep: 4612.72 | bwd_inner_microstep: 4607.59 | bwd_allreduce_microstep: 5.03 | step_microstep: 47.32
-[2025-01-25 16:58:20,317] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.84 | bwd: 4612.74 | bwd_inner: 4607.59 | bwd_allreduce: 5.08 | step: 47.33
- 51%|█████     | 2934/5800 [8:11:50<5:29:21,  6.89s/it]                                                       {'loss': 0.0181, 'grad_norm': 0.3601551651954651, 'learning_rate': 2.059182404897135e-05, 'epoch': 25.29}
- 51%|█████     | 2934/5800 [8:11:50<5:29:21,  6.89s/it]score1 tensor([[0.5273],
-        [0.4570],
-        [0.5664],
-        [0.4980]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.4492, 0.5664, 0.4883], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:58:27,172] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.54 | optimizer_step: 4.37
-[2025-01-25 16:58:27,173] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.31 | bwd_microstep: 4575.87 | bwd_inner_microstep: 4571.04 | bwd_allreduce_microstep: 4.72 | step_microstep: 47.34
-[2025-01-25 16:58:27,173] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.27 | bwd: 4575.90 | bwd_inner: 4571.04 | bwd_allreduce: 4.77 | step: 47.35
- 51%|█████     | 2935/5800 [8:11:57<5:28:40,  6.88s/it]                                                       {'loss': 0.0083, 'grad_norm': 1.9358121156692505, 'learning_rate': 2.058066072620591e-05, 'epoch': 25.3}
- 51%|█████     | 2935/5800 [8:11:57<5:28:40,  6.88s/it]score1 tensor([[0.6289],
-        [0.4961],
-        [0.3613],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6562, 0.5273, 0.3711, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0259, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:58:34,063] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.89 | optimizer_step: 4.36
-[2025-01-25 16:58:34,065] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.22 | bwd_microstep: 4620.85 | bwd_inner_microstep: 4615.69 | bwd_allreduce_microstep: 5.08 | step_microstep: 42.91
-[2025-01-25 16:58:34,065] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.19 | bwd: 4620.87 | bwd_inner: 4615.69 | bwd_allreduce: 5.12 | step: 42.92
- 51%|█████     | 2936/5800 [8:12:04<5:28:44,  6.89s/it]                                                       {'loss': 0.0259, 'grad_norm': 8.156170845031738, 'learning_rate': 2.0569497222380384e-05, 'epoch': 25.31}
- 51%|█████     | 2936/5800 [8:12:04<5:28:44,  6.89s/it]score1 tensor([[0.3691],
-        [0.4512],
-        [0.5000],
-        [0.4609]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3457, 0.4727, 0.5117, 0.4570], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0151, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:58:40,957] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.37
-[2025-01-25 16:58:40,959] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.34 | bwd_microstep: 4612.20 | bwd_inner_microstep: 4607.20 | bwd_allreduce_microstep: 4.92 | step_microstep: 42.44
-[2025-01-25 16:58:40,959] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.31 | bwd: 4612.23 | bwd_inner: 4607.20 | bwd_allreduce: 4.96 | step: 42.46
- 51%|█████     | 2937/5800 [8:12:10<5:28:40,  6.89s/it]                                                       {'loss': 0.0151, 'grad_norm': 0.5096209645271301, 'learning_rate': 2.0558333540975735e-05, 'epoch': 25.32}
- 51%|█████     | 2937/5800 [8:12:10<5:28:40,  6.89s/it]score1 tensor([[0.4746],
-        [0.4727],
-        [0.5977],
-        [0.6172]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5000, 0.4980, 0.6055, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:58:47,850] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 16:58:47,851] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.18 | bwd_microstep: 4615.96 | bwd_inner_microstep: 4610.85 | bwd_allreduce_microstep: 4.99 | step_microstep: 42.58
-[2025-01-25 16:58:47,851] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.13 | bwd: 4615.98 | bwd_inner: 4610.85 | bwd_allreduce: 5.06 | step: 42.58
- 51%|█████     | 2938/5800 [8:12:17<5:28:38,  6.89s/it]                                                       {'loss': 0.0215, 'grad_norm': 8.60069465637207, 'learning_rate': 2.0547169685473004e-05, 'epoch': 25.33}
- 51%|█████     | 2938/5800 [8:12:17<5:28:38,  6.89s/it]score1 tensor([[0.4922],
-        [0.4746],
-        [0.4277],
-        [0.6289]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.5000, 0.4277, 0.6797], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0220, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:58:54,692] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.37
-[2025-01-25 16:58:54,693] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.00 | bwd_microstep: 4567.10 | bwd_inner_microstep: 4562.46 | bwd_allreduce_microstep: 4.55 | step_microstep: 41.98
-[2025-01-25 16:58:54,693] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.95 | bwd: 4567.12 | bwd_inner: 4562.45 | bwd_allreduce: 4.60 | step: 41.99
- 51%|█████     | 2939/5800 [8:12:24<5:27:53,  6.88s/it]                                                       {'loss': 0.022, 'grad_norm': 6.3611273765563965, 'learning_rate': 2.053600565935327e-05, 'epoch': 25.34}
- 51%|█████     | 2939/5800 [8:12:24<5:27:53,  6.88s/it]score1 tensor([[0.4395],
-        [0.3828],
-        [0.4609],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4297, 0.3477, 0.4785, 0.5938], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0225, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:59:01,591] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.89 | optimizer_step: 4.36
-[2025-01-25 16:59:01,593] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.23 | bwd_microstep: 4617.38 | bwd_inner_microstep: 4612.38 | bwd_allreduce_microstep: 4.91 | step_microstep: 43.66
-[2025-01-25 16:59:01,593] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.19 | bwd: 4617.40 | bwd_inner: 4612.38 | bwd_allreduce: 4.95 | step: 43.67
- 51%|█████     | 2940/5800 [8:12:31<5:28:04,  6.88s/it]                                                       {'loss': 0.0225, 'grad_norm': 0.762593150138855, 'learning_rate': 2.052484146609767e-05, 'epoch': 25.34}
- 51%|█████     | 2940/5800 [8:12:31<5:28:04,  6.88s/it]score1 tensor([[0.4785],
-        [0.3926],
-        [0.5273],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4609, 0.4121, 0.5508, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:59:08,479] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 16:59:08,480] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.50 | bwd_microstep: 4609.75 | bwd_inner_microstep: 4604.99 | bwd_allreduce_microstep: 4.68 | step_microstep: 43.34
-[2025-01-25 16:59:08,481] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.44 | bwd: 4609.78 | bwd_inner: 4604.99 | bwd_allreduce: 4.72 | step: 43.35
- 51%|█████     | 2941/5800 [8:12:38<5:27:59,  6.88s/it]                                                       {'loss': 0.0186, 'grad_norm': 0.5023521780967712, 'learning_rate': 2.05136771091874e-05, 'epoch': 25.35}
- 51%|█████     | 2941/5800 [8:12:38<5:27:59,  6.88s/it]score1 tensor([[0.4453],
-        [0.5391],
-        [0.4434],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.5547, 0.4180, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0171, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:59:15,368] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 16:59:15,369] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.29 | bwd_microstep: 4613.60 | bwd_inner_microstep: 4608.97 | bwd_allreduce_microstep: 4.51 | step_microstep: 42.78
-[2025-01-25 16:59:15,369] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.26 | bwd: 4613.62 | bwd_inner: 4608.98 | bwd_allreduce: 4.55 | step: 42.78
- 51%|█████     | 2942/5800 [8:12:45<5:27:56,  6.88s/it]                                                       {'loss': 0.0171, 'grad_norm': 0.5277834534645081, 'learning_rate': 2.0502512592103693e-05, 'epoch': 25.36}
- 51%|█████     | 2942/5800 [8:12:45<5:27:56,  6.88s/it]score1 tensor([[0.4336],
-        [0.4492],
-        [0.4980],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4453, 0.4512, 0.4648, 0.4668], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0210, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:59:22,258] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 16:59:22,260] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.77 | bwd_microstep: 4616.03 | bwd_inner_microstep: 4611.12 | bwd_allreduce_microstep: 4.79 | step_microstep: 42.61
-[2025-01-25 16:59:22,260] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.74 | bwd: 4616.05 | bwd_inner: 4611.12 | bwd_allreduce: 4.87 | step: 42.62
- 51%|█████     | 2943/5800 [8:12:52<5:27:52,  6.89s/it]                                                       {'loss': 0.021, 'grad_norm': 0.6019539833068848, 'learning_rate': 2.0491347918327832e-05, 'epoch': 25.37}
- 51%|█████     | 2943/5800 [8:12:52<5:27:52,  6.89s/it]score1 tensor([[0.4922],
-        [0.5312],
-        [0.5742],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4434, 0.4844, 0.5352, 0.4629], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0449, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:59:29,141] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.55 | optimizer_step: 4.36
-[2025-01-25 16:59:29,142] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.85 | bwd_microstep: 4610.68 | bwd_inner_microstep: 4605.87 | bwd_allreduce_microstep: 4.73 | step_microstep: 43.14
-[2025-01-25 16:59:29,142] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.82 | bwd: 4610.70 | bwd_inner: 4605.87 | bwd_allreduce: 4.76 | step: 43.14
- 51%|█████     | 2944/5800 [8:12:59<5:27:48,  6.89s/it]                                                       {'loss': 0.0449, 'grad_norm': 8.498647689819336, 'learning_rate': 2.0480183091341164e-05, 'epoch': 25.38}
- 51%|█████     | 2944/5800 [8:12:59<5:27:48,  6.89s/it]score1 tensor([[0.4805],
-        [0.6836],
-        [0.1973],
-        [0.4766]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.6562, 0.1787, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0178, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:59:36,038] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 16:59:36,040] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.42 | bwd_microstep: 4616.63 | bwd_inner_microstep: 4611.88 | bwd_allreduce_microstep: 4.67 | step_microstep: 44.72
-[2025-01-25 16:59:36,040] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.39 | bwd: 4616.66 | bwd_inner: 4611.88 | bwd_allreduce: 4.71 | step: 44.72
- 51%|█████     | 2945/5800 [8:13:06<5:27:49,  6.89s/it]                                                       {'loss': 0.0178, 'grad_norm': 3.6857452392578125, 'learning_rate': 2.0469018114625085e-05, 'epoch': 25.39}
- 51%|█████     | 2945/5800 [8:13:06<5:27:49,  6.89s/it]score1 tensor([[0.5078],
-        [0.4336],
-        [0.4961],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4902, 0.4004, 0.4863, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0190, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:59:42,946] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.37
-[2025-01-25 16:59:42,947] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.70 | bwd_microstep: 4615.52 | bwd_inner_microstep: 4610.57 | bwd_allreduce_microstep: 4.87 | step_microstep: 46.37
-[2025-01-25 16:59:42,948] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.66 | bwd: 4615.54 | bwd_inner: 4610.57 | bwd_allreduce: 4.91 | step: 46.37
- 51%|█████     | 2946/5800 [8:13:12<5:27:56,  6.89s/it]                                                       {'loss': 0.019, 'grad_norm': 3.955723285675049, 'learning_rate': 2.0457852991661012e-05, 'epoch': 25.4}
- 51%|█████     | 2946/5800 [8:13:12<5:27:56,  6.89s/it]score1 tensor([[0.4805],
-        [0.5586],
-        [0.3770],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.5352, 0.4004, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0225, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:59:49,831] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.36
-[2025-01-25 16:59:49,832] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.22 | bwd_microstep: 4609.82 | bwd_inner_microstep: 4604.71 | bwd_allreduce_microstep: 5.00 | step_microstep: 44.23
-[2025-01-25 16:59:49,833] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.17 | bwd: 4609.84 | bwd_inner: 4604.71 | bwd_allreduce: 5.06 | step: 44.23
- 51%|█████     | 2947/5800 [8:13:19<5:27:44,  6.89s/it]                                                       {'loss': 0.0225, 'grad_norm': 0.6071322560310364, 'learning_rate': 2.044668772593044e-05, 'epoch': 25.41}
- 51%|█████     | 2947/5800 [8:13:19<5:27:44,  6.89s/it]score1 tensor([[0.5430],
-        [0.4102],
-        [0.4082],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5742, 0.4023, 0.4023, 0.5820], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 16:59:56,736] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.37
-[2025-01-25 16:59:56,737] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.02 | bwd_microstep: 4616.84 | bwd_inner_microstep: 4611.58 | bwd_allreduce_microstep: 5.18 | step_microstep: 44.99
-[2025-01-25 16:59:56,738] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.98 | bwd: 4616.86 | bwd_inner: 4611.58 | bwd_allreduce: 5.22 | step: 44.99
- 51%|█████     | 2948/5800 [8:13:26<5:27:46,  6.90s/it]                                                       {'loss': 0.0132, 'grad_norm': 1.0104421377182007, 'learning_rate': 2.0435522320914886e-05, 'epoch': 25.41}
- 51%|█████     | 2948/5800 [8:13:26<5:27:46,  6.90s/it]score1 tensor([[0.4922],
-        [0.4727],
-        [0.5820],
-        [0.4297]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.4961, 0.6016, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:00:03,626] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.37
-[2025-01-25 17:00:03,627] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.06 | bwd_microstep: 4611.34 | bwd_inner_microstep: 4606.35 | bwd_allreduce_microstep: 4.90 | step_microstep: 43.38
-[2025-01-25 17:00:03,627] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.03 | bwd: 4611.37 | bwd_inner: 4606.35 | bwd_allreduce: 4.95 | step: 43.38
- 51%|█████     | 2949/5800 [8:13:33<5:27:35,  6.89s/it]                                                       {'loss': 0.0161, 'grad_norm': 8.136398315429688, 'learning_rate': 2.0424356780095908e-05, 'epoch': 25.42}
- 51%|█████     | 2949/5800 [8:13:33<5:27:35,  6.89s/it]score1 tensor([[0.4316],
-        [0.4590],
-        [0.5547],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4297, 0.4531, 0.5625, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:00:10,517] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 17:00:10,518] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.76 | bwd_microstep: 4614.47 | bwd_inner_microstep: 4608.91 | bwd_allreduce_microstep: 5.48 | step_microstep: 41.92
-[2025-01-25 17:00:10,519] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.72 | bwd: 4614.49 | bwd_inner: 4608.91 | bwd_allreduce: 5.52 | step: 41.92
- 51%|█████     | 2950/5800 [8:13:40<5:27:24,  6.89s/it]                                                       {'loss': 0.0107, 'grad_norm': 0.7744698524475098, 'learning_rate': 2.0413191106955123e-05, 'epoch': 25.43}
- 51%|█████     | 2950/5800 [8:13:40<5:27:24,  6.89s/it]score1 tensor([[0.3398],
-        [0.4180],
-        [0.5078],
-        [0.4238]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3555, 0.4277, 0.5078, 0.4316], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:00:17,365] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.91 | optimizer_step: 4.37
-[2025-01-25 17:00:17,367] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.23 | bwd_microstep: 4569.17 | bwd_inner_microstep: 4563.95 | bwd_allreduce_microstep: 5.12 | step_microstep: 43.18
-[2025-01-25 17:00:17,367] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.20 | bwd: 4569.19 | bwd_inner: 4563.95 | bwd_allreduce: 5.17 | step: 43.19
- 51%|█████     | 2951/5800 [8:13:47<5:26:41,  6.88s/it]                                                       {'loss': 0.0083, 'grad_norm': 5.527121543884277, 'learning_rate': 2.040202530497418e-05, 'epoch': 25.44}
- 51%|█████     | 2951/5800 [8:13:47<5:26:41,  6.88s/it]score1 tensor([[0.6172],
-        [0.3379],
-        [0.5625],
-        [0.6094]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.4043, 0.5625, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:00:24,233] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 17:00:24,234] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.45 | bwd_microstep: 4575.57 | bwd_inner_microstep: 4570.50 | bwd_allreduce_microstep: 4.96 | step_microstep: 43.22
-[2025-01-25 17:00:24,234] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.42 | bwd: 4575.60 | bwd_inner: 4570.50 | bwd_allreduce: 5.02 | step: 43.24
- 51%|█████     | 2952/5800 [8:13:54<5:26:23,  6.88s/it]                                                       {'loss': 0.0254, 'grad_norm': 1.829777717590332, 'learning_rate': 2.039085937763478e-05, 'epoch': 25.45}
- 51%|█████     | 2952/5800 [8:13:54<5:26:23,  6.88s/it]score1 tensor([[0.4629],
-        [0.3398],
-        [0.6328],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.3691, 0.6289, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0151, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:00:31,130] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 17:00:31,131] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.74 | bwd_microstep: 4615.45 | bwd_inner_microstep: 4609.89 | bwd_allreduce_microstep: 5.42 | step_microstep: 41.72
-[2025-01-25 17:00:31,131] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.70 | bwd: 4615.48 | bwd_inner: 4609.89 | bwd_allreduce: 5.50 | step: 41.73
- 51%|█████     | 2953/5800 [8:14:01<5:26:33,  6.88s/it]                                                       {'loss': 0.0151, 'grad_norm': 3.4196081161499023, 'learning_rate': 2.037969332841864e-05, 'epoch': 25.46}
- 51%|█████     | 2953/5800 [8:14:01<5:26:33,  6.88s/it]score1 tensor([[0.5859],
-        [0.6094],
-        [0.5938],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5898, 0.6133, 0.5664, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:00:38,030] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.12 | optimizer_step: 4.37
-[2025-01-25 17:00:38,032] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.88 | bwd_microstep: 4618.33 | bwd_inner_microstep: 4613.32 | bwd_allreduce_microstep: 4.93 | step_microstep: 42.68
-[2025-01-25 17:00:38,032] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.83 | bwd: 4618.35 | bwd_inner: 4613.32 | bwd_allreduce: 4.96 | step: 42.69
- 51%|█████     | 2954/5800 [8:14:08<5:26:45,  6.89s/it]                                                       {'loss': 0.0161, 'grad_norm': 0.8249691128730774, 'learning_rate': 2.036852716080753e-05, 'epoch': 25.47}
- 51%|█████     | 2954/5800 [8:14:08<5:26:45,  6.89s/it]score1 tensor([[0.5430],
-        [0.5312],
-        [0.5000],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5000, 0.5391, 0.4980, 0.4688], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0239, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:00:44,921] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.50 | optimizer_step: 4.37
-[2025-01-25 17:00:44,922] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.14 | bwd_microstep: 4610.61 | bwd_inner_microstep: 4605.72 | bwd_allreduce_microstep: 4.78 | step_microstep: 42.98
-[2025-01-25 17:00:44,922] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.11 | bwd: 4610.63 | bwd_inner: 4605.72 | bwd_allreduce: 4.83 | step: 42.98
- 51%|█████     | 2955/5800 [8:14:14<5:26:33,  6.89s/it]                                                       {'loss': 0.0239, 'grad_norm': 4.158675670623779, 'learning_rate': 2.0357360878283252e-05, 'epoch': 25.47}
- 51%|█████     | 2955/5800 [8:14:14<5:26:33,  6.89s/it]score1 tensor([[0.5195],
-        [0.4727],
-        [0.4688],
-        [0.4316]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.4902, 0.4395, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0298, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:00:51,808] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.37
-[2025-01-25 17:00:51,809] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.59 | bwd_microstep: 4613.45 | bwd_inner_microstep: 4608.23 | bwd_allreduce_microstep: 5.12 | step_microstep: 43.35
-[2025-01-25 17:00:51,810] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.56 | bwd: 4613.47 | bwd_inner: 4608.23 | bwd_allreduce: 5.17 | step: 43.36
- 51%|█████     | 2956/5800 [8:14:21<5:26:28,  6.89s/it]                                                       {'loss': 0.0298, 'grad_norm': 4.0042853355407715, 'learning_rate': 2.0346194484327658e-05, 'epoch': 25.48}
- 51%|█████     | 2956/5800 [8:14:21<5:26:28,  6.89s/it]score1 tensor([[0.6406],
-        [0.6016],
-        [0.4668],
-        [0.4141]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.5625, 0.4844, 0.4160], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:00:58,701] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.91 | optimizer_step: 4.37
-[2025-01-25 17:00:58,707] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.09 | bwd_microstep: 4616.99 | bwd_inner_microstep: 4611.86 | bwd_allreduce_microstep: 5.00 | step_microstep: 47.47
-[2025-01-25 17:00:58,708] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.05 | bwd: 4617.01 | bwd_inner: 4611.86 | bwd_allreduce: 5.08 | step: 47.48
- 51%|█████     | 2957/5800 [8:14:28<5:26:32,  6.89s/it]                                                       {'loss': 0.0234, 'grad_norm': 1.2290302515029907, 'learning_rate': 2.0335027982422614e-05, 'epoch': 25.49}
- 51%|█████     | 2957/5800 [8:14:28<5:26:32,  6.89s/it]score1 tensor([[0.5625],
-        [0.4551],
-        [0.5898],
-        [0.4980]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.4570, 0.6094, 0.4805], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:01:05,606] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.36
-[2025-01-25 17:01:05,608] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.09 | bwd_microstep: 4620.84 | bwd_inner_microstep: 4615.66 | bwd_allreduce_microstep: 5.07 | step_microstep: 44.00
-[2025-01-25 17:01:05,608] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.06 | bwd: 4620.87 | bwd_inner: 4615.66 | bwd_allreduce: 5.12 | step: 44.00
- 51%|█████     | 2958/5800 [8:14:35<5:26:35,  6.90s/it]                                                       {'loss': 0.0146, 'grad_norm': 0.5548405051231384, 'learning_rate': 2.0323861376050035e-05, 'epoch': 25.5}
- 51%|█████     | 2958/5800 [8:14:35<5:26:35,  6.90s/it]score1 tensor([[0.7344],
-        [0.5078],
-        [0.4395],
-        [0.4688]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6875, 0.5156, 0.4180, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0200, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:01:12,525] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 17:01:12,526] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.70 | bwd_microstep: 4633.68 | bwd_inner_microstep: 4628.68 | bwd_allreduce_microstep: 4.90 | step_microstep: 41.92
-[2025-01-25 17:01:12,527] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.66 | bwd: 4633.70 | bwd_inner: 4628.68 | bwd_allreduce: 4.95 | step: 41.92
- 51%|█████     | 2959/5800 [8:14:42<5:26:44,  6.90s/it]                                                       {'loss': 0.02, 'grad_norm': 4.451251983642578, 'learning_rate': 2.0312694668691857e-05, 'epoch': 25.51}
- 51%|█████     | 2959/5800 [8:14:42<5:26:44,  6.90s/it]score1 tensor([[0.4688],
-        [0.5312],
-        [0.5508],
-        [0.3887]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.5391, 0.5195, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0190, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:01:19,443] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.36
-[2025-01-25 17:01:19,444] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.03 | bwd_microstep: 4633.43 | bwd_inner_microstep: 4628.42 | bwd_allreduce_microstep: 4.94 | step_microstep: 50.39
-[2025-01-25 17:01:19,445] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.00 | bwd: 4633.46 | bwd_inner: 4628.42 | bwd_allreduce: 4.98 | step: 50.40
- 51%|█████     | 2960/5800 [8:14:49<5:26:50,  6.91s/it]                                                       {'loss': 0.019, 'grad_norm': 3.788355827331543, 'learning_rate': 2.0301527863830054e-05, 'epoch': 25.52}
- 51%|█████     | 2960/5800 [8:14:49<5:26:50,  6.91s/it]score1 tensor([[0.5312],
-        [0.4141],
-        [0.6172],
-        [0.6406]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.4043, 0.6055, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0171, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:01:26,357] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.37
-[2025-01-25 17:01:26,358] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.12 | bwd_microstep: 4639.15 | bwd_inner_microstep: 4634.10 | bwd_allreduce_microstep: 4.94 | step_microstep: 41.44
-[2025-01-25 17:01:26,358] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.09 | bwd: 4639.18 | bwd_inner: 4634.10 | bwd_allreduce: 5.00 | step: 41.45
- 51%|█████     | 2961/5800 [8:14:56<5:26:57,  6.91s/it]                                                       {'loss': 0.0171, 'grad_norm': 8.810965538024902, 'learning_rate': 2.029036096494663e-05, 'epoch': 25.53}
- 51%|█████     | 2961/5800 [8:14:56<5:26:57,  6.91s/it]score1 tensor([[0.4863],
-        [0.6328],
-        [0.5234],
-        [0.3965]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4883, 0.5352, 0.6055, 0.4082], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0483, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:01:33,286] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.05 | optimizer_step: 4.37
-[2025-01-25 17:01:33,288] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.36 | bwd_microstep: 4642.52 | bwd_inner_microstep: 4637.73 | bwd_allreduce_microstep: 4.70 | step_microstep: 42.68
-[2025-01-25 17:01:33,288] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.31 | bwd: 4642.54 | bwd_inner: 4637.73 | bwd_allreduce: 4.74 | step: 42.69
- 51%|█████     | 2962/5800 [8:15:03<5:27:05,  6.92s/it]                                                       {'loss': 0.0483, 'grad_norm': 3.671034812927246, 'learning_rate': 2.0279193975523625e-05, 'epoch': 25.53}
- 51%|█████     | 2962/5800 [8:15:03<5:27:05,  6.92s/it]score1 tensor([[0.4414],
-        [0.4941],
-        [0.5664],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4668, 0.5195, 0.6211, 0.4824], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0327, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:01:40,209] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 17:01:40,210] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.73 | bwd_microstep: 4633.86 | bwd_inner_microstep: 4628.80 | bwd_allreduce_microstep: 4.93 | step_microstep: 42.60
-[2025-01-25 17:01:40,211] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.70 | bwd: 4633.89 | bwd_inner: 4628.80 | bwd_allreduce: 4.99 | step: 42.61
- 51%|█████     | 2963/5800 [8:15:10<5:27:06,  6.92s/it]                                                       {'loss': 0.0327, 'grad_norm': 4.083914756774902, 'learning_rate': 2.026802689904309e-05, 'epoch': 25.54}
- 51%|█████     | 2963/5800 [8:15:10<5:27:06,  6.92s/it]score1 tensor([[0.5898],
-        [0.4277],
-        [0.4199],
-        [0.7266]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.4023, 0.4121, 0.7070], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0181, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:01:47,141] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.94 | optimizer_step: 4.36
-[2025-01-25 17:01:47,143] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.20 | bwd_microstep: 4643.75 | bwd_inner_microstep: 4638.41 | bwd_allreduce_microstep: 5.23 | step_microstep: 47.51
-[2025-01-25 17:01:47,143] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.17 | bwd: 4643.78 | bwd_inner: 4638.41 | bwd_allreduce: 5.28 | step: 47.51
- 51%|█████     | 2964/5800 [8:15:17<5:27:06,  6.92s/it]                                                       {'loss': 0.0181, 'grad_norm': 4.053958415985107, 'learning_rate': 2.025685973898712e-05, 'epoch': 25.55}
- 51%|█████     | 2964/5800 [8:15:17<5:27:06,  6.92s/it]score1 tensor([[0.3496],
-        [0.3652],
-        [0.4746],
-        [0.4590]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3516, 0.3672, 0.4980, 0.4531], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:01:54,071] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 17:01:54,072] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.53 | bwd_microstep: 4643.18 | bwd_inner_microstep: 4636.87 | bwd_allreduce_microstep: 6.18 | step_microstep: 42.48
-[2025-01-25 17:01:54,072] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.50 | bwd: 4643.21 | bwd_inner: 4636.87 | bwd_allreduce: 6.25 | step: 42.48
- 51%|█████     | 2965/5800 [8:15:24<5:27:10,  6.92s/it]                                                       {'loss': 0.0083, 'grad_norm': 3.51613450050354, 'learning_rate': 2.0245692498837825e-05, 'epoch': 25.56}
- 51%|█████     | 2965/5800 [8:15:24<5:27:10,  6.92s/it]score1 tensor([[0.6250],
-        [0.5156],
-        [0.5273],
-        [0.4316]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5977, 0.4805, 0.5117, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0229, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:02:00,999] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 17:02:01,000] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.84 | bwd_microstep: 4639.38 | bwd_inner_microstep: 4634.43 | bwd_allreduce_microstep: 4.85 | step_microstep: 42.19
-[2025-01-25 17:02:01,000] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.82 | bwd: 4639.41 | bwd_inner: 4634.43 | bwd_allreduce: 4.90 | step: 42.20
- 51%|█████     | 2966/5800 [8:15:30<5:27:03,  6.92s/it]                                                       {'loss': 0.0229, 'grad_norm': 4.726541042327881, 'learning_rate': 2.0234525182077344e-05, 'epoch': 25.57}
- 51%|█████     | 2966/5800 [8:15:30<5:27:03,  6.92s/it]score1 tensor([[0.5625],
-        [0.6758],
-        [0.5391],
-        [0.6328]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.6445, 0.5078, 0.6016], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0322, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:02:07,916] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.37
-[2025-01-25 17:02:07,918] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.98 | bwd_microstep: 4634.05 | bwd_inner_microstep: 4629.09 | bwd_allreduce_microstep: 4.88 | step_microstep: 42.13
-[2025-01-25 17:02:07,918] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.94 | bwd: 4634.08 | bwd_inner: 4629.09 | bwd_allreduce: 4.92 | step: 42.15
- 51%|█████     | 2967/5800 [8:15:37<5:26:48,  6.92s/it]                                                       {'loss': 0.0322, 'grad_norm': 9.104907989501953, 'learning_rate': 2.0223357792187844e-05, 'epoch': 25.58}
- 51%|█████     | 2967/5800 [8:15:37<5:26:48,  6.92s/it]score1 tensor([[0.4863],
-        [0.5742],
-        [0.4590],
-        [0.4121]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4766, 0.5508, 0.4395, 0.4062], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:02:14,834] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.36
-[2025-01-25 17:02:14,835] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.39 | bwd_microstep: 4642.64 | bwd_inner_microstep: 4634.36 | bwd_allreduce_microstep: 8.19 | step_microstep: 42.33
-[2025-01-25 17:02:14,836] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.36 | bwd: 4642.66 | bwd_inner: 4634.36 | bwd_allreduce: 8.23 | step: 42.34
- 51%|█████     | 2968/5800 [8:15:44<5:26:43,  6.92s/it]                                                       {'loss': 0.0146, 'grad_norm': 7.965229511260986, 'learning_rate': 2.0212190332651508e-05, 'epoch': 25.59}
- 51%|█████     | 2968/5800 [8:15:44<5:26:43,  6.92s/it]score1 tensor([[0.4043],
-        [0.4316],
-        [0.5000],
-        [0.4141]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4199, 0.4277, 0.5156, 0.4082], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:02:21,760] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 17:02:21,761] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.93 | bwd_microstep: 4639.29 | bwd_inner_microstep: 4634.49 | bwd_allreduce_microstep: 4.70 | step_microstep: 43.04
-[2025-01-25 17:02:21,762] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.89 | bwd: 4639.31 | bwd_inner: 4634.49 | bwd_allreduce: 4.75 | step: 43.04
- 51%|█████     | 2969/5800 [8:15:51<5:26:41,  6.92s/it]                                                       {'loss': 0.0103, 'grad_norm': 0.4149737060070038, 'learning_rate': 2.0201022806950544e-05, 'epoch': 25.59}
- 51%|█████     | 2969/5800 [8:15:51<5:26:41,  6.92s/it]score1 tensor([[0.5078],
-        [0.5039],
-        [0.4375],
-        [0.4355]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.4707, 0.4473, 0.4414], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0151, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:02:28,680] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 7.76
-[2025-01-25 17:02:28,681] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.75 | bwd_microstep: 4633.87 | bwd_inner_microstep: 4628.51 | bwd_allreduce_microstep: 5.24 | step_microstep: 46.14
-[2025-01-25 17:02:28,682] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.71 | bwd: 4633.89 | bwd_inner: 4628.51 | bwd_allreduce: 5.30 | step: 46.15
- 51%|█████     | 2970/5800 [8:15:58<5:26:28,  6.92s/it]                                                       {'loss': 0.0151, 'grad_norm': 3.814573049545288, 'learning_rate': 2.0189855218567184e-05, 'epoch': 25.6}
- 51%|█████     | 2970/5800 [8:15:58<5:26:28,  6.92s/it]score1 tensor([[0.6172],
-        [0.5977],
-        [0.4531],
-        [0.4336]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.5547, 0.4375, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:02:35,532] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.37
-[2025-01-25 17:02:35,533] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.67 | bwd_microstep: 4575.27 | bwd_inner_microstep: 4570.31 | bwd_allreduce_microstep: 4.85 | step_microstep: 41.44
-[2025-01-25 17:02:35,533] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.63 | bwd: 4575.29 | bwd_inner: 4570.31 | bwd_allreduce: 4.90 | step: 41.45
- 51%|█████     | 2971/5800 [8:16:05<5:25:21,  6.90s/it]                                                       {'loss': 0.0186, 'grad_norm': 6.044720649719238, 'learning_rate': 2.0178687570983668e-05, 'epoch': 25.61}
- 51%|█████     | 2971/5800 [8:16:05<5:25:21,  6.90s/it]score1 tensor([[0.4746],
-        [0.4473],
-        [0.6680],
-        [0.4648]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4766, 0.4121, 0.6953, 0.4746], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:02:42,460] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.36
-[2025-01-25 17:02:42,462] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.14 | bwd_microstep: 4645.30 | bwd_inner_microstep: 4640.10 | bwd_allreduce_microstep: 5.11 | step_microstep: 42.77
-[2025-01-25 17:02:42,462] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.10 | bwd: 4645.32 | bwd_inner: 4640.10 | bwd_allreduce: 5.15 | step: 42.77
- 51%|█████     | 2972/5800 [8:16:12<5:25:39,  6.91s/it]                                                       {'loss': 0.0186, 'grad_norm': 4.468846797943115, 'learning_rate': 2.016751986768227e-05, 'epoch': 25.62}
- 51%|█████     | 2972/5800 [8:16:12<5:25:39,  6.91s/it]score1 tensor([[0.5781],
-        [0.6133],
-        [0.4922],
-        [0.6953]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.6211, 0.5039, 0.6602], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:02:49,374] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.33 | optimizer_step: 4.36
-[2025-01-25 17:02:49,375] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.19 | bwd_microstep: 4630.34 | bwd_inner_microstep: 4625.32 | bwd_allreduce_microstep: 4.91 | step_microstep: 43.60
-[2025-01-25 17:02:49,376] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.17 | bwd: 4630.37 | bwd_inner: 4625.32 | bwd_allreduce: 4.98 | step: 43.60
- 51%|█████▏    | 2973/5800 [8:16:19<5:25:35,  6.91s/it]                                                       {'loss': 0.0215, 'grad_norm': 4.1070237159729, 'learning_rate': 2.015635211214527e-05, 'epoch': 25.63}
- 51%|█████▏    | 2973/5800 [8:16:19<5:25:35,  6.91s/it]score1 tensor([[0.4668],
-        [0.3848],
-        [0.5234],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4414, 0.3789, 0.4766, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:02:56,294] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.36
-[2025-01-25 17:02:56,296] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.73 | bwd_microstep: 4631.65 | bwd_inner_microstep: 4626.47 | bwd_allreduce_microstep: 5.04 | step_microstep: 45.77
-[2025-01-25 17:02:56,296] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.69 | bwd: 4631.67 | bwd_inner: 4626.47 | bwd_allreduce: 5.11 | step: 45.77
- 51%|█████▏    | 2974/5800 [8:16:26<5:25:41,  6.91s/it]                                                       {'loss': 0.0215, 'grad_norm': 3.687870740890503, 'learning_rate': 2.0145184307854966e-05, 'epoch': 25.64}
- 51%|█████▏    | 2974/5800 [8:16:26<5:25:41,  6.91s/it]score1 tensor([[0.5586],
-        [0.4160],
-        [0.4551],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.4238, 0.4375, 0.5586], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:03:03,210] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 17:03:03,211] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.86 | bwd_microstep: 4630.07 | bwd_inner_microstep: 4625.13 | bwd_allreduce_microstep: 4.85 | step_microstep: 43.05
-[2025-01-25 17:03:03,211] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.83 | bwd: 4630.10 | bwd_inner: 4625.13 | bwd_allreduce: 4.90 | step: 43.05
- 51%|█████▏    | 2975/5800 [8:16:33<5:25:33,  6.91s/it]                                                       {'loss': 0.0083, 'grad_norm': 4.472708225250244, 'learning_rate': 2.013401645829368e-05, 'epoch': 25.65}
- 51%|█████▏    | 2975/5800 [8:16:33<5:25:33,  6.91s/it]score1 tensor([[0.4219],
-        [0.5859],
-        [0.5156],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.6172, 0.5312, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:03:10,128] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.37
-[2025-01-25 17:03:10,129] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.36 | bwd_microstep: 4632.35 | bwd_inner_microstep: 4627.57 | bwd_allreduce_microstep: 4.71 | step_microstep: 43.64
-[2025-01-25 17:03:10,130] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.32 | bwd: 4632.38 | bwd_inner: 4627.57 | bwd_allreduce: 4.75 | step: 43.64
- 51%|█████▏    | 2976/5800 [8:16:40<5:25:29,  6.92s/it]                                                       {'loss': 0.0137, 'grad_norm': 0.7317002415657043, 'learning_rate': 2.012284856694373e-05, 'epoch': 25.66}
- 51%|█████▏    | 2976/5800 [8:16:40<5:25:29,  6.92s/it]score1 tensor([[0.5195],
-        [0.4922],
-        [0.3613],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.4863, 0.3926, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0239, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:03:17,052] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.02 | optimizer_step: 4.36
-[2025-01-25 17:03:17,054] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.82 | bwd_microstep: 4633.86 | bwd_inner_microstep: 4629.04 | bwd_allreduce_microstep: 4.73 | step_microstep: 45.07
-[2025-01-25 17:03:17,054] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.79 | bwd: 4633.89 | bwd_inner: 4629.04 | bwd_allreduce: 4.77 | step: 45.08
- 51%|█████▏    | 2977/5800 [8:16:47<5:25:31,  6.92s/it]                                                       {'loss': 0.0239, 'grad_norm': 3.858121871948242, 'learning_rate': 2.011168063728747e-05, 'epoch': 25.66}
- 51%|█████▏    | 2977/5800 [8:16:47<5:25:31,  6.92s/it]score1 tensor([[0.3965],
-        [0.4648],
-        [0.3594],
-        [0.6094]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.4863, 0.3340, 0.6328], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0229, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:03:23,975] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 17:03:23,976] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.17 | bwd_microstep: 4633.64 | bwd_inner_microstep: 4628.60 | bwd_allreduce_microstep: 4.96 | step_microstep: 43.13
-[2025-01-25 17:03:23,977] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.14 | bwd: 4633.67 | bwd_inner: 4628.60 | bwd_allreduce: 5.00 | step: 43.14
- 51%|█████▏    | 2978/5800 [8:16:53<5:25:22,  6.92s/it]                                                       {'loss': 0.0229, 'grad_norm': 1.0422836542129517, 'learning_rate': 2.010051267280725e-05, 'epoch': 25.67}
- 51%|█████▏    | 2978/5800 [8:16:53<5:25:22,  6.92s/it]score1 tensor([[0.4688],
-        [0.3887],
-        [0.5039],
-        [0.5781]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.4004, 0.5312, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:03:30,850] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.37
-[2025-01-25 17:03:30,852] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.16 | bwd_microstep: 4594.73 | bwd_inner_microstep: 4589.54 | bwd_allreduce_microstep: 5.10 | step_microstep: 43.29
-[2025-01-25 17:03:30,852] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.12 | bwd: 4594.76 | bwd_inner: 4589.53 | bwd_allreduce: 5.15 | step: 43.30
- 51%|█████▏    | 2979/5800 [8:17:00<5:24:40,  6.91s/it]                                                       {'loss': 0.0122, 'grad_norm': 5.781589984893799, 'learning_rate': 2.0089344676985433e-05, 'epoch': 25.68}
- 51%|█████▏    | 2979/5800 [8:17:00<5:24:40,  6.91s/it]score1 tensor([[0.5703],
-        [0.5781],
-        [0.5156],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5898, 0.5977, 0.5117, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0205, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:03:37,780] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 17:03:37,781] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.32 | bwd_microstep: 4645.77 | bwd_inner_microstep: 4640.60 | bwd_allreduce_microstep: 5.04 | step_microstep: 43.12
-[2025-01-25 17:03:37,781] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.28 | bwd: 4645.80 | bwd_inner: 4640.60 | bwd_allreduce: 5.10 | step: 43.12
- 51%|█████▏    | 2980/5800 [8:17:07<5:24:54,  6.91s/it]                                                       {'loss': 0.0205, 'grad_norm': 4.533493995666504, 'learning_rate': 2.0078176653304394e-05, 'epoch': 25.69}
- 51%|█████▏    | 2980/5800 [8:17:07<5:24:54,  6.91s/it]score1 tensor([[0.4062],
-        [0.4824],
-        [0.4062],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.4590, 0.4512, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:03:44,694] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 17:03:44,696] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.87 | bwd_microstep: 4631.27 | bwd_inner_microstep: 4626.20 | bwd_allreduce_microstep: 4.97 | step_microstep: 42.83
-[2025-01-25 17:03:44,696] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.82 | bwd: 4631.29 | bwd_inner: 4626.20 | bwd_allreduce: 5.02 | step: 42.84
- 51%|█████▏    | 2981/5800 [8:17:14<5:24:45,  6.91s/it]                                                       {'loss': 0.0215, 'grad_norm': 0.7559143304824829, 'learning_rate': 2.006700860524652e-05, 'epoch': 25.7}
- 51%|█████▏    | 2981/5800 [8:17:14<5:24:45,  6.91s/it]score1 tensor([[0.6602],
-        [0.5859],
-        [0.5547],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6367, 0.5508, 0.5156, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0283, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:03:51,602] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.37
-[2025-01-25 17:03:51,603] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.88 | bwd_microstep: 4635.78 | bwd_inner_microstep: 4631.15 | bwd_allreduce_microstep: 4.55 | step_microstep: 42.42
-[2025-01-25 17:03:51,604] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.85 | bwd: 4635.81 | bwd_inner: 4631.15 | bwd_allreduce: 4.59 | step: 42.43
- 51%|█████▏    | 2982/5800 [8:17:21<5:24:39,  6.91s/it]                                                       {'loss': 0.0283, 'grad_norm': 4.773045063018799, 'learning_rate': 2.00558405362942e-05, 'epoch': 25.71}
- 51%|█████▏    | 2982/5800 [8:17:21<5:24:39,  6.91s/it]score1 tensor([[0.6094],
-        [0.5820],
-        [0.4453],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5703, 0.4961, 0.4297, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0361, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:03:58,513] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.37
-[2025-01-25 17:03:58,515] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.17 | bwd_microstep: 4628.95 | bwd_inner_microstep: 4624.10 | bwd_allreduce_microstep: 4.76 | step_microstep: 42.23
-[2025-01-25 17:03:58,515] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.13 | bwd: 4628.97 | bwd_inner: 4624.10 | bwd_allreduce: 4.80 | step: 42.23
- 51%|█████▏    | 2983/5800 [8:17:28<5:24:33,  6.91s/it]                                                       {'loss': 0.0361, 'grad_norm': 4.324300765991211, 'learning_rate': 2.0044672449929836e-05, 'epoch': 25.72}
- 51%|█████▏    | 2983/5800 [8:17:28<5:24:33,  6.91s/it]score1 tensor([[0.5820],
-        [0.4395],
-        [0.4844],
-        [0.6641]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.4395, 0.4785, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:04:05,376] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 17:04:05,378] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.72 | bwd_microstep: 4578.29 | bwd_inner_microstep: 4573.55 | bwd_allreduce_microstep: 4.67 | step_microstep: 41.92
-[2025-01-25 17:04:05,378] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.68 | bwd: 4578.32 | bwd_inner: 4573.55 | bwd_allreduce: 4.70 | step: 41.92
- 51%|█████▏    | 2984/5800 [8:17:35<5:23:42,  6.90s/it]                                                       {'loss': 0.0103, 'grad_norm': 6.691514015197754, 'learning_rate': 2.0033504349635825e-05, 'epoch': 25.72}
- 51%|█████▏    | 2984/5800 [8:17:35<5:23:42,  6.90s/it]score1 tensor([[0.4883],
-        [0.4727],
-        [0.6719],
-        [0.3984]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.4629, 0.6445, 0.4141], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0229, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:04:12,303] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 17:04:12,305] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.84 | bwd_microstep: 4641.27 | bwd_inner_microstep: 4636.43 | bwd_allreduce_microstep: 4.75 | step_microstep: 42.97
-[2025-01-25 17:04:12,305] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.81 | bwd: 4641.30 | bwd_inner: 4636.42 | bwd_allreduce: 4.80 | step: 42.98
- 51%|█████▏    | 2985/5800 [8:17:42<5:23:59,  6.91s/it]                                                       {'loss': 0.0229, 'grad_norm': 0.9728752374649048, 'learning_rate': 2.0022336238894572e-05, 'epoch': 25.73}
- 51%|█████▏    | 2985/5800 [8:17:42<5:23:59,  6.91s/it]score1 tensor([[0.5977],
-        [0.4160],
-        [0.4648],
-        [0.6055]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.4141, 0.4473, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:04:19,224] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.12 | optimizer_step: 4.36
-[2025-01-25 17:04:19,225] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.32 | bwd_microstep: 4635.12 | bwd_inner_microstep: 4629.80 | bwd_allreduce_microstep: 5.25 | step_microstep: 47.54
-[2025-01-25 17:04:19,225] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.27 | bwd: 4635.15 | bwd_inner: 4629.80 | bwd_allreduce: 5.29 | step: 47.55
- 51%|█████▏    | 2986/5800 [8:17:49<5:24:09,  6.91s/it]                                                       {'loss': 0.0293, 'grad_norm': 8.301749229431152, 'learning_rate': 2.0011168121188492e-05, 'epoch': 25.74}
- 51%|█████▏    | 2986/5800 [8:17:49<5:24:09,  6.91s/it]score1 tensor([[0.5391],
-        [0.4785],
-        [0.5273],
-        [0.3945]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.4980, 0.5234, 0.4199], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0142, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:04:26,144] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 17:04:26,145] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.49 | bwd_microstep: 4632.67 | bwd_inner_microstep: 4627.64 | bwd_allreduce_microstep: 4.95 | step_microstep: 44.36
-[2025-01-25 17:04:26,145] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.46 | bwd: 4632.69 | bwd_inner: 4627.64 | bwd_allreduce: 4.99 | step: 44.37
- 52%|█████▏    | 2987/5800 [8:17:56<5:24:00,  6.91s/it]                                                       {'loss': 0.0142, 'grad_norm': 3.7933731079101562, 'learning_rate': 2e-05, 'epoch': 25.75}
- 52%|█████▏    | 2987/5800 [8:17:56<5:24:00,  6.91s/it]score1 tensor([[0.6523],
-        [0.5352],
-        [0.4902],
-        [0.4551]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6641, 0.5508, 0.4922, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:04:33,045] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.37
-[2025-01-25 17:04:33,046] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.34 | bwd_microstep: 4630.48 | bwd_inner_microstep: 4625.58 | bwd_allreduce_microstep: 4.77 | step_microstep: 41.70
-[2025-01-25 17:04:33,046] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.30 | bwd: 4630.50 | bwd_inner: 4625.58 | bwd_allreduce: 4.85 | step: 41.70
- 52%|█████▏    | 2988/5800 [8:18:03<5:23:49,  6.91s/it]                                                       {'loss': 0.0098, 'grad_norm': 4.644787311553955, 'learning_rate': 1.9988831878811515e-05, 'epoch': 25.76}
- 52%|█████▏    | 2988/5800 [8:18:03<5:23:49,  6.91s/it]score1 tensor([[0.4395],
-        [0.4102],
-        [0.5508],
-        [0.4355]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.4258, 0.5508, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:04:39,900] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 17:04:39,901] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.11 | bwd_microstep: 4576.89 | bwd_inner_microstep: 4572.03 | bwd_allreduce_microstep: 4.77 | step_microstep: 43.12
-[2025-01-25 17:04:39,902] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.07 | bwd: 4576.91 | bwd_inner: 4572.03 | bwd_allreduce: 4.81 | step: 43.12
- 52%|█████▏    | 2989/5800 [8:18:09<5:22:58,  6.89s/it]                                                       {'loss': 0.0093, 'grad_norm': 5.643258571624756, 'learning_rate': 1.997766376110543e-05, 'epoch': 25.77}
- 52%|█████▏    | 2989/5800 [8:18:09<5:22:58,  6.89s/it]score1 tensor([[0.5469],
-        [0.6172],
-        [0.3848],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.6094, 0.3750, 0.5703], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:04:46,819] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.37
-[2025-01-25 17:04:46,820] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.43 | bwd_microstep: 4637.24 | bwd_inner_microstep: 4632.57 | bwd_allreduce_microstep: 4.59 | step_microstep: 40.95
-[2025-01-25 17:04:46,821] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.38 | bwd: 4637.26 | bwd_inner: 4632.57 | bwd_allreduce: 4.63 | step: 40.96
- 52%|█████▏    | 2990/5800 [8:18:16<5:23:10,  6.90s/it]                                                       {'loss': 0.0132, 'grad_norm': 4.048091888427734, 'learning_rate': 1.9966495650364185e-05, 'epoch': 25.78}
- 52%|█████▏    | 2990/5800 [8:18:16<5:23:10,  6.90s/it]evaluate!
-score1 tensor([[0.4805]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4805, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0., device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5508]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5625, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4629]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4766, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5703, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6758]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5742, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1016, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4434]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5547, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1113, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5977, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0625, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5664]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4219, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1445, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5625]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0742, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4824]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5000, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4297]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3945, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5430]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5000, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4746]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3730, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1016, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4492]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4375]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4414, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5430]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6797, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1367, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5195]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5391, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5000]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4648, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4004]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4258, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4473]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4707, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4648]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3965, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0684, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5117]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3750, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1367, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4492, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0859, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5469]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5586, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6523, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1367, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6172]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5898, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5508]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4355, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1152, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5000]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5156, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4844]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3262, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1582, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4082]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4863, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0781, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4883]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1602, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5703]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6055, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5195]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4570, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0625, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5586]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0898, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6094]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.7031, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0938, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4766]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4336, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5664]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5391, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4375]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4473, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5430, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4570]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4590, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6406]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6641, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5430]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4453, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0977, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6211]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.7109, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0898, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5273, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4785]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3691, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1094, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4277]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4180, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4707]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3223, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1484, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5781]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6016, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4688]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4648, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4355]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3418, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0938, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4453]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4590, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4961]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5117, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5039]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4551, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0488, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4609]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5234, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0625, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5742]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5312, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5977]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5820, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4258]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4199, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4180]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3926, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5586]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0586, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4922]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0., device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5117]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5352, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0., device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5781]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6250, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5977]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5703, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4668]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4727, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5625]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4434, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1191, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5000]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4941, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5586]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5352, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4668]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4316]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3828, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0488, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3926]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4219, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4824]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4844, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5195]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4805, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5664]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6094]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5898, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6562]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6641, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4883]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4688, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4102]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4297, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4082]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4668, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0586, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5547]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0742, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4453]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4297, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4160]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4609, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0449, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4980]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1191, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4531]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4492, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4141]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3730, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0410, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5469]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6445, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0977, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4805]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1680, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4316]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5430, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1113, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5000]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5273, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4512]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3867, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0645, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4727]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4668, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5859]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5625, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6094]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6250, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5781]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5508]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0781, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4258]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4785, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0527, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4590]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4062, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0527, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5625]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4395, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1230, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4180]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5664, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1484, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4746]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3555, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1191, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4688]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4785, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5469]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5078, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5977]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6016, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4609]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4277, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0332, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5938]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5781, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4746]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3477, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1270, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4941]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6133, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1191, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5586]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5156, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4434]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3379, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1055, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4434]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3418, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1016, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4961]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3848, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1113, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4238]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3828, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0410, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4727]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4277, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0449, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4102]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4043, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16)
-Results saved to /DATA/DATA1/wjr/intern/InternVL/internvl_chat/new_train_result/stage2/mos3_test.csv
-Accuracy: 0.0
-SRCC_score: 0.6562967433396514
-PLCC_score: 0.6582402957055535
-KRCC_score: 0.4643197475188998
-SRCC_level: 0.6562967433396514
-PLCC_level: 0.6582402957055535
-KRCC_level: 0.4643197475188998
-score1 tensor([[0.4336],
-        [0.6328],
-        [0.3926],
-        [0.4277]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4258, 0.6406, 0.3867, 0.4219], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:15:03,297] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 17:15:03,298] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2137.07 | bwd_microstep: 4599.88 | bwd_inner_microstep: 4594.88 | bwd_allreduce_microstep: 4.92 | step_microstep: 43.49
-[2025-01-25 17:15:03,299] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2137.02 | bwd: 4599.90 | bwd_inner: 4594.88 | bwd_allreduce: 4.95 | step: 43.50
- 52%|█████▏    | 2991/5800 [8:28:33<148:04:34, 189.77s/it]                                                          {'loss': 0.0068, 'grad_norm': 3.273188352584839, 'learning_rate': 1.9955327550070168e-05, 'epoch': 25.78}
- 52%|█████▏    | 2991/5800 [8:28:33<148:04:34, 189.77s/it]score1 tensor([[0.4551],
-        [0.5742],
-        [0.4707],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.5586, 0.4844, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:15:10,131] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.37
-[2025-01-25 17:15:10,133] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2137.41 | bwd_microstep: 4568.79 | bwd_inner_microstep: 4564.22 | bwd_allreduce_microstep: 4.46 | step_microstep: 47.06
-[2025-01-25 17:15:10,133] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2137.38 | bwd: 4568.81 | bwd_inner: 4564.22 | bwd_allreduce: 4.52 | step: 47.07
- 52%|█████▏    | 2992/5800 [8:28:40<105:12:51, 134.89s/it]                                                          {'loss': 0.0195, 'grad_norm': 0.8277460932731628, 'learning_rate': 1.9944159463705804e-05, 'epoch': 25.79}
- 52%|█████▏    | 2992/5800 [8:28:40<105:12:51, 134.89s/it]score1 tensor([[0.4512],
-        [0.6719],
-        [0.5625],
-        [0.6562]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4629, 0.6836, 0.5430, 0.6719], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:15:16,957] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.37
-[2025-01-25 17:15:16,958] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2133.22 | bwd_microstep: 4577.98 | bwd_inner_microstep: 4573.15 | bwd_allreduce_microstep: 4.75 | step_microstep: 43.82
-[2025-01-25 17:15:16,959] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2133.19 | bwd: 4578.00 | bwd_inner: 4573.15 | bwd_allreduce: 4.79 | step: 43.82
- 52%|█████▏    | 2993/5800 [8:28:46<75:13:14, 96.47s/it]                                                          {'loss': 0.0146, 'grad_norm': 4.572220325469971, 'learning_rate': 1.993299139475348e-05, 'epoch': 25.8}
- 52%|█████▏    | 2993/5800 [8:28:46<75:13:14, 96.47s/it]score1 tensor([[0.4727],
-        [0.4434],
-        [0.4609],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4805, 0.4355, 0.4688, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:15:23,800] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 17:15:23,801] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2141.09 | bwd_microstep: 4590.47 | bwd_inner_microstep: 4585.30 | bwd_allreduce_microstep: 5.06 | step_microstep: 44.22
-[2025-01-25 17:15:23,801] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2141.02 | bwd: 4590.50 | bwd_inner: 4585.30 | bwd_allreduce: 5.12 | step: 44.23
- 52%|█████▏    | 2994/5800 [8:28:53<54:14:08, 69.58s/it]                                                        {'loss': 0.0088, 'grad_norm': 0.5276705026626587, 'learning_rate': 1.992182334669561e-05, 'epoch': 25.81}
- 52%|█████▏    | 2994/5800 [8:28:53<54:14:08, 69.58s/it]score1 tensor([[0.4473],
-        [0.6836],
-        [0.4395],
-        [0.3652]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4746, 0.7031, 0.3262, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0425, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:15:30,655] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 17:15:30,656] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2142.30 | bwd_microstep: 4599.61 | bwd_inner_microstep: 4594.67 | bwd_allreduce_microstep: 4.86 | step_microstep: 43.45
-[2025-01-25 17:15:30,657] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2142.26 | bwd: 4599.63 | bwd_inner: 4594.67 | bwd_allreduce: 4.90 | step: 43.45
- 52%|█████▏    | 2995/5800 [8:29:00<39:33:16, 50.77s/it]                                                        {'loss': 0.0425, 'grad_norm': 4.2213134765625, 'learning_rate': 1.9910655323014574e-05, 'epoch': 25.82}
- 52%|█████▏    | 2995/5800 [8:29:00<39:33:16, 50.77s/it]score1 tensor([[0.4297],
-        [0.5742],
-        [0.4434],
-        [0.4473]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4160, 0.5625, 0.4707, 0.4316], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0171, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:15:37,523] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 17:15:37,524] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2148.06 | bwd_microstep: 4597.43 | bwd_inner_microstep: 4592.41 | bwd_allreduce_microstep: 4.92 | step_microstep: 48.52
-[2025-01-25 17:15:37,525] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2148.04 | bwd: 4597.46 | bwd_inner: 4592.41 | bwd_allreduce: 4.97 | step: 48.53
- 52%|█████▏    | 2996/5800 [8:29:07<29:17:01, 37.60s/it]                                                        {'loss': 0.0171, 'grad_norm': 4.10106086730957, 'learning_rate': 1.9899487327192757e-05, 'epoch': 25.83}
- 52%|█████▏    | 2996/5800 [8:29:07<29:17:01, 37.60s/it]score1 tensor([[0.4531],
-        [0.4844],
-        [0.4902],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.5156, 0.5273, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:15:44,398] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 17:15:44,399] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2147.42 | bwd_microstep: 4604.46 | bwd_inner_microstep: 4599.36 | bwd_allreduce_microstep: 5.01 | step_microstep: 45.72
-[2025-01-25 17:15:44,400] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2147.37 | bwd: 4604.49 | bwd_inner: 4599.36 | bwd_allreduce: 5.05 | step: 45.71
- 52%|█████▏    | 2997/5800 [8:29:14<22:05:49, 28.38s/it]                                                        {'loss': 0.0273, 'grad_norm': 3.8265514373779297, 'learning_rate': 1.9888319362712535e-05, 'epoch': 25.84}
- 52%|█████▏    | 2997/5800 [8:29:14<22:05:49, 28.38s/it]score1 tensor([[0.4395],
-        [0.5586],
-        [0.5156],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.5781, 0.5156, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:15:51,216] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 17:15:51,217] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2148.52 | bwd_microstep: 4550.74 | bwd_inner_microstep: 4546.10 | bwd_allreduce_microstep: 4.53 | step_microstep: 45.51
-[2025-01-25 17:15:51,218] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2148.48 | bwd: 4550.76 | bwd_inner: 4546.10 | bwd_allreduce: 4.59 | step: 45.52
- 52%|█████▏    | 2998/5800 [8:29:21<17:03:13, 21.91s/it]                                                        {'loss': 0.0098, 'grad_norm': 2.077080726623535, 'learning_rate': 1.9877151433056273e-05, 'epoch': 25.84}
- 52%|█████▏    | 2998/5800 [8:29:21<17:03:13, 21.91s/it]score1 tensor([[0.5664],
-        [0.5859],
-        [0.4648],
-        [0.3848]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.6094, 0.4551, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:15:58,245] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 17:15:58,246] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2147.68 | bwd_microstep: 4606.88 | bwd_inner_microstep: 4601.46 | bwd_allreduce_microstep: 5.32 | step_microstep: 47.71
-[2025-01-25 17:15:58,247] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2147.62 | bwd: 4606.91 | bwd_inner: 4601.46 | bwd_allreduce: 5.36 | step: 47.71
- 52%|█████▏    | 2999/5800 [8:29:28<13:34:27, 17.45s/it]                                                        {'loss': 0.0156, 'grad_norm': 3.688410997390747, 'learning_rate': 1.9865983541706328e-05, 'epoch': 25.85}
- 52%|█████▏    | 2999/5800 [8:29:28<13:34:27, 17.45s/it]score1 tensor([[0.5742],
-        [0.5156],
-        [0.4902],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.4844, 0.4590, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:16:05,120] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.37
-[2025-01-25 17:16:05,121] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.71 | bwd_microstep: 4602.71 | bwd_inner_microstep: 4598.05 | bwd_allreduce_microstep: 4.56 | step_microstep: 44.05
-[2025-01-25 17:16:05,122] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.65 | bwd: 4602.73 | bwd_inner: 4598.05 | bwd_allreduce: 4.61 | step: 44.05
- 52%|█████▏    | 3000/5800 [8:29:35<11:06:12, 14.28s/it]                                                        {'loss': 0.0254, 'grad_norm': 8.312097549438477, 'learning_rate': 1.985481569214504e-05, 'epoch': 25.86}
- 52%|█████▏    | 3000/5800 [8:29:35<11:06:12, 14.28s/it]score1 tensor([[0.3887],
-        [0.5234],
-        [0.6172],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3789, 0.5391, 0.6172, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0171, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:16:11,960] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 17:16:11,961] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.85 | bwd_microstep: 4564.62 | bwd_inner_microstep: 4559.67 | bwd_allreduce_microstep: 4.86 | step_microstep: 46.12
-[2025-01-25 17:16:11,961] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.82 | bwd: 4564.65 | bwd_inner: 4559.67 | bwd_allreduce: 4.91 | step: 46.13
- 52%|█████▏    | 3001/5800 [8:29:41<9:21:48, 12.04s/it]                                                        {'loss': 0.0171, 'grad_norm': 1.8583979606628418, 'learning_rate': 1.9843647887854734e-05, 'epoch': 25.87}
- 52%|█████▏    | 3001/5800 [8:29:41<9:21:48, 12.04s/it]score1 tensor([[0.6445],
-        [0.5078],
-        [0.4434],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6250, 0.4941, 0.4355, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:16:18,794] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 17:16:18,795] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.07 | bwd_microstep: 4566.82 | bwd_inner_microstep: 4562.11 | bwd_allreduce_microstep: 4.63 | step_microstep: 44.25
-[2025-01-25 17:16:18,796] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.02 | bwd: 4566.85 | bwd_inner: 4562.11 | bwd_allreduce: 4.67 | step: 44.25
- 52%|█████▏    | 3002/5800 [8:29:48<8:08:45, 10.48s/it]                                                       {'loss': 0.0103, 'grad_norm': 6.314230918884277, 'learning_rate': 1.983248013231774e-05, 'epoch': 25.88}
- 52%|█████▏    | 3002/5800 [8:29:48<8:08:45, 10.48s/it]score1 tensor([[0.3867],
-        [0.5703],
-        [0.3965],
-        [0.4727]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3672, 0.6133, 0.3750, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0264, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:16:25,689] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 17:16:25,690] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.53 | bwd_microstep: 4620.57 | bwd_inner_microstep: 4615.27 | bwd_allreduce_microstep: 5.22 | step_microstep: 43.88
-[2025-01-25 17:16:25,690] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.46 | bwd: 4620.59 | bwd_inner: 4615.27 | bwd_allreduce: 5.26 | step: 43.88
- 52%|█████▏    | 3003/5800 [8:29:55<7:18:24,  9.40s/it]                                                       {'loss': 0.0264, 'grad_norm': 3.392733097076416, 'learning_rate': 1.9821312429016335e-05, 'epoch': 25.89}
- 52%|█████▏    | 3003/5800 [8:29:55<7:18:24,  9.40s/it]score1 tensor([[0.5039],
-        [0.4277],
-        [0.4570],
-        [0.4707]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4824, 0.4512, 0.4941, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0317, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:16:32,584] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.47 | optimizer_step: 4.37
-[2025-01-25 17:16:32,585] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.64 | bwd_microstep: 4622.25 | bwd_inner_microstep: 4617.03 | bwd_allreduce_microstep: 5.09 | step_microstep: 46.52
-[2025-01-25 17:16:32,586] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.61 | bwd: 4622.29 | bwd_inner: 4617.03 | bwd_allreduce: 5.17 | step: 46.53
- 52%|█████▏    | 3004/5800 [8:30:02<6:43:09,  8.65s/it]                                                       {'loss': 0.0317, 'grad_norm': 3.7106359004974365, 'learning_rate': 1.9810144781432826e-05, 'epoch': 25.9}
- 52%|█████▏    | 3004/5800 [8:30:02<6:43:09,  8.65s/it]score1 tensor([[0.5469],
-        [0.4648],
-        [0.4141],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133, 0.4727, 0.4551, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0356, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:16:39,473] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.37
-[2025-01-25 17:16:39,475] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.29 | bwd_microstep: 4615.05 | bwd_inner_microstep: 4610.05 | bwd_allreduce_microstep: 4.89 | step_microstep: 45.38
-[2025-01-25 17:16:39,475] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.25 | bwd: 4615.07 | bwd_inner: 4610.05 | bwd_allreduce: 4.95 | step: 45.39
- 52%|█████▏    | 3005/5800 [8:30:09<6:18:30,  8.13s/it]                                                       {'loss': 0.0356, 'grad_norm': 8.011798858642578, 'learning_rate': 1.979897719304946e-05, 'epoch': 25.91}
- 52%|█████▏    | 3005/5800 [8:30:09<6:18:30,  8.13s/it]score1 tensor([[0.4453],
-        [0.6406],
-        [0.4492],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4883, 0.6562, 0.4453, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0283, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:16:46,397] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 17:16:46,398] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.56 | bwd_microstep: 4640.45 | bwd_inner_microstep: 4635.48 | bwd_allreduce_microstep: 4.85 | step_microstep: 43.17
-[2025-01-25 17:16:46,399] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.54 | bwd: 4640.47 | bwd_inner: 4635.48 | bwd_allreduce: 4.91 | step: 43.18
- 52%|█████▏    | 3006/5800 [8:30:16<6:01:31,  7.76s/it]                                                       {'loss': 0.0283, 'grad_norm': 4.387711524963379, 'learning_rate': 1.9787809667348496e-05, 'epoch': 25.91}
- 52%|█████▏    | 3006/5800 [8:30:16<6:01:31,  7.76s/it]score1 tensor([[0.6016],
-        [0.3691],
-        [0.5547],
-        [0.4062]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.4043, 0.5742, 0.4043], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0181, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:16:53,317] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 17:16:53,319] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.71 | bwd_microstep: 4640.22 | bwd_inner_microstep: 4635.27 | bwd_allreduce_microstep: 4.86 | step_microstep: 45.47
-[2025-01-25 17:16:53,319] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.67 | bwd: 4640.25 | bwd_inner: 4635.27 | bwd_allreduce: 4.91 | step: 45.48
- 52%|█████▏    | 3007/5800 [8:30:23<5:49:37,  7.51s/it]                                                       {'loss': 0.0181, 'grad_norm': 4.378654479980469, 'learning_rate': 1.9776642207812166e-05, 'epoch': 25.92}
- 52%|█████▏    | 3007/5800 [8:30:23<5:49:37,  7.51s/it]score1 tensor([[0.6172],
-        [0.5156],
-        [0.3926],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6367, 0.5273, 0.4473, 0.6016], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0264, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:17:00,234] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.37
-[2025-01-25 17:17:00,236] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.97 | bwd_microstep: 4633.15 | bwd_inner_microstep: 4628.22 | bwd_allreduce_microstep: 4.82 | step_microstep: 44.11
-[2025-01-25 17:17:00,236] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.93 | bwd: 4633.17 | bwd_inner: 4628.22 | bwd_allreduce: 4.87 | step: 44.12
- 52%|█████▏    | 3008/5800 [8:30:30<5:41:12,  7.33s/it]                                                       {'loss': 0.0264, 'grad_norm': 8.409396171569824, 'learning_rate': 1.9765474817922662e-05, 'epoch': 25.93}
- 52%|█████▏    | 3008/5800 [8:30:30<5:41:12,  7.33s/it]score1 tensor([[0.3906],
-        [0.5547],
-        [0.4082],
-        [0.5938]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3984, 0.5820, 0.3906, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0190, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:17:07,148] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.37
-[2025-01-25 17:17:07,149] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.50 | bwd_microstep: 4631.56 | bwd_inner_microstep: 4627.19 | bwd_allreduce_microstep: 4.29 | step_microstep: 44.00
-[2025-01-25 17:17:07,149] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.45 | bwd: 4631.59 | bwd_inner: 4627.19 | bwd_allreduce: 4.33 | step: 44.00
- 52%|█████▏    | 3009/5800 [8:30:37<5:35:12,  7.21s/it]                                                       {'loss': 0.019, 'grad_norm': 4.394618511199951, 'learning_rate': 1.9754307501162186e-05, 'epoch': 25.94}
- 52%|█████▏    | 3009/5800 [8:30:37<5:35:12,  7.21s/it]score1 tensor([[0.4492],
-        [0.3262],
-        [0.4492],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.3086, 0.4238, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0142, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:17:14,068] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.37
-[2025-01-25 17:17:14,069] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.45 | bwd_microstep: 4635.46 | bwd_inner_microstep: 4629.82 | bwd_allreduce_microstep: 5.54 | step_microstep: 47.08
-[2025-01-25 17:17:14,070] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.42 | bwd: 4635.48 | bwd_inner: 4629.82 | bwd_allreduce: 5.59 | step: 47.09
- 52%|█████▏    | 3010/5800 [8:30:44<5:31:09,  7.12s/it]                                                       {'loss': 0.0142, 'grad_norm': 3.4122252464294434, 'learning_rate': 1.9743140261012884e-05, 'epoch': 25.95}
- 52%|█████▏    | 3010/5800 [8:30:44<5:31:09,  7.12s/it]score1 tensor([[0.5781],
-        [0.4297],
-        [0.5625],
-        [0.4492]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.4141, 0.5781, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0171, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:17:20,988] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 17:17:20,989] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.17 | bwd_microstep: 4636.61 | bwd_inner_microstep: 4631.62 | bwd_allreduce_microstep: 4.90 | step_microstep: 43.71
-[2025-01-25 17:17:20,990] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.13 | bwd: 4636.63 | bwd_inner: 4631.62 | bwd_allreduce: 4.94 | step: 43.72
- 52%|█████▏    | 3011/5800 [8:30:50<5:28:12,  7.06s/it]                                                       {'loss': 0.0171, 'grad_norm': 0.3846585750579834, 'learning_rate': 1.9731973100956915e-05, 'epoch': 25.96}
- 52%|█████▏    | 3011/5800 [8:30:50<5:28:12,  7.06s/it]score1 tensor([[0.5039],
-        [0.5195],
-        [0.5117],
-        [0.4512]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.4863, 0.4844, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:17:27,913] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 17:17:27,914] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.55 | bwd_microstep: 4644.51 | bwd_inner_microstep: 4639.60 | bwd_allreduce_microstep: 4.80 | step_microstep: 44.78
-[2025-01-25 17:17:27,914] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.52 | bwd: 4644.53 | bwd_inner: 4639.60 | bwd_allreduce: 4.85 | step: 44.78
- 52%|█████▏    | 3012/5800 [8:30:57<5:26:08,  7.02s/it]                                                       {'loss': 0.0273, 'grad_norm': 0.5187074542045593, 'learning_rate': 1.972080602447638e-05, 'epoch': 25.97}
- 52%|█████▏    | 3012/5800 [8:30:57<5:26:08,  7.02s/it]score1 tensor([[0.5312],
-        [0.3945],
-        [0.5586],
-        [0.4629]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.3398, 0.5664, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:17:34,843] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 17:17:34,845] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.29 | bwd_microstep: 4647.97 | bwd_inner_microstep: 4642.43 | bwd_allreduce_microstep: 5.36 | step_microstep: 46.71
-[2025-01-25 17:17:34,845] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.26 | bwd: 4647.99 | bwd_inner: 4642.43 | bwd_allreduce: 5.48 | step: 46.71
- 52%|█████▏    | 3013/5800 [8:31:04<5:24:49,  6.99s/it]                                                       {'loss': 0.0234, 'grad_norm': 3.7091503143310547, 'learning_rate': 1.9709639035053373e-05, 'epoch': 25.97}
- 52%|█████▏    | 3013/5800 [8:31:04<5:24:49,  6.99s/it]score1 tensor([[0.4883],
-        [0.4746],
-        [0.4648],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4570, 0.4473, 0.4570, 0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0244, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:17:41,767] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 17:17:41,769] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2169.22 | bwd_microstep: 4635.70 | bwd_inner_microstep: 4630.47 | bwd_allreduce_microstep: 5.13 | step_microstep: 46.35
-[2025-01-25 17:17:41,769] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2169.19 | bwd: 4635.73 | bwd_inner: 4630.47 | bwd_allreduce: 5.18 | step: 46.36
- 52%|█████▏    | 3014/5800 [8:31:11<5:23:45,  6.97s/it]                                                       {'loss': 0.0244, 'grad_norm': 8.125685691833496, 'learning_rate': 1.9698472136169953e-05, 'epoch': 25.98}
- 52%|█████▏    | 3014/5800 [8:31:11<5:23:45,  6.97s/it]score1 tensor([[0.4688],
-        [0.4805],
-        [0.5508],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.4922, 0.5664, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0181, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:17:48,695] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 17:17:48,697] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.87 | bwd_microstep: 4647.16 | bwd_inner_microstep: 4642.29 | bwd_allreduce_microstep: 4.78 | step_microstep: 44.42
-[2025-01-25 17:17:48,697] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.83 | bwd: 4647.18 | bwd_inner: 4642.29 | bwd_allreduce: 4.83 | step: 44.43
- 52%|█████▏    | 3015/5800 [8:31:18<5:22:59,  6.96s/it]                                                       {'loss': 0.0181, 'grad_norm': 4.244984149932861, 'learning_rate': 1.968730533130815e-05, 'epoch': 25.99}
- 52%|█████▏    | 3015/5800 [8:31:18<5:22:59,  6.96s/it][93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-score1 tensor([[0.4238]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4336], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:17:53,017] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.00 | optimizer_step: 4.36
-[2025-01-25 17:17:53,018] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 576.92 | bwd_microstep: 1222.48 | bwd_inner_microstep: 1217.63 | bwd_allreduce_microstep: 4.76 | step_microstep: 44.98
-[2025-01-25 17:17:53,019] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 576.87 | bwd: 1222.51 | bwd_inner: 1217.63 | bwd_allreduce: 4.81 | step: 44.99
- 52%|█████▏    | 3016/5800 [8:31:22<4:46:19,  6.17s/it]                                                       {'loss': 0.0098, 'grad_norm': 7.4328436851501465, 'learning_rate': 1.967613862394997e-05, 'epoch': 26.0}
- 52%|█████▏    | 3016/5800 [8:31:23<4:46:19,  6.17s/it][2025-01-25 17:17:57,429] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 17:18:07,499] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 17:18:17,539] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 17:18:27,295] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-score1 tensor([[0.5391],
-        [0.5977],
-        [0.5312],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.6094, 0.5273, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0322, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:18:44,259] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 17:18:44,261] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2139.80 | bwd_microstep: 4583.42 | bwd_inner_microstep: 4577.02 | bwd_allreduce_microstep: 6.28 | step_microstep: 49.67
-[2025-01-25 17:18:44,261] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2139.75 | bwd: 4583.44 | bwd_inner: 4577.02 | bwd_allreduce: 6.34 | step: 49.68
- 52%|█████▏    | 3017/5800 [8:32:14<15:13:19, 19.69s/it]                                                        {'loss': 0.0322, 'grad_norm': 0.42041516304016113, 'learning_rate': 1.9664972017577386e-05, 'epoch': 26.01}
- 52%|█████▏    | 3017/5800 [8:32:14<15:13:19, 19.69s/it]score1 tensor([[0.5625],
-        [0.6914],
-        [0.5234],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.6953, 0.4961, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0225, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:18:51,110] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 17:18:51,111] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2132.44 | bwd_microstep: 4581.22 | bwd_inner_microstep: 4576.05 | bwd_allreduce_microstep: 5.05 | step_microstep: 45.72
-[2025-01-25 17:18:51,111] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2132.40 | bwd: 4581.24 | bwd_inner: 4576.05 | bwd_allreduce: 5.11 | step: 45.73
- 52%|█████▏    | 3018/5800 [8:32:21<12:14:22, 15.84s/it]                                                        {'loss': 0.0225, 'grad_norm': 3.9772369861602783, 'learning_rate': 1.965380551567235e-05, 'epoch': 26.02}
- 52%|█████▏    | 3018/5800 [8:32:21<12:14:22, 15.84s/it]score1 tensor([[0.5430],
-        [0.4512],
-        [0.5312],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.4277, 0.4922, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:18:57,972] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.89 | optimizer_step: 4.37
-[2025-01-25 17:18:57,973] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2137.58 | bwd_microstep: 4591.84 | bwd_inner_microstep: 4586.60 | bwd_allreduce_microstep: 5.14 | step_microstep: 46.81
-[2025-01-25 17:18:57,974] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2137.55 | bwd: 4591.87 | bwd_inner: 4586.60 | bwd_allreduce: 5.19 | step: 46.82
- 52%|█████▏    | 3019/5800 [8:32:27<10:09:17, 13.15s/it]                                                        {'loss': 0.0176, 'grad_norm': 8.300670623779297, 'learning_rate': 1.9642639121716755e-05, 'epoch': 26.03}
- 52%|█████▏    | 3019/5800 [8:32:27<10:09:17, 13.15s/it]score1 tensor([[0.5234],
-        [0.4375],
-        [0.5547],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.4297, 0.5547, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:19:04,753] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 17:19:04,754] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2142.06 | bwd_microstep: 4517.57 | bwd_inner_microstep: 4512.42 | bwd_allreduce_microstep: 5.03 | step_microstep: 44.41
-[2025-01-25 17:19:04,754] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2142.03 | bwd: 4517.60 | bwd_inner: 4512.42 | bwd_allreduce: 5.09 | step: 44.42
- 52%|█████▏    | 3020/5800 [8:32:34<8:40:34, 11.24s/it]                                                        {'loss': 0.0068, 'grad_norm': 3.956268787384033, 'learning_rate': 1.963147283919248e-05, 'epoch': 26.03}
- 52%|█████▏    | 3020/5800 [8:32:34<8:40:34, 11.24s/it]score1 tensor([[0.5664],
-        [0.4922],
-        [0.3711],
-        [0.4941]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.4863, 0.3555, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:19:11,595] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 17:19:11,596] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2147.43 | bwd_microstep: 4566.64 | bwd_inner_microstep: 4561.26 | bwd_allreduce_microstep: 5.27 | step_microstep: 50.64
-[2025-01-25 17:19:11,597] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2147.39 | bwd: 4566.66 | bwd_inner: 4561.26 | bwd_allreduce: 5.32 | step: 50.65
- 52%|█████▏    | 3021/5800 [8:32:41<7:39:16,  9.92s/it]                                                       {'loss': 0.0063, 'grad_norm': 5.933192729949951, 'learning_rate': 1.9620306671581374e-05, 'epoch': 26.04}
- 52%|█████▏    | 3021/5800 [8:32:41<7:39:16,  9.92s/it]score1 tensor([[0.4160],
-        [0.6445],
-        [0.5039],
-        [0.4492]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4316, 0.6562, 0.5156, 0.4805], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:19:18,459] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 17:19:18,460] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2146.16 | bwd_microstep: 4596.08 | bwd_inner_microstep: 4590.56 | bwd_allreduce_microstep: 5.42 | step_microstep: 45.75
-[2025-01-25 17:19:18,461] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2146.14 | bwd: 4596.10 | bwd_inner: 4590.56 | bwd_allreduce: 5.48 | step: 45.76
- 52%|█████▏    | 3022/5800 [8:32:48<6:56:49,  9.00s/it]                                                       {'loss': 0.0176, 'grad_norm': 8.137880325317383, 'learning_rate': 1.9609140622365225e-05, 'epoch': 26.05}
- 52%|█████▏    | 3022/5800 [8:32:48<6:56:49,  9.00s/it]score1 tensor([[0.4414],
-        [0.6055],
-        [0.5430],
-        [0.6523]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.6133, 0.5508, 0.6875], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:19:25,345] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 17:19:25,347] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2148.96 | bwd_microstep: 4610.56 | bwd_inner_microstep: 4605.35 | bwd_allreduce_microstep: 5.12 | step_microstep: 44.94
-[2025-01-25 17:19:25,347] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2148.93 | bwd: 4610.59 | bwd_inner: 4605.35 | bwd_allreduce: 5.17 | step: 44.94
- 52%|█████▏    | 3023/5800 [8:32:55<6:27:15,  8.37s/it]                                                       {'loss': 0.0146, 'grad_norm': 8.710915565490723, 'learning_rate': 1.9597974695025824e-05, 'epoch': 26.06}
- 52%|█████▏    | 3023/5800 [8:32:55<6:27:15,  8.37s/it]score1 tensor([[0.3770],
-        [0.5586],
-        [0.5234],
-        [0.4062]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.5664, 0.5547, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:19:32,231] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.37
-[2025-01-25 17:19:32,232] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.28 | bwd_microstep: 4608.86 | bwd_inner_microstep: 4603.08 | bwd_allreduce_microstep: 5.62 | step_microstep: 49.56
-[2025-01-25 17:19:32,233] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.24 | bwd: 4608.89 | bwd_inner: 4603.08 | bwd_allreduce: 5.70 | step: 49.56
- 52%|█████▏    | 3024/5800 [8:33:02<6:06:37,  7.92s/it]                                                       {'loss': 0.0117, 'grad_norm': 1.003050446510315, 'learning_rate': 1.958680889304488e-05, 'epoch': 26.07}
- 52%|█████▏    | 3024/5800 [8:33:02<6:06:37,  7.92s/it]score1 tensor([[0.5078],
-        [0.6250],
-        [0.4102],
-        [0.6250]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.6445, 0.4238, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0181, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:19:39,125] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 17:19:39,126] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.12 | bwd_microstep: 4605.08 | bwd_inner_microstep: 4599.28 | bwd_allreduce_microstep: 5.68 | step_microstep: 46.71
-[2025-01-25 17:19:39,126] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.08 | bwd: 4605.12 | bwd_inner: 4599.28 | bwd_allreduce: 5.74 | step: 46.75
- 52%|█████▏    | 3025/5800 [8:33:09<5:52:07,  7.61s/it]                                                       {'loss': 0.0181, 'grad_norm': 8.508299827575684, 'learning_rate': 1.95756432199041e-05, 'epoch': 26.08}
- 52%|█████▏    | 3025/5800 [8:33:09<5:52:07,  7.61s/it]score1 tensor([[0.4297],
-        [0.4883],
-        [0.4258],
-        [0.4688]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4570, 0.4707, 0.4258, 0.4746], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:19:45,963] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.37
-[2025-01-25 17:19:45,964] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.69 | bwd_microstep: 4557.23 | bwd_inner_microstep: 4552.46 | bwd_allreduce_microstep: 4.69 | step_microstep: 43.11
-[2025-01-25 17:19:45,965] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.65 | bwd: 4557.25 | bwd_inner: 4552.46 | bwd_allreduce: 4.73 | step: 43.11
- 52%|█████▏    | 3026/5800 [8:33:15<5:41:13,  7.38s/it]                                                       {'loss': 0.0127, 'grad_norm': 1.8398398160934448, 'learning_rate': 1.9564477679085127e-05, 'epoch': 26.09}
- 52%|█████▏    | 3026/5800 [8:33:15<5:41:13,  7.38s/it]score1 tensor([[0.4492],
-        [0.5703],
-        [0.5859],
-        [0.4375]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4238, 0.5508, 0.5742, 0.4629], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0205, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:19:52,855] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.38
-[2025-01-25 17:19:52,857] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.61 | bwd_microstep: 4608.08 | bwd_inner_microstep: 4602.67 | bwd_allreduce_microstep: 5.31 | step_microstep: 51.56
-[2025-01-25 17:19:52,857] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.57 | bwd: 4608.11 | bwd_inner: 4602.67 | bwd_allreduce: 5.36 | step: 51.58
- 52%|█████▏    | 3027/5800 [8:33:22<5:34:22,  7.23s/it]                                                       {'loss': 0.0205, 'grad_norm': 4.503227233886719, 'learning_rate': 1.9553312274069562e-05, 'epoch': 26.09}
- 52%|█████▏    | 3027/5800 [8:33:22<5:34:22,  7.23s/it]score1 tensor([[0.4570],
-        [0.5039],
-        [0.4922],
-        [0.6680]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.4805, 0.5078, 0.6562], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:19:59,748] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 17:19:59,749] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.70 | bwd_microstep: 4607.31 | bwd_inner_microstep: 4602.05 | bwd_allreduce_microstep: 5.16 | step_microstep: 45.46
-[2025-01-25 17:19:59,750] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.65 | bwd: 4607.33 | bwd_inner: 4602.05 | bwd_allreduce: 5.21 | step: 45.47
- 52%|█████▏    | 3028/5800 [8:33:29<5:29:27,  7.13s/it]                                                       {'loss': 0.0132, 'grad_norm': 4.437765598297119, 'learning_rate': 1.954214700833899e-05, 'epoch': 26.1}
- 52%|█████▏    | 3028/5800 [8:33:29<5:29:27,  7.13s/it]score1 tensor([[0.4629],
-        [0.4980],
-        [0.5195],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.4707, 0.5391, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:20:06,627] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.37
-[2025-01-25 17:20:06,628] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.39 | bwd_microstep: 4609.97 | bwd_inner_microstep: 4605.10 | bwd_allreduce_microstep: 4.79 | step_microstep: 44.57
-[2025-01-25 17:20:06,629] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.36 | bwd: 4609.99 | bwd_inner: 4605.10 | bwd_allreduce: 4.83 | step: 44.59
- 52%|█████▏    | 3029/5800 [8:33:36<5:25:52,  7.06s/it]                                                       {'loss': 0.0176, 'grad_norm': 4.075150489807129, 'learning_rate': 1.9530981885374918e-05, 'epoch': 26.11}
- 52%|█████▏    | 3029/5800 [8:33:36<5:25:52,  7.06s/it]score1 tensor([[0.5938],
-        [0.5469],
-        [0.5625],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.5586, 0.5469, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:20:13,517] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.37
-[2025-01-25 17:20:13,518] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.85 | bwd_microstep: 4611.49 | bwd_inner_microstep: 4606.35 | bwd_allreduce_microstep: 5.04 | step_microstep: 45.92
-[2025-01-25 17:20:13,519] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.80 | bwd: 4611.51 | bwd_inner: 4606.35 | bwd_allreduce: 5.09 | step: 45.93
- 52%|█████▏    | 3030/5800 [8:33:43<5:23:28,  7.01s/it]                                                       {'loss': 0.0293, 'grad_norm': 0.5756887197494507, 'learning_rate': 1.951981690865884e-05, 'epoch': 26.12}
- 52%|█████▏    | 3030/5800 [8:33:43<5:23:28,  7.01s/it]score1 tensor([[0.4746],
-        [0.4023],
-        [0.5469],
-        [0.3887]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4863, 0.4004, 0.4961, 0.3730], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0200, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:20:20,403] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 17:20:20,404] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.23 | bwd_microstep: 4607.27 | bwd_inner_microstep: 4602.26 | bwd_allreduce_microstep: 4.91 | step_microstep: 45.66
-[2025-01-25 17:20:20,404] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.18 | bwd: 4607.29 | bwd_inner: 4602.26 | bwd_allreduce: 4.96 | step: 45.67
- 52%|█████▏    | 3031/5800 [8:33:50<5:21:43,  6.97s/it]                                                       {'loss': 0.02, 'grad_norm': 3.7545082569122314, 'learning_rate': 1.9508652081672178e-05, 'epoch': 26.13}
- 52%|█████▏    | 3031/5800 [8:33:50<5:21:43,  6.97s/it]score1 tensor([[0.6133],
-        [0.5039],
-        [0.4922],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.5508, 0.5195, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:20:27,310] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.96 | optimizer_step: 4.36
-[2025-01-25 17:20:27,311] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.12 | bwd_microstep: 4608.28 | bwd_inner_microstep: 4602.35 | bwd_allreduce_microstep: 5.81 | step_microstep: 48.71
-[2025-01-25 17:20:27,312] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.09 | bwd: 4608.30 | bwd_inner: 4602.35 | bwd_allreduce: 5.87 | step: 48.71
- 52%|█████▏    | 3032/5800 [8:33:57<5:20:39,  6.95s/it]                                                       {'loss': 0.0234, 'grad_norm': 3.903940439224243, 'learning_rate': 1.9497487407896317e-05, 'epoch': 26.14}
- 52%|█████▏    | 3032/5800 [8:33:57<5:20:39,  6.95s/it]score1 tensor([[0.6211],
-        [0.3496],
-        [0.4336],
-        [0.4473]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.3398, 0.4531, 0.4746], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0200, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:20:34,213] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 17:20:34,214] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.61 | bwd_microstep: 4616.33 | bwd_inner_microstep: 4610.98 | bwd_allreduce_microstep: 5.25 | step_microstep: 43.75
-[2025-01-25 17:20:34,215] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.58 | bwd: 4616.36 | bwd_inner: 4610.98 | bwd_allreduce: 5.30 | step: 43.75
- 52%|█████▏    | 3033/5800 [8:34:04<5:19:55,  6.94s/it]                                                       {'loss': 0.02, 'grad_norm': 4.425839424133301, 'learning_rate': 1.948632289081261e-05, 'epoch': 26.15}
- 52%|█████▏    | 3033/5800 [8:34:04<5:19:55,  6.94s/it]score1 tensor([[0.4258],
-        [0.4004],
-        [0.5625],
-        [0.4336]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.4141, 0.5430, 0.4395], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:20:41,116] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.37
-[2025-01-25 17:20:41,118] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.28 | bwd_microstep: 4607.22 | bwd_inner_microstep: 4601.85 | bwd_allreduce_microstep: 5.28 | step_microstep: 48.47
-[2025-01-25 17:20:41,118] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.23 | bwd: 4607.24 | bwd_inner: 4601.85 | bwd_allreduce: 5.33 | step: 48.48
- 52%|█████▏    | 3034/5800 [8:34:11<5:19:19,  6.93s/it]                                                       {'loss': 0.0156, 'grad_norm': 3.3962268829345703, 'learning_rate': 1.947515853390233e-05, 'epoch': 26.16}
- 52%|█████▏    | 3034/5800 [8:34:11<5:19:19,  6.93s/it]score1 tensor([[0.4844],
-        [0.5312],
-        [0.5352],
-        [0.4238]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.4941, 0.5391, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:20:48,035] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 17:20:48,037] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.90 | bwd_microstep: 4630.06 | bwd_inner_microstep: 4624.51 | bwd_allreduce_microstep: 5.44 | step_microstep: 48.34
-[2025-01-25 17:20:48,037] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.87 | bwd: 4630.09 | bwd_inner: 4624.51 | bwd_allreduce: 5.50 | step: 48.35
- 52%|█████▏    | 3035/5800 [8:34:18<5:19:03,  6.92s/it]                                                       {'loss': 0.0186, 'grad_norm': 0.3997008502483368, 'learning_rate': 1.9463994340646733e-05, 'epoch': 26.16}
- 52%|█████▏    | 3035/5800 [8:34:18<5:19:03,  6.92s/it]score1 tensor([[0.4121],
-        [0.5547],
-        [0.6055],
-        [0.6250]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4160, 0.5312, 0.6016, 0.6289], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:20:54,950] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 17:20:54,951] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.37 | bwd_microstep: 4629.67 | bwd_inner_microstep: 4624.74 | bwd_allreduce_microstep: 4.85 | step_microstep: 43.67
-[2025-01-25 17:20:54,951] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.33 | bwd: 4629.70 | bwd_inner: 4624.74 | bwd_allreduce: 4.89 | step: 43.67
- 52%|█████▏    | 3036/5800 [8:34:24<5:18:54,  6.92s/it]                                                       {'loss': 0.0088, 'grad_norm': 0.5751334428787231, 'learning_rate': 1.9452830314526996e-05, 'epoch': 26.17}
- 52%|█████▏    | 3036/5800 [8:34:24<5:18:54,  6.92s/it]score1 tensor([[0.5625],
-        [0.5195],
-        [0.4492],
-        [0.6797]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.5352, 0.4316, 0.6602], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0190, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:21:01,893] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 17:21:01,894] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.53 | bwd_microstep: 4645.20 | bwd_inner_microstep: 4635.80 | bwd_allreduce_microstep: 9.27 | step_microstep: 45.23
-[2025-01-25 17:21:01,894] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.49 | bwd: 4645.22 | bwd_inner: 4635.80 | bwd_allreduce: 9.33 | step: 45.23
- 52%|█████▏    | 3037/5800 [8:34:31<5:18:59,  6.93s/it]                                                       {'loss': 0.019, 'grad_norm': 4.493648052215576, 'learning_rate': 1.944166645902427e-05, 'epoch': 26.18}
- 52%|█████▏    | 3037/5800 [8:34:31<5:18:59,  6.93s/it]score1 tensor([[0.4863],
-        [0.4922],
-        [0.4258],
-        [0.4707]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.4980, 0.4297, 0.4629], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:21:08,818] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.36
-[2025-01-25 17:21:08,819] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.03 | bwd_microstep: 4634.83 | bwd_inner_microstep: 4629.80 | bwd_allreduce_microstep: 4.92 | step_microstep: 44.54
-[2025-01-25 17:21:08,820] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.99 | bwd: 4634.86 | bwd_inner: 4629.79 | bwd_allreduce: 4.97 | step: 44.53
- 52%|█████▏    | 3038/5800 [8:34:38<5:18:52,  6.93s/it]                                                       {'loss': 0.0088, 'grad_norm': 3.9043071269989014, 'learning_rate': 1.943050277761963e-05, 'epoch': 26.19}
- 52%|█████▏    | 3038/5800 [8:34:38<5:18:52,  6.93s/it]score1 tensor([[0.3594],
-        [0.4902],
-        [0.5703],
-        [0.6289]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3711, 0.4941, 0.5820, 0.6016], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:21:15,743] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 17:21:15,744] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.98 | bwd_microstep: 4628.72 | bwd_inner_microstep: 4623.13 | bwd_allreduce_microstep: 5.48 | step_microstep: 49.76
-[2025-01-25 17:21:15,745] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.94 | bwd: 4628.74 | bwd_inner: 4623.13 | bwd_allreduce: 5.54 | step: 49.77
- 52%|█████▏    | 3039/5800 [8:34:45<5:18:47,  6.93s/it]                                                       {'loss': 0.0137, 'grad_norm': 3.602799892425537, 'learning_rate': 1.9419339273794092e-05, 'epoch': 26.2}
- 52%|█████▏    | 3039/5800 [8:34:45<5:18:47,  6.93s/it]score1 tensor([[0.5781],
-        [0.5469],
-        [0.5234],
-        [0.3984]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.5742, 0.5156, 0.4277], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0249, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:21:22,669] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 17:21:22,670] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.98 | bwd_microstep: 4629.36 | bwd_inner_microstep: 4624.03 | bwd_allreduce_microstep: 5.24 | step_microstep: 50.56
-[2025-01-25 17:21:22,671] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.95 | bwd: 4629.39 | bwd_inner: 4624.03 | bwd_allreduce: 5.29 | step: 50.57
- 52%|█████▏    | 3040/5800 [8:34:52<5:18:35,  6.93s/it]                                                       {'loss': 0.0249, 'grad_norm': 0.6278514266014099, 'learning_rate': 1.9408175951028657e-05, 'epoch': 26.21}
- 52%|█████▏    | 3040/5800 [8:34:52<5:18:35,  6.93s/it]score1 tensor([[0.6562],
-        [0.3926],
-        [0.5352],
-        [0.4492]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.3652, 0.5391, 0.4121], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0405, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:21:29,585] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.92 | optimizer_step: 4.37
-[2025-01-25 17:21:29,587] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.78 | bwd_microstep: 4633.02 | bwd_inner_microstep: 4628.32 | bwd_allreduce_microstep: 4.59 | step_microstep: 41.71
-[2025-01-25 17:21:29,587] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.72 | bwd: 4633.04 | bwd_inner: 4628.32 | bwd_allreduce: 4.66 | step: 41.72
- 52%|█████▏    | 3041/5800 [8:34:59<5:18:22,  6.92s/it]                                                       {'loss': 0.0405, 'grad_norm': 4.004605293273926, 'learning_rate': 1.9397012812804217e-05, 'epoch': 26.22}
- 52%|█████▏    | 3041/5800 [8:34:59<5:18:22,  6.92s/it]score1 tensor([[0.4844],
-        [0.6641],
-        [0.5430],
-        [0.4199]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.6406, 0.5547, 0.3809], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0225, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:21:36,515] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.36
-[2025-01-25 17:21:36,516] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.63 | bwd_microstep: 4633.50 | bwd_inner_microstep: 4628.65 | bwd_allreduce_microstep: 4.77 | step_microstep: 47.34
-[2025-01-25 17:21:36,517] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.60 | bwd: 4633.52 | bwd_inner: 4628.65 | bwd_allreduce: 4.80 | step: 47.35
- 52%|█████▏    | 3042/5800 [8:35:06<5:18:25,  6.93s/it]                                                       {'loss': 0.0225, 'grad_norm': 4.152174949645996, 'learning_rate': 1.9385849862601657e-05, 'epoch': 26.22}
- 52%|█████▏    | 3042/5800 [8:35:06<5:18:25,  6.93s/it]score1 tensor([[0.4180],
-        [0.5117],
-        [0.4258],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3262, 0.5078, 0.4609, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0327, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:21:43,415] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.37
-[2025-01-25 17:21:43,417] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.83 | bwd_microstep: 4596.72 | bwd_inner_microstep: 4590.94 | bwd_allreduce_microstep: 5.69 | step_microstep: 49.38
-[2025-01-25 17:21:43,417] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.79 | bwd: 4596.75 | bwd_inner: 4590.94 | bwd_allreduce: 5.74 | step: 49.39
- 52%|█████▏    | 3043/5800 [8:35:13<5:17:49,  6.92s/it]                                                       {'loss': 0.0327, 'grad_norm': 2.0503461360931396, 'learning_rate': 1.9374687103901764e-05, 'epoch': 26.23}
- 52%|█████▏    | 3043/5800 [8:35:13<5:17:49,  6.92s/it]score1 tensor([[0.3691],
-        [0.5664],
-        [0.5352],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3789, 0.5664, 0.5391, 0.4883], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0034, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:21:50,251] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 17:21:50,253] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.84 | bwd_microstep: 4547.19 | bwd_inner_microstep: 4541.87 | bwd_allreduce_microstep: 5.22 | step_microstep: 44.99
-[2025-01-25 17:21:50,253] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.81 | bwd: 4547.21 | bwd_inner: 4541.87 | bwd_allreduce: 5.27 | step: 44.99
- 52%|█████▏    | 3044/5800 [8:35:20<5:16:37,  6.89s/it]                                                       {'loss': 0.0034, 'grad_norm': 3.8367257118225098, 'learning_rate': 1.9363524540185278e-05, 'epoch': 26.24}
- 52%|█████▏    | 3044/5800 [8:35:20<5:16:37,  6.89s/it]score1 tensor([[0.4648],
-        [0.5664],
-        [0.4395],
-        [0.4355]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4453, 0.5781, 0.4219, 0.4355], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:21:57,129] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.37
-[2025-01-25 17:21:57,130] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.77 | bwd_microstep: 4594.51 | bwd_inner_microstep: 4589.42 | bwd_allreduce_microstep: 5.00 | step_microstep: 44.01
-[2025-01-25 17:21:57,131] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.72 | bwd: 4594.54 | bwd_inner: 4589.42 | bwd_allreduce: 5.05 | step: 44.02
- 52%|█████▎    | 3045/5800 [8:35:27<5:16:18,  6.89s/it]                                                       {'loss': 0.0122, 'grad_norm': 1.6931090354919434, 'learning_rate': 1.9352362174932887e-05, 'epoch': 26.25}
- 52%|█████▎    | 3045/5800 [8:35:27<5:16:18,  6.89s/it]score1 tensor([[0.4238],
-        [0.5938],
-        [0.4902],
-        [0.4629]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4199, 0.5781, 0.5039, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:22:04,061] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.37
-[2025-01-25 17:22:04,062] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.01 | bwd_microstep: 4633.00 | bwd_inner_microstep: 4627.78 | bwd_allreduce_microstep: 5.13 | step_microstep: 54.18
-[2025-01-25 17:22:04,063] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.96 | bwd: 4633.02 | bwd_inner: 4627.78 | bwd_allreduce: 5.17 | step: 54.18
- 53%|█████▎    | 3046/5800 [8:35:34<5:16:45,  6.90s/it]                                                       {'loss': 0.0161, 'grad_norm': 0.5455746650695801, 'learning_rate': 1.9341200011625202e-05, 'epoch': 26.26}
- 53%|█████▎    | 3046/5800 [8:35:34<5:16:45,  6.90s/it]score1 tensor([[0.4238],
-        [0.5117],
-        [0.4707],
-        [0.3848]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.4805, 0.4844, 0.3613], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:22:10,992] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 17:22:10,994] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.40 | bwd_microstep: 4641.75 | bwd_inner_microstep: 4636.15 | bwd_allreduce_microstep: 5.51 | step_microstep: 45.56
-[2025-01-25 17:22:10,994] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.37 | bwd: 4641.78 | bwd_inner: 4636.15 | bwd_allreduce: 5.55 | step: 45.56
- 53%|█████▎    | 3047/5800 [8:35:40<5:17:04,  6.91s/it]                                                       {'loss': 0.0186, 'grad_norm': 3.7413477897644043, 'learning_rate': 1.9330038053742794e-05, 'epoch': 26.27}
- 53%|█████▎    | 3047/5800 [8:35:40<5:17:04,  6.91s/it]score1 tensor([[0.5547],
-        [0.4551],
-        [0.4727],
-        [0.4863]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5898, 0.4570, 0.4473, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0171, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:22:17,916] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 17:22:17,917] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.72 | bwd_microstep: 4633.11 | bwd_inner_microstep: 4628.09 | bwd_allreduce_microstep: 4.94 | step_microstep: 45.26
-[2025-01-25 17:22:17,918] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.67 | bwd: 4633.13 | bwd_inner: 4628.09 | bwd_allreduce: 4.98 | step: 45.28
- 53%|█████▎    | 3048/5800 [8:35:47<5:17:08,  6.91s/it]                                                       {'loss': 0.0171, 'grad_norm': 4.108235836029053, 'learning_rate': 1.9318876304766134e-05, 'epoch': 26.28}
- 53%|█████▎    | 3048/5800 [8:35:47<5:17:08,  6.91s/it]score1 tensor([[0.6172],
-        [0.4551],
-        [0.5430],
-        [0.5898]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.4863, 0.5469, 0.6250], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0244, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:22:24,843] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 17:22:24,844] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.18 | bwd_microstep: 4631.95 | bwd_inner_microstep: 4626.46 | bwd_allreduce_microstep: 5.41 | step_microstep: 50.61
-[2025-01-25 17:22:24,845] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.12 | bwd: 4631.98 | bwd_inner: 4626.46 | bwd_allreduce: 5.45 | step: 50.61
- 53%|█████▎    | 3049/5800 [8:35:54<5:17:11,  6.92s/it]                                                       {'loss': 0.0244, 'grad_norm': 8.602235794067383, 'learning_rate': 1.9307714768175667e-05, 'epoch': 26.28}
- 53%|█████▎    | 3049/5800 [8:35:54<5:17:11,  6.92s/it]score1 tensor([[0.4941],
-        [0.4473],
-        [0.4766],
-        [0.6641]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4922, 0.4434, 0.4785, 0.6875], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:22:31,774] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 17:22:31,776] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.08 | bwd_microstep: 4639.31 | bwd_inner_microstep: 4633.99 | bwd_allreduce_microstep: 5.24 | step_microstep: 44.50
-[2025-01-25 17:22:31,776] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.03 | bwd: 4639.34 | bwd_inner: 4633.99 | bwd_allreduce: 5.28 | step: 44.51
- 53%|█████▎    | 3050/5800 [8:36:01<5:17:17,  6.92s/it]                                                       {'loss': 0.0078, 'grad_norm': 0.7121790647506714, 'learning_rate': 1.929655344745175e-05, 'epoch': 26.29}
- 53%|█████▎    | 3050/5800 [8:36:01<5:17:17,  6.92s/it]score1 tensor([[0.6055],
-        [0.4277],
-        [0.4727],
-        [0.6172]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.4219, 0.4512, 0.6406], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0225, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:22:38,705] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.37
-[2025-01-25 17:22:38,707] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.39 | bwd_microstep: 4633.12 | bwd_inner_microstep: 4627.88 | bwd_allreduce_microstep: 5.15 | step_microstep: 45.12
-[2025-01-25 17:22:38,707] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.36 | bwd: 4633.15 | bwd_inner: 4627.88 | bwd_allreduce: 5.20 | step: 45.13
- 53%|█████▎    | 3051/5800 [8:36:08<5:17:16,  6.92s/it]                                                       {'loss': 0.0225, 'grad_norm': 1.1083831787109375, 'learning_rate': 1.928539234607467e-05, 'epoch': 26.3}
- 53%|█████▎    | 3051/5800 [8:36:08<5:17:16,  6.92s/it]score1 tensor([[0.5625],
-        [0.5938],
-        [0.3555],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.6055, 0.3516, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:22:45,644] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.93 | optimizer_step: 4.37
-[2025-01-25 17:22:45,646] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.05 | bwd_microstep: 4637.40 | bwd_inner_microstep: 4632.21 | bwd_allreduce_microstep: 5.11 | step_microstep: 54.96
-[2025-01-25 17:22:45,646] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.00 | bwd: 4637.42 | bwd_inner: 4632.21 | bwd_allreduce: 5.15 | step: 54.97
- 53%|█████▎    | 3052/5800 [8:36:15<5:17:19,  6.93s/it]                                                       {'loss': 0.0098, 'grad_norm': 4.98502254486084, 'learning_rate': 1.927423146752466e-05, 'epoch': 26.31}
- 53%|█████▎    | 3052/5800 [8:36:15<5:17:19,  6.93s/it]score1 tensor([[0.6055],
-        [0.4590],
-        [0.4648],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.4902, 0.4766, 0.6289], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:22:52,569] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.89 | optimizer_step: 4.36
-[2025-01-25 17:22:52,570] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.28 | bwd_microstep: 4634.22 | bwd_inner_microstep: 4628.84 | bwd_allreduce_microstep: 5.27 | step_microstep: 52.07
-[2025-01-25 17:22:52,570] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.24 | bwd: 4634.25 | bwd_inner: 4628.84 | bwd_allreduce: 5.33 | step: 52.03
- 53%|█████▎    | 3053/5800 [8:36:22<5:17:09,  6.93s/it]                                                       {'loss': 0.0254, 'grad_norm': 8.34235668182373, 'learning_rate': 1.9263070815281863e-05, 'epoch': 26.32}
- 53%|█████▎    | 3053/5800 [8:36:22<5:17:09,  6.93s/it]score1 tensor([[0.3652],
-        [0.5898],
-        [0.7266],
-        [0.5938]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3926, 0.5352, 0.7031, 0.5898], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:22:59,492] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 17:22:59,493] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.72 | bwd_microstep: 4631.72 | bwd_inner_microstep: 4626.24 | bwd_allreduce_microstep: 5.35 | step_microstep: 50.14
-[2025-01-25 17:22:59,493] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.68 | bwd: 4631.75 | bwd_inner: 4626.24 | bwd_allreduce: 5.42 | step: 50.15
- 53%|█████▎    | 3054/5800 [8:36:29<5:16:57,  6.93s/it]                                                       {'loss': 0.0273, 'grad_norm': 5.440890312194824, 'learning_rate': 1.925191039282638e-05, 'epoch': 26.33}
- 53%|█████▎    | 3054/5800 [8:36:29<5:16:57,  6.93s/it]score1 tensor([[0.5742],
-        [0.3789],
-        [0.5234],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.4180, 0.5000, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0410, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:23:06,415] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.37
-[2025-01-25 17:23:06,417] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.30 | bwd_microstep: 4635.91 | bwd_inner_microstep: 4630.25 | bwd_allreduce_microstep: 5.55 | step_microstep: 47.82
-[2025-01-25 17:23:06,417] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.26 | bwd: 4635.94 | bwd_inner: 4630.25 | bwd_allreduce: 5.61 | step: 47.83
- 53%|█████▎    | 3055/5800 [8:36:36<5:16:50,  6.93s/it]                                                       {'loss': 0.041, 'grad_norm': 0.7658831477165222, 'learning_rate': 1.9240750203638213e-05, 'epoch': 26.34}
- 53%|█████▎    | 3055/5800 [8:36:36<5:16:50,  6.93s/it]score1 tensor([[0.6172],
-        [0.5312],
-        [0.3145],
-        [0.3516]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6367, 0.5234, 0.3086, 0.3418], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:23:13,358] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.37
-[2025-01-25 17:23:13,359] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.01 | bwd_microstep: 4648.17 | bwd_inner_microstep: 4641.86 | bwd_allreduce_microstep: 6.22 | step_microstep: 49.56
-[2025-01-25 17:23:13,360] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.98 | bwd: 4648.19 | bwd_inner: 4641.86 | bwd_allreduce: 6.27 | step: 49.57
- 53%|█████▎    | 3056/5800 [8:36:43<5:16:57,  6.93s/it]                                                       {'loss': 0.0107, 'grad_norm': 3.2078492641448975, 'learning_rate': 1.9229590251197313e-05, 'epoch': 26.34}
- 53%|█████▎    | 3056/5800 [8:36:43<5:16:57,  6.93s/it]score1 tensor([[0.4844],
-        [0.6367],
-        [0.4883],
-        [0.5977]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.6211, 0.4863, 0.6484], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0200, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:23:20,288] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.37
-[2025-01-25 17:23:20,290] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.99 | bwd_microstep: 4638.00 | bwd_inner_microstep: 4632.80 | bwd_allreduce_microstep: 5.11 | step_microstep: 48.62
-[2025-01-25 17:23:20,290] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.95 | bwd: 4638.04 | bwd_inner: 4632.80 | bwd_allreduce: 5.15 | step: 48.63
- 53%|█████▎    | 3057/5800 [8:36:50<5:16:49,  6.93s/it]                                                       {'loss': 0.02, 'grad_norm': 4.079143524169922, 'learning_rate': 1.9218430538983545e-05, 'epoch': 26.35}
- 53%|█████▎    | 3057/5800 [8:36:50<5:16:49,  6.93s/it]score1 tensor([[0.4688],
-        [0.4082],
-        [0.5508],
-        [0.4746]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.4141, 0.5703, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:23:27,219] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.37
-[2025-01-25 17:23:27,220] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.00 | bwd_microstep: 4634.64 | bwd_inner_microstep: 4628.74 | bwd_allreduce_microstep: 5.76 | step_microstep: 49.95
-[2025-01-25 17:23:27,221] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.93 | bwd: 4634.67 | bwd_inner: 4628.74 | bwd_allreduce: 5.83 | step: 49.97
- 53%|█████▎    | 3058/5800 [8:36:57<5:16:43,  6.93s/it]                                                       {'loss': 0.0117, 'grad_norm': 4.031035423278809, 'learning_rate': 1.920727107047668e-05, 'epoch': 26.36}
- 53%|█████▎    | 3058/5800 [8:36:57<5:16:43,  6.93s/it]score1 tensor([[0.4277],
-        [0.3809],
-        [0.4805],
-        [0.5977]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.3750, 0.4648, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:23:34,155] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.08 | optimizer_step: 4.36
-[2025-01-25 17:23:34,158] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.56 | bwd_microstep: 4641.43 | bwd_inner_microstep: 4635.56 | bwd_allreduce_microstep: 5.75 | step_microstep: 50.22
-[2025-01-25 17:23:34,158] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.52 | bwd: 4641.46 | bwd_inner: 4635.56 | bwd_allreduce: 5.82 | step: 50.23
- 53%|█████▎    | 3059/5800 [8:37:04<5:16:41,  6.93s/it]                                                       {'loss': 0.0132, 'grad_norm': 0.6407144665718079, 'learning_rate': 1.9196111849156463e-05, 'epoch': 26.37}
- 53%|█████▎    | 3059/5800 [8:37:04<5:16:41,  6.93s/it]score1 tensor([[0.6562],
-        [0.5273],
-        [0.2988],
-        [0.3535]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6523, 0.5078, 0.3457, 0.3438], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0200, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:23:41,093] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.37
-[2025-01-25 17:23:41,095] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.25 | bwd_microstep: 4641.76 | bwd_inner_microstep: 4636.15 | bwd_allreduce_microstep: 5.52 | step_microstep: 46.63
-[2025-01-25 17:23:41,095] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.20 | bwd: 4641.78 | bwd_inner: 4636.15 | bwd_allreduce: 5.57 | step: 46.64
- 53%|█████▎    | 3060/5800 [8:37:11<5:16:38,  6.93s/it]                                                       {'loss': 0.02, 'grad_norm': 4.649057865142822, 'learning_rate': 1.918495287850251e-05, 'epoch': 26.38}
- 53%|█████▎    | 3060/5800 [8:37:11<5:16:38,  6.93s/it]score1 tensor([[0.6250],
-        [0.3926],
-        [0.4375],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.4062, 0.4512, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:23:48,032] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 17:23:48,034] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.16 | bwd_microstep: 4644.63 | bwd_inner_microstep: 4639.10 | bwd_allreduce_microstep: 5.44 | step_microstep: 50.15
-[2025-01-25 17:23:48,034] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.12 | bwd: 4644.66 | bwd_inner: 4639.10 | bwd_allreduce: 5.49 | step: 50.16
- 53%|█████▎    | 3061/5800 [8:37:18<5:16:38,  6.94s/it]                                                       {'loss': 0.0176, 'grad_norm': 1.1418448686599731, 'learning_rate': 1.91737941619944e-05, 'epoch': 26.39}
- 53%|█████▎    | 3061/5800 [8:37:18<5:16:38,  6.94s/it]score1 tensor([[0.5586],
-        [0.6172],
-        [0.3828],
-        [0.4805]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.6133, 0.3906, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:23:54,972] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 17:23:54,974] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.75 | bwd_microstep: 4644.18 | bwd_inner_microstep: 4638.25 | bwd_allreduce_microstep: 5.81 | step_microstep: 47.33
-[2025-01-25 17:23:54,974] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.72 | bwd: 4644.20 | bwd_inner: 4638.25 | bwd_allreduce: 5.88 | step: 47.35
- 53%|█████▎    | 3062/5800 [8:37:24<5:16:30,  6.94s/it]                                                       {'loss': 0.0103, 'grad_norm': 1.0670692920684814, 'learning_rate': 1.9162635703111608e-05, 'epoch': 26.4}
- 53%|█████▎    | 3062/5800 [8:37:24<5:16:30,  6.94s/it]score1 tensor([[0.6367],
-        [0.5703],
-        [0.5039],
-        [0.6562]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6250, 0.5820, 0.5312, 0.6641], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:24:01,904] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 7.20 | optimizer_step: 4.37
-[2025-01-25 17:24:01,905] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.49 | bwd_microstep: 4634.75 | bwd_inner_microstep: 4629.22 | bwd_allreduce_microstep: 5.44 | step_microstep: 51.69
-[2025-01-25 17:24:01,906] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.44 | bwd: 4634.77 | bwd_inner: 4629.22 | bwd_allreduce: 5.49 | step: 51.70
- 53%|█████▎    | 3063/5800 [8:37:31<5:16:27,  6.94s/it]                                                       {'loss': 0.0146, 'grad_norm': 4.3014020919799805, 'learning_rate': 1.9151477505333526e-05, 'epoch': 26.41}
- 53%|█████▎    | 3063/5800 [8:37:31<5:16:27,  6.94s/it]score1 tensor([[0.4746],
-        [0.5273],
-        [0.6250],
-        [0.4141]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.5703, 0.6445, 0.4375], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0239, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:24:08,832] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.37
-[2025-01-25 17:24:08,833] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.18 | bwd_microstep: 4634.71 | bwd_inner_microstep: 4629.25 | bwd_allreduce_microstep: 5.35 | step_microstep: 47.09
-[2025-01-25 17:24:08,834] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.14 | bwd: 4634.75 | bwd_inner: 4629.25 | bwd_allreduce: 5.40 | step: 47.09
- 53%|█████▎    | 3064/5800 [8:37:38<5:16:09,  6.93s/it]                                                       {'loss': 0.0239, 'grad_norm': 8.237204551696777, 'learning_rate': 1.9140319572139482e-05, 'epoch': 26.41}
- 53%|█████▎    | 3064/5800 [8:37:38<5:16:09,  6.93s/it]score1 tensor([[0.4824],
-        [0.4082],
-        [0.5195],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4922, 0.3945, 0.5469, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:24:15,761] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.94 | optimizer_step: 4.37
-[2025-01-25 17:24:15,763] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.97 | bwd_microstep: 4634.90 | bwd_inner_microstep: 4629.18 | bwd_allreduce_microstep: 5.59 | step_microstep: 54.23
-[2025-01-25 17:24:15,763] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.89 | bwd: 4634.95 | bwd_inner: 4629.18 | bwd_allreduce: 5.66 | step: 54.24
- 53%|█████▎    | 3065/5800 [8:37:45<5:15:57,  6.93s/it]                                                       {'loss': 0.0146, 'grad_norm': 0.4587191641330719, 'learning_rate': 1.9129161907008714e-05, 'epoch': 26.42}
- 53%|█████▎    | 3065/5800 [8:37:45<5:15:57,  6.93s/it]score1 tensor([[0.4258],
-        [0.4434],
-        [0.5703],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4297, 0.4512, 0.5898, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:24:22,701] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 17:24:22,702] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.33 | bwd_microstep: 4637.41 | bwd_inner_microstep: 4631.40 | bwd_allreduce_microstep: 5.91 | step_microstep: 50.47
-[2025-01-25 17:24:22,703] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.28 | bwd: 4637.43 | bwd_inner: 4631.40 | bwd_allreduce: 5.96 | step: 50.48
- 53%|█████▎    | 3066/5800 [8:37:52<5:16:06,  6.94s/it]                                                       {'loss': 0.0107, 'grad_norm': 8.117746353149414, 'learning_rate': 1.9118004513420374e-05, 'epoch': 26.43}
- 53%|█████▎    | 3066/5800 [8:37:52<5:16:06,  6.94s/it]score1 tensor([[0.5977],
-        [0.5117],
-        [0.4238],
-        [0.5859]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.4961, 0.4121, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:24:29,644] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 17:24:29,645] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.99 | bwd_microstep: 4642.35 | bwd_inner_microstep: 4636.76 | bwd_allreduce_microstep: 5.47 | step_microstep: 48.60
-[2025-01-25 17:24:29,645] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.95 | bwd: 4642.37 | bwd_inner: 4636.76 | bwd_allreduce: 5.53 | step: 48.61
- 53%|█████▎    | 3067/5800 [8:37:59<5:15:55,  6.94s/it]                                                       {'loss': 0.0146, 'grad_norm': 0.873887836933136, 'learning_rate': 1.910684739485353e-05, 'epoch': 26.44}
- 53%|█████▎    | 3067/5800 [8:37:59<5:15:55,  6.94s/it]score1 tensor([[0.5352],
-        [0.6484],
-        [0.4863],
-        [0.4434]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.6484, 0.4766, 0.4609], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:24:36,530] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.99 | optimizer_step: 4.37
-[2025-01-25 17:24:36,532] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.65 | bwd_microstep: 4586.81 | bwd_inner_microstep: 4581.25 | bwd_allreduce_microstep: 5.43 | step_microstep: 49.57
-[2025-01-25 17:24:36,532] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.61 | bwd: 4586.83 | bwd_inner: 4581.25 | bwd_allreduce: 5.51 | step: 49.58
- 53%|█████▎    | 3068/5800 [8:38:06<5:15:07,  6.92s/it]                                                       {'loss': 0.0078, 'grad_norm': 2.0437796115875244, 'learning_rate': 1.9095690554787167e-05, 'epoch': 26.45}
- 53%|█████▎    | 3068/5800 [8:38:06<5:15:07,  6.92s/it]score1 tensor([[0.5312],
-        [0.3867],
-        [0.5078],
-        [0.4277]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.4023, 0.5078, 0.4258], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:24:43,407] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 17:24:43,408] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.22 | bwd_microstep: 4584.82 | bwd_inner_microstep: 4580.04 | bwd_allreduce_microstep: 4.69 | step_microstep: 47.80
-[2025-01-25 17:24:43,408] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.19 | bwd: 4584.84 | bwd_inner: 4580.04 | bwd_allreduce: 4.74 | step: 47.80
- 53%|█████▎    | 3069/5800 [8:38:13<5:14:24,  6.91s/it]                                                       {'loss': 0.0083, 'grad_norm': 2.074857473373413, 'learning_rate': 1.908453399670018e-05, 'epoch': 26.46}
- 53%|█████▎    | 3069/5800 [8:38:13<5:14:24,  6.91s/it]score1 tensor([[0.4570],
-        [0.4453],
-        [0.4941],
-        [0.4102]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.4277, 0.4688, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0220, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:24:50,333] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 17:24:50,334] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.30 | bwd_microstep: 4635.49 | bwd_inner_microstep: 4630.03 | bwd_allreduce_microstep: 5.35 | step_microstep: 46.56
-[2025-01-25 17:24:50,335] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.25 | bwd: 4635.51 | bwd_inner: 4630.03 | bwd_allreduce: 5.41 | step: 46.58
- 53%|█████▎    | 3070/5800 [8:38:20<5:14:33,  6.91s/it]                                                       {'loss': 0.022, 'grad_norm': 7.703393936157227, 'learning_rate': 1.9073377724071373e-05, 'epoch': 26.47}
- 53%|█████▎    | 3070/5800 [8:38:20<5:14:33,  6.91s/it]score1 tensor([[0.4629],
-        [0.4766],
-        [0.5117],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.4785, 0.4844, 0.4727], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0151, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:24:57,271] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 17:24:57,272] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.79 | bwd_microstep: 4641.14 | bwd_inner_microstep: 4635.84 | bwd_allreduce_microstep: 5.22 | step_microstep: 48.31
-[2025-01-25 17:24:57,273] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.75 | bwd: 4641.16 | bwd_inner: 4635.84 | bwd_allreduce: 5.26 | step: 48.32
- 53%|█████▎    | 3071/5800 [8:38:27<5:14:47,  6.92s/it]                                                       {'loss': 0.0151, 'grad_norm': 4.036501407623291, 'learning_rate': 1.9062221740379477e-05, 'epoch': 26.47}
- 53%|█████▎    | 3071/5800 [8:38:27<5:14:47,  6.92s/it]score1 tensor([[0.5977],
-        [0.4727],
-        [0.5391],
-        [0.3789]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5977, 0.4512, 0.5273, 0.3789], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:25:04,103] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 17:25:04,104] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.21 | bwd_microstep: 4539.66 | bwd_inner_microstep: 4533.53 | bwd_allreduce_microstep: 6.02 | step_microstep: 48.26
-[2025-01-25 17:25:04,104] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.15 | bwd: 4539.69 | bwd_inner: 4533.53 | bwd_allreduce: 6.07 | step: 48.26
- 53%|█████▎    | 3072/5800 [8:38:34<5:13:25,  6.89s/it]                                                       {'loss': 0.0083, 'grad_norm': 4.071112155914307, 'learning_rate': 1.9051066049103105e-05, 'epoch': 26.48}
- 53%|█████▎    | 3072/5800 [8:38:34<5:13:25,  6.89s/it]score1 tensor([[0.4805],
-        [0.6328],
-        [0.3672],
-        [0.4766]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4629, 0.6133, 0.4043, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0190, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:25:11,033] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.37
-[2025-01-25 17:25:11,034] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.25 | bwd_microstep: 4639.75 | bwd_inner_microstep: 4634.58 | bwd_allreduce_microstep: 5.09 | step_microstep: 46.38
-[2025-01-25 17:25:11,035] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.21 | bwd: 4639.78 | bwd_inner: 4634.58 | bwd_allreduce: 5.12 | step: 46.38
- 53%|█████▎    | 3073/5800 [8:38:41<5:13:50,  6.91s/it]                                                       {'loss': 0.019, 'grad_norm': 1.011380672454834, 'learning_rate': 1.9039910653720816e-05, 'epoch': 26.49}
- 53%|█████▎    | 3073/5800 [8:38:41<5:13:50,  6.91s/it]score1 tensor([[0.4180],
-        [0.3887],
-        [0.4355],
-        [0.4688]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3945, 0.3750, 0.4375, 0.4668], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:25:17,954] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.03 | optimizer_step: 4.36
-[2025-01-25 17:25:17,956] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.18 | bwd_microstep: 4632.51 | bwd_inner_microstep: 4626.85 | bwd_allreduce_microstep: 5.55 | step_microstep: 45.25
-[2025-01-25 17:25:17,956] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.15 | bwd: 4632.53 | bwd_inner: 4626.85 | bwd_allreduce: 5.61 | step: 45.25
- 53%|█████▎    | 3074/5800 [8:38:47<5:13:54,  6.91s/it]                                                       {'loss': 0.0103, 'grad_norm': 3.724846124649048, 'learning_rate': 1.9028755557711043e-05, 'epoch': 26.5}
- 53%|█████▎    | 3074/5800 [8:38:47<5:13:54,  6.91s/it]score1 tensor([[0.5312],
-        [0.1494],
-        [0.6367],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.1787, 0.6211, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0264, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:25:24,877] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.88 | optimizer_step: 4.36
-[2025-01-25 17:25:24,878] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.16 | bwd_microstep: 4631.11 | bwd_inner_microstep: 4625.88 | bwd_allreduce_microstep: 5.14 | step_microstep: 43.54
-[2025-01-25 17:25:24,879] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.13 | bwd: 4631.13 | bwd_inner: 4625.88 | bwd_allreduce: 5.18 | step: 43.55
- 53%|█████▎    | 3075/5800 [8:38:54<5:14:01,  6.91s/it]                                                       {'loss': 0.0264, 'grad_norm': 5.683568954467773, 'learning_rate': 1.901760076455213e-05, 'epoch': 26.51}
- 53%|█████▎    | 3075/5800 [8:38:54<5:14:01,  6.91s/it]score1 tensor([[0.5469],
-        [0.6211],
-        [0.3633],
-        [0.5781]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.6484, 0.3691, 0.6016], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0181, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:25:31,827] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 17:25:31,829] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2170.01 | bwd_microstep: 4642.76 | bwd_inner_microstep: 4637.39 | bwd_allreduce_microstep: 5.29 | step_microstep: 52.66
-[2025-01-25 17:25:31,829] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2169.97 | bwd: 4642.79 | bwd_inner: 4637.39 | bwd_allreduce: 5.33 | step: 52.67
- 53%|█████▎    | 3076/5800 [8:39:01<5:14:24,  6.93s/it]                                                       {'loss': 0.0181, 'grad_norm': 8.40725326538086, 'learning_rate': 1.9006446277722355e-05, 'epoch': 26.52}
- 53%|█████▎    | 3076/5800 [8:39:01<5:14:24,  6.93s/it]score1 tensor([[0.5312],
-        [0.4961],
-        [0.4297],
-        [0.4258]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.4629, 0.4375, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:25:38,768] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 17:25:38,770] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.23 | bwd_microstep: 4642.29 | bwd_inner_microstep: 4636.76 | bwd_allreduce_microstep: 5.40 | step_microstep: 48.72
-[2025-01-25 17:25:38,770] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.20 | bwd: 4642.31 | bwd_inner: 4636.76 | bwd_allreduce: 5.47 | step: 48.72
- 53%|█████▎    | 3077/5800 [8:39:08<5:14:29,  6.93s/it]                                                       {'loss': 0.0195, 'grad_norm': 3.8081917762756348, 'learning_rate': 1.8995292100699857e-05, 'epoch': 26.53}
- 53%|█████▎    | 3077/5800 [8:39:08<5:14:29,  6.93s/it]score1 tensor([[0.5742],
-        [0.4492],
-        [0.4688],
-        [0.4102]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.4414, 0.4980, 0.4043], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:25:45,706] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.37
-[2025-01-25 17:25:45,708] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.60 | bwd_microstep: 4643.37 | bwd_inner_microstep: 4637.95 | bwd_allreduce_microstep: 5.33 | step_microstep: 46.16
-[2025-01-25 17:25:45,708] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.56 | bwd: 4643.40 | bwd_inner: 4637.95 | bwd_allreduce: 5.37 | step: 46.17
- 53%|█████▎    | 3078/5800 [8:39:15<5:14:29,  6.93s/it]                                                       {'loss': 0.0195, 'grad_norm': 3.96366810798645, 'learning_rate': 1.898413823696273e-05, 'epoch': 26.53}
- 53%|█████▎    | 3078/5800 [8:39:15<5:14:29,  6.93s/it]score1 tensor([[0.5234],
-        [0.4297],
-        [0.4355],
-        [0.3770]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.4336, 0.4551, 0.4043], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:25:52,643] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 17:25:52,644] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.97 | bwd_microstep: 4642.30 | bwd_inner_microstep: 4636.35 | bwd_allreduce_microstep: 5.87 | step_microstep: 47.76
-[2025-01-25 17:25:52,645] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.93 | bwd: 4642.32 | bwd_inner: 4636.35 | bwd_allreduce: 5.91 | step: 47.77
- 53%|█████▎    | 3079/5800 [8:39:22<5:14:23,  6.93s/it]                                                       {'loss': 0.0146, 'grad_norm': 7.60098123550415, 'learning_rate': 1.8972984689988914e-05, 'epoch': 26.54}
- 53%|█████▎    | 3079/5800 [8:39:22<5:14:23,  6.93s/it]score1 tensor([[0.5547],
-        [0.5078],
-        [0.4922],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.4941, 0.5039, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:25:59,580] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 17:25:59,581] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.12 | bwd_microstep: 4642.09 | bwd_inner_microstep: 4636.56 | bwd_allreduce_microstep: 5.44 | step_microstep: 45.29
-[2025-01-25 17:25:59,582] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.09 | bwd: 4642.12 | bwd_inner: 4636.56 | bwd_allreduce: 5.49 | step: 45.30
- 53%|█████▎    | 3080/5800 [8:39:29<5:14:20,  6.93s/it]                                                       {'loss': 0.0103, 'grad_norm': 0.5630002617835999, 'learning_rate': 1.8961831463256305e-05, 'epoch': 26.55}
- 53%|█████▎    | 3080/5800 [8:39:29<5:14:20,  6.93s/it]score1 tensor([[0.3555],
-        [0.4453],
-        [0.4629],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3652, 0.4492, 0.4941, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:26:06,507] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.91 | optimizer_step: 4.37
-[2025-01-25 17:26:06,509] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.70 | bwd_microstep: 4633.80 | bwd_inner_microstep: 4628.74 | bwd_allreduce_microstep: 4.95 | step_microstep: 50.81
-[2025-01-25 17:26:06,509] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.66 | bwd: 4633.82 | bwd_inner: 4628.74 | bwd_allreduce: 5.00 | step: 50.82
- 53%|█████▎    | 3081/5800 [8:39:36<5:14:09,  6.93s/it]                                                       {'loss': 0.0132, 'grad_norm': 7.61208438873291, 'learning_rate': 1.8950678560242653e-05, 'epoch': 26.56}
- 53%|█████▎    | 3081/5800 [8:39:36<5:14:09,  6.93s/it]score1 tensor([[0.4062],
-        [0.4766],
-        [0.5820],
-        [0.4316]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4082, 0.4648, 0.5781, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:26:13,431] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.24 | optimizer_step: 4.36
-[2025-01-25 17:26:13,432] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.89 | bwd_microstep: 4632.53 | bwd_inner_microstep: 4628.36 | bwd_allreduce_microstep: 4.10 | step_microstep: 37.78
-[2025-01-25 17:26:13,432] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.84 | bwd: 4632.55 | bwd_inner: 4628.36 | bwd_allreduce: 4.14 | step: 37.78
- 53%|█████▎    | 3082/5800 [8:39:43<5:13:44,  6.93s/it]                                                       {'loss': 0.0078, 'grad_norm': 4.268441200256348, 'learning_rate': 1.893952598442563e-05, 'epoch': 26.57}
- 53%|█████▎    | 3082/5800 [8:39:43<5:13:44,  6.93s/it]score1 tensor([[0.6055],
-        [0.5625],
-        [0.4453],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.5977, 0.4609, 0.4824], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0229, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:26:20,333] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 17:26:20,334] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.01 | bwd_microstep: 4633.31 | bwd_inner_microstep: 4628.19 | bwd_allreduce_microstep: 5.00 | step_microstep: 40.48
-[2025-01-25 17:26:20,334] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.97 | bwd: 4633.34 | bwd_inner: 4628.19 | bwd_allreduce: 5.06 | step: 40.49
- 53%|█████▎    | 3083/5800 [8:39:50<5:13:28,  6.92s/it]                                                       {'loss': 0.0229, 'grad_norm': 4.308757781982422, 'learning_rate': 1.892837373928281e-05, 'epoch': 26.58}
- 53%|█████▎    | 3083/5800 [8:39:50<5:13:28,  6.92s/it]score1 tensor([[0.5039],
-        [0.4082],
-        [0.5859],
-        [0.4336]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.4043, 0.6094, 0.4336], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:26:27,219] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 17:26:27,221] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.72 | bwd_microstep: 4587.02 | bwd_inner_microstep: 4581.45 | bwd_allreduce_microstep: 5.44 | step_microstep: 44.34
-[2025-01-25 17:26:27,221] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.68 | bwd: 4587.05 | bwd_inner: 4581.45 | bwd_allreduce: 5.51 | step: 44.34
- 53%|█████▎    | 3084/5800 [8:39:57<5:12:52,  6.91s/it]                                                       {'loss': 0.0127, 'grad_norm': 2.5010194778442383, 'learning_rate': 1.8917221828291652e-05, 'epoch': 26.59}
- 53%|█████▎    | 3084/5800 [8:39:57<5:12:52,  6.91s/it]score1 tensor([[0.5000],
-        [0.4922],
-        [0.5977],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.4902, 0.5664, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:26:34,158] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.89 | optimizer_step: 4.37
-[2025-01-25 17:26:34,159] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.63 | bwd_microstep: 4640.12 | bwd_inner_microstep: 4634.48 | bwd_allreduce_microstep: 5.52 | step_microstep: 49.22
-[2025-01-25 17:26:34,159] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.60 | bwd: 4640.15 | bwd_inner: 4634.48 | bwd_allreduce: 5.58 | step: 49.22
- 53%|█████▎    | 3085/5800 [8:40:04<5:13:11,  6.92s/it]                                                       {'loss': 0.0127, 'grad_norm': 4.295276641845703, 'learning_rate': 1.8906070254929517e-05, 'epoch': 26.59}
- 53%|█████▎    | 3085/5800 [8:40:04<5:13:11,  6.92s/it]score1 tensor([[0.5156],
-        [0.4297],
-        [0.6367],
-        [0.4590]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.3984, 0.6211, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:26:41,093] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 17:26:41,099] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.28 | bwd_microstep: 4636.27 | bwd_inner_microstep: 4630.94 | bwd_allreduce_microstep: 5.25 | step_microstep: 48.61
-[2025-01-25 17:26:41,099] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.23 | bwd: 4636.29 | bwd_inner: 4630.94 | bwd_allreduce: 5.29 | step: 48.62
- 53%|█████▎    | 3086/5800 [8:40:11<5:13:14,  6.92s/it]                                                       {'loss': 0.0156, 'grad_norm': 4.10537052154541, 'learning_rate': 1.8894919022673655e-05, 'epoch': 26.6}
- 53%|█████▎    | 3086/5800 [8:40:11<5:13:14,  6.92s/it]score1 tensor([[0.5234],
-        [0.5195],
-        [0.4668],
-        [0.4746]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.5273, 0.4395, 0.4805], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:26:48,024] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.02 | optimizer_step: 4.36
-[2025-01-25 17:26:48,026] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.34 | bwd_microstep: 4634.35 | bwd_inner_microstep: 4628.49 | bwd_allreduce_microstep: 5.75 | step_microstep: 47.08
-[2025-01-25 17:26:48,026] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.29 | bwd: 4634.38 | bwd_inner: 4628.49 | bwd_allreduce: 5.82 | step: 47.11
- 53%|█████▎    | 3087/5800 [8:40:17<5:13:08,  6.93s/it]                                                       {'loss': 0.0132, 'grad_norm': 4.212502956390381, 'learning_rate': 1.888376813500122e-05, 'epoch': 26.61}
- 53%|█████▎    | 3087/5800 [8:40:18<5:13:08,  6.93s/it]score1 tensor([[0.5625],
-        [0.4727],
-        [0.3848],
-        [0.4297]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.4551, 0.3809, 0.4023], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:26:54,943] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.16 | optimizer_step: 4.37
-[2025-01-25 17:26:54,944] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.22 | bwd_microstep: 4637.60 | bwd_inner_microstep: 4632.43 | bwd_allreduce_microstep: 5.09 | step_microstep: 42.10
-[2025-01-25 17:26:54,945] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.18 | bwd: 4637.62 | bwd_inner: 4632.43 | bwd_allreduce: 5.13 | step: 42.10
- 53%|█████▎    | 3088/5800 [8:40:24<5:12:58,  6.92s/it]                                                       {'loss': 0.0161, 'grad_norm': 7.79277229309082, 'learning_rate': 1.8872617595389246e-05, 'epoch': 26.62}
- 53%|█████▎    | 3088/5800 [8:40:24<5:12:58,  6.92s/it]score1 tensor([[0.6680],
-        [0.4980],
-        [0.5742],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6875, 0.4688, 0.5781, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0142, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:27:01,874] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.37
-[2025-01-25 17:27:01,875] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.87 | bwd_microstep: 4635.61 | bwd_inner_microstep: 4630.34 | bwd_allreduce_microstep: 5.17 | step_microstep: 44.32
-[2025-01-25 17:27:01,875] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.83 | bwd: 4635.63 | bwd_inner: 4630.33 | bwd_allreduce: 5.22 | step: 44.32
- 53%|█████▎    | 3089/5800 [8:40:31<5:12:56,  6.93s/it]                                                       {'loss': 0.0142, 'grad_norm': 4.74575138092041, 'learning_rate': 1.8861467407314663e-05, 'epoch': 26.63}
- 53%|█████▎    | 3089/5800 [8:40:31<5:12:56,  6.93s/it]score1 tensor([[0.5703],
-        [0.5352],
-        [0.4883],
-        [0.6250]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.5156, 0.4453, 0.6367], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:27:08,814] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 17:27:08,816] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.37 | bwd_microstep: 4639.87 | bwd_inner_microstep: 4634.27 | bwd_allreduce_microstep: 5.50 | step_microstep: 47.69
-[2025-01-25 17:27:08,816] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.33 | bwd: 4639.90 | bwd_inner: 4634.27 | bwd_allreduce: 5.55 | step: 47.69
- 53%|█████▎    | 3090/5800 [8:40:38<5:13:01,  6.93s/it]                                                       {'loss': 0.0234, 'grad_norm': 4.01207160949707, 'learning_rate': 1.885031757425431e-05, 'epoch': 26.64}
- 53%|█████▎    | 3090/5800 [8:40:38<5:13:01,  6.93s/it]score1 tensor([[0.6016],
-        [0.4590],
-        [0.6211],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.4590, 0.6289, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:27:15,653] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.95 | optimizer_step: 4.37
-[2025-01-25 17:27:15,655] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.45 | bwd_microstep: 4547.55 | bwd_inner_microstep: 4542.62 | bwd_allreduce_microstep: 4.83 | step_microstep: 41.81
-[2025-01-25 17:27:15,655] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.40 | bwd: 4547.57 | bwd_inner: 4542.62 | bwd_allreduce: 4.88 | step: 41.82
- 53%|█████▎    | 3091/5800 [8:40:45<5:11:50,  6.91s/it]                                                       {'loss': 0.0117, 'grad_norm': 0.3333401381969452, 'learning_rate': 1.883916809968487e-05, 'epoch': 26.65}
- 53%|█████▎    | 3091/5800 [8:40:45<5:11:50,  6.91s/it]score1 tensor([[0.3867],
-        [0.7148],
-        [0.6875],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3672, 0.6797, 0.6172, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0322, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:27:22,600] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.36
-[2025-01-25 17:27:22,602] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.51 | bwd_microstep: 4640.71 | bwd_inner_microstep: 4635.74 | bwd_allreduce_microstep: 4.89 | step_microstep: 42.47
-[2025-01-25 17:27:22,602] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.48 | bwd: 4640.73 | bwd_inner: 4635.74 | bwd_allreduce: 4.93 | step: 42.48
- 53%|█████▎    | 3092/5800 [8:40:52<5:12:06,  6.92s/it]                                                       {'loss': 0.0322, 'grad_norm': 4.714727401733398, 'learning_rate': 1.8828018987082973e-05, 'epoch': 26.66}
- 53%|█████▎    | 3092/5800 [8:40:52<5:12:06,  6.92s/it]score1 tensor([[0.4258],
-        [0.4062],
-        [0.5352],
-        [0.3906]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.4023, 0.5469, 0.3672], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:27:29,532] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 17:27:29,534] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.05 | bwd_microstep: 4635.50 | bwd_inner_microstep: 4630.34 | bwd_allreduce_microstep: 5.07 | step_microstep: 47.46
-[2025-01-25 17:27:29,535] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.01 | bwd: 4635.52 | bwd_inner: 4630.34 | bwd_allreduce: 5.11 | step: 47.47
- 53%|█████▎    | 3093/5800 [8:40:59<5:12:09,  6.92s/it]                                                       {'loss': 0.0117, 'grad_norm': 3.387941837310791, 'learning_rate': 1.881687023992509e-05, 'epoch': 26.66}
- 53%|█████▎    | 3093/5800 [8:40:59<5:12:09,  6.92s/it]score1 tensor([[0.6992],
-        [0.5625],
-        [0.5273],
-        [0.4004]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6836, 0.5664, 0.5469, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:27:36,417] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.10 | optimizer_step: 4.36
-[2025-01-25 17:27:36,423] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.82 | bwd_microstep: 4595.76 | bwd_inner_microstep: 4590.67 | bwd_allreduce_microstep: 4.98 | step_microstep: 47.51
-[2025-01-25 17:27:36,423] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.78 | bwd: 4595.79 | bwd_inner: 4590.67 | bwd_allreduce: 5.04 | step: 47.52
- 53%|█████▎    | 3094/5800 [8:41:06<5:11:40,  6.91s/it]                                                       {'loss': 0.0098, 'grad_norm': 1.8626009225845337, 'learning_rate': 1.880572186168759e-05, 'epoch': 26.67}
- 53%|█████▎    | 3094/5800 [8:41:06<5:11:40,  6.91s/it]score1 tensor([[0.5078],
-        [0.4863],
-        [0.3770],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5000, 0.5156, 0.3750, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:27:43,363] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.37
-[2025-01-25 17:27:43,364] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.64 | bwd_microstep: 4638.54 | bwd_inner_microstep: 4633.36 | bwd_allreduce_microstep: 5.08 | step_microstep: 45.83
-[2025-01-25 17:27:43,365] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.60 | bwd: 4638.57 | bwd_inner: 4633.36 | bwd_allreduce: 5.13 | step: 45.84
- 53%|█████▎    | 3095/5800 [8:41:13<5:11:59,  6.92s/it]                                                       {'loss': 0.0137, 'grad_norm': 3.9671573638916016, 'learning_rate': 1.879457385584674e-05, 'epoch': 26.68}
- 53%|█████▎    | 3095/5800 [8:41:13<5:11:59,  6.92s/it]score1 tensor([[0.4102],
-        [0.5703],
-        [0.4746],
-        [0.6797]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4121, 0.6133, 0.4980, 0.6719], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0190, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:27:50,298] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.94 | optimizer_step: 4.36
-[2025-01-25 17:27:50,300] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.39 | bwd_microstep: 4636.58 | bwd_inner_microstep: 4631.21 | bwd_allreduce_microstep: 5.27 | step_microstep: 43.21
-[2025-01-25 17:27:50,300] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.36 | bwd: 4636.61 | bwd_inner: 4631.21 | bwd_allreduce: 5.32 | step: 43.22
- 53%|█████▎    | 3096/5800 [8:41:20<5:12:07,  6.93s/it]                                                       {'loss': 0.019, 'grad_norm': 3.6294808387756348, 'learning_rate': 1.878342622587867e-05, 'epoch': 26.69}
- 53%|█████▎    | 3096/5800 [8:41:20<5:12:07,  6.93s/it]score1 tensor([[0.3672],
-        [0.4922],
-        [0.6094],
-        [0.4414]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3867, 0.4844, 0.6484, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:27:57,228] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 17:27:57,230] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.29 | bwd_microstep: 4633.45 | bwd_inner_microstep: 4628.09 | bwd_allreduce_microstep: 5.28 | step_microstep: 51.54
-[2025-01-25 17:27:57,231] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.26 | bwd: 4633.47 | bwd_inner: 4628.09 | bwd_allreduce: 5.32 | step: 51.55
- 53%|█████▎    | 3097/5800 [8:41:27<5:11:57,  6.92s/it]                                                       {'loss': 0.0186, 'grad_norm': 3.8884129524230957, 'learning_rate': 1.877227897525942e-05, 'epoch': 26.7}
- 53%|█████▎    | 3097/5800 [8:41:27<5:11:57,  6.92s/it]score1 tensor([[0.3906],
-        [0.4238],
-        [0.6172],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4082, 0.4473, 0.6094, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:28:04,147] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 17:28:04,148] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.08 | bwd_microstep: 4634.43 | bwd_inner_microstep: 4629.68 | bwd_allreduce_microstep: 4.68 | step_microstep: 42.17
-[2025-01-25 17:28:04,148] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.04 | bwd: 4634.46 | bwd_inner: 4629.68 | bwd_allreduce: 4.72 | step: 42.18
- 53%|█████▎    | 3098/5800 [8:41:34<5:11:48,  6.92s/it]                                                       {'loss': 0.0161, 'grad_norm': 3.632336378097534, 'learning_rate': 1.8761132107464883e-05, 'epoch': 26.71}
- 53%|█████▎    | 3098/5800 [8:41:34<5:11:48,  6.92s/it]score1 tensor([[0.4375],
-        [0.4688],
-        [0.5469],
-        [0.6055]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.4707, 0.5508, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0044, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:28:11,036] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.37
-[2025-01-25 17:28:11,038] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.29 | bwd_microstep: 4596.74 | bwd_inner_microstep: 4591.63 | bwd_allreduce_microstep: 5.00 | step_microstep: 43.30
-[2025-01-25 17:28:11,038] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.25 | bwd: 4596.76 | bwd_inner: 4591.63 | bwd_allreduce: 5.05 | step: 43.30
- 53%|█████▎    | 3099/5800 [8:41:41<5:11:15,  6.91s/it]                                                       {'loss': 0.0044, 'grad_norm': 6.015540599822998, 'learning_rate': 1.8749985625970857e-05, 'epoch': 26.72}
- 53%|█████▎    | 3099/5800 [8:41:41<5:11:15,  6.91s/it]score1 tensor([[0.5625],
-        [0.5430],
-        [0.6211],
-        [0.4941]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5938, 0.4883, 0.6172, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0225, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:28:17,913] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 17:28:17,914] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.49 | bwd_microstep: 4589.93 | bwd_inner_microstep: 4584.51 | bwd_allreduce_microstep: 5.33 | step_microstep: 45.64
-[2025-01-25 17:28:17,914] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.45 | bwd: 4589.96 | bwd_inner: 4584.51 | bwd_allreduce: 5.38 | step: 45.64
- 53%|█████▎    | 3100/5800 [8:41:47<5:10:34,  6.90s/it]                                                       {'loss': 0.0225, 'grad_norm': 2.333014965057373, 'learning_rate': 1.8738839534252998e-05, 'epoch': 26.72}
- 53%|█████▎    | 3100/5800 [8:41:47<5:10:34,  6.90s/it]score1 tensor([[0.4277],
-        [0.5547],
-        [0.6055],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4668, 0.5508, 0.6172, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:28:24,842] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 8.28 | optimizer_step: 4.36
-[2025-01-25 17:28:24,843] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.86 | bwd_microstep: 4642.38 | bwd_inner_microstep: 4636.99 | bwd_allreduce_microstep: 5.27 | step_microstep: 46.86
-[2025-01-25 17:28:24,844] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.81 | bwd: 4642.40 | bwd_inner: 4636.99 | bwd_allreduce: 5.33 | step: 46.87
- 53%|█████▎    | 3101/5800 [8:41:54<5:10:52,  6.91s/it]                                                       {'loss': 0.0156, 'grad_norm': 0.4559441804885864, 'learning_rate': 1.8727693835786858e-05, 'epoch': 26.73}
- 53%|█████▎    | 3101/5800 [8:41:54<5:10:52,  6.91s/it]score1 tensor([[0.3906],
-        [0.4961],
-        [0.5352],
-        [0.4434]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3789, 0.4727, 0.5195, 0.4277], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0166, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:28:31,777] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.99 | optimizer_step: 4.36
-[2025-01-25 17:28:31,779] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.33 | bwd_microstep: 4633.04 | bwd_inner_microstep: 4627.88 | bwd_allreduce_microstep: 5.08 | step_microstep: 48.85
-[2025-01-25 17:28:31,779] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.30 | bwd: 4633.06 | bwd_inner: 4627.88 | bwd_allreduce: 5.13 | step: 48.86
- 53%|█████▎    | 3102/5800 [8:42:01<5:11:05,  6.92s/it]                                                       {'loss': 0.0166, 'grad_norm': 7.766653537750244, 'learning_rate': 1.8716548534047853e-05, 'epoch': 26.74}
- 53%|█████▎    | 3102/5800 [8:42:01<5:11:05,  6.92s/it]score1 tensor([[0.4961],
-        [0.6289],
-        [0.6758],
-        [0.6055]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4883, 0.5938, 0.6641, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:28:38,669] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.02 | optimizer_step: 4.37
-[2025-01-25 17:28:38,671] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.09 | bwd_microstep: 4595.24 | bwd_inner_microstep: 4590.27 | bwd_allreduce_microstep: 4.88 | step_microstep: 44.70
-[2025-01-25 17:28:38,671] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.04 | bwd: 4595.27 | bwd_inner: 4590.27 | bwd_allreduce: 4.93 | step: 44.71
- 54%|█████▎    | 3103/5800 [8:42:08<5:10:39,  6.91s/it]                                                       {'loss': 0.0137, 'grad_norm': 6.801600933074951, 'learning_rate': 1.8705403632511286e-05, 'epoch': 26.75}
- 54%|█████▎    | 3103/5800 [8:42:08<5:10:39,  6.91s/it]score1 tensor([[0.4961],
-        [0.4648],
-        [0.5195],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4922, 0.4648, 0.4844, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0166, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:28:45,556] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.37
-[2025-01-25 17:28:45,557] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.46 | bwd_microstep: 4582.48 | bwd_inner_microstep: 4577.46 | bwd_allreduce_microstep: 4.93 | step_microstep: 48.75
-[2025-01-25 17:28:45,558] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.43 | bwd: 4582.50 | bwd_inner: 4577.46 | bwd_allreduce: 4.97 | step: 48.77
- 54%|█████▎    | 3104/5800 [8:42:15<5:10:09,  6.90s/it]                                                       {'loss': 0.0166, 'grad_norm': 2.0536932945251465, 'learning_rate': 1.869425913465233e-05, 'epoch': 26.76}
- 54%|█████▎    | 3104/5800 [8:42:15<5:10:09,  6.90s/it]score1 tensor([[0.5469],
-        [0.4512],
-        [0.6719],
-        [0.4160]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.4258, 0.6562, 0.4199], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:28:52,479] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 17:28:52,481] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.42 | bwd_microstep: 4640.50 | bwd_inner_microstep: 4635.20 | bwd_allreduce_microstep: 5.18 | step_microstep: 43.90
-[2025-01-25 17:28:52,481] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.38 | bwd: 4640.53 | bwd_inner: 4635.20 | bwd_allreduce: 5.25 | step: 43.91
- 54%|█████▎    | 3105/5800 [8:42:22<5:10:18,  6.91s/it]                                                       {'loss': 0.0161, 'grad_norm': 0.6760956645011902, 'learning_rate': 1.868311504394604e-05, 'epoch': 26.77}
- 54%|█████▎    | 3105/5800 [8:42:22<5:10:18,  6.91s/it]score1 tensor([[0.4062],
-        [0.4883],
-        [0.4609],
-        [0.3984]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3984, 0.4980, 0.4570, 0.2812], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0347, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:28:59,411] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.37
-[2025-01-25 17:28:59,412] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.58 | bwd_microstep: 4645.37 | bwd_inner_microstep: 4639.88 | bwd_allreduce_microstep: 5.41 | step_microstep: 44.52
-[2025-01-25 17:28:59,413] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.55 | bwd: 4645.40 | bwd_inner: 4639.88 | bwd_allreduce: 5.45 | step: 44.53
- 54%|█████▎    | 3106/5800 [8:42:29<5:10:31,  6.92s/it]                                                       {'loss': 0.0347, 'grad_norm': 3.5865118503570557, 'learning_rate': 1.8671971363867317e-05, 'epoch': 26.78}
- 54%|█████▎    | 3106/5800 [8:42:29<5:10:31,  6.92s/it]score1 tensor([[0.4961],
-        [0.5781],
-        [0.4688],
-        [0.6016]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.5664, 0.4590, 0.6133], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0171, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:29:06,346] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.36
-[2025-01-25 17:29:06,348] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.07 | bwd_microstep: 4633.44 | bwd_inner_microstep: 4628.03 | bwd_allreduce_microstep: 5.32 | step_microstep: 43.38
-[2025-01-25 17:29:06,348] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.03 | bwd: 4633.47 | bwd_inner: 4628.02 | bwd_allreduce: 5.37 | step: 43.38
- 54%|█████▎    | 3107/5800 [8:42:36<5:10:40,  6.92s/it]                                                       {'loss': 0.0171, 'grad_norm': 0.4052250385284424, 'learning_rate': 1.8660828097890974e-05, 'epoch': 26.78}
- 54%|█████▎    | 3107/5800 [8:42:36<5:10:40,  6.92s/it]score1 tensor([[0.3379],
-        [0.5312],
-        [0.4336],
-        [0.4648]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3105, 0.5703, 0.4336, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0264, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:29:13,237] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.16 | optimizer_step: 4.37
-[2025-01-25 17:29:13,238] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.32 | bwd_microstep: 4592.30 | bwd_inner_microstep: 4587.15 | bwd_allreduce_microstep: 5.05 | step_microstep: 44.32
-[2025-01-25 17:29:13,238] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.27 | bwd: 4592.32 | bwd_inner: 4587.15 | bwd_allreduce: 5.09 | step: 44.33
- 54%|█████▎    | 3108/5800 [8:42:43<5:10:10,  6.91s/it]                                                       {'loss': 0.0264, 'grad_norm': 2.4108543395996094, 'learning_rate': 1.8649685249491658e-05, 'epoch': 26.79}
- 54%|█████▎    | 3108/5800 [8:42:43<5:10:10,  6.91s/it]score1 tensor([[0.4922],
-        [0.4727],
-        [0.4395],
-        [0.4688]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.5117, 0.4531, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0269, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:29:20,174] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.03 | optimizer_step: 4.36
-[2025-01-25 17:29:20,176] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.12 | bwd_microstep: 4642.58 | bwd_inner_microstep: 4637.67 | bwd_allreduce_microstep: 4.81 | step_microstep: 41.39
-[2025-01-25 17:29:20,176] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.08 | bwd: 4642.61 | bwd_inner: 4637.67 | bwd_allreduce: 4.87 | step: 41.40
- 54%|█████▎    | 3109/5800 [8:42:50<5:10:18,  6.92s/it]                                                       {'loss': 0.0269, 'grad_norm': 3.8196332454681396, 'learning_rate': 1.863854282214392e-05, 'epoch': 26.8}
- 54%|█████▎    | 3109/5800 [8:42:50<5:10:18,  6.92s/it]score1 tensor([[0.4375],
-        [0.3828],
-        [0.4277],
-        [0.4414]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4590, 0.3652, 0.3887, 0.4395], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0200, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:29:27,100] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.37
-[2025-01-25 17:29:27,102] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.15 | bwd_microstep: 4636.45 | bwd_inner_microstep: 4631.02 | bwd_allreduce_microstep: 5.30 | step_microstep: 42.35
-[2025-01-25 17:29:27,102] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.11 | bwd: 4636.49 | bwd_inner: 4631.02 | bwd_allreduce: 5.36 | step: 42.36
- 54%|█████▎    | 3110/5800 [8:42:57<5:10:25,  6.92s/it]                                                       {'loss': 0.02, 'grad_norm': 3.6797540187835693, 'learning_rate': 1.8627400819322146e-05, 'epoch': 26.81}
- 54%|█████▎    | 3110/5800 [8:42:57<5:10:25,  6.92s/it]score1 tensor([[0.4766],
-        [0.5156],
-        [0.5117],
-        [0.4980]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.5586, 0.5273, 0.5234], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0308, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:29:34,039] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.37
-[2025-01-25 17:29:34,041] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.75 | bwd_microstep: 4636.48 | bwd_inner_microstep: 4631.42 | bwd_allreduce_microstep: 4.97 | step_microstep: 43.56
-[2025-01-25 17:29:34,041] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.69 | bwd: 4636.51 | bwd_inner: 4631.42 | bwd_allreduce: 5.02 | step: 43.57
- 54%|█████▎    | 3111/5800 [8:43:04<5:10:24,  6.93s/it]                                                       {'loss': 0.0308, 'grad_norm': 8.110686302185059, 'learning_rate': 1.8616259244500622e-05, 'epoch': 26.82}
- 54%|█████▎    | 3111/5800 [8:43:04<5:10:24,  6.93s/it]score1 tensor([[0.3242],
-        [0.5508],
-        [0.4395],
-        [0.4531]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3223, 0.5547, 0.4668, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:29:40,974] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.37
-[2025-01-25 17:29:40,976] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.12 | bwd_microstep: 4637.70 | bwd_inner_microstep: 4632.76 | bwd_allreduce_microstep: 4.85 | step_microstep: 43.33
-[2025-01-25 17:29:40,976] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.07 | bwd: 4637.72 | bwd_inner: 4632.76 | bwd_allreduce: 4.90 | step: 43.34
- 54%|█████▎    | 3112/5800 [8:43:10<5:10:23,  6.93s/it]                                                       {'loss': 0.0195, 'grad_norm': 4.370243072509766, 'learning_rate': 1.8605118101153476e-05, 'epoch': 26.83}
- 54%|█████▎    | 3112/5800 [8:43:10<5:10:23,  6.93s/it]score1 tensor([[0.5078],
-        [0.4805],
-        [0.5469],
-        [0.4590]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.4863, 0.5664, 0.4824], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:29:47,911] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.94 | optimizer_step: 4.36
-[2025-01-25 17:29:47,912] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.16 | bwd_microstep: 4646.23 | bwd_inner_microstep: 4640.90 | bwd_allreduce_microstep: 5.23 | step_microstep: 45.40
-[2025-01-25 17:29:47,913] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.12 | bwd: 4646.26 | bwd_inner: 4640.90 | bwd_allreduce: 5.29 | step: 45.41
- 54%|█████▎    | 3113/5800 [8:43:17<5:10:24,  6.93s/it]                                                       {'loss': 0.0132, 'grad_norm': 8.071949005126953, 'learning_rate': 1.85939773927547e-05, 'epoch': 26.84}
- 54%|█████▎    | 3113/5800 [8:43:17<5:10:24,  6.93s/it]score1 tensor([[0.4668],
-        [0.4961],
-        [0.4805],
-        [0.4492]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4766, 0.5039, 0.5117, 0.4414], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0142, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:29:54,835] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.99 | optimizer_step: 4.37
-[2025-01-25 17:29:54,836] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.83 | bwd_microstep: 4635.09 | bwd_inner_microstep: 4629.70 | bwd_allreduce_microstep: 5.28 | step_microstep: 45.52
-[2025-01-25 17:29:54,837] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.80 | bwd: 4635.13 | bwd_inner: 4629.70 | bwd_allreduce: 5.34 | step: 45.53
- 54%|█████▎    | 3114/5800 [8:43:24<5:10:18,  6.93s/it]                                                       {'loss': 0.0142, 'grad_norm': 4.033081531524658, 'learning_rate': 1.858283712277818e-05, 'epoch': 26.84}
- 54%|█████▎    | 3114/5800 [8:43:24<5:10:18,  6.93s/it]score1 tensor([[0.4395],
-        [0.5117],
-        [0.6641],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4160, 0.5195, 0.7070, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0322, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:30:01,777] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.91 | optimizer_step: 4.36
-[2025-01-25 17:30:01,779] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.57 | bwd_microstep: 4637.73 | bwd_inner_microstep: 4632.69 | bwd_allreduce_microstep: 4.95 | step_microstep: 43.31
-[2025-01-25 17:30:01,779] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.54 | bwd: 4637.76 | bwd_inner: 4632.69 | bwd_allreduce: 4.99 | step: 43.32
- 54%|█████▎    | 3115/5800 [8:43:31<5:10:14,  6.93s/it]                                                       {'loss': 0.0322, 'grad_norm': 4.790304183959961, 'learning_rate': 1.8571697294697625e-05, 'epoch': 26.85}
- 54%|█████▎    | 3115/5800 [8:43:31<5:10:14,  6.93s/it]score1 tensor([[0.5586],
-        [0.5508],
-        [0.6094],
-        [0.6523]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.5742, 0.6094, 0.6367], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:30:08,664] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 17:30:08,665] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.87 | bwd_microstep: 4591.24 | bwd_inner_microstep: 4586.54 | bwd_allreduce_microstep: 4.62 | step_microstep: 49.43
-[2025-01-25 17:30:08,666] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.84 | bwd: 4591.27 | bwd_inner: 4586.54 | bwd_allreduce: 4.66 | step: 49.44
- 54%|█████▎    | 3116/5800 [8:43:38<5:09:28,  6.92s/it]                                                       {'loss': 0.0107, 'grad_norm': 1.9464210271835327, 'learning_rate': 1.8560557911986644e-05, 'epoch': 26.86}
- 54%|█████▎    | 3116/5800 [8:43:38<5:09:28,  6.92s/it]score1 tensor([[0.5742],
-        [0.3730],
-        [0.6094],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.3340, 0.5781, 0.4688], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0283, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:30:15,594] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 17:30:15,596] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.01 | bwd_microstep: 4642.37 | bwd_inner_microstep: 4636.30 | bwd_allreduce_microstep: 5.90 | step_microstep: 48.20
-[2025-01-25 17:30:15,596] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.98 | bwd: 4642.40 | bwd_inner: 4636.30 | bwd_allreduce: 6.00 | step: 48.20
- 54%|█████▎    | 3117/5800 [8:43:45<5:09:34,  6.92s/it]                                                       {'loss': 0.0283, 'grad_norm': 8.253931999206543, 'learning_rate': 1.8549418978118673e-05, 'epoch': 26.87}
- 54%|█████▎    | 3117/5800 [8:43:45<5:09:34,  6.92s/it]score1 tensor([[0.5273],
-        [0.6602],
-        [0.6211],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5000, 0.6328, 0.6094, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0166, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:30:22,476] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.57 | optimizer_step: 4.36
-[2025-01-25 17:30:22,478] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.18 | bwd_microstep: 4591.54 | bwd_inner_microstep: 4586.46 | bwd_allreduce_microstep: 4.97 | step_microstep: 44.86
-[2025-01-25 17:30:22,478] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.15 | bwd: 4591.57 | bwd_inner: 4586.46 | bwd_allreduce: 5.03 | step: 44.86
- 54%|█████▍    | 3118/5800 [8:43:52<5:08:57,  6.91s/it]                                                       {'loss': 0.0166, 'grad_norm': 6.845578670501709, 'learning_rate': 1.8538280496567045e-05, 'epoch': 26.88}
- 54%|█████▍    | 3118/5800 [8:43:52<5:08:57,  6.91s/it]score1 tensor([[0.5000],
-        [0.5312],
-        [0.3711],
-        [0.4219]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.5312, 0.3457, 0.4141], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:30:29,350] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.36
-[2025-01-25 17:30:29,352] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.42 | bwd_microstep: 4581.15 | bwd_inner_microstep: 4575.85 | bwd_allreduce_microstep: 5.15 | step_microstep: 43.56
-[2025-01-25 17:30:29,352] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.37 | bwd: 4581.17 | bwd_inner: 4575.86 | bwd_allreduce: 5.23 | step: 43.56
- 54%|█████▍    | 3119/5800 [8:43:59<5:08:14,  6.90s/it]                                                       {'loss': 0.0122, 'grad_norm': 5.595898628234863, 'learning_rate': 1.8527142470804916e-05, 'epoch': 26.89}
- 54%|█████▍    | 3119/5800 [8:43:59<5:08:14,  6.90s/it]score1 tensor([[0.5156],
-        [0.5234],
-        [0.5898],
-        [0.4375]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.5039, 0.5742, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0229, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:30:36,280] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.36
-[2025-01-25 17:30:36,281] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.87 | bwd_microstep: 4642.83 | bwd_inner_microstep: 4637.01 | bwd_allreduce_microstep: 5.71 | step_microstep: 46.30
-[2025-01-25 17:30:36,282] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.82 | bwd: 4642.85 | bwd_inner: 4637.01 | bwd_allreduce: 5.77 | step: 46.31
- 54%|█████▍    | 3120/5800 [8:44:06<5:08:33,  6.91s/it]                                                       {'loss': 0.0229, 'grad_norm': 8.219095230102539, 'learning_rate': 1.8516004904305322e-05, 'epoch': 26.9}
- 54%|█████▍    | 3120/5800 [8:44:06<5:08:33,  6.91s/it]score1 tensor([[0.5078],
-        [0.4961],
-        [0.4238],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5000, 0.4941, 0.4004, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0166, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:30:43,206] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.99 | optimizer_step: 4.37
-[2025-01-25 17:30:43,208] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.04 | bwd_microstep: 4634.06 | bwd_inner_microstep: 4628.55 | bwd_allreduce_microstep: 5.41 | step_microstep: 44.21
-[2025-01-25 17:30:43,208] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.00 | bwd: 4634.08 | bwd_inner: 4628.55 | bwd_allreduce: 5.46 | step: 44.22
- 54%|█████▍    | 3121/5800 [8:44:13<5:08:42,  6.91s/it]                                                       {'loss': 0.0166, 'grad_norm': 8.017367362976074, 'learning_rate': 1.8504867800541148e-05, 'epoch': 26.91}
- 54%|█████▍    | 3121/5800 [8:44:13<5:08:42,  6.91s/it]score1 tensor([[0.3848],
-        [0.5859],
-        [0.5742],
-        [0.4570]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3730, 0.5508, 0.5508, 0.4609], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:30:50,138] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.36
-[2025-01-25 17:30:50,139] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.11 | bwd_microstep: 4639.47 | bwd_inner_microstep: 4634.15 | bwd_allreduce_microstep: 5.22 | step_microstep: 43.42
-[2025-01-25 17:30:50,139] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.05 | bwd: 4639.49 | bwd_inner: 4634.15 | bwd_allreduce: 5.27 | step: 43.43
- 54%|█████▍    | 3122/5800 [8:44:20<5:08:47,  6.92s/it]                                                       {'loss': 0.0186, 'grad_norm': 4.26318359375, 'learning_rate': 1.8493731162985135e-05, 'epoch': 26.91}
- 54%|█████▍    | 3122/5800 [8:44:20<5:08:47,  6.92s/it]score1 tensor([[0.5547],
-        [0.4707],
-        [0.6016],
-        [0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5586, 0.4648, 0.5625, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0171, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:30:57,060] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.90 | optimizer_step: 4.36
-[2025-01-25 17:30:57,061] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.40 | bwd_microstep: 4640.24 | bwd_inner_microstep: 4635.03 | bwd_allreduce_microstep: 5.12 | step_microstep: 44.79
-[2025-01-25 17:30:57,062] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.37 | bwd: 4640.27 | bwd_inner: 4635.03 | bwd_allreduce: 5.16 | step: 44.80
- 54%|█████▍    | 3123/5800 [8:44:27<5:08:45,  6.92s/it]                                                       {'loss': 0.0171, 'grad_norm': 4.04071044921875, 'learning_rate': 1.8482594995109887e-05, 'epoch': 26.92}
- 54%|█████▍    | 3123/5800 [8:44:27<5:08:45,  6.92s/it]score1 tensor([[0.5391],
-        [0.5625],
-        [0.4922],
-        [0.5898]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5586, 0.5625, 0.5000, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:31:03,928] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.37
-[2025-01-25 17:31:03,929] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.52 | bwd_microstep: 4580.73 | bwd_inner_microstep: 4575.48 | bwd_allreduce_microstep: 5.12 | step_microstep: 45.67
-[2025-01-25 17:31:03,930] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.49 | bwd: 4580.77 | bwd_inner: 4575.48 | bwd_allreduce: 5.19 | step: 45.68
- 54%|█████▍    | 3124/5800 [8:44:33<5:07:55,  6.90s/it]                                                       {'loss': 0.0098, 'grad_norm': 1.907314419746399, 'learning_rate': 1.8471459300387846e-05, 'epoch': 26.93}
- 54%|█████▍    | 3124/5800 [8:44:33<5:07:55,  6.90s/it]score1 tensor([[0.4863],
-        [0.4375],
-        [0.4727],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4961, 0.4551, 0.4961, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0151, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:31:10,847] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 17:31:10,849] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.05 | bwd_microstep: 4640.23 | bwd_inner_microstep: 4635.03 | bwd_allreduce_microstep: 5.10 | step_microstep: 43.45
-[2025-01-25 17:31:10,849] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.02 | bwd: 4640.26 | bwd_inner: 4635.03 | bwd_allreduce: 5.15 | step: 43.45
- 54%|█████▍    | 3125/5800 [8:44:40<5:08:03,  6.91s/it]                                                       {'loss': 0.0151, 'grad_norm': 3.8530843257904053, 'learning_rate': 1.8460324082291313e-05, 'epoch': 26.94}
- 54%|█████▍    | 3125/5800 [8:44:40<5:08:03,  6.91s/it]score1 tensor([[0.4902],
-        [0.5586],
-        [0.4160],
-        [0.5781]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.5391, 0.4062, 0.5820], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:31:17,772] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 17:31:17,773] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.00 | bwd_microstep: 4637.73 | bwd_inner_microstep: 4632.62 | bwd_allreduce_microstep: 5.00 | step_microstep: 43.44
-[2025-01-25 17:31:17,773] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.96 | bwd: 4637.76 | bwd_inner: 4632.62 | bwd_allreduce: 5.06 | step: 43.45
- 54%|█████▍    | 3126/5800 [8:44:47<5:08:04,  6.91s/it]                                                       {'loss': 0.0098, 'grad_norm': 3.8050522804260254, 'learning_rate': 1.8449189344292455e-05, 'epoch': 26.95}
- 54%|█████▍    | 3126/5800 [8:44:47<5:08:04,  6.91s/it]score1 tensor([[0.5898],
-        [0.5078],
-        [0.4473],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.4844, 0.4473, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:31:24,641] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.15 | optimizer_step: 4.36
-[2025-01-25 17:31:24,642] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.94 | bwd_microstep: 4586.62 | bwd_inner_microstep: 4581.49 | bwd_allreduce_microstep: 5.02 | step_microstep: 46.74
-[2025-01-25 17:31:24,642] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.91 | bwd: 4586.64 | bwd_inner: 4581.49 | bwd_allreduce: 5.07 | step: 46.75
- 54%|█████▍    | 3127/5800 [8:44:54<5:07:24,  6.90s/it]                                                       {'loss': 0.0137, 'grad_norm': 1.9282282590866089, 'learning_rate': 1.843805508986326e-05, 'epoch': 26.96}
- 54%|█████▍    | 3127/5800 [8:44:54<5:07:24,  6.90s/it]score1 tensor([[0.6055],
-        [0.3887],
-        [0.4688],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6016, 0.4004, 0.4453, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0190, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:31:31,563] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.37
-[2025-01-25 17:31:31,565] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.72 | bwd_microstep: 4637.02 | bwd_inner_microstep: 4631.67 | bwd_allreduce_microstep: 5.24 | step_microstep: 44.41
-[2025-01-25 17:31:31,565] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.68 | bwd: 4637.04 | bwd_inner: 4631.67 | bwd_allreduce: 5.29 | step: 44.42
- 54%|█████▍    | 3128/5800 [8:45:01<5:07:35,  6.91s/it]                                                       {'loss': 0.019, 'grad_norm': 4.57619047164917, 'learning_rate': 1.8426921322475596e-05, 'epoch': 26.97}
- 54%|█████▍    | 3128/5800 [8:45:01<5:07:35,  6.91s/it]score1 tensor([[0.4492],
-        [0.6523],
-        [0.4043],
-        [0.4980]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4355, 0.6797, 0.4453, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0239, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:31:38,473] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.03 | optimizer_step: 4.37
-[2025-01-25 17:31:38,474] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.54 | bwd_microstep: 4628.73 | bwd_inner_microstep: 4623.77 | bwd_allreduce_microstep: 4.88 | step_microstep: 44.82
-[2025-01-25 17:31:38,475] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.50 | bwd: 4628.76 | bwd_inner: 4623.77 | bwd_allreduce: 4.92 | step: 44.83
- 54%|█████▍    | 3129/5800 [8:45:08<5:07:29,  6.91s/it]                                                       {'loss': 0.0239, 'grad_norm': 4.309577941894531, 'learning_rate': 1.841578804560115e-05, 'epoch': 26.97}
- 54%|█████▍    | 3129/5800 [8:45:08<5:07:29,  6.91s/it]score1 tensor([[0.4727],
-        [0.3301],
-        [0.4004],
-        [0.4355]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.3477, 0.4180, 0.4688], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0288, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:31:45,399] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.37
-[2025-01-25 17:31:45,400] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.86 | bwd_microstep: 4640.70 | bwd_inner_microstep: 4635.77 | bwd_allreduce_microstep: 4.84 | step_microstep: 43.92
-[2025-01-25 17:31:45,401] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.82 | bwd: 4640.72 | bwd_inner: 4635.77 | bwd_allreduce: 4.88 | step: 43.93
- 54%|█████▍    | 3130/5800 [8:45:15<5:07:38,  6.91s/it]                                                       {'loss': 0.0288, 'grad_norm': 7.258059501647949, 'learning_rate': 1.8404655262711494e-05, 'epoch': 26.98}
- 54%|█████▍    | 3130/5800 [8:45:15<5:07:38,  6.91s/it]score1 tensor([[0.5391],
-        [0.4316],
-        [0.4414],
-        [0.4297]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.4570, 0.4492, 0.4434], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:31:52,315] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.37
-[2025-01-25 17:31:52,316] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.68 | bwd_microstep: 4631.77 | bwd_inner_microstep: 4626.70 | bwd_allreduce_microstep: 4.97 | step_microstep: 43.87
-[2025-01-25 17:31:52,316] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.65 | bwd: 4631.80 | bwd_inner: 4626.70 | bwd_allreduce: 5.02 | step: 43.88
- 54%|█████▍    | 3131/5800 [8:45:22<5:07:30,  6.91s/it]                                                       {'loss': 0.0176, 'grad_norm': 7.687956809997559, 'learning_rate': 1.8393522977278008e-05, 'epoch': 26.99}
- 54%|█████▍    | 3131/5800 [8:45:22<5:07:30,  6.91s/it][93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-score1 tensor([[0.4102]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4160], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:31:56,882] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.37
-[2025-01-25 17:31:56,884] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 573.65 | bwd_microstep: 1221.74 | bwd_inner_microstep: 1216.34 | bwd_allreduce_microstep: 5.28 | step_microstep: 45.28
-[2025-01-25 17:31:56,884] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 573.61 | bwd: 1221.76 | bwd_inner: 1216.34 | bwd_allreduce: 5.35 | step: 45.29
- 54%|█████▍    | 3132/5800 [8:45:26<4:36:08,  6.21s/it]                                                       {'loss': 0.0059, 'grad_norm': 7.335108757019043, 'learning_rate': 1.838239119277192e-05, 'epoch': 27.0}
- 54%|█████▍    | 3132/5800 [8:45:26<4:36:08,  6.21s/it][2025-01-25 17:32:01,353] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 17:32:12,111] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 17:32:22,968] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 17:32:33,753] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-score1 tensor([[0.4648],
-        [0.6562],
-        [0.5352],
-        [0.6367]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.6836, 0.5664, 0.6719], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0269, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:32:57,001] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.22 | optimizer_step: 4.36
-[2025-01-25 17:32:57,002] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2143.88 | bwd_microstep: 4577.63 | bwd_inner_microstep: 4572.59 | bwd_allreduce_microstep: 4.96 | step_microstep: 47.50
-[2025-01-25 17:32:57,002] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2143.83 | bwd: 4577.66 | bwd_inner: 4572.59 | bwd_allreduce: 5.00 | step: 47.50
- 54%|█████▍    | 3133/5800 [8:46:26<16:34:52, 22.38s/it]                                                        {'loss': 0.0269, 'grad_norm': 8.806113243103027, 'learning_rate': 1.8371259912664336e-05, 'epoch': 27.01}
- 54%|█████▍    | 3133/5800 [8:46:26<16:34:52, 22.38s/it]score1 tensor([[0.3789],
-        [0.6289],
-        [0.4668],
-        [0.5938]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4043, 0.6289, 0.4688, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:33:03,785] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 17:33:03,786] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2128.12 | bwd_microstep: 4537.01 | bwd_inner_microstep: 4532.09 | bwd_allreduce_microstep: 4.85 | step_microstep: 42.87
-[2025-01-25 17:33:03,787] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2128.08 | bwd: 4537.03 | bwd_inner: 4532.09 | bwd_allreduce: 4.88 | step: 42.88
- 54%|█████▍    | 3134/5800 [8:46:33<13:06:40, 17.70s/it]                                                        {'loss': 0.0137, 'grad_norm': 5.961374759674072, 'learning_rate': 1.8360129140426163e-05, 'epoch': 27.02}
- 54%|█████▍    | 3134/5800 [8:46:33<13:06:40, 17.70s/it]score1 tensor([[0.5977],
-        [0.3965],
-        [0.4414],
-        [0.3652]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.3945, 0.4297, 0.2812], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0332, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:33:10,628] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 17:33:10,629] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2129.55 | bwd_microstep: 4587.27 | bwd_inner_microstep: 4582.51 | bwd_allreduce_microstep: 4.64 | step_microstep: 43.86
-[2025-01-25 17:33:10,630] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2129.52 | bwd: 4587.29 | bwd_inner: 4582.51 | bwd_allreduce: 4.70 | step: 43.86
- 54%|█████▍    | 3135/5800 [8:46:40<10:41:36, 14.45s/it]                                                        {'loss': 0.0332, 'grad_norm': 7.722053050994873, 'learning_rate': 1.834899887952819e-05, 'epoch': 27.03}
- 54%|█████▍    | 3135/5800 [8:46:40<10:41:36, 14.45s/it]score1 tensor([[0.4902],
-        [0.3711],
-        [0.4902],
-        [0.4531]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.3750, 0.4961, 0.4570], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:33:17,486] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.37
-[2025-01-25 17:33:17,487] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2137.89 | bwd_microstep: 4597.40 | bwd_inner_microstep: 4592.89 | bwd_allreduce_microstep: 4.42 | step_microstep: 41.98
-[2025-01-25 17:33:17,487] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2137.84 | bwd: 4597.42 | bwd_inner: 4592.89 | bwd_allreduce: 4.46 | step: 41.98
- 54%|█████▍    | 3136/5800 [8:46:47<9:00:17, 12.17s/it]                                                        {'loss': 0.0088, 'grad_norm': 7.668392658233643, 'learning_rate': 1.8337869133441e-05, 'epoch': 27.03}
- 54%|█████▍    | 3136/5800 [8:46:47<9:00:17, 12.17s/it]score1 tensor([[0.4414],
-        [0.4609],
-        [0.5000],
-        [0.4141]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4121, 0.4668, 0.4688, 0.4062], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:33:24,355] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.08 | optimizer_step: 4.37
-[2025-01-25 17:33:24,356] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2140.76 | bwd_microstep: 4603.94 | bwd_inner_microstep: 4599.24 | bwd_allreduce_microstep: 4.62 | step_microstep: 45.45
-[2025-01-25 17:33:24,357] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2140.73 | bwd: 4603.96 | bwd_inner: 4599.24 | bwd_allreduce: 4.66 | step: 45.46
- 54%|█████▍    | 3137/5800 [8:46:54<7:49:30, 10.58s/it]                                                       {'loss': 0.0186, 'grad_norm': 3.8204033374786377, 'learning_rate': 1.8326739905635066e-05, 'epoch': 27.04}
- 54%|█████▍    | 3137/5800 [8:46:54<7:49:30, 10.58s/it]score1 tensor([[0.4395],
-        [0.4473],
-        [0.4336],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4258, 0.4277, 0.4453, 0.5586], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:33:31,221] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.90 | optimizer_step: 4.36
-[2025-01-25 17:33:31,223] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2146.25 | bwd_microstep: 4598.97 | bwd_inner_microstep: 4593.84 | bwd_allreduce_microstep: 5.01 | step_microstep: 44.66
-[2025-01-25 17:33:31,223] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2146.21 | bwd: 4598.99 | bwd_inner: 4593.84 | bwd_allreduce: 5.07 | step: 44.66
- 54%|█████▍    | 3138/5800 [8:47:01<6:59:56,  9.47s/it]                                                       {'loss': 0.0122, 'grad_norm': 4.1034064292907715, 'learning_rate': 1.831561119958066e-05, 'epoch': 27.05}
- 54%|█████▍    | 3138/5800 [8:47:01<6:59:56,  9.47s/it]score1 tensor([[0.6367],
-        [0.4863],
-        [0.5039],
-        [0.6289]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6602, 0.4980, 0.4883, 0.6289], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:33:38,033] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.94 | optimizer_step: 4.36
-[2025-01-25 17:33:38,038] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2143.09 | bwd_microstep: 4548.91 | bwd_inner_microstep: 4543.99 | bwd_allreduce_microstep: 4.83 | step_microstep: 44.01
-[2025-01-25 17:33:38,038] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2143.06 | bwd: 4548.94 | bwd_inner: 4543.99 | bwd_allreduce: 4.87 | step: 44.02
- 54%|█████▍    | 3139/5800 [8:47:08<6:24:32,  8.67s/it]                                                       {'loss': 0.0127, 'grad_norm': 2.344642400741577, 'learning_rate': 1.8304483018747903e-05, 'epoch': 27.06}
- 54%|█████▍    | 3139/5800 [8:47:08<6:24:32,  8.67s/it]score1 tensor([[0.6328],
-        [0.5898],
-        [0.5156],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6484, 0.5938, 0.4941, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:33:44,899] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 17:33:44,900] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2144.84 | bwd_microstep: 4587.06 | bwd_inner_microstep: 4582.09 | bwd_allreduce_microstep: 4.88 | step_microstep: 43.34
-[2025-01-25 17:33:44,900] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2144.79 | bwd: 4587.09 | bwd_inner: 4582.09 | bwd_allreduce: 4.93 | step: 43.34
- 54%|█████▍    | 3140/5800 [8:47:14<6:00:19,  8.13s/it]                                                       {'loss': 0.0132, 'grad_norm': 0.7583722472190857, 'learning_rate': 1.829335536660677e-05, 'epoch': 27.07}
- 54%|█████▍    | 3140/5800 [8:47:14<6:00:19,  8.13s/it]score1 tensor([[0.4902],
-        [0.5820],
-        [0.4551],
-        [0.4023]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.5469, 0.4355, 0.3945], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0200, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:33:51,755] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.36
-[2025-01-25 17:33:51,757] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2145.28 | bwd_microstep: 4590.88 | bwd_inner_microstep: 4585.90 | bwd_allreduce_microstep: 4.88 | step_microstep: 43.90
-[2025-01-25 17:33:51,757] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2145.24 | bwd: 4590.91 | bwd_inner: 4585.90 | bwd_allreduce: 4.93 | step: 43.91
- 54%|█████▍    | 3141/5800 [8:47:21<5:43:19,  7.75s/it]                                                       {'loss': 0.02, 'grad_norm': 7.935324668884277, 'learning_rate': 1.8282228246627035e-05, 'epoch': 27.08}
- 54%|█████▍    | 3141/5800 [8:47:21<5:43:19,  7.75s/it]score1 tensor([[0.5352],
-        [0.5352],
-        [0.4043],
-        [0.5859]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.5117, 0.3750, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0317, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:33:58,617] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.95 | optimizer_step: 4.37
-[2025-01-25 17:33:58,618] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2146.12 | bwd_microstep: 4592.73 | bwd_inner_microstep: 4587.54 | bwd_allreduce_microstep: 5.09 | step_microstep: 43.84
-[2025-01-25 17:33:58,619] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2146.10 | bwd: 4592.76 | bwd_inner: 4587.54 | bwd_allreduce: 5.14 | step: 43.84
- 54%|█████▍    | 3142/5800 [8:47:28<5:31:21,  7.48s/it]                                                       {'loss': 0.0317, 'grad_norm': 8.281097412109375, 'learning_rate': 1.827110166227835e-05, 'epoch': 27.09}
- 54%|█████▍    | 3142/5800 [8:47:28<5:31:21,  7.48s/it]score1 tensor([[0.4980],
-        [0.5352],
-        [0.5547],
-        [0.6367]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4766, 0.5352, 0.5391, 0.6250], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:34:05,459] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 17:34:05,460] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2147.44 | bwd_microstep: 4554.86 | bwd_inner_microstep: 4545.32 | bwd_allreduce_microstep: 9.40 | step_microstep: 60.29
-[2025-01-25 17:34:05,461] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2147.41 | bwd: 4554.90 | bwd_inner: 4545.32 | bwd_allreduce: 9.47 | step: 60.28
- 54%|█████▍    | 3143/5800 [8:47:35<5:22:49,  7.29s/it]                                                       {'loss': 0.0122, 'grad_norm': 6.5481085777282715, 'learning_rate': 1.8259975617030166e-05, 'epoch': 27.09}
- 54%|█████▍    | 3143/5800 [8:47:35<5:22:49,  7.29s/it]score1 tensor([[0.4062],
-        [0.3301],
-        [0.4570],
-        [0.5781]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.3457, 0.4707, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0142, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:34:12,340] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 17:34:12,342] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2148.35 | bwd_microstep: 4607.06 | bwd_inner_microstep: 4601.77 | bwd_allreduce_microstep: 5.22 | step_microstep: 45.05
-[2025-01-25 17:34:12,342] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2148.31 | bwd: 4607.08 | bwd_inner: 4601.77 | bwd_allreduce: 5.25 | step: 45.06
- 54%|█████▍    | 3144/5800 [8:47:42<5:17:16,  7.17s/it]                                                       {'loss': 0.0142, 'grad_norm': 3.29769229888916, 'learning_rate': 1.8248850114351766e-05, 'epoch': 27.1}
- 54%|█████▍    | 3144/5800 [8:47:42<5:17:16,  7.17s/it]score1 tensor([[0.4746],
-        [0.4668],
-        [0.6094],
-        [0.5938]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4805, 0.4922, 0.6094, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:34:19,163] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.02 | optimizer_step: 4.36
-[2025-01-25 17:34:19,165] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2151.61 | bwd_microstep: 4547.44 | bwd_inner_microstep: 4542.60 | bwd_allreduce_microstep: 4.77 | step_microstep: 42.02
-[2025-01-25 17:34:19,165] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2151.56 | bwd: 4547.46 | bwd_inner: 4542.60 | bwd_allreduce: 4.81 | step: 42.03
- 54%|█████▍    | 3145/5800 [8:47:49<5:12:34,  7.06s/it]                                                       {'loss': 0.0117, 'grad_norm': 6.164815425872803, 'learning_rate': 1.8237725157712295e-05, 'epoch': 27.11}
- 54%|█████▍    | 3145/5800 [8:47:49<5:12:34,  7.06s/it]score1 tensor([[0.3887],
-        [0.4180],
-        [0.6094],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3730, 0.4004, 0.6055, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:34:26,044] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.93 | optimizer_step: 4.37
-[2025-01-25 17:34:26,046] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2149.67 | bwd_microstep: 4605.12 | bwd_inner_microstep: 4599.84 | bwd_allreduce_microstep: 5.20 | step_microstep: 46.50
-[2025-01-25 17:34:26,046] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2149.63 | bwd: 4605.15 | bwd_inner: 4599.84 | bwd_allreduce: 5.23 | step: 46.52
- 54%|█████▍    | 3146/5800 [8:47:56<5:10:05,  7.01s/it]                                                       {'loss': 0.0103, 'grad_norm': 8.042257308959961, 'learning_rate': 1.8226600750580688e-05, 'epoch': 27.12}
- 54%|█████▍    | 3146/5800 [8:47:56<5:10:05,  7.01s/it]score1 tensor([[0.4297],
-        [0.5469],
-        [0.5820],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.5430, 0.6211, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:34:32,924] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.08 | optimizer_step: 4.36
-[2025-01-25 17:34:32,925] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.50 | bwd_microstep: 4604.78 | bwd_inner_microstep: 4599.69 | bwd_allreduce_microstep: 4.99 | step_microstep: 43.13
-[2025-01-25 17:34:32,926] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.46 | bwd: 4604.80 | bwd_inner: 4599.69 | bwd_allreduce: 5.04 | step: 43.13
- 54%|█████▍    | 3147/5800 [8:48:02<5:08:10,  6.97s/it]                                                       {'loss': 0.0146, 'grad_norm': 3.8817710876464844, 'learning_rate': 1.821547689642575e-05, 'epoch': 27.13}
- 54%|█████▍    | 3147/5800 [8:48:02<5:08:10,  6.97s/it]score1 tensor([[0.5195],
-        [0.5195],
-        [0.4805],
-        [0.4180]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.5156, 0.4883, 0.4121], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:34:39,806] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.96 | optimizer_step: 4.36
-[2025-01-25 17:34:39,808] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.40 | bwd_microstep: 4604.44 | bwd_inner_microstep: 4599.22 | bwd_allreduce_microstep: 5.12 | step_microstep: 42.70
-[2025-01-25 17:34:39,808] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.37 | bwd: 4604.47 | bwd_inner: 4599.22 | bwd_allreduce: 5.17 | step: 42.71
- 54%|█████▍    | 3148/5800 [8:48:09<5:06:55,  6.94s/it]                                                       {'loss': 0.0103, 'grad_norm': 0.46662086248397827, 'learning_rate': 1.820435359871607e-05, 'epoch': 27.14}
- 54%|█████▍    | 3148/5800 [8:48:09<5:06:55,  6.94s/it]score1 tensor([[0.3926],
-        [0.4902],
-        [0.5898],
-        [0.4824]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.4941, 0.6055, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:34:46,700] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.36
-[2025-01-25 17:34:46,702] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.12 | bwd_microstep: 4608.37 | bwd_inner_microstep: 4603.41 | bwd_allreduce_microstep: 4.86 | step_microstep: 43.92
-[2025-01-25 17:34:46,702] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.09 | bwd: 4608.39 | bwd_inner: 4603.41 | bwd_allreduce: 4.91 | step: 43.92
- 54%|█████▍    | 3149/5800 [8:48:16<5:06:07,  6.93s/it]                                                       {'loss': 0.0122, 'grad_norm': 8.049306869506836, 'learning_rate': 1.8193230860920103e-05, 'epoch': 27.15}
- 54%|█████▍    | 3149/5800 [8:48:16<5:06:07,  6.93s/it]score1 tensor([[0.3281],
-        [0.4004],
-        [0.4336],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3477, 0.4258, 0.4590, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0249, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:34:53,610] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.37
-[2025-01-25 17:34:53,611] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.09 | bwd_microstep: 4626.70 | bwd_inner_microstep: 4621.70 | bwd_allreduce_microstep: 4.91 | step_microstep: 43.54
-[2025-01-25 17:34:53,611] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.05 | bwd: 4626.72 | bwd_inner: 4621.69 | bwd_allreduce: 4.95 | step: 43.55
- 54%|█████▍    | 3150/5800 [8:48:23<5:05:45,  6.92s/it]                                                       {'loss': 0.0249, 'grad_norm': 3.3035292625427246, 'learning_rate': 1.8182108686506104e-05, 'epoch': 27.16}
- 54%|█████▍    | 3150/5800 [8:48:23<5:05:45,  6.92s/it]score1 tensor([[0.6523],
-        [0.4414],
-        [0.5078],
-        [0.4531]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133, 0.4590, 0.5117, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0171, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:35:00,520] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 17:35:00,522] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.16 | bwd_microstep: 4632.82 | bwd_inner_microstep: 4627.97 | bwd_allreduce_microstep: 4.78 | step_microstep: 43.94
-[2025-01-25 17:35:00,522] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.11 | bwd: 4632.85 | bwd_inner: 4627.97 | bwd_allreduce: 4.81 | step: 43.96
- 54%|█████▍    | 3151/5800 [8:48:30<5:05:32,  6.92s/it]                                                       {'loss': 0.0171, 'grad_norm': 0.6292648911476135, 'learning_rate': 1.817098707894215e-05, 'epoch': 27.16}
- 54%|█████▍    | 3151/5800 [8:48:30<5:05:32,  6.92s/it]score1 tensor([[0.4473],
-        [0.5352],
-        [0.5625],
-        [0.4238]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4629, 0.5312, 0.5625, 0.4121], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:35:07,389] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 17:35:07,390] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.55 | bwd_microstep: 4580.71 | bwd_inner_microstep: 4575.90 | bwd_allreduce_microstep: 4.72 | step_microstep: 42.45
-[2025-01-25 17:35:07,392] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.52 | bwd: 4580.73 | bwd_inner: 4575.90 | bwd_allreduce: 4.76 | step: 42.46
- 54%|█████▍    | 3152/5800 [8:48:37<5:04:42,  6.90s/it]                                                       {'loss': 0.0078, 'grad_norm': 2.085505485534668, 'learning_rate': 1.815986604169617e-05, 'epoch': 27.17}
- 54%|█████▍    | 3152/5800 [8:48:37<5:04:42,  6.90s/it]score1 tensor([[0.4336],
-        [0.5703],
-        [0.5078],
-        [0.4941]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4629, 0.5664, 0.5312, 0.4883], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:35:14,304] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 17:35:14,307] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.82 | bwd_microstep: 4630.43 | bwd_inner_microstep: 4625.20 | bwd_allreduce_microstep: 5.14 | step_microstep: 39.56
-[2025-01-25 17:35:14,307] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.77 | bwd: 4630.46 | bwd_inner: 4625.20 | bwd_allreduce: 5.18 | step: 39.56
- 54%|█████▍    | 3153/5800 [8:48:44<5:04:41,  6.91s/it]                                                       {'loss': 0.0156, 'grad_norm': 0.5088024139404297, 'learning_rate': 1.814874557823588e-05, 'epoch': 27.18}
- 54%|█████▍    | 3153/5800 [8:48:44<5:04:41,  6.91s/it]score1 tensor([[0.4297],
-        [0.4082],
-        [0.4902],
-        [0.4531]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4590, 0.4141, 0.4980, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:35:21,211] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.89 | optimizer_step: 4.36
-[2025-01-25 17:35:21,212] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.12 | bwd_microstep: 4631.95 | bwd_inner_microstep: 4627.20 | bwd_allreduce_microstep: 4.67 | step_microstep: 42.30
-[2025-01-25 17:35:21,212] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.10 | bwd: 4631.97 | bwd_inner: 4627.20 | bwd_allreduce: 4.71 | step: 42.31
- 54%|█████▍    | 3154/5800 [8:48:51<5:04:35,  6.91s/it]                                                       {'loss': 0.0112, 'grad_norm': 7.643856048583984, 'learning_rate': 1.8137625692028848e-05, 'epoch': 27.19}
- 54%|█████▍    | 3154/5800 [8:48:51<5:04:35,  6.91s/it]score1 tensor([[0.5078],
-        [0.4473],
-        [0.4609],
-        [0.4453]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5000, 0.4629, 0.4746, 0.4570], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:35:28,120] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.37
-[2025-01-25 17:35:28,121] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.08 | bwd_microstep: 4625.25 | bwd_inner_microstep: 4620.55 | bwd_allreduce_microstep: 4.62 | step_microstep: 43.26
-[2025-01-25 17:35:28,122] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.04 | bwd: 4625.28 | bwd_inner: 4620.55 | bwd_allreduce: 4.66 | step: 43.27
- 54%|█████▍    | 3155/5800 [8:48:58<5:04:30,  6.91s/it]                                                       {'loss': 0.0122, 'grad_norm': 3.750277280807495, 'learning_rate': 1.8126506386542438e-05, 'epoch': 27.2}
- 54%|█████▍    | 3155/5800 [8:48:58<5:04:30,  6.91s/it]score1 tensor([[0.4238],
-        [0.4961],
-        [0.6992],
-        [0.4141]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4141, 0.4844, 0.6875, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:35:35,025] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.37
-[2025-01-25 17:35:35,026] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.02 | bwd_microstep: 4628.27 | bwd_inner_microstep: 4623.12 | bwd_allreduce_microstep: 5.04 | step_microstep: 41.91
-[2025-01-25 17:35:35,027] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.98 | bwd: 4628.30 | bwd_inner: 4623.12 | bwd_allreduce: 5.09 | step: 41.92
- 54%|█████▍    | 3156/5800 [8:49:05<5:04:22,  6.91s/it]                                                       {'loss': 0.0117, 'grad_norm': 8.1568603515625, 'learning_rate': 1.811538766524384e-05, 'epoch': 27.21}
- 54%|█████▍    | 3156/5800 [8:49:05<5:04:22,  6.91s/it]score1 tensor([[0.5117],
-        [0.6953],
-        [0.4297],
-        [0.3652]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5078, 0.7070, 0.4551, 0.3555], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:35:41,947] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 17:35:41,949] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.25 | bwd_microstep: 4630.27 | bwd_inner_microstep: 4625.24 | bwd_allreduce_microstep: 4.93 | step_microstep: 43.76
-[2025-01-25 17:35:41,949] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.22 | bwd: 4630.29 | bwd_inner: 4625.24 | bwd_allreduce: 4.98 | step: 43.77
- 54%|█████▍    | 3157/5800 [8:49:11<5:04:28,  6.91s/it]                                                       {'loss': 0.0127, 'grad_norm': 1.0784591436386108, 'learning_rate': 1.8104269531600077e-05, 'epoch': 27.22}
- 54%|█████▍    | 3157/5800 [8:49:11<5:04:28,  6.91s/it]score1 tensor([[0.6836],
-        [0.4746],
-        [0.3809],
-        [0.6016]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6875, 0.4648, 0.3789, 0.6289], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:35:48,868] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 17:35:48,870] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.54 | bwd_microstep: 4629.94 | bwd_inner_microstep: 4624.76 | bwd_allreduce_microstep: 5.07 | step_microstep: 44.72
-[2025-01-25 17:35:48,870] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.50 | bwd: 4629.97 | bwd_inner: 4624.76 | bwd_allreduce: 5.13 | step: 44.73
- 54%|█████▍    | 3158/5800 [8:49:18<5:04:24,  6.91s/it]                                                       {'loss': 0.0107, 'grad_norm': 1.6569875478744507, 'learning_rate': 1.8093151989077958e-05, 'epoch': 27.22}
- 54%|█████▍    | 3158/5800 [8:49:18<5:04:24,  6.91s/it]score1 tensor([[0.5391],
-        [0.5117],
-        [0.5586],
-        [0.4473]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.5117, 0.5391, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:35:55,689] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.37
-[2025-01-25 17:35:55,690] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.58 | bwd_microstep: 4544.79 | bwd_inner_microstep: 4539.62 | bwd_allreduce_microstep: 5.10 | step_microstep: 44.69
-[2025-01-25 17:35:55,690] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.55 | bwd: 4544.82 | bwd_inner: 4539.62 | bwd_allreduce: 5.14 | step: 44.70
- 54%|█████▍    | 3159/5800 [8:49:25<5:03:05,  6.89s/it]                                                       {'loss': 0.0054, 'grad_norm': 0.43990930914878845, 'learning_rate': 1.8082035041144144e-05, 'epoch': 27.23}
- 54%|█████▍    | 3159/5800 [8:49:25<5:03:05,  6.89s/it]score1 tensor([[0.6406],
-        [0.6562],
-        [0.5703],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133, 0.6484, 0.5781, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:36:02,604] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 17:36:02,605] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.73 | bwd_microstep: 4635.15 | bwd_inner_microstep: 4629.85 | bwd_allreduce_microstep: 5.18 | step_microstep: 43.84
-[2025-01-25 17:36:02,606] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.70 | bwd: 4635.19 | bwd_inner: 4629.85 | bwd_allreduce: 5.23 | step: 43.85
- 54%|█████▍    | 3160/5800 [8:49:32<5:03:22,  6.90s/it]                                                       {'loss': 0.0117, 'grad_norm': 4.59368896484375, 'learning_rate': 1.8070918691265075e-05, 'epoch': 27.24}
- 54%|█████▍    | 3160/5800 [8:49:32<5:03:22,  6.90s/it]score1 tensor([[0.5273],
-        [0.6367],
-        [0.3652],
-        [0.6836]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.6562, 0.3438, 0.6875], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0151, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:36:09,509] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.88 | optimizer_step: 4.37
-[2025-01-25 17:36:09,511] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.82 | bwd_microstep: 4624.20 | bwd_inner_microstep: 4619.48 | bwd_allreduce_microstep: 4.65 | step_microstep: 42.57
-[2025-01-25 17:36:09,511] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.78 | bwd: 4624.23 | bwd_inner: 4619.48 | bwd_allreduce: 4.68 | step: 42.58
- 55%|█████▍    | 3161/5800 [8:49:39<5:03:25,  6.90s/it]                                                       {'loss': 0.0151, 'grad_norm': 1.388834834098816, 'learning_rate': 1.805980294290704e-05, 'epoch': 27.25}
- 55%|█████▍    | 3161/5800 [8:49:39<5:03:25,  6.90s/it]score1 tensor([[0.4062],
-        [0.5430],
-        [0.5469],
-        [0.5977]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3984, 0.5039, 0.5352, 0.5898], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0166, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:36:16,423] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.36
-[2025-01-25 17:36:16,425] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.53 | bwd_microstep: 4629.35 | bwd_inner_microstep: 4624.03 | bwd_allreduce_microstep: 5.22 | step_microstep: 43.94
-[2025-01-25 17:36:16,425] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.50 | bwd: 4629.38 | bwd_inner: 4624.03 | bwd_allreduce: 5.27 | step: 43.94
- 55%|█████▍    | 3162/5800 [8:49:46<5:03:29,  6.90s/it]                                                       {'loss': 0.0166, 'grad_norm': 8.385160446166992, 'learning_rate': 1.8048687799536113e-05, 'epoch': 27.26}
- 55%|█████▍    | 3162/5800 [8:49:46<5:03:29,  6.90s/it]score1 tensor([[0.5391],
-        [0.3750],
-        [0.5664],
-        [0.1846]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.3652, 0.5625, 0.1787], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:36:23,336] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.13 | optimizer_step: 4.37
-[2025-01-25 17:36:23,338] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.96 | bwd_microstep: 4632.04 | bwd_inner_microstep: 4627.04 | bwd_allreduce_microstep: 4.89 | step_microstep: 44.78
-[2025-01-25 17:36:23,339] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.92 | bwd: 4632.07 | bwd_inner: 4627.04 | bwd_allreduce: 4.94 | step: 44.79
- 55%|█████▍    | 3163/5800 [8:49:53<5:03:41,  6.91s/it]                                                       {'loss': 0.0059, 'grad_norm': 7.058482646942139, 'learning_rate': 1.803757326461818e-05, 'epoch': 27.27}
- 55%|█████▍    | 3163/5800 [8:49:53<5:03:41,  6.91s/it]score1 tensor([[0.6055],
-        [0.4824],
-        [0.4570],
-        [0.4082]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.4980, 0.4648, 0.4219], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0190, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:36:30,269] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.04 | optimizer_step: 4.37
-[2025-01-25 17:36:30,270] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.02 | bwd_microstep: 4627.16 | bwd_inner_microstep: 4621.60 | bwd_allreduce_microstep: 5.48 | step_microstep: 45.24
-[2025-01-25 17:36:30,271] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.98 | bwd: 4627.19 | bwd_inner: 4621.60 | bwd_allreduce: 5.52 | step: 45.25
- 55%|█████▍    | 3164/5800 [8:50:00<5:03:41,  6.91s/it]                                                       {'loss': 0.019, 'grad_norm': 3.4960410594940186, 'learning_rate': 1.8026459341618964e-05, 'epoch': 27.28}
- 55%|█████▍    | 3164/5800 [8:50:00<5:03:41,  6.91s/it]score1 tensor([[0.4902],
-        [0.5547],
-        [0.6133],
-        [0.4727]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.5664, 0.6367, 0.4707], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:36:37,182] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.10 | optimizer_step: 4.36
-[2025-01-25 17:36:37,183] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.40 | bwd_microstep: 4629.51 | bwd_inner_microstep: 4624.71 | bwd_allreduce_microstep: 4.71 | step_microstep: 41.94
-[2025-01-25 17:36:37,183] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.37 | bwd: 4629.53 | bwd_inner: 4624.71 | bwd_allreduce: 4.75 | step: 41.95
- 55%|█████▍    | 3165/5800 [8:50:07<5:03:33,  6.91s/it]                                                       {'loss': 0.0107, 'grad_norm': 0.7726607918739319, 'learning_rate': 1.8015346034003957e-05, 'epoch': 27.28}
- 55%|█████▍    | 3165/5800 [8:50:07<5:03:33,  6.91s/it]score1 tensor([[0.3086],
-        [0.6055],
-        [0.5781],
-        [0.5859]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3086, 0.6133, 0.5977, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:36:44,045] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 17:36:44,046] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.10 | bwd_microstep: 4579.78 | bwd_inner_microstep: 4574.45 | bwd_allreduce_microstep: 5.25 | step_microstep: 45.99
-[2025-01-25 17:36:44,046] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.06 | bwd: 4579.80 | bwd_inner: 4574.45 | bwd_allreduce: 5.29 | step: 46.00
- 55%|█████▍    | 3166/5800 [8:50:14<5:02:48,  6.90s/it]                                                       {'loss': 0.0117, 'grad_norm': 2.3405649662017822, 'learning_rate': 1.8004233345238503e-05, 'epoch': 27.29}
- 55%|█████▍    | 3166/5800 [8:50:14<5:02:48,  6.90s/it]score1 tensor([[0.4199],
-        [0.5508],
-        [0.5117],
-        [0.4551]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.5273, 0.4980, 0.4668], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:36:50,958] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.88 | optimizer_step: 4.36
-[2025-01-25 17:36:50,959] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.03 | bwd_microstep: 4630.49 | bwd_inner_microstep: 4625.23 | bwd_allreduce_microstep: 5.15 | step_microstep: 45.18
-[2025-01-25 17:36:50,960] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.99 | bwd: 4630.51 | bwd_inner: 4625.23 | bwd_allreduce: 5.21 | step: 45.18
- 55%|█████▍    | 3167/5800 [8:50:20<5:02:54,  6.90s/it]                                                       {'loss': 0.0127, 'grad_norm': 4.211531162261963, 'learning_rate': 1.7993121278787714e-05, 'epoch': 27.3}
- 55%|█████▍    | 3167/5800 [8:50:20<5:02:54,  6.90s/it]score1 tensor([[0.3652],
-        [0.3828],
-        [0.5586],
-        [0.4238]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.3750, 0.5586, 0.4414], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:36:57,826] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 17:36:57,827] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.26 | bwd_microstep: 4585.89 | bwd_inner_microstep: 4580.94 | bwd_allreduce_microstep: 4.86 | step_microstep: 42.44
-[2025-01-25 17:36:57,827] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.20 | bwd: 4585.91 | bwd_inner: 4580.94 | bwd_allreduce: 4.90 | step: 42.44
- 55%|█████▍    | 3168/5800 [8:50:27<5:02:24,  6.89s/it]                                                       {'loss': 0.0088, 'grad_norm': 1.8442294597625732, 'learning_rate': 1.798200983811654e-05, 'epoch': 27.31}
- 55%|█████▍    | 3168/5800 [8:50:27<5:02:24,  6.89s/it]score1 tensor([[0.5430],
-        [0.6484],
-        [0.5625],
-        [0.4629]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.6367, 0.5312, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0205, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:37:04,740] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.37
-[2025-01-25 17:37:04,741] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.15 | bwd_microstep: 4628.46 | bwd_inner_microstep: 4623.55 | bwd_allreduce_microstep: 4.82 | step_microstep: 43.01
-[2025-01-25 17:37:04,742] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.11 | bwd: 4628.52 | bwd_inner: 4623.55 | bwd_allreduce: 4.86 | step: 43.01
- 55%|█████▍    | 3169/5800 [8:50:34<5:02:29,  6.90s/it]                                                       {'loss': 0.0205, 'grad_norm': 8.683903694152832, 'learning_rate': 1.797089902668971e-05, 'epoch': 27.32}
- 55%|█████▍    | 3169/5800 [8:50:34<5:02:29,  6.90s/it]score1 tensor([[0.5469],
-        [0.5000],
-        [0.5078],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.4980, 0.5273, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:37:11,654] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.36
-[2025-01-25 17:37:11,656] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.97 | bwd_microstep: 4627.92 | bwd_inner_microstep: 4622.88 | bwd_allreduce_microstep: 4.94 | step_microstep: 44.71
-[2025-01-25 17:37:11,656] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.93 | bwd: 4627.94 | bwd_inner: 4622.89 | bwd_allreduce: 4.98 | step: 44.71
- 55%|█████���    | 3170/5800 [8:50:41<5:02:39,  6.90s/it]                                                       {'loss': 0.0073, 'grad_norm': 0.4160698652267456, 'learning_rate': 1.7959788847971772e-05, 'epoch': 27.33}
- 55%|█████▍    | 3170/5800 [8:50:41<5:02:39,  6.90s/it]score1 tensor([[0.5234],
-        [0.4375],
-        [0.4531],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5000, 0.4375, 0.4395, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:37:18,520] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 17:37:18,522] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.36 | bwd_microstep: 4580.72 | bwd_inner_microstep: 4575.43 | bwd_allreduce_microstep: 5.20 | step_microstep: 44.51
-[2025-01-25 17:37:18,522] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.28 | bwd: 4580.74 | bwd_inner: 4575.43 | bwd_allreduce: 5.25 | step: 44.52
- 55%|█████▍    | 3171/5800 [8:50:48<5:01:54,  6.89s/it]                                                       {'loss': 0.0132, 'grad_norm': 1.9213720560073853, 'learning_rate': 1.7948679305427073e-05, 'epoch': 27.34}
- 55%|█████▍    | 3171/5800 [8:50:48<5:01:54,  6.89s/it]score1 tensor([[0.6445],
-        [0.5078],
-        [0.5977],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.5039, 0.6016, 0.5742], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:37:25,422] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.37
-[2025-01-25 17:37:25,424] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.14 | bwd_microstep: 4624.53 | bwd_inner_microstep: 4619.68 | bwd_allreduce_microstep: 4.78 | step_microstep: 43.25
-[2025-01-25 17:37:25,424] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.11 | bwd: 4624.55 | bwd_inner: 4619.68 | bwd_allreduce: 4.81 | step: 43.26
- 55%|█████▍    | 3172/5800 [8:50:55<5:01:57,  6.89s/it]                                                       {'loss': 0.0107, 'grad_norm': 4.403805732727051, 'learning_rate': 1.793757040251976e-05, 'epoch': 27.34}
- 55%|█████▍    | 3172/5800 [8:50:55<5:01:57,  6.89s/it]score1 tensor([[0.6172],
-        [0.6562],
-        [0.4941],
-        [0.3418]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.6445, 0.5156, 0.3613], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:37:32,335] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 17:37:32,336] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.09 | bwd_microstep: 4629.87 | bwd_inner_microstep: 4624.98 | bwd_allreduce_microstep: 4.82 | step_microstep: 44.61
-[2025-01-25 17:37:32,337] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.05 | bwd: 4629.90 | bwd_inner: 4624.98 | bwd_allreduce: 4.85 | step: 44.61
- 55%|█████▍    | 3173/5800 [8:51:02<5:02:06,  6.90s/it]                                                       {'loss': 0.0161, 'grad_norm': 1.4193484783172607, 'learning_rate': 1.7926462142713777e-05, 'epoch': 27.35}
- 55%|█████▍    | 3173/5800 [8:51:02<5:02:06,  6.90s/it]score1 tensor([[0.5273],
-        [0.3555],
-        [0.4004],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4961, 0.3926, 0.4238, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0269, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:37:39,249] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 17:37:39,250] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.76 | bwd_microstep: 4629.83 | bwd_inner_microstep: 4624.88 | bwd_allreduce_microstep: 4.87 | step_microstep: 46.41
-[2025-01-25 17:37:39,250] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.72 | bwd: 4629.85 | bwd_inner: 4624.88 | bwd_allreduce: 4.91 | step: 46.42
- 55%|█████▍    | 3174/5800 [8:51:09<5:02:08,  6.90s/it]                                                       {'loss': 0.0269, 'grad_norm': 3.64786696434021, 'learning_rate': 1.7915354529472884e-05, 'epoch': 27.36}
- 55%|█████▍    | 3174/5800 [8:51:09<5:02:08,  6.90s/it]score1 tensor([[0.4180],
-        [0.6172],
-        [0.4512],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4297, 0.6445, 0.4648, 0.5703], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0151, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:37:46,165] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.36
-[2025-01-25 17:37:46,167] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.33 | bwd_microstep: 4635.20 | bwd_inner_microstep: 4630.32 | bwd_allreduce_microstep: 4.80 | step_microstep: 46.18
-[2025-01-25 17:37:46,167] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.29 | bwd: 4635.23 | bwd_inner: 4630.32 | bwd_allreduce: 4.84 | step: 46.20
- 55%|█████▍    | 3175/5800 [8:51:16<5:02:22,  6.91s/it]                                                       {'loss': 0.0151, 'grad_norm': 8.330493927001953, 'learning_rate': 1.7904247566260616e-05, 'epoch': 27.37}
- 55%|█████▍    | 3175/5800 [8:51:16<5:02:22,  6.91s/it]score1 tensor([[0.5508],
-        [0.6367],
-        [0.4785],
-        [0.4180]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.6445, 0.5039, 0.4609], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0190, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:37:53,038] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.15 | optimizer_step: 4.36
-[2025-01-25 17:37:53,040] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.31 | bwd_microstep: 4578.65 | bwd_inner_microstep: 4573.68 | bwd_allreduce_microstep: 4.89 | step_microstep: 43.48
-[2025-01-25 17:37:53,040] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.26 | bwd: 4578.67 | bwd_inner: 4573.68 | bwd_allreduce: 4.93 | step: 43.49
- 55%|█████▍    | 3176/5800 [8:51:23<5:01:37,  6.90s/it]                                                       {'loss': 0.019, 'grad_norm': 6.203238487243652, 'learning_rate': 1.7893141256540325e-05, 'epoch': 27.38}
- 55%|█████▍    | 3176/5800 [8:51:23<5:01:37,  6.90s/it]score1 tensor([[0.5703],
-        [0.4414],
-        [0.4668],
-        [0.5898]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.4512, 0.4961, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:37:59,955] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.37
-[2025-01-25 17:37:59,956] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.19 | bwd_microstep: 4629.73 | bwd_inner_microstep: 4624.51 | bwd_allreduce_microstep: 5.11 | step_microstep: 41.75
-[2025-01-25 17:37:59,957] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.15 | bwd: 4629.75 | bwd_inner: 4624.51 | bwd_allreduce: 5.17 | step: 41.76
- 55%|█████▍    | 3177/5800 [8:51:29<5:01:46,  6.90s/it]                                                       {'loss': 0.0176, 'grad_norm': 3.828327178955078, 'learning_rate': 1.7882035603775142e-05, 'epoch': 27.39}
- 55%|█████▍    | 3177/5800 [8:51:29<5:01:46,  6.90s/it]score1 tensor([[0.5898],
-        [0.4629],
-        [0.3145],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5820, 0.4746, 0.3457, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:38:06,823] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.36
-[2025-01-25 17:38:06,824] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.67 | bwd_microstep: 4591.28 | bwd_inner_microstep: 4586.52 | bwd_allreduce_microstep: 4.68 | step_microstep: 42.94
-[2025-01-25 17:38:06,824] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.64 | bwd: 4591.30 | bwd_inner: 4586.52 | bwd_allreduce: 4.72 | step: 42.95
- 55%|█████▍    | 3178/5800 [8:51:36<5:01:11,  6.89s/it]                                                       {'loss': 0.0127, 'grad_norm': 1.4754905700683594, 'learning_rate': 1.7870930611428013e-05, 'epoch': 27.4}
- 55%|█████▍    | 3178/5800 [8:51:36<5:01:11,  6.89s/it]score1 tensor([[0.5117],
-        [0.4004],
-        [0.6641],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.4004, 0.6328, 0.4863], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0142, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:38:13,690] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.94 | optimizer_step: 4.36
-[2025-01-25 17:38:13,691] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.64 | bwd_microstep: 4582.45 | bwd_inner_microstep: 4577.70 | bwd_allreduce_microstep: 4.62 | step_microstep: 43.54
-[2025-01-25 17:38:13,692] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.61 | bwd: 4582.47 | bwd_inner: 4577.70 | bwd_allreduce: 4.68 | step: 43.55
- 55%|█████▍    | 3179/5800 [8:51:43<5:00:44,  6.88s/it]                                                       {'loss': 0.0142, 'grad_norm': 2.5636940002441406, 'learning_rate': 1.7859826282961655e-05, 'epoch': 27.41}
- 55%|█████▍    | 3179/5800 [8:51:43<5:00:44,  6.88s/it]score1 tensor([[0.6367],
-        [0.6602],
-        [0.6484],
-        [0.4902]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.6211, 0.6406, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0210, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:38:20,613] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.02 | optimizer_step: 4.36
-[2025-01-25 17:38:20,615] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.10 | bwd_microstep: 4635.05 | bwd_inner_microstep: 4630.15 | bwd_allreduce_microstep: 4.80 | step_microstep: 45.50
-[2025-01-25 17:38:20,615] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.06 | bwd: 4635.08 | bwd_inner: 4630.15 | bwd_allreduce: 4.86 | step: 45.51
- 55%|█████▍    | 3180/5800 [8:51:50<5:01:12,  6.90s/it]                                                       {'loss': 0.021, 'grad_norm': 9.18017578125, 'learning_rate': 1.7848722621838607e-05, 'epoch': 27.41}
- 55%|█████▍    | 3180/5800 [8:51:50<5:01:12,  6.90s/it]score1 tensor([[0.4531],
-        [0.5273],
-        [0.5859],
-        [0.4785]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.4941, 0.5391, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0205, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:38:27,488] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 17:38:27,489] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.83 | bwd_microstep: 4587.37 | bwd_inner_microstep: 4582.68 | bwd_allreduce_microstep: 4.61 | step_microstep: 42.39
-[2025-01-25 17:38:27,490] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.78 | bwd: 4587.39 | bwd_inner: 4582.68 | bwd_allreduce: 4.65 | step: 42.40
- 55%|█████▍    | 3181/5800 [8:51:57<5:00:41,  6.89s/it]                                                       {'loss': 0.0205, 'grad_norm': 2.487717390060425, 'learning_rate': 1.783761963152117e-05, 'epoch': 27.42}
- 55%|█████▍    | 3181/5800 [8:51:57<5:00:41,  6.89s/it]score1 tensor([[0.4551],
-        [0.5547],
-        [0.4707],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.5391, 0.4512, 0.5703], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:38:34,361] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.37
-[2025-01-25 17:38:34,363] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.40 | bwd_microstep: 4592.30 | bwd_inner_microstep: 4587.37 | bwd_allreduce_microstep: 4.82 | step_microstep: 45.56
-[2025-01-25 17:38:34,363] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.37 | bwd: 4592.32 | bwd_inner: 4587.37 | bwd_allreduce: 4.88 | step: 45.57
- 55%|█████▍    | 3182/5800 [8:52:04<5:00:22,  6.88s/it]                                                       {'loss': 0.0132, 'grad_norm': 2.227992534637451, 'learning_rate': 1.7826517315471447e-05, 'epoch': 27.43}
- 55%|█████▍    | 3182/5800 [8:52:04<5:00:22,  6.88s/it]score1 tensor([[0.5781],
-        [0.4961],
-        [0.6367],
-        [0.3691]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.4863, 0.6094, 0.3398], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:38:41,276] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.37
-[2025-01-25 17:38:41,278] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.93 | bwd_microstep: 4636.99 | bwd_inner_microstep: 4632.29 | bwd_allreduce_microstep: 4.63 | step_microstep: 43.50
-[2025-01-25 17:38:41,278] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.88 | bwd: 4637.01 | bwd_inner: 4632.29 | bwd_allreduce: 4.66 | step: 43.51
- 55%|█████▍    | 3183/5800 [8:52:11<5:00:43,  6.89s/it]                                                       {'loss': 0.0195, 'grad_norm': 8.318100929260254, 'learning_rate': 1.7815415677151345e-05, 'epoch': 27.44}
- 55%|█████��    | 3183/5800 [8:52:11<5:00:43,  6.89s/it]score1 tensor([[0.3750],
-        [0.5781],
-        [0.5664],
-        [0.4297]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3691, 0.5898, 0.5781, 0.3809], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:38:48,192] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.88 | optimizer_step: 4.37
-[2025-01-25 17:38:48,194] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.59 | bwd_microstep: 4631.83 | bwd_inner_microstep: 4626.83 | bwd_allreduce_microstep: 4.89 | step_microstep: 43.43
-[2025-01-25 17:38:48,194] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.55 | bwd: 4631.85 | bwd_inner: 4626.83 | bwd_allreduce: 4.94 | step: 43.44
- 55%|█████▍    | 3184/5800 [8:52:18<5:00:51,  6.90s/it]                                                       {'loss': 0.0195, 'grad_norm': 1.1305036544799805, 'learning_rate': 1.7804314720022532e-05, 'epoch': 27.45}
- 55%|█████▍    | 3184/5800 [8:52:18<5:00:51,  6.90s/it]score1 tensor([[0.4297],
-        [0.4766],
-        [0.5586],
-        [0.6211]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4277, 0.4551, 0.5742, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:38:55,113] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 17:38:55,114] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.99 | bwd_microstep: 4629.38 | bwd_inner_microstep: 4624.55 | bwd_allreduce_microstep: 4.74 | step_microstep: 49.20
-[2025-01-25 17:38:55,114] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.95 | bwd: 4629.41 | bwd_inner: 4624.55 | bwd_allreduce: 4.78 | step: 49.20
- 55%|█████▍    | 3185/5800 [8:52:25<5:01:01,  6.91s/it]                                                       {'loss': 0.0127, 'grad_norm': 4.0721516609191895, 'learning_rate': 1.77932144475465e-05, 'epoch': 27.46}
- 55%|█████▍    | 3185/5800 [8:52:25<5:01:01,  6.91s/it]score1 tensor([[0.4414],
-        [0.5117],
-        [0.4570],
-        [0.4414]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.4941, 0.4766, 0.4668], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0171, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:39:02,031] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.08 | optimizer_step: 4.37
-[2025-01-25 17:39:02,032] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.41 | bwd_microstep: 4633.35 | bwd_inner_microstep: 4628.55 | bwd_allreduce_microstep: 4.73 | step_microstep: 41.96
-[2025-01-25 17:39:02,032] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.36 | bwd: 4633.37 | bwd_inner: 4628.55 | bwd_allreduce: 4.77 | step: 41.96
- 55%|█████▍    | 3186/5800 [8:52:32<5:01:03,  6.91s/it]                                                       {'loss': 0.0171, 'grad_norm': 3.672492265701294, 'learning_rate': 1.7782114863184485e-05, 'epoch': 27.47}
- 55%|█████▍    | 3186/5800 [8:52:32<5:01:03,  6.91s/it]score1 tensor([[0.6484],
-        [0.4531],
-        [0.4746],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6562, 0.4648, 0.4844, 0.6016], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0171, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:39:08,952] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.39 | optimizer_step: 4.37
-[2025-01-25 17:39:08,954] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.83 | bwd_microstep: 4630.91 | bwd_inner_microstep: 4626.33 | bwd_allreduce_microstep: 4.50 | step_microstep: 44.58
-[2025-01-25 17:39:08,954] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.79 | bwd: 4630.93 | bwd_inner: 4626.33 | bwd_allreduce: 4.54 | step: 44.59
- 55%|█████▍    | 3187/5800 [8:52:38<5:01:07,  6.91s/it]                                                       {'loss': 0.0171, 'grad_norm': 8.469779014587402, 'learning_rate': 1.7771015970397544e-05, 'epoch': 27.47}
- 55%|█████▍    | 3187/5800 [8:52:38<5:01:07,  6.91s/it]score1 tensor([[0.5430],
-        [0.4512],
-        [0.5586],
-        [0.4785]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.4766, 0.5469, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0190, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:39:15,879] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.94 | optimizer_step: 4.37
-[2025-01-25 17:39:15,881] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.56 | bwd_microstep: 4639.67 | bwd_inner_microstep: 4631.36 | bwd_allreduce_microstep: 8.17 | step_microstep: 41.55
-[2025-01-25 17:39:15,881] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.52 | bwd: 4639.69 | bwd_inner: 4631.36 | bwd_allreduce: 8.24 | step: 41.57
- 55%|█████▍    | 3188/5800 [8:52:45<5:01:08,  6.92s/it]                                                       {'loss': 0.019, 'grad_norm': 3.8751325607299805, 'learning_rate': 1.7759917772646496e-05, 'epoch': 27.48}
- 55%|█████▍    | 3188/5800 [8:52:45<5:01:08,  6.92s/it]score1 tensor([[0.5469],
-        [0.5312],
-        [0.4980],
-        [0.4453]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.5352, 0.5391, 0.4570], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0142, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:39:22,740] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.12 | optimizer_step: 4.37
-[2025-01-25 17:39:22,742] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.41 | bwd_microstep: 4570.83 | bwd_inner_microstep: 4565.87 | bwd_allreduce_microstep: 4.87 | step_microstep: 43.87
-[2025-01-25 17:39:22,743] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.37 | bwd: 4570.87 | bwd_inner: 4565.87 | bwd_allreduce: 4.90 | step: 43.90
- 55%|█████▍    | 3189/5800 [8:52:52<5:00:18,  6.90s/it]                                                       {'loss': 0.0142, 'grad_norm': 6.038167476654053, 'learning_rate': 1.7748820273391956e-05, 'epoch': 27.49}
- 55%|█████▍    | 3189/5800 [8:52:52<5:00:18,  6.90s/it]score1 tensor([[0.5508],
-        [0.5508],
-        [0.4453],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.5547, 0.4512, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:39:29,649] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.55 | optimizer_step: 4.37
-[2025-01-25 17:39:29,650] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.62 | bwd_microstep: 4625.33 | bwd_inner_microstep: 4619.75 | bwd_allreduce_microstep: 5.47 | step_microstep: 45.53
-[2025-01-25 17:39:29,651] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.58 | bwd: 4625.35 | bwd_inner: 4619.75 | bwd_allreduce: 5.53 | step: 45.54
- 55%|█████▌    | 3190/5800 [8:52:59<5:00:23,  6.91s/it]                                                       {'loss': 0.0112, 'grad_norm': 8.321187019348145, 'learning_rate': 1.7737723476094317e-05, 'epoch': 27.5}
- 55%|█████▌    | 3190/5800 [8:52:59<5:00:23,  6.91s/it]score1 tensor([[0.4395],
-        [0.3672],
-        [0.5156],
-        [0.6445]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4609, 0.3711, 0.5156, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0151, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:39:36,524] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.36
-[2025-01-25 17:39:36,525] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.56 | bwd_microstep: 4576.33 | bwd_inner_microstep: 4571.75 | bwd_allreduce_microstep: 4.49 | step_microstep: 46.27
-[2025-01-25 17:39:36,526] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.51 | bwd: 4576.35 | bwd_inner: 4571.76 | bwd_allreduce: 4.53 | step: 46.28
- 55%|█████▌    | 3191/5800 [8:53:06<4:59:43,  6.89s/it]                                                       {'loss': 0.0151, 'grad_norm': 1.4686511754989624, 'learning_rate': 1.772662738421375e-05, 'epoch': 27.51}
- 55%|█████▌    | 3191/5800 [8:53:06<4:59:43,  6.89s/it]score1 tensor([[0.6914],
-        [0.5508],
-        [0.5156],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6797, 0.5469, 0.5000, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:39:43,444] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.36
-[2025-01-25 17:39:43,445] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.73 | bwd_microstep: 4635.78 | bwd_inner_microstep: 4631.08 | bwd_allreduce_microstep: 4.63 | step_microstep: 42.11
-[2025-01-25 17:39:43,446] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.67 | bwd: 4635.81 | bwd_inner: 4631.08 | bwd_allreduce: 4.67 | step: 42.12
- 55%|████��▌    | 3192/5800 [8:53:13<4:59:58,  6.90s/it]                                                       {'loss': 0.0137, 'grad_norm': 8.824210166931152, 'learning_rate': 1.7715532001210214e-05, 'epoch': 27.52}
- 55%|█████▌    | 3192/5800 [8:53:13<4:59:58,  6.90s/it]score1 tensor([[0.5312],
-        [0.6562],
-        [0.4863],
-        [0.4609]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.6523, 0.4492, 0.4434], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:39:50,351] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 17:39:50,352] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.17 | bwd_microstep: 4626.73 | bwd_inner_microstep: 4621.36 | bwd_allreduce_microstep: 5.29 | step_microstep: 42.11
-[2025-01-25 17:39:50,352] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.14 | bwd: 4626.76 | bwd_inner: 4621.36 | bwd_allreduce: 5.33 | step: 42.12
- 55%|█████▌    | 3193/5800 [8:53:20<4:59:59,  6.90s/it]                                                       {'loss': 0.0195, 'grad_norm': 8.416336059570312, 'learning_rate': 1.770443733054344e-05, 'epoch': 27.53}
- 55%|█████▌    | 3193/5800 [8:53:20<4:59:59,  6.90s/it]score1 tensor([[0.5273],
-        [0.6055],
-        [0.6172],
-        [0.4648]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.6094, 0.5781, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0166, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:39:57,264] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.36
-[2025-01-25 17:39:57,266] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.37 | bwd_microstep: 4626.06 | bwd_inner_microstep: 4621.28 | bwd_allreduce_microstep: 4.69 | step_microstep: 43.76
-[2025-01-25 17:39:57,266] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.33 | bwd: 4626.08 | bwd_inner: 4621.28 | bwd_allreduce: 4.74 | step: 43.78
- 55%|█████▌    | 3194/5800 [8:53:27<5:00:00,  6.91s/it]                                                       {'loss': 0.0166, 'grad_norm': 4.064288139343262, 'learning_rate': 1.7693343375672932e-05, 'epoch': 27.53}
- 55%|█████▌    | 3194/5800 [8:53:27<5:00:00,  6.91s/it]score1 tensor([[0.3906],
-        [0.5625],
-        [0.4805],
-        [0.4570]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3867, 0.5586, 0.4688, 0.4355], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:40:04,190] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.98 | optimizer_step: 4.37
-[2025-01-25 17:40:04,192] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.03 | bwd_microstep: 4631.18 | bwd_inner_microstep: 4626.37 | bwd_allreduce_microstep: 4.71 | step_microstep: 42.16
-[2025-01-25 17:40:04,192] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.99 | bwd: 4631.20 | bwd_inner: 4626.37 | bwd_allreduce: 4.76 | step: 42.17
- 55%|█████▌    | 3195/5800 [8:53:34<5:00:03,  6.91s/it]                                                       {'loss': 0.0103, 'grad_norm': 7.828789234161377, 'learning_rate': 1.7682250140057994e-05, 'epoch': 27.54}
- 55%|█████▌    | 3195/5800 [8:53:34<5:00:03,  6.91s/it]score1 tensor([[0.4648],
-        [0.5391],
-        [0.4707],
-        [0.6523]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.5430, 0.4453, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:40:11,103] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 17:40:11,104] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.70 | bwd_microstep: 4629.87 | bwd_inner_microstep: 4625.11 | bwd_allreduce_microstep: 4.67 | step_microstep: 44.22
-[2025-01-25 17:40:11,105] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.68 | bwd: 4629.89 | bwd_inner: 4625.11 | bwd_allreduce: 4.71 | step: 44.23
- 55%|█████▌    | 3196/5800 [8:53:41<4:59:57,  6.91s/it]                                                       {'loss': 0.0161, 'grad_norm': 0.5294057726860046, 'learning_rate': 1.767115762715767e-05, 'epoch': 27.55}
- 55%|█████▌    | 3196/5800 [8:53:41<4:59:57,  6.91s/it]score1 tensor([[0.5625],
-        [0.6211],
-        [0.5234],
-        [0.4551]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.6094, 0.5156, 0.4434], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:40:18,021] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 17:40:18,023] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.33 | bwd_microstep: 4636.10 | bwd_inner_microstep: 4630.85 | bwd_allreduce_microstep: 5.15 | step_microstep: 45.57
-[2025-01-25 17:40:18,023] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.29 | bwd: 4636.14 | bwd_inner: 4630.85 | bwd_allreduce: 5.19 | step: 45.57
- 55%|█████▌    | 3197/5800 [8:53:47<4:59:57,  6.91s/it]                                                       {'loss': 0.0107, 'grad_norm': 8.451472282409668, 'learning_rate': 1.7660065840430808e-05, 'epoch': 27.56}
- 55%|█████▌    | 3197/5800 [8:53:48<4:59:57,  6.91s/it]score1 tensor([[0.4785],
-        [0.4707],
-        [0.3887],
-        [0.4062]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.4375, 0.3672, 0.4199], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:40:24,947] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 17:40:24,949] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.01 | bwd_microstep: 4637.93 | bwd_inner_microstep: 4633.18 | bwd_allreduce_microstep: 4.68 | step_microstep: 43.69
-[2025-01-25 17:40:24,949] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.92 | bwd: 4637.95 | bwd_inner: 4633.18 | bwd_allreduce: 4.71 | step: 43.70
- 55%|█████▌    | 3198/5800 [8:53:54<4:59:58,  6.92s/it]                                                       {'loss': 0.0186, 'grad_norm': 3.9313418865203857, 'learning_rate': 1.7648974783336014e-05, 'epoch': 27.57}
- 55%|█████▌    | 3198/5800 [8:53:54<4:59:58,  6.92s/it]score1 tensor([[0.5039],
-        [0.3652],
-        [0.5039],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.3516, 0.4805, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:40:31,860] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.36
-[2025-01-25 17:40:31,861] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.23 | bwd_microstep: 4630.76 | bwd_inner_microstep: 4625.83 | bwd_allreduce_microstep: 4.84 | step_microstep: 42.43
-[2025-01-25 17:40:31,861] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.19 | bwd: 4630.78 | bwd_inner: 4625.82 | bwd_allreduce: 4.89 | step: 42.44
- 55%|█████▌    | 3199/5800 [8:54:01<4:59:47,  6.92s/it]                                                       {'loss': 0.0176, 'grad_norm': 7.838746070861816, 'learning_rate': 1.7637884459331685e-05, 'epoch': 27.58}
- 55%|█████▌    | 3199/5800 [8:54:01<4:59:47,  6.92s/it]score1 tensor([[0.5703],
-        [0.4375],
-        [0.5234],
-        [0.4316]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5742, 0.4512, 0.5156, 0.4395], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:40:38,776] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 17:40:38,777] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.08 | bwd_microstep: 4634.97 | bwd_inner_microstep: 4630.15 | bwd_allreduce_microstep: 4.75 | step_microstep: 42.39
-[2025-01-25 17:40:38,778] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.06 | bwd: 4635.00 | bwd_inner: 4630.14 | bwd_allreduce: 4.78 | step: 42.38
- 55%|█████▌    | 3200/5800 [8:54:08<4:59:44,  6.92s/it]                                                       {'loss': 0.0083, 'grad_norm': 3.890307903289795, 'learning_rate': 1.762679487187597e-05, 'epoch': 27.59}
- 55%|█████▌    | 3200/5800 [8:54:08<4:59:44,  6.92s/it]score1 tensor([[0.4824],
-        [0.4512],
-        [0.4434],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4902, 0.4160, 0.4551, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:40:45,687] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 17:40:45,688] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.17 | bwd_microstep: 4628.76 | bwd_inner_microstep: 4624.09 | bwd_allreduce_microstep: 4.58 | step_microstep: 42.28
-[2025-01-25 17:40:45,689] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.14 | bwd: 4628.78 | bwd_inner: 4624.09 | bwd_allreduce: 4.62 | step: 42.30
- 55%|█████▌    | 3201/5800 [8:54:15<4:59:32,  6.92s/it]                                                       {'loss': 0.0161, 'grad_norm': 0.34415850043296814, 'learning_rate': 1.7615706024426785e-05, 'epoch': 27.59}
- 55%|█████▌    | 3201/5800 [8:54:15<4:59:32,  6.92s/it]score1 tensor([[0.4258],
-        [0.4336],
-        [0.4863],
-        [0.4590]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4023, 0.4336, 0.4941, 0.4414], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:40:52,553] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 17:40:52,554] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.10 | bwd_microstep: 4581.68 | bwd_inner_microstep: 4576.88 | bwd_allreduce_microstep: 4.72 | step_microstep: 42.15
-[2025-01-25 17:40:52,555] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.06 | bwd: 4581.71 | bwd_inner: 4576.88 | bwd_allreduce: 4.76 | step: 42.16
- 55%|█████▌    | 3202/5800 [8:54:22<4:58:44,  6.90s/it]                                                       {'loss': 0.0122, 'grad_norm': 1.8004791736602783, 'learning_rate': 1.7604617920441834e-05, 'epoch': 27.6}
- 55%|█████▌    | 3202/5800 [8:54:22<4:58:44,  6.90s/it]score1 tensor([[0.5117],
-        [0.4453],
-        [0.6211],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5078, 0.4688, 0.6172, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:40:59,470] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.37
-[2025-01-25 17:40:59,471] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.68 | bwd_microstep: 4630.12 | bwd_inner_microstep: 4624.89 | bwd_allreduce_microstep: 5.12 | step_microstep: 43.39
-[2025-01-25 17:40:59,471] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.63 | bwd: 4630.14 | bwd_inner: 4624.89 | bwd_allreduce: 5.17 | step: 43.41
- 55%|█████▌    | 3203/5800 [8:54:29<4:58:52,  6.90s/it]                                                       {'loss': 0.0107, 'grad_norm': 0.5847815871238708, 'learning_rate': 1.7593530563378577e-05, 'epoch': 27.61}
- 55%|█████▌    | 3203/5800 [8:54:29<4:58:52,  6.90s/it]score1 tensor([[0.4648],
-        [0.4082],
-        [0.4941],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.3262, 0.5273, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0459, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:41:06,386] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.92 | optimizer_step: 4.37
-[2025-01-25 17:41:06,387] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.03 | bwd_microstep: 4628.61 | bwd_inner_microstep: 4623.94 | bwd_allreduce_microstep: 4.57 | step_microstep: 48.06
-[2025-01-25 17:41:06,388] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.00 | bwd: 4628.64 | bwd_inner: 4623.94 | bwd_allreduce: 4.62 | step: 48.07
- 55%|█████▌    | 3204/5800 [8:54:36<4:58:50,  6.91s/it]                                                       {'loss': 0.0459, 'grad_norm': 4.158842086791992, 'learning_rate': 1.758244395669425e-05, 'epoch': 27.62}
- 55%|█████▌    | 3204/5800 [8:54:36<4:58:50,  6.91s/it]score1 tensor([[0.4883],
-        [0.4277],
-        [0.4141],
-        [0.4727]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4961, 0.4473, 0.4277, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:41:13,304] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 17:41:13,305] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.27 | bwd_microstep: 4635.82 | bwd_inner_microstep: 4631.01 | bwd_allreduce_microstep: 4.71 | step_microstep: 45.85
-[2025-01-25 17:41:13,306] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.24 | bwd: 4635.84 | bwd_inner: 4631.01 | bwd_allreduce: 4.76 | step: 45.85
- 55%|█████▌    | 3205/5800 [8:54:43<4:58:50,  6.91s/it]                                                       {'loss': 0.0132, 'grad_norm': 7.639847278594971, 'learning_rate': 1.7571358103845847e-05, 'epoch': 27.63}
- 55%|█████▌    | 3205/5800 [8:54:43<4:58:50,  6.91s/it]score1 tensor([[0.4824],
-        [0.4141],
-        [0.4355],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.3887, 0.4648, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0249, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:41:20,201] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 17:41:20,203] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.19 | bwd_microstep: 4627.45 | bwd_inner_microstep: 4622.95 | bwd_allreduce_microstep: 4.42 | step_microstep: 45.29
-[2025-01-25 17:41:20,203] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.16 | bwd: 4627.47 | bwd_inner: 4622.95 | bwd_allreduce: 4.45 | step: 45.31
- 55%|█████▌    | 3206/5800 [8:54:50<4:58:39,  6.91s/it]                                                       {'loss': 0.0249, 'grad_norm': 0.43981149792671204, 'learning_rate': 1.7560273008290126e-05, 'epoch': 27.64}
- 55%|█████▌    | 3206/5800 [8:54:50<4:58:39,  6.91s/it]score1 tensor([[0.4863],
-        [0.4941],
-        [0.4961],
-        [0.4121]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4922, 0.5039, 0.5352, 0.4297], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0181, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:41:27,115] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.15 | optimizer_step: 4.36
-[2025-01-25 17:41:27,116] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.73 | bwd_microstep: 4628.51 | bwd_inner_microstep: 4623.66 | bwd_allreduce_microstep: 4.75 | step_microstep: 46.05
-[2025-01-25 17:41:27,116] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.70 | bwd: 4628.54 | bwd_inner: 4623.66 | bwd_allreduce: 4.78 | step: 46.05
- 55%|█████▌    | 3207/5800 [8:54:57<4:58:35,  6.91s/it]                                                       {'loss': 0.0181, 'grad_norm': 7.840244770050049, 'learning_rate': 1.7549188673483613e-05, 'epoch': 27.65}
- 55%|█████▌    | 3207/5800 [8:54:57<4:58:35,  6.91s/it]score1 tensor([[0.3496],
-        [0.4141],
-        [0.5430],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.4258, 0.5352, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:41:34,029] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 17:41:34,030] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.97 | bwd_microstep: 4633.21 | bwd_inner_microstep: 4628.62 | bwd_allreduce_microstep: 4.50 | step_microstep: 43.32
-[2025-01-25 17:41:34,030] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.93 | bwd: 4633.23 | bwd_inner: 4628.62 | bwd_allreduce: 4.55 | step: 43.32
- 55%|█████▌    | 3208/5800 [8:55:04<4:58:29,  6.91s/it]                                                       {'loss': 0.0161, 'grad_norm': 3.5420382022857666, 'learning_rate': 1.7538105102882598e-05, 'epoch': 27.66}
- 55%|█████▌    | 3208/5800 [8:55:04<4:58:29,  6.91s/it]score1 tensor([[0.5117],
-        [0.4512],
-        [0.4023],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.4629, 0.4141, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:41:40,946] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.36
-[2025-01-25 17:41:40,947] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.30 | bwd_microstep: 4637.87 | bwd_inner_microstep: 4633.03 | bwd_allreduce_microstep: 4.77 | step_microstep: 42.12
-[2025-01-25 17:41:40,948] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.23 | bwd: 4637.89 | bwd_inner: 4633.03 | bwd_allreduce: 4.80 | step: 42.13
- 55%|█████▌    | 3209/5800 [8:55:10<4:58:29,  6.91s/it]                                                       {'loss': 0.0127, 'grad_norm': 3.5898022651672363, 'learning_rate': 1.7527022299943125e-05, 'epoch': 27.66}
- 55%|█████▌    | 3209/5800 [8:55:10<4:58:29,  6.91s/it]score1 tensor([[0.5234],
-        [0.4746],
-        [0.4492],
-        [0.4805]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.4844, 0.4609, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0303, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:41:47,861] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.93 | optimizer_step: 4.37
-[2025-01-25 17:41:47,862] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.38 | bwd_microstep: 4639.15 | bwd_inner_microstep: 4634.05 | bwd_allreduce_microstep: 4.99 | step_microstep: 42.24
-[2025-01-25 17:41:47,863] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.34 | bwd: 4639.18 | bwd_inner: 4634.05 | bwd_allreduce: 5.04 | step: 42.25
- 55%|█████▌    | 3210/5800 [8:55:17<4:58:28,  6.91s/it]                                                       {'loss': 0.0303, 'grad_norm': 7.949809551239014, 'learning_rate': 1.7515940268121014e-05, 'epoch': 27.67}
- 55%|█████▌    | 3210/5800 [8:55:17<4:58:28,  6.91s/it]score1 tensor([[0.6250],
-        [0.5977],
-        [0.4180],
-        [0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6641, 0.6172, 0.4180, 0.4863], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0151, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:41:54,727] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 17:41:54,728] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.83 | bwd_microstep: 4581.35 | bwd_inner_microstep: 4576.25 | bwd_allreduce_microstep: 5.02 | step_microstep: 47.04
-[2025-01-25 17:41:54,729] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.78 | bwd: 4581.38 | bwd_inner: 4576.25 | bwd_allreduce: 5.06 | step: 47.05
- 55%|█████▌    | 3211/5800 [8:55:24<4:57:40,  6.90s/it]                                                       {'loss': 0.0151, 'grad_norm': 6.549032211303711, 'learning_rate': 1.7504859010871827e-05, 'epoch': 27.68}
- 55%|█████▌    | 3211/5800 [8:55:24<4:57:40,  6.90s/it]score1 tensor([[0.5469],
-        [0.6250],
-        [0.5039],
-        [0.4629]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.6484, 0.4980, 0.4395], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0171, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:42:01,637] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.37
-[2025-01-25 17:42:01,639] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.00 | bwd_microstep: 4633.68 | bwd_inner_microstep: 4628.98 | bwd_allreduce_microstep: 4.62 | step_microstep: 43.68
-[2025-01-25 17:42:01,639] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.96 | bwd: 4633.70 | bwd_inner: 4628.98 | bwd_allreduce: 4.66 | step: 43.69
- 55%|█████▌    | 3212/5800 [8:55:31<4:57:41,  6.90s/it]                                                       {'loss': 0.0171, 'grad_norm': 4.049030780792236, 'learning_rate': 1.74937785316509e-05, 'epoch': 27.69}
- 55%|█████▌    | 3212/5800 [8:55:31<4:57:41,  6.90s/it]score1 tensor([[0.4043],
-        [0.6250],
-        [0.5703],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4043, 0.6445, 0.5977, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0166, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:42:08,494] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.37
-[2025-01-25 17:42:08,495] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.05 | bwd_microstep: 4578.39 | bwd_inner_microstep: 4573.86 | bwd_allreduce_microstep: 4.46 | step_microstep: 42.12
-[2025-01-25 17:42:08,496] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.02 | bwd: 4578.41 | bwd_inner: 4573.86 | bwd_allreduce: 4.50 | step: 42.13
- 55%|█████▌    | 3213/5800 [8:55:38<4:57:00,  6.89s/it]                                                       {'loss': 0.0166, 'grad_norm': 2.5027363300323486, 'learning_rate': 1.7482698833913304e-05, 'epoch': 27.7}
- 55%|█████▌    | 3213/5800 [8:55:38<4:57:00,  6.89s/it]score1 tensor([[0.4102],
-        [0.5117],
-        [0.6289],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3789, 0.4707, 0.6562, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0298, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:42:15,418] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 8.21 | optimizer_step: 4.36
-[2025-01-25 17:42:15,419] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.44 | bwd_microstep: 4641.16 | bwd_inner_microstep: 4636.55 | bwd_allreduce_microstep: 4.53 | step_microstep: 46.91
-[2025-01-25 17:42:15,420] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.40 | bwd: 4641.18 | bwd_inner: 4636.55 | bwd_allreduce: 4.56 | step: 46.92
- 55%|█████▌    | 3214/5800 [8:55:45<4:57:21,  6.90s/it]                                                       {'loss': 0.0298, 'grad_norm': 3.7064871788024902, 'learning_rate': 1.74716199211139e-05, 'epoch': 27.71}
- 55%|█████▌    | 3214/5800 [8:55:45<4:57:21,  6.90s/it]score1 tensor([[0.5781],
-        [0.4492],
-        [0.3926],
-        [0.4336]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5820, 0.4043, 0.3730, 0.4082], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:42:22,327] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 17:42:22,328] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.93 | bwd_microstep: 4634.58 | bwd_inner_microstep: 4629.91 | bwd_allreduce_microstep: 4.59 | step_microstep: 44.08
-[2025-01-25 17:42:22,329] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.90 | bwd: 4634.61 | bwd_inner: 4629.91 | bwd_allreduce: 4.63 | step: 44.08
- 55%|█████▌    | 3215/5800 [8:55:52<4:57:23,  6.90s/it]                                                       {'loss': 0.0234, 'grad_norm': 3.387141227722168, 'learning_rate': 1.746054179670727e-05, 'epoch': 27.72}
- 55%|█████▌    | 3215/5800 [8:55:52<4:57:23,  6.90s/it]score1 tensor([[0.5156],
-        [0.6328],
-        [0.3770],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.6250, 0.3340, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:42:29,242] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.05 | optimizer_step: 4.37
-[2025-01-25 17:42:29,243] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.69 | bwd_microstep: 4636.85 | bwd_inner_microstep: 4631.99 | bwd_allreduce_microstep: 4.76 | step_microstep: 43.47
-[2025-01-25 17:42:29,244] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.65 | bwd: 4636.87 | bwd_inner: 4631.99 | bwd_allreduce: 4.81 | step: 43.48
- 55%|█████▌    | 3216/5800 [8:55:59<4:57:26,  6.91s/it]                                                       {'loss': 0.0215, 'grad_norm': 8.189995765686035, 'learning_rate': 1.7449464464147774e-05, 'epoch': 27.72}
- 55%|█████▌    | 3216/5800 [8:55:59<4:57:26,  6.91s/it]score1 tensor([[0.5156],
-        [0.5703],
-        [0.5938],
-        [0.6133]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.5586, 0.5742, 0.6016], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:42:36,160] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.37
-[2025-01-25 17:42:36,162] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.04 | bwd_microstep: 4636.03 | bwd_inner_microstep: 4630.79 | bwd_allreduce_microstep: 5.04 | step_microstep: 42.95
-[2025-01-25 17:42:36,162] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.01 | bwd: 4636.05 | bwd_inner: 4630.79 | bwd_allreduce: 5.16 | step: 42.96
- 55%|█████▌    | 3217/5800 [8:56:06<4:57:28,  6.91s/it]                                                       {'loss': 0.0186, 'grad_norm': 8.800103187561035, 'learning_rate': 1.7438387926889516e-05, 'epoch': 27.73}
- 55%|█████▌    | 3217/5800 [8:56:06<4:57:28,  6.91s/it]score1 tensor([[0.5781],
-        [0.5195],
-        [0.5000],
-        [0.4707]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.4902, 0.4863, 0.4375], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0229, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:42:43,079] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.99 | optimizer_step: 4.37
-[2025-01-25 17:42:43,081] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.16 | bwd_microstep: 4638.55 | bwd_inner_microstep: 4633.94 | bwd_allreduce_microstep: 4.51 | step_microstep: 44.49
-[2025-01-25 17:42:43,081] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.12 | bwd: 4638.58 | bwd_inner: 4633.94 | bwd_allreduce: 4.55 | step: 44.50
- 55%|█████▌    | 3218/5800 [8:56:13<4:57:28,  6.91s/it]                                                       {'loss': 0.0229, 'grad_norm': 8.268899917602539, 'learning_rate': 1.7427312188386346e-05, 'epoch': 27.74}
- 55%|█████▌    | 3218/5800 [8:56:13<4:57:28,  6.91s/it]score1 tensor([[0.4805],
-        [0.6367],
-        [0.5469],
-        [0.6836]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.6445, 0.5391, 0.7031], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0151, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:42:49,998] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.04 | optimizer_step: 4.36
-[2025-01-25 17:42:49,999] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.93 | bwd_microstep: 4633.94 | bwd_inner_microstep: 4629.12 | bwd_allreduce_microstep: 4.74 | step_microstep: 45.26
-[2025-01-25 17:42:49,999] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.89 | bwd: 4633.97 | bwd_inner: 4629.12 | bwd_allreduce: 4.78 | step: 45.27
- 56%|█████▌    | 3219/5800 [8:56:19<4:57:26,  6.91s/it]                                                       {'loss': 0.0151, 'grad_norm': 0.9962587356567383, 'learning_rate': 1.7416237252091878e-05, 'epoch': 27.75}
- 56%|█████▌    | 3219/5800 [8:56:19<4:57:26,  6.91s/it]score1 tensor([[0.5820],
-        [0.5625],
-        [0.5352],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.6172, 0.5508, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:42:56,919] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.36
-[2025-01-25 17:42:56,921] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.51 | bwd_microstep: 4639.06 | bwd_inner_microstep: 4633.77 | bwd_allreduce_microstep: 5.19 | step_microstep: 42.92
-[2025-01-25 17:42:56,921] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.44 | bwd: 4639.09 | bwd_inner: 4633.77 | bwd_allreduce: 5.24 | step: 42.93
- 56%|█████▌    | 3220/5800 [8:56:26<4:57:23,  6.92s/it]                                                       {'loss': 0.0293, 'grad_norm': 4.15285062789917, 'learning_rate': 1.7405163121459462e-05, 'epoch': 27.76}
- 56%|█████▌    | 3220/5800 [8:56:26<4:57:23,  6.92s/it]evaluate!
-score1 tensor([[0.4766]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4805, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5469]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5625, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4922]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4766, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5703, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6523]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5742, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0781, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4531]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5547, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1016, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5977, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5508]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4219, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1289, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5703]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0820, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5117]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5000, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4570]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3945, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0625, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5312]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5000, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5078]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3730, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1348, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4863]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4355]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4414, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5430]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6797, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1367, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5391, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0., device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5078]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4648, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4160]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4258, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4727]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4707, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4727]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3965, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0762, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5195]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3750, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1445, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5469]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4492, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0977, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5703]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5586, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5781]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6523, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0742, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5781]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5898, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4355, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1035, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5156, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5078]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3262, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1816, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4375]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4863, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0488, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4863]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1621, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5469]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6055, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0586, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4570, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0781, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5781]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6094]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.7031, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0938, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4941]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4336, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0605, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5508]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5391, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4688]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4473, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5430, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4883]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4590, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6562]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6641, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5312]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4453, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0859, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6172]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.7109, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0938, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5273, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5078]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3691, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1387, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4824]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4180, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0645, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4902]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3223, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1680, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5586]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6016, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4941]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4648, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4707]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3418, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1289, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5117]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4590, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0527, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5117]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5117, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0., device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5078]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4551, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0527, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4844]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5234, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5703]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5312, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5742]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5820, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4434]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4199, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4277]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3926, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5430]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0742, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5352, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0., device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5977]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6250, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6094]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5703, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4824]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4727, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5703]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4434, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1270, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4941]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4941, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0., device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5508]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5352, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5000]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4551]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3828, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0723, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4531]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4219, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5039]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4844, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4805, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5664]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6016]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5898, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6367]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6641, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4688, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4414]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4297, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4316]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4668, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1016, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4824]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4297, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0527, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4414]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4609, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4902]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1270, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5000]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4492, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4180]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3730, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0449, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5508]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6445, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0938, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5000]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1484, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4453]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5430, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0977, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5273, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4707]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3867, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0840, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4863]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4668, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5430]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5898]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5625, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6016]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6250, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5742]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5625]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4688]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4785, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5078]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4062, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1016, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5469]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4395, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1074, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4512]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5664, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1152, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4688]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3555, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1133, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4902]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4785, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5469]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5078, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6055]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6016, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4805]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4277, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0527, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6055]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5781, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3477, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1680, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6133, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0898, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5430]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5156, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4668]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3379, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1289, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4609]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3418, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1191, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5039]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3848, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1191, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4570]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3828, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0742, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4902]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4277, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0625, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4316]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4043, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-Results saved to /DATA/DATA1/wjr/intern/InternVL/internvl_chat/new_train_result/stage2/mos3_test.csv
-Accuracy: 0.0
-SRCC_score: 0.6485523947675013
-PLCC_score: 0.6493642784663894
-KRCC_score: 0.4706132505138726
-SRCC_level: 0.6485523947675013
-PLCC_level: 0.6493642784663894
-KRCC_level: 0.4706132505138726
-score1 tensor([[0.4473],
-        [0.5391],
-        [0.6523],
-        [0.4922]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4062, 0.5078, 0.6641, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0317, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:53:05,940] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 17:53:05,942] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2135.68 | bwd_microstep: 4595.42 | bwd_inner_microstep: 4590.31 | bwd_allreduce_microstep: 4.99 | step_microstep: 43.32
-[2025-01-25 17:53:05,942] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2135.62 | bwd: 4595.44 | bwd_inner: 4590.31 | bwd_allreduce: 5.05 | step: 43.33
- 56%|█████▌    | 3221/5800 [9:06:35<134:21:25, 187.55s/it]                                                          {'loss': 0.0317, 'grad_norm': 0.6484151482582092, 'learning_rate': 1.7394089799942215e-05, 'epoch': 27.77}
- 56%|█████▌    | 3221/5800 [9:06:35<134:21:25, 187.55s/it]score1 tensor([[0.5195],
-        [0.4551],
-        [0.6289],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.4492, 0.6172, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0142, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:53:12,774] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.02 | optimizer_step: 4.36
-[2025-01-25 17:53:12,775] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2144.68 | bwd_microstep: 4569.11 | bwd_inner_microstep: 4564.21 | bwd_allreduce_microstep: 4.76 | step_microstep: 43.61
-[2025-01-25 17:53:12,776] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2144.63 | bwd: 4569.14 | bwd_inner: 4564.21 | bwd_allreduce: 4.83 | step: 43.62
- 56%|█████▌    | 3222/5800 [9:06:42<95:28:54, 133.33s/it]                                                          {'loss': 0.0142, 'grad_norm': 4.339907169342041, 'learning_rate': 1.7383017290992978e-05, 'epoch': 27.78}
- 56%|█████▌    | 3222/5800 [9:06:42<95:28:54, 133.33s/it]score1 tensor([[0.5586],
-        [0.5430],
-        [0.5195],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.5234, 0.5195, 0.5000], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:53:19,561] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.36
-[2025-01-25 17:53:19,563] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2131.24 | bwd_microstep: 4535.90 | bwd_inner_microstep: 4531.46 | bwd_allreduce_microstep: 4.36 | step_microstep: 42.57
-[2025-01-25 17:53:19,563] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2131.21 | bwd: 4535.92 | bwd_inner: 4531.46 | bwd_allreduce: 4.40 | step: 42.58
- 56%|█████▌    | 3223/5800 [9:06:49<68:16:04, 95.37s/it]                                                         {'loss': 0.0146, 'grad_norm': 6.408192157745361, 'learning_rate': 1.737194559806437e-05, 'epoch': 27.78}
- 56%|█████▌    | 3223/5800 [9:06:49<68:16:04, 95.37s/it]score1 tensor([[0.5820],
-        [0.5039],
-        [0.4199],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4941, 0.3984, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:53:26,414] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 17:53:26,415] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2134.83 | bwd_microstep: 4593.70 | bwd_inner_microstep: 4588.90 | bwd_allreduce_microstep: 4.72 | step_microstep: 42.32
-[2025-01-25 17:53:26,416] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2134.78 | bwd: 4593.73 | bwd_inner: 4588.90 | bwd_allreduce: 4.76 | step: 42.33
- 56%|█████▌    | 3224/5800 [9:06:56<49:14:24, 68.81s/it]                                                        {'loss': 0.0186, 'grad_norm': 8.2799072265625, 'learning_rate': 1.7360874724608715e-05, 'epoch': 27.79}
- 56%|█████▌    | 3224/5800 [9:06:56<49:14:24, 68.81s/it]score1 tensor([[0.4492],
-        [0.5352],
-        [0.3867],
-        [0.3945]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.5625, 0.3652, 0.4043], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0151, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:53:33,278] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 17:53:33,283] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2144.01 | bwd_microstep: 4590.47 | bwd_inner_microstep: 4585.51 | bwd_allreduce_microstep: 4.87 | step_microstep: 48.02
-[2025-01-25 17:53:33,283] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2143.99 | bwd: 4590.50 | bwd_inner: 4585.51 | bwd_allreduce: 4.91 | step: 48.04
- 56%|█████▌    | 3225/5800 [9:07:03<35:55:41, 50.23s/it]                                                        {'loss': 0.0151, 'grad_norm': 0.4430064558982849, 'learning_rate': 1.7349804674078115e-05, 'epoch': 27.8}
- 56%|█████▌    | 3225/5800 [9:07:03<35:55:41, 50.23s/it]score1 tensor([[0.5898],
-        [0.5195],
-        [0.5820],
-        [0.4746]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133, 0.5273, 0.6133, 0.4824], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:53:40,144] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 17:53:40,145] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2148.72 | bwd_microstep: 4598.76 | bwd_inner_microstep: 4593.99 | bwd_allreduce_microstep: 4.69 | step_microstep: 41.41
-[2025-01-25 17:53:40,145] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2148.67 | bwd: 4598.78 | bwd_inner: 4593.99 | bwd_allreduce: 4.73 | step: 41.42
- 56%|█████▌    | 3226/5800 [9:07:10<26:36:45, 37.22s/it]                                                        {'loss': 0.0176, 'grad_norm': 8.546854019165039, 'learning_rate': 1.7338735449924406e-05, 'epoch': 27.81}
- 56%|█████▌    | 3226/5800 [9:07:10<26:36:45, 37.22s/it]score1 tensor([[0.3574],
-        [0.6211],
-        [0.3730],
-        [0.4766]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3672, 0.6484, 0.3809, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0210, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:53:47,006] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.37
-[2025-01-25 17:53:47,008] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2145.62 | bwd_microstep: 4600.29 | bwd_inner_microstep: 4595.51 | bwd_allreduce_microstep: 4.70 | step_microstep: 42.81
-[2025-01-25 17:53:47,008] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2145.59 | bwd: 4600.31 | bwd_inner: 4595.51 | bwd_allreduce: 4.74 | step: 42.83
- 56%|█████▌    | 3227/5800 [9:07:16<20:05:34, 28.11s/it]                                                        {'loss': 0.021, 'grad_norm': 7.717751979827881, 'learning_rate': 1.7327667055599154e-05, 'epoch': 27.82}
- 56%|█████▌    | 3227/5800 [9:07:16<20:05:34, 28.11s/it]score1 tensor([[0.6094],
-        [0.4316],
-        [0.4023],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.4336, 0.4180, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:53:53,828] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 17:53:53,829] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2148.49 | bwd_microstep: 4550.95 | bwd_inner_microstep: 4546.23 | bwd_allreduce_microstep: 4.58 | step_microstep: 42.87
-[2025-01-25 17:53:53,829] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2148.45 | bwd: 4550.98 | bwd_inner: 4546.23 | bwd_allreduce: 4.67 | step: 42.88
- 56%|█████▌    | 3228/5800 [9:07:23<15:31:18, 21.73s/it]                                                        {'loss': 0.0063, 'grad_norm': 5.748849868774414, 'learning_rate': 1.731659949455369e-05, 'epoch': 27.83}
- 56%|█████▌    | 3228/5800 [9:07:23<15:31:18, 21.73s/it]score1 tensor([[0.5391],
-        [0.4688],
-        [0.4297],
-        [0.4941]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.5000, 0.4531, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0200, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:54:00,704] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.91 | optimizer_step: 4.36
-[2025-01-25 17:54:00,705] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2147.91 | bwd_microstep: 4603.99 | bwd_inner_microstep: 4599.10 | bwd_allreduce_microstep: 4.80 | step_microstep: 42.11
-[2025-01-25 17:54:00,706] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2147.87 | bwd: 4604.01 | bwd_inner: 4599.10 | bwd_allreduce: 4.83 | step: 42.12
- 56%|█████▌    | 3229/5800 [9:07:30<12:20:03, 17.27s/it]                                                        {'loss': 0.02, 'grad_norm': 7.945118427276611, 'learning_rate': 1.730553277023906e-05, 'epoch': 27.84}
- 56%|█████▌    | 3229/5800 [9:07:30<12:20:03, 17.27s/it]score1 tensor([[0.4570],
-        [0.6445],
-        [0.4766],
-        [0.4316]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.6406, 0.4805, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:54:07,589] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 17:54:07,591] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.04 | bwd_microstep: 4604.64 | bwd_inner_microstep: 4599.20 | bwd_allreduce_microstep: 5.32 | step_microstep: 48.46
-[2025-01-25 17:54:07,591] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.00 | bwd: 4604.68 | bwd_inner: 4599.20 | bwd_allreduce: 5.38 | step: 48.47
- 56%|█████▌    | 3230/5800 [9:07:37<10:06:18, 14.16s/it]                                                        {'loss': 0.0078, 'grad_norm': 0.7129862904548645, 'learning_rate': 1.729446688610607e-05, 'epoch': 27.84}
- 56%|█████▌    | 3230/5800 [9:07:37<10:06:18, 14.16s/it]score1 tensor([[0.3730],
-        [0.3418],
-        [0.5508],
-        [0.5859]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.3418, 0.5508, 0.5820], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:54:14,385] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.37
-[2025-01-25 17:54:14,387] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.56 | bwd_microstep: 4520.72 | bwd_inner_microstep: 4515.67 | bwd_allreduce_microstep: 4.94 | step_microstep: 42.85
-[2025-01-25 17:54:14,387] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.52 | bwd: 4520.75 | bwd_inner: 4515.67 | bwd_allreduce: 4.99 | step: 42.86
- 56%|█████▌    | 3231/5800 [9:07:44<8:31:33, 11.95s/it]                                                        {'loss': 0.0078, 'grad_norm': 0.9888824820518494, 'learning_rate': 1.7283401845605254e-05, 'epoch': 27.85}
- 56%|█████▌    | 3231/5800 [9:07:44<8:31:33, 11.95s/it]score1 tensor([[0.4707],
-        [0.4590],
-        [0.6523],
-        [0.5859]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4453, 0.4512, 0.6445, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:54:21,271] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 17:54:21,272] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.80 | bwd_microstep: 4616.86 | bwd_inner_microstep: 4612.01 | bwd_allreduce_microstep: 4.77 | step_microstep: 41.94
-[2025-01-25 17:54:21,272] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.77 | bwd: 4616.88 | bwd_inner: 4612.01 | bwd_allreduce: 4.81 | step: 41.94
- 56%|█████▌    | 3232/5800 [9:07:51<7:26:19, 10.43s/it]                                                       {'loss': 0.0161, 'grad_norm': 8.491072654724121, 'learning_rate': 1.7272337652186867e-05, 'epoch': 27.86}
- 56%|█████▌    | 3232/5800 [9:07:51<7:26:19, 10.43s/it]score1 tensor([[0.4473],
-        [0.4785],
-        [0.5273],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4277, 0.4570, 0.5391, 0.6016], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0181, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:54:28,156] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.37
-[2025-01-25 17:54:28,157] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.38 | bwd_microstep: 4612.89 | bwd_inner_microstep: 4607.83 | bwd_allreduce_microstep: 4.96 | step_microstep: 43.04
-[2025-01-25 17:54:28,158] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.34 | bwd: 4612.92 | bwd_inner: 4607.83 | bwd_allreduce: 5.01 | step: 43.05
- 56%|█████▌    | 3233/5800 [9:07:58<6:40:42,  9.37s/it]                                                       {'loss': 0.0181, 'grad_norm': 0.7324692010879517, 'learning_rate': 1.726127430930094e-05, 'epoch': 27.87}
- 56%|█████▌    | 3233/5800 [9:07:58<6:40:42,  9.37s/it]score1 tensor([[0.5547],
-        [0.4746],
-        [0.4629],
-        [0.3398]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4844, 0.4863, 0.3105], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0166, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:54:35,052] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 17:54:35,053] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.50 | bwd_microstep: 4618.60 | bwd_inner_microstep: 4613.11 | bwd_allreduce_microstep: 5.38 | step_microstep: 44.18
-[2025-01-25 17:54:35,053] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.46 | bwd: 4618.63 | bwd_inner: 4613.11 | bwd_allreduce: 5.44 | step: 44.18
- 56%|█████▌    | 3234/5800 [9:08:05<6:08:47,  8.62s/it]                                                       {'loss': 0.0166, 'grad_norm': 0.4974617660045624, 'learning_rate': 1.7250211820397196e-05, 'epoch': 27.88}
- 56%|█████▌    | 3234/5800 [9:08:05<6:08:47,  8.62s/it]score1 tensor([[0.5273],
-        [0.4121],
-        [0.3340],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.4219, 0.3223, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:54:41,886] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.37
-[2025-01-25 17:54:41,887] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.89 | bwd_microstep: 4557.84 | bwd_inner_microstep: 4552.77 | bwd_allreduce_microstep: 4.96 | step_microstep: 44.42
-[2025-01-25 17:54:41,888] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.85 | bwd: 4557.87 | bwd_inner: 4552.77 | bwd_allreduce: 5.02 | step: 44.43
- 56%|█████▌    | 3235/5800 [9:08:11<5:45:44,  8.09s/it]                                                       {'loss': 0.0063, 'grad_norm': 1.9949419498443604, 'learning_rate': 1.7239150188925128e-05, 'epoch': 27.89}
- 56%|█████▌    | 3235/5800 [9:08:11<5:45:44,  8.09s/it]score1 tensor([[0.3711],
-        [0.4863],
-        [0.4844],
-        [0.4277]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3652, 0.5039, 0.4844, 0.4199], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:54:48,739] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 17:54:48,741] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.83 | bwd_microstep: 4578.87 | bwd_inner_microstep: 4574.35 | bwd_allreduce_microstep: 4.45 | step_microstep: 43.75
-[2025-01-25 17:54:48,741] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.79 | bwd: 4578.90 | bwd_inner: 4574.35 | bwd_allreduce: 4.49 | step: 43.77
- 56%|█████▌    | 3236/5800 [9:08:18<5:29:48,  7.72s/it]                                                       {'loss': 0.0078, 'grad_norm': 1.6902403831481934, 'learning_rate': 1.7228089418333935e-05, 'epoch': 27.9}
- 56%|█████▌    | 3236/5800 [9:08:18<5:29:48,  7.72s/it]score1 tensor([[0.5625],
-        [0.4160],
-        [0.6094],
-        [0.6133]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.4160, 0.5938, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:54:55,577] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.37
-[2025-01-25 17:54:55,578] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.68 | bwd_microstep: 4562.08 | bwd_inner_microstep: 4557.14 | bwd_allreduce_microstep: 4.85 | step_microstep: 43.17
-[2025-01-25 17:54:55,578] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.65 | bwd: 4562.11 | bwd_inner: 4557.14 | bwd_allreduce: 4.90 | step: 43.18
- 56%|█████▌    | 3237/5800 [9:08:25<5:18:21,  7.45s/it]                                                       {'loss': 0.0068, 'grad_norm': 2.1728267669677734, 'learning_rate': 1.7217029512072553e-05, 'epoch': 27.91}
- 56%|█████▌    | 3237/5800 [9:08:25<5:18:21,  7.45s/it]score1 tensor([[0.5820],
-        [0.4316],
-        [0.4375],
-        [0.4707]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5898, 0.4473, 0.4316, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:55:02,463] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 17:55:02,464] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.94 | bwd_microstep: 4616.16 | bwd_inner_microstep: 4610.93 | bwd_allreduce_microstep: 5.14 | step_microstep: 43.15
-[2025-01-25 17:55:02,464] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.91 | bwd: 4616.19 | bwd_inner: 4610.93 | bwd_allreduce: 5.19 | step: 43.15
- 56%|█████▌    | 3238/5800 [9:08:32<5:10:58,  7.28s/it]                                                       {'loss': 0.0093, 'grad_norm': 4.239046096801758, 'learning_rate': 1.7205970473589662e-05, 'epoch': 27.91}
- 56%|█████▌    | 3238/5800 [9:08:32<5:10:58,  7.28s/it]score1 tensor([[0.5039],
-        [0.3516],
-        [0.4004],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.3789, 0.4023, 0.4824], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0225, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:55:09,352] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 17:55:09,354] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.49 | bwd_microstep: 4613.78 | bwd_inner_microstep: 4609.06 | bwd_allreduce_microstep: 4.64 | step_microstep: 42.23
-[2025-01-25 17:55:09,354] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.44 | bwd: 4613.80 | bwd_inner: 4609.06 | bwd_allreduce: 4.68 | step: 42.23
- 56%|█████▌    | 3239/5800 [9:08:39<5:05:49,  7.17s/it]                                                       {'loss': 0.0225, 'grad_norm': 3.529430866241455, 'learning_rate': 1.719491230633365e-05, 'epoch': 27.92}
- 56%|█████▌    | 3239/5800 [9:08:39<5:05:49,  7.17s/it]score1 tensor([[0.3828],
-        [0.4434],
-        [0.4219],
-        [0.6562]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4023, 0.4531, 0.4180, 0.6797], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0142, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:55:16,249] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 17:55:16,250] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.09 | bwd_microstep: 4621.28 | bwd_inner_microstep: 4616.08 | bwd_allreduce_microstep: 5.09 | step_microstep: 43.51
-[2025-01-25 17:55:16,251] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.05 | bwd: 4621.31 | bwd_inner: 4616.08 | bwd_allreduce: 5.14 | step: 43.52
- 56%|█████▌    | 3240/5800 [9:08:46<5:02:17,  7.08s/it]                                                       {'loss': 0.0142, 'grad_norm': 4.220132350921631, 'learning_rate': 1.7183855013752664e-05, 'epoch': 27.93}
- 56%|█████▌    | 3240/5800 [9:08:46<5:02:17,  7.08s/it]score1 tensor([[0.6641],
-        [0.4453],
-        [0.4961],
-        [0.4766]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6953, 0.4492, 0.5156, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0244, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:55:23,140] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.36
-[2025-01-25 17:55:23,142] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.73 | bwd_microstep: 4612.93 | bwd_inner_microstep: 4607.38 | bwd_allreduce_microstep: 5.39 | step_microstep: 46.34
-[2025-01-25 17:55:23,142] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.68 | bwd: 4612.96 | bwd_inner: 4607.38 | bwd_allreduce: 5.48 | step: 46.34
- 56%|█████▌    | 3241/5800 [9:08:53<4:59:43,  7.03s/it]                                                       {'loss': 0.0244, 'grad_norm': 8.274359703063965, 'learning_rate': 1.7172798599294537e-05, 'epoch': 27.94}
- 56%|█████▌    | 3241/5800 [9:08:53<4:59:43,  7.03s/it]score1 tensor([[0.5195],
-        [0.4375],
-        [0.5469],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5234, 0.4473, 0.5625, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:55:29,994] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.37
-[2025-01-25 17:55:29,995] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.03 | bwd_microstep: 4569.97 | bwd_inner_microstep: 4565.33 | bwd_allreduce_microstep: 4.57 | step_microstep: 43.55
-[2025-01-25 17:55:29,995] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.00 | bwd: 4569.99 | bwd_inner: 4565.33 | bwd_allreduce: 4.60 | step: 43.55
- 56%|█████▌    | 3242/5800 [9:08:59<4:57:19,  6.97s/it]                                                       {'loss': 0.0073, 'grad_norm': 6.097774505615234, 'learning_rate': 1.7161743066406874e-05, 'epoch': 27.95}
- 56%|█████▌    | 3242/5800 [9:08:59<4:57:19,  6.97s/it]score1 tensor([[0.4746],
-        [0.4238],
-        [0.5742],
-        [0.6328]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4609, 0.4238, 0.5703, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:55:36,839] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 17:55:36,840] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.90 | bwd_microstep: 4570.46 | bwd_inner_microstep: 4565.97 | bwd_allreduce_microstep: 4.41 | step_microstep: 42.21
-[2025-01-25 17:55:36,840] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.87 | bwd: 4570.48 | bwd_inner: 4565.97 | bwd_allreduce: 4.45 | step: 42.22
- 56%|█████▌    | 3243/5800 [9:09:06<4:55:34,  6.94s/it]                                                       {'loss': 0.0112, 'grad_norm': 6.534008979797363, 'learning_rate': 1.7150688418536966e-05, 'epoch': 27.96}
- 56%|█████▌    | 3243/5800 [9:09:06<4:55:34,  6.94s/it]score1 tensor([[0.4238],
-        [0.4492],
-        [0.5391],
-        [0.4414]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4336, 0.4512, 0.5430, 0.4316], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:55:43,738] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.96 | optimizer_step: 4.36
-[2025-01-25 17:55:43,740] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.32 | bwd_microstep: 4619.15 | bwd_inner_microstep: 4614.06 | bwd_allreduce_microstep: 4.98 | step_microstep: 48.21
-[2025-01-25 17:55:43,740] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.28 | bwd: 4619.17 | bwd_inner: 4614.06 | bwd_allreduce: 5.03 | step: 48.22
- 56%|█████▌    | 3244/5800 [9:09:13<4:55:02,  6.93s/it]                                                       {'loss': 0.0063, 'grad_norm': 4.005799293518066, 'learning_rate': 1.7139634659131848e-05, 'epoch': 27.97}
- 56%|█████▌    | 3244/5800 [9:09:13<4:55:02,  6.93s/it]score1 tensor([[0.4316],
-        [0.5898],
-        [0.6602],
-        [0.4121]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3906, 0.5664, 0.6367, 0.4160], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0229, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:55:50,645] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.37
-[2025-01-25 17:55:50,646] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.48 | bwd_microstep: 4621.72 | bwd_inner_microstep: 4616.98 | bwd_allreduce_microstep: 4.61 | step_microstep: 43.75
-[2025-01-25 17:55:50,646] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.45 | bwd: 4621.74 | bwd_inner: 4616.98 | bwd_allreduce: 4.69 | step: 43.76
- 56%|█████▌    | 3245/5800 [9:09:20<4:54:44,  6.92s/it]                                                       {'loss': 0.0229, 'grad_norm': 4.737756729125977, 'learning_rate': 1.712858179163828e-05, 'epoch': 27.97}
- 56%|█████▌    | 3245/5800 [9:09:20<4:54:44,  6.92s/it]score1 tensor([[0.5352],
-        [0.5156],
-        [0.5039],
-        [0.4141]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.4805, 0.4922, 0.4082], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0142, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:55:57,547] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.04 | optimizer_step: 4.37
-[2025-01-25 17:55:57,549] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.07 | bwd_microstep: 4615.25 | bwd_inner_microstep: 4610.35 | bwd_allreduce_microstep: 4.80 | step_microstep: 47.72
-[2025-01-25 17:55:57,549] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.03 | bwd: 4615.28 | bwd_inner: 4610.35 | bwd_allreduce: 4.86 | step: 47.73
- 56%|█████▌    | 3246/5800 [9:09:27<4:54:17,  6.91s/it]                                                       {'loss': 0.0142, 'grad_norm': 7.9864020347595215, 'learning_rate': 1.7117529819502726e-05, 'epoch': 27.98}
- 56%|█████▌    | 3246/5800 [9:09:27<4:54:17,  6.91s/it]score1 tensor([[0.6094],
-        [0.5117],
-        [0.5547],
-        [0.4551]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.5039, 0.5391, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:56:04,440] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 17:56:04,441] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.10 | bwd_microstep: 4612.21 | bwd_inner_microstep: 4607.53 | bwd_allreduce_microstep: 4.60 | step_microstep: 42.44
-[2025-01-25 17:56:04,442] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.06 | bwd: 4612.23 | bwd_inner: 4607.53 | bwd_allreduce: 4.64 | step: 42.45
- 56%|█████▌    | 3247/5800 [9:09:34<4:53:54,  6.91s/it]                                                       {'loss': 0.0103, 'grad_norm': 3.868910074234009, 'learning_rate': 1.7106478746171394e-05, 'epoch': 27.99}
- 56%|█████▌    | 3247/5800 [9:09:34<4:53:54,  6.91s/it][93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-score1 tensor([[0.4727]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:56:08,654] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 17:56:08,655] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 571.09 | bwd_microstep: 1220.37 | bwd_inner_microstep: 1215.98 | bwd_allreduce_microstep: 4.30 | step_microstep: 42.86
-[2025-01-25 17:56:08,656] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 571.05 | bwd: 1220.39 | bwd_inner: 1215.98 | bwd_allreduce: 4.34 | step: 42.87
- 56%|█████▌    | 3248/5800 [9:09:38<4:19:22,  6.10s/it]                                                       {'loss': 0.0078, 'grad_norm': 8.393621444702148, 'learning_rate': 1.70954285750902e-05, 'epoch': 28.0}
- 56%|█████▌    | 3248/5800 [9:09:38<4:19:22,  6.10s/it][2025-01-25 17:56:12,803] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 17:56:23,484] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 17:56:33,535] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 17:56:43,804] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-score1 tensor([[0.7656],
-        [0.6406],
-        [0.5312],
-        [0.4297]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.7031, 0.6367, 0.5273, 0.4297], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:57:00,754] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 17:57:00,755] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2140.71 | bwd_microstep: 4538.05 | bwd_inner_microstep: 4532.75 | bwd_allreduce_microstep: 5.22 | step_microstep: 45.72
-[2025-01-25 17:57:00,756] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2140.66 | bwd: 4538.07 | bwd_inner: 4532.75 | bwd_allreduce: 5.26 | step: 45.73
- 56%|█████▌    | 3249/5800 [9:10:30<14:06:05, 19.90s/it]                                                        {'loss': 0.0176, 'grad_norm': 7.157200336456299, 'learning_rate': 1.7084379309704767e-05, 'epoch': 28.01}
- 56%|█████▌    | 3249/5800 [9:10:30<14:06:05, 19.90s/it]score1 tensor([[0.3848],
-        [0.4590],
-        [0.5039],
-        [0.6211]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3926, 0.4648, 0.5156, 0.6289], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:57:07,590] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.36
-[2025-01-25 17:57:07,591] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2128.12 | bwd_microstep: 4580.94 | bwd_inner_microstep: 4576.16 | bwd_allreduce_microstep: 4.69 | step_microstep: 45.53
-[2025-01-25 17:57:07,592] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2128.08 | bwd: 4580.96 | bwd_inner: 4576.16 | bwd_allreduce: 4.74 | step: 45.54
- 56%|█████▌    | 3250/5800 [9:10:37<11:19:07, 15.98s/it]                                                        {'loss': 0.0083, 'grad_norm': 8.04876708984375, 'learning_rate': 1.7073330953460462e-05, 'epoch': 28.02}
- 56%|█████▌    | 3250/5800 [9:10:37<11:19:07, 15.98s/it]score1 tensor([[0.6211],
-        [0.4492],
-        [0.5820],
-        [0.4863]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.4609, 0.5547, 0.4805], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:57:14,425] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 17:57:14,426] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2133.91 | bwd_microstep: 4587.35 | bwd_inner_microstep: 4582.38 | bwd_allreduce_microstep: 4.87 | step_microstep: 42.90
-[2025-01-25 17:57:14,426] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2133.87 | bwd: 4587.38 | bwd_inner: 4582.38 | bwd_allreduce: 4.92 | step: 42.91
- 56%|█████▌    | 3251/5800 [9:10:44<9:22:18, 13.24s/it]                                                        {'loss': 0.0122, 'grad_norm': 4.680803298950195, 'learning_rate': 1.706228350980234e-05, 'epoch': 28.03}
- 56%|█████▌    | 3251/5800 [9:10:44<9:22:18, 13.24s/it]score1 tensor([[0.4941],
-        [0.4941],
-        [0.4570],
-        [0.4277]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.4824, 0.4609, 0.4375], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:57:21,282] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.96 | optimizer_step: 4.36
-[2025-01-25 17:57:21,284] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2140.84 | bwd_microstep: 4594.81 | bwd_inner_microstep: 4590.05 | bwd_allreduce_microstep: 4.67 | step_microstep: 48.42
-[2025-01-25 17:57:21,284] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2140.81 | bwd: 4594.84 | bwd_inner: 4590.05 | bwd_allreduce: 4.71 | step: 48.43
- 56%|█████▌    | 3252/5800 [9:10:51<8:00:54, 11.32s/it]                                                       {'loss': 0.0127, 'grad_norm': 3.815790891647339, 'learning_rate': 1.70512369821752e-05, 'epoch': 28.03}
- 56%|█████▌    | 3252/5800 [9:10:51<8:00:54, 11.32s/it]score1 tensor([[0.4102],
-        [0.4629],
-        [0.6055],
-        [0.3965]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4141, 0.4785, 0.5938, 0.3945], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:57:28,175] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.21 | optimizer_step: 4.40
-[2025-01-25 17:57:28,177] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2146.20 | bwd_microstep: 4603.95 | bwd_inner_microstep: 4597.10 | bwd_allreduce_microstep: 6.67 | step_microstep: 61.23
-[2025-01-25 17:57:28,182] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2146.16 | bwd: 4603.98 | bwd_inner: 4597.10 | bwd_allreduce: 6.78 | step: 61.24
- 56%|█████▌    | 3253/5800 [9:10:58<7:04:21, 10.00s/it]                                                       {'loss': 0.0083, 'grad_norm': 0.5415021181106567, 'learning_rate': 1.704019137402353e-05, 'epoch': 28.04}
- 56%|█████▌    | 3253/5800 [9:10:58<7:04:21, 10.00s/it]score1 tensor([[0.5977],
-        [0.5859],
-        [0.4824],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133, 0.6055, 0.4941, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:57:35,053] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 17:57:35,054] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2149.03 | bwd_microstep: 4591.60 | bwd_inner_microstep: 4586.24 | bwd_allreduce_microstep: 5.25 | step_microstep: 48.61
-[2025-01-25 17:57:35,055] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2148.99 | bwd: 4591.62 | bwd_inner: 4586.24 | bwd_allreduce: 5.31 | step: 48.62
- 56%|█████▌    | 3254/5800 [9:11:05<6:24:23,  9.06s/it]                                                       {'loss': 0.0186, 'grad_norm': 8.497871398925781, 'learning_rate': 1.702914668879155e-05, 'epoch': 28.05}
- 56%|█████▌    | 3254/5800 [9:11:05<6:24:23,  9.06s/it]score1 tensor([[0.5039],
-        [0.5195],
-        [0.4766],
-        [0.3047]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.5508, 0.4688, 0.3105], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0200, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:57:41,920] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 17:57:41,921] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.39 | bwd_microstep: 4594.24 | bwd_inner_microstep: 4589.31 | bwd_allreduce_microstep: 4.85 | step_microstep: 50.15
-[2025-01-25 17:57:41,922] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.35 | bwd: 4594.26 | bwd_inner: 4589.31 | bwd_allreduce: 4.89 | step: 50.17
- 56%|█████▌    | 3255/5800 [9:11:11<5:56:18,  8.40s/it]                                                       {'loss': 0.02, 'grad_norm': 3.7532331943511963, 'learning_rate': 1.7018102929923186e-05, 'epoch': 28.06}
- 56%|█████▌    | 3255/5800 [9:11:11<5:56:18,  8.40s/it]score1 tensor([[0.4707],
-        [0.5352],
-        [0.5469],
-        [0.4824]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.5586, 0.5469, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:57:48,755] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.36
-[2025-01-25 17:57:48,757] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.10 | bwd_microstep: 4563.37 | bwd_inner_microstep: 4558.20 | bwd_allreduce_microstep: 5.10 | step_microstep: 49.49
-[2025-01-25 17:57:48,757] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.06 | bwd: 4563.40 | bwd_inner: 4558.20 | bwd_allreduce: 5.13 | step: 49.49
- 56%|█████▌    | 3256/5800 [9:11:18<5:36:13,  7.93s/it]                                                       {'loss': 0.0098, 'grad_norm': 2.168210029602051, 'learning_rate': 1.700706010086206e-05, 'epoch': 28.07}
- 56%|█████▌    | 3256/5800 [9:11:18<5:36:13,  7.93s/it]score1 tensor([[0.4648],
-        [0.4961],
-        [0.5977],
-        [0.3984]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4453, 0.5312, 0.6094, 0.4023], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:57:55,633] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.36
-[2025-01-25 17:57:55,634] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.99 | bwd_microstep: 4609.89 | bwd_inner_microstep: 4604.92 | bwd_allreduce_microstep: 4.86 | step_microstep: 46.83
-[2025-01-25 17:57:55,635] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.95 | bwd: 4609.91 | bwd_inner: 4604.92 | bwd_allreduce: 4.91 | step: 46.84
- 56%|█████▌    | 3257/5800 [9:11:25<5:22:44,  7.61s/it]                                                       {'loss': 0.0176, 'grad_norm': 4.090327739715576, 'learning_rate': 1.6996018205051532e-05, 'epoch': 28.08}
- 56%|█████▌    | 3257/5800 [9:11:25<5:22:44,  7.61s/it]score1 tensor([[0.4863],
-        [0.6523],
-        [0.5430],
-        [0.3945]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4922, 0.6719, 0.5664, 0.4043], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:58:02,520] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 17:58:02,521] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.28 | bwd_microstep: 4615.16 | bwd_inner_microstep: 4605.84 | bwd_allreduce_microstep: 9.21 | step_microstep: 47.84
-[2025-01-25 17:58:02,521] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.25 | bwd: 4615.18 | bwd_inner: 4605.84 | bwd_allreduce: 9.26 | step: 47.85
- 56%|█████▌    | 3258/5800 [9:11:32<5:13:20,  7.40s/it]                                                       {'loss': 0.0146, 'grad_norm': 8.280078887939453, 'learning_rate': 1.6984977245934645e-05, 'epoch': 28.09}
- 56%|█████▌    | 3258/5800 [9:11:32<5:13:20,  7.40s/it]score1 tensor([[0.5820],
-        [0.5195],
-        [0.5078],
-        [0.3789]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.5078, 0.4746, 0.3789], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0210, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:58:09,356] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.36
-[2025-01-25 17:58:09,358] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.34 | bwd_microstep: 4564.94 | bwd_inner_microstep: 4559.80 | bwd_allreduce_microstep: 5.06 | step_microstep: 46.56
-[2025-01-25 17:58:09,358] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.30 | bwd: 4564.96 | bwd_inner: 4559.80 | bwd_allreduce: 5.10 | step: 46.57
- 56%|█████▌    | 3259/5800 [9:11:39<5:06:07,  7.23s/it]                                                       {'loss': 0.021, 'grad_norm': 1.874875783920288, 'learning_rate': 1.6973937226954172e-05, 'epoch': 28.09}
- 56%|█████▌    | 3259/5800 [9:11:39<5:06:07,  7.23s/it]score1 tensor([[0.5859],
-        [0.5508],
-        [0.4531],
-        [0.4336]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5742, 0.5391, 0.4336, 0.4277], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:58:16,259] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.37
-[2025-01-25 17:58:16,260] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.10 | bwd_microstep: 4626.34 | bwd_inner_microstep: 4621.24 | bwd_allreduce_microstep: 5.02 | step_microstep: 44.88
-[2025-01-25 17:58:16,261] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.07 | bwd: 4626.36 | bwd_inner: 4621.24 | bwd_allreduce: 5.06 | step: 44.88
- 56%|█████▌    | 3260/5800 [9:11:46<5:01:58,  7.13s/it]                                                       {'loss': 0.0122, 'grad_norm': 8.134757995605469, 'learning_rate': 1.696289815155258e-05, 'epoch': 28.1}
- 56%|█████▌    | 3260/5800 [9:11:46<5:01:58,  7.13s/it]score1 tensor([[0.5312],
-        [0.5195],
-        [0.4961],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.5508, 0.4570, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0264, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:58:23,131] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.36
-[2025-01-25 17:58:23,132] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.67 | bwd_microstep: 4579.32 | bwd_inner_microstep: 4574.22 | bwd_allreduce_microstep: 5.00 | step_microstep: 45.26
-[2025-01-25 17:58:23,133] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.64 | bwd: 4579.34 | bwd_inner: 4574.22 | bwd_allreduce: 5.05 | step: 45.27
- 56%|█████▌    | 3261/5800 [9:11:53<4:58:25,  7.05s/it]                                                       {'loss': 0.0264, 'grad_norm': 2.2047007083892822, 'learning_rate': 1.6951860023172036e-05, 'epoch': 28.11}
- 56%|█████▌    | 3261/5800 [9:11:53<4:58:25,  7.05s/it]score1 tensor([[0.5391],
-        [0.4141],
-        [0.5234],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.3789, 0.5039, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:58:30,055] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 17:58:30,056] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.35 | bwd_microstep: 4640.52 | bwd_inner_microstep: 4635.12 | bwd_allreduce_microstep: 5.31 | step_microstep: 47.69
-[2025-01-25 17:58:30,057] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.32 | bwd: 4640.57 | bwd_inner: 4635.12 | bwd_allreduce: 5.35 | step: 47.70
- 56%|█████▌    | 3262/5800 [9:12:00<4:56:40,  7.01s/it]                                                       {'loss': 0.0234, 'grad_norm': 8.205048561096191, 'learning_rate': 1.6940822845254438e-05, 'epoch': 28.12}
- 56%|█████▌    | 3262/5800 [9:12:00<4:56:40,  7.01s/it]score1 tensor([[0.5938],
-        [0.5391],
-        [0.4258],
-        [0.4199]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5898, 0.5234, 0.4062, 0.4062], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:58:36,968] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 17:58:36,969] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.82 | bwd_microstep: 4630.42 | bwd_inner_microstep: 4625.28 | bwd_allreduce_microstep: 5.04 | step_microstep: 45.42
-[2025-01-25 17:58:36,970] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.78 | bwd: 4630.44 | bwd_inner: 4625.28 | bwd_allreduce: 5.09 | step: 45.43
- 56%|█████▋    | 3263/5800 [9:12:06<4:55:23,  6.99s/it]                                                       {'loss': 0.0132, 'grad_norm': 8.045234680175781, 'learning_rate': 1.6929786621241347e-05, 'epoch': 28.13}
- 56%|█████▋    | 3263/5800 [9:12:06<4:55:23,  6.99s/it]score1 tensor([[0.5273],
-        [0.4668],
-        [0.5430],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.4473, 0.5508, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0181, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:58:43,888] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 17:58:43,890] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.62 | bwd_microstep: 4634.59 | bwd_inner_microstep: 4629.56 | bwd_allreduce_microstep: 4.95 | step_microstep: 44.33
-[2025-01-25 17:58:43,890] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.57 | bwd: 4634.62 | bwd_inner: 4629.56 | bwd_allreduce: 4.99 | step: 44.34
- 56%|█████▋    | 3264/5800 [9:12:13<4:54:22,  6.96s/it]                                                       {'loss': 0.0181, 'grad_norm': 0.49369776248931885, 'learning_rate': 1.6918751354574067e-05, 'epoch': 28.14}
- 56%|█████▋    | 3264/5800 [9:12:13<4:54:22,  6.96s/it]score1 tensor([[0.4766],
-        [0.3887],
-        [0.5039],
-        [0.4961]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.3652, 0.5039, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:58:50,754] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 17:58:50,755] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.24 | bwd_microstep: 4583.75 | bwd_inner_microstep: 4578.90 | bwd_allreduce_microstep: 4.74 | step_microstep: 43.85
-[2025-01-25 17:58:50,755] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.20 | bwd: 4583.78 | bwd_inner: 4578.91 | bwd_allreduce: 4.79 | step: 43.86
- 56%|█████▋    | 3265/5800 [9:12:20<4:53:01,  6.94s/it]                                                       {'loss': 0.0117, 'grad_norm': 1.7638283967971802, 'learning_rate': 1.6907717048693573e-05, 'epoch': 28.15}
- 56%|█████▋    | 3265/5800 [9:12:20<4:53:01,  6.94s/it]score1 tensor([[0.5430],
-        [0.5586],
-        [0.4883],
-        [0.6250]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.6055, 0.5156, 0.6328], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0205, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:58:57,629] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 17:58:57,631] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.93 | bwd_microstep: 4588.99 | bwd_inner_microstep: 4583.88 | bwd_allreduce_microstep: 5.02 | step_microstep: 49.05
-[2025-01-25 17:58:57,631] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.90 | bwd: 4589.02 | bwd_inner: 4583.88 | bwd_allreduce: 5.06 | step: 49.05
- 56%|█████▋    | 3266/5800 [9:12:27<4:52:06,  6.92s/it]                                                       {'loss': 0.0205, 'grad_norm': 6.49683141708374, 'learning_rate': 1.6896683707040567e-05, 'epoch': 28.16}
- 56%|█████▋    | 3266/5800 [9:12:27<4:52:06,  6.92s/it]score1 tensor([[0.5156],
-        [0.4492],
-        [0.5000],
-        [0.4531]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.4414, 0.5117, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:59:04,555] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.54 | optimizer_step: 4.36
-[2025-01-25 17:59:04,557] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.99 | bwd_microstep: 4642.85 | bwd_inner_microstep: 4637.75 | bwd_allreduce_microstep: 5.02 | step_microstep: 46.15
-[2025-01-25 17:59:04,557] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.96 | bwd: 4642.87 | bwd_inner: 4637.75 | bwd_allreduce: 5.06 | step: 46.16
- 56%|█████▋    | 3267/5800 [9:12:34<4:52:05,  6.92s/it]                                                       {'loss': 0.0083, 'grad_norm': 3.865356922149658, 'learning_rate': 1.6885651333055428e-05, 'epoch': 28.16}
- 56%|█████▋    | 3267/5800 [9:12:34<4:52:05,  6.92s/it]score1 tensor([[0.5742],
-        [0.4883],
-        [0.5039],
-        [0.4629]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.5156, 0.5117, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:59:11,476] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 17:59:11,477] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.45 | bwd_microstep: 4641.28 | bwd_inner_microstep: 4636.19 | bwd_allreduce_microstep: 4.98 | step_microstep: 44.93
-[2025-01-25 17:59:11,478] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.42 | bwd: 4641.30 | bwd_inner: 4636.19 | bwd_allreduce: 5.04 | step: 44.94
- 56%|█████▋    | 3268/5800 [9:12:41<4:52:00,  6.92s/it]                                                       {'loss': 0.0186, 'grad_norm': 8.134748458862305, 'learning_rate': 1.6874619930178235e-05, 'epoch': 28.17}
- 56%|█████▋    | 3268/5800 [9:12:41<4:52:00,  6.92s/it]score1 tensor([[0.5195],
-        [0.6367],
-        [0.5859],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.6641, 0.6094, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0361, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:59:18,407] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 17:59:18,408] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.67 | bwd_microstep: 4648.93 | bwd_inner_microstep: 4643.85 | bwd_allreduce_microstep: 4.98 | step_microstep: 47.16
-[2025-01-25 17:59:18,409] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.63 | bwd: 4648.95 | bwd_inner: 4643.85 | bwd_allreduce: 5.03 | step: 47.17
- 56%|█████▋    | 3269/5800 [9:12:48<4:52:02,  6.92s/it]                                                       {'loss': 0.0361, 'grad_norm': 4.586621284484863, 'learning_rate': 1.6863589501848785e-05, 'epoch': 28.18}
- 56%|█████▋    | 3269/5800 [9:12:48<4:52:02,  6.92s/it]score1 tensor([[0.3887],
-        [0.5508],
-        [0.5352],
-        [0.4004]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4043, 0.5820, 0.5469, 0.4043], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:59:25,345] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 17:59:25,346] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.30 | bwd_microstep: 4643.68 | bwd_inner_microstep: 4634.39 | bwd_allreduce_microstep: 9.19 | step_microstep: 45.99
-[2025-01-25 17:59:25,347] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.26 | bwd: 4643.70 | bwd_inner: 4634.39 | bwd_allreduce: 9.24 | step: 46.00
- 56%|█████▋    | 3270/5800 [9:12:55<4:52:09,  6.93s/it]                                                       {'loss': 0.0156, 'grad_norm': 7.811926364898682, 'learning_rate': 1.685256005150654e-05, 'epoch': 28.19}
- 56%|█████▋    | 3270/5800 [9:12:55<4:52:09,  6.93s/it]score1 tensor([[0.4727],
-        [0.3438],
-        [0.5352],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4766, 0.3516, 0.5312, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:59:32,261] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.37
-[2025-01-25 17:59:32,262] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.61 | bwd_microstep: 4636.51 | bwd_inner_microstep: 4631.52 | bwd_allreduce_microstep: 4.91 | step_microstep: 44.21
-[2025-01-25 17:59:32,262] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.57 | bwd: 4636.53 | bwd_inner: 4631.52 | bwd_allreduce: 4.94 | step: 44.22
- 56%|█████▋    | 3271/5800 [9:13:02<4:51:50,  6.92s/it]                                                       {'loss': 0.0088, 'grad_norm': 3.6033265590667725, 'learning_rate': 1.6841531582590697e-05, 'epoch': 28.2}
- 56%|█████▋    | 3271/5800 [9:13:02<4:51:50,  6.92s/it]score1 tensor([[0.3730],
-        [0.4844],
-        [0.4199],
-        [0.3984]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.4844, 0.4141, 0.4082], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0044, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:59:39,133] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 17:59:39,135] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.95 | bwd_microstep: 4593.44 | bwd_inner_microstep: 4588.65 | bwd_allreduce_microstep: 4.69 | step_microstep: 43.49
-[2025-01-25 17:59:39,135] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.91 | bwd: 4593.46 | bwd_inner: 4588.66 | bwd_allreduce: 4.74 | step: 43.50
- 56%|█████▋    | 3272/5800 [9:13:09<4:51:04,  6.91s/it]                                                       {'loss': 0.0044, 'grad_norm': 1.7356414794921875, 'learning_rate': 1.6830504098540098e-05, 'epoch': 28.21}
- 56%|█████▋    | 3272/5800 [9:13:09<4:51:04,  6.91s/it]score1 tensor([[0.5586],
-        [0.6328],
-        [0.5117],
-        [0.3750]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5820, 0.6406, 0.5078, 0.3867], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:59:46,061] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.37
-[2025-01-25 17:59:46,062] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.43 | bwd_microstep: 4644.34 | bwd_inner_microstep: 4638.82 | bwd_allreduce_microstep: 5.40 | step_microstep: 46.55
-[2025-01-25 17:59:46,063] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.40 | bwd: 4644.40 | bwd_inner: 4638.82 | bwd_allreduce: 5.47 | step: 46.55
- 56%|█████▋    | 3273/5800 [9:13:16<4:51:10,  6.91s/it]                                                       {'loss': 0.0117, 'grad_norm': 4.198330879211426, 'learning_rate': 1.6819477602793325e-05, 'epoch': 28.22}
- 56%|█████▋    | 3273/5800 [9:13:16<4:51:10,  6.91s/it]score1 tensor([[0.4629],
-        [0.4570],
-        [0.5273],
-        [0.4316]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.4297, 0.4941, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 17:59:53,453] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 17:59:53,454] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.99 | bwd_microstep: 4636.66 | bwd_inner_microstep: 4631.78 | bwd_allreduce_microstep: 4.80 | step_microstep: 522.20
-[2025-01-25 17:59:53,454] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.95 | bwd: 4636.69 | bwd_inner: 4631.78 | bwd_allreduce: 4.84 | step: 522.20
- 56%|█████▋    | 3274/5800 [9:13:23<4:57:07,  7.06s/it]                                                       {'loss': 0.0215, 'grad_norm': 4.137742042541504, 'learning_rate': 1.6808452098788625e-05, 'epoch': 28.22}
- 56%|█████▋    | 3274/5800 [9:13:23<4:57:07,  7.06s/it]score1 tensor([[0.4688],
-        [0.4863],
-        [0.4590],
-        [0.4668]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.4648, 0.4707, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:00:00,361] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.37
-[2025-01-25 18:00:00,363] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.36 | bwd_microstep: 4633.69 | bwd_inner_microstep: 4628.63 | bwd_allreduce_microstep: 4.97 | step_microstep: 44.11
-[2025-01-25 18:00:00,363] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.31 | bwd: 4633.72 | bwd_inner: 4628.63 | bwd_allreduce: 5.02 | step: 44.11
- 56%|█████▋    | 3275/5800 [9:13:30<4:55:07,  7.01s/it]                                                       {'loss': 0.0132, 'grad_norm': 3.983673334121704, 'learning_rate': 1.6797427589963926e-05, 'epoch': 28.23}
- 56%|█████▋    | 3275/5800 [9:13:30<4:55:07,  7.01s/it]score1 tensor([[0.4883],
-        [0.6250],
-        [0.4258],
-        [0.4590]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4766, 0.6172, 0.4219, 0.4277], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:00:07,286] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 18:00:07,288] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2171.30 | bwd_microstep: 4640.27 | bwd_inner_microstep: 4635.24 | bwd_allreduce_microstep: 4.94 | step_microstep: 41.05
-[2025-01-25 18:00:07,288] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2171.26 | bwd: 4640.30 | bwd_inner: 4635.24 | bwd_allreduce: 4.98 | step: 41.06
- 56%|█████▋    | 3276/5800 [9:13:37<4:53:48,  6.98s/it]                                                       {'loss': 0.0137, 'grad_norm': 8.093613624572754, 'learning_rate': 1.6786404079756883e-05, 'epoch': 28.24}
- 56%|█████▋    | 3276/5800 [9:13:37<4:53:48,  6.98s/it]score1 tensor([[0.4883],
-        [0.6953],
-        [0.5859],
-        [0.4316]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.6641, 0.5664, 0.4023], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0259, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:00:14,186] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.51 | optimizer_step: 4.37
-[2025-01-25 18:00:14,187] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.49 | bwd_microstep: 4631.76 | bwd_inner_microstep: 4626.88 | bwd_allreduce_microstep: 4.80 | step_microstep: 43.20
-[2025-01-25 18:00:14,188] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.46 | bwd: 4631.79 | bwd_inner: 4626.88 | bwd_allreduce: 4.84 | step: 43.21
- 56%|█████▋    | 3277/5800 [9:13:44<4:52:41,  6.96s/it]                                                       {'loss': 0.0259, 'grad_norm': 4.602729797363281, 'learning_rate': 1.6775381571604806e-05, 'epoch': 28.25}
- 56%|█████▋    | 3277/5800 [9:13:44<4:52:41,  6.96s/it]score1 tensor([[0.4785],
-        [0.5391],
-        [0.6641],
-        [0.6289]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.5195, 0.6484, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0190, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:00:21,115] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 18:00:21,116] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.91 | bwd_microstep: 4647.42 | bwd_inner_microstep: 4642.44 | bwd_allreduce_microstep: 4.87 | step_microstep: 49.44
-[2025-01-25 18:00:21,116] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.86 | bwd: 4647.44 | bwd_inner: 4642.44 | bwd_allreduce: 4.92 | step: 49.45
- 57%|█████▋    | 3278/5800 [9:13:51<4:52:11,  6.95s/it]                                                       {'loss': 0.019, 'grad_norm': 8.813344955444336, 'learning_rate': 1.6764360068944706e-05, 'epoch': 28.26}
- 57%|█████▋    | 3278/5800 [9:13:51<4:52:11,  6.95s/it]score1 tensor([[0.5117],
-        [0.5430],
-        [0.3789],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.5391, 0.3730, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0024, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:00:27,933] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 18:00:27,934] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.21 | bwd_microstep: 4540.88 | bwd_inner_microstep: 4536.03 | bwd_allreduce_microstep: 4.77 | step_microstep: 43.86
-[2025-01-25 18:00:27,935] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.17 | bwd: 4540.90 | bwd_inner: 4536.03 | bwd_allreduce: 4.80 | step: 43.87
- 57%|█████▋    | 3279/5800 [9:13:57<4:50:27,  6.91s/it]                                                       {'loss': 0.0024, 'grad_norm': 3.8903143405914307, 'learning_rate': 1.675333957521329e-05, 'epoch': 28.27}
- 57%|█████▋    | 3279/5800 [9:13:57<4:50:27,  6.91s/it]score1 tensor([[0.6523],
-        [0.4902],
-        [0.4980],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6484, 0.5117, 0.4863, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0142, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:00:34,851] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 18:00:34,852] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.49 | bwd_microstep: 4633.59 | bwd_inner_microstep: 4628.34 | bwd_allreduce_microstep: 5.13 | step_microstep: 46.50
-[2025-01-25 18:00:34,853] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.46 | bwd: 4633.61 | bwd_inner: 4628.34 | bwd_allreduce: 5.19 | step: 46.51
- 57%|█████▋    | 3280/5800 [9:14:04<4:50:22,  6.91s/it]                                                       {'loss': 0.0142, 'grad_norm': 4.482483386993408, 'learning_rate': 1.6742320093846912e-05, 'epoch': 28.28}
- 57%|█████▋    | 3280/5800 [9:14:04<4:50:22,  6.91s/it]score1 tensor([[0.6133],
-        [0.3730],
-        [0.5859],
-        [0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.3672, 0.6055, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:00:41,767] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 18:00:41,768] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.68 | bwd_microstep: 4632.89 | bwd_inner_microstep: 4627.82 | bwd_allreduce_microstep: 4.94 | step_microstep: 43.83
-[2025-01-25 18:00:41,769] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.64 | bwd: 4632.91 | bwd_inner: 4627.82 | bwd_allreduce: 5.00 | step: 43.82
- 57%|█████▋    | 3281/5800 [9:14:11<4:50:20,  6.92s/it]                                                       {'loss': 0.0132, 'grad_norm': 0.6815003156661987, 'learning_rate': 1.6731301628281667e-05, 'epoch': 28.28}
- 57%|█████▋    | 3281/5800 [9:14:11<4:50:20,  6.92s/it]score1 tensor([[0.3281],
-        [0.5469],
-        [0.4766],
-        [0.6289]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3223, 0.5508, 0.4785, 0.6562], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:00:48,696] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 18:00:48,697] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.30 | bwd_microstep: 4632.46 | bwd_inner_microstep: 4627.38 | bwd_allreduce_microstep: 4.98 | step_microstep: 44.02
-[2025-01-25 18:00:48,698] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.25 | bwd: 4632.50 | bwd_inner: 4627.38 | bwd_allreduce: 5.02 | step: 44.03
- 57%|█████▋    | 3282/5800 [9:14:18<4:50:23,  6.92s/it]                                                       {'loss': 0.0098, 'grad_norm': 4.8664164543151855, 'learning_rate': 1.6720284181953285e-05, 'epoch': 28.29}
- 57%|█████▋    | 3282/5800 [9:14:18<4:50:23,  6.92s/it]score1 tensor([[0.4805],
-        [0.3809],
-        [0.4082],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.3984, 0.4160, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:00:55,619] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 18:00:55,620] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.19 | bwd_microstep: 4629.64 | bwd_inner_microstep: 4623.94 | bwd_allreduce_microstep: 5.61 | step_microstep: 50.66
-[2025-01-25 18:00:55,620] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.15 | bwd: 4629.67 | bwd_inner: 4623.94 | bwd_allreduce: 5.65 | step: 50.66
- 57%|█████▋    | 3283/5800 [9:14:25<4:50:13,  6.92s/it]                                                       {'loss': 0.0127, 'grad_norm': 3.4424922466278076, 'learning_rate': 1.670926775829721e-05, 'epoch': 28.3}
- 57%|█████▋    | 3283/5800 [9:14:25<4:50:13,  6.92s/it]score1 tensor([[0.4102],
-        [0.4863],
-        [0.3633],
-        [0.5859]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.5195, 0.3613, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:01:02,535] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 18:01:02,536] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.63 | bwd_microstep: 4636.51 | bwd_inner_microstep: 4631.25 | bwd_allreduce_microstep: 5.15 | step_microstep: 43.66
-[2025-01-25 18:01:02,537] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.59 | bwd: 4636.54 | bwd_inner: 4631.25 | bwd_allreduce: 5.20 | step: 43.66
- 57%|█████▋    | 3284/5800 [9:14:32<4:50:10,  6.92s/it]                                                       {'loss': 0.0195, 'grad_norm': 4.33893346786499, 'learning_rate': 1.6698252360748535e-05, 'epoch': 28.31}
- 57%|█████▋    | 3284/5800 [9:14:32<4:50:10,  6.92s/it]score1 tensor([[0.5820],
-        [0.4473],
-        [0.5391],
-        [0.6289]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133, 0.4629, 0.5820, 0.6367], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0244, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:01:09,454] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 18:01:09,456] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.43 | bwd_microstep: 4630.50 | bwd_inner_microstep: 4625.31 | bwd_allreduce_microstep: 5.07 | step_microstep: 45.05
-[2025-01-25 18:01:09,456] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.41 | bwd: 4630.53 | bwd_inner: 4625.31 | bwd_allreduce: 5.12 | step: 45.05
- 57%|█████▋    | 3285/5800 [9:14:39<4:50:08,  6.92s/it]                                                       {'loss': 0.0244, 'grad_norm': 8.61443042755127, 'learning_rate': 1.6687237992742075e-05, 'epoch': 28.32}
- 57%|█████▋    | 3285/5800 [9:14:39<4:50:08,  6.92s/it]score1 tensor([[0.5586],
-        [0.6055],
-        [0.5352],
-        [0.6172]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.6055, 0.5430, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:01:16,339] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 18:01:16,340] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.88 | bwd_microstep: 4580.70 | bwd_inner_microstep: 4574.76 | bwd_allreduce_microstep: 5.83 | step_microstep: 49.40
-[2025-01-25 18:01:16,341] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.83 | bwd: 4580.72 | bwd_inner: 4574.76 | bwd_allreduce: 5.89 | step: 49.41
- 57%|█████▋    | 3286/5800 [9:14:46<4:49:24,  6.91s/it]                                                       {'loss': 0.0107, 'grad_norm': 6.569902420043945, 'learning_rate': 1.6676224657712288e-05, 'epoch': 28.33}
- 57%|█████▋    | 3286/5800 [9:14:46<4:49:24,  6.91s/it]score1 tensor([[0.4316],
-        [0.4902],
-        [0.4395],
-        [0.4707]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.4980, 0.4531, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:01:23,251] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 18:01:23,252] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.06 | bwd_microstep: 4631.03 | bwd_inner_microstep: 4626.54 | bwd_allreduce_microstep: 4.42 | step_microstep: 40.68
-[2025-01-25 18:01:23,253] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.02 | bwd: 4631.05 | bwd_inner: 4626.54 | bwd_allreduce: 4.45 | step: 40.70
- 57%|█████▋    | 3287/5800 [9:14:53<4:49:17,  6.91s/it]                                                       {'loss': 0.0132, 'grad_norm': 3.8223555088043213, 'learning_rate': 1.666521235909332e-05, 'epoch': 28.34}
- 57%|█████▋    | 3287/5800 [9:14:53<4:49:17,  6.91s/it]score1 tensor([[0.5508],
-        [0.5586],
-        [0.6289],
-        [0.3594]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.5625, 0.6133, 0.3652], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:01:30,164] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 18:01:30,165] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.81 | bwd_microstep: 4637.78 | bwd_inner_microstep: 4632.60 | bwd_allreduce_microstep: 5.09 | step_microstep: 49.79
-[2025-01-25 18:01:30,166] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.78 | bwd: 4637.80 | bwd_inner: 4632.60 | bwd_allreduce: 5.13 | step: 49.80
- 57%|█████▋    | 3288/5800 [9:15:00<4:49:18,  6.91s/it]                                                       {'loss': 0.0073, 'grad_norm': 0.9486657977104187, 'learning_rate': 1.665420110031901e-05, 'epoch': 28.34}
- 57%|█████▋    | 3288/5800 [9:15:00<4:49:18,  6.91s/it]score1 tensor([[0.3711],
-        [0.3105],
-        [0.4785],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3652, 0.3086, 0.4844, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:01:37,091] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 18:01:37,093] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.52 | bwd_microstep: 4641.27 | bwd_inner_microstep: 4635.82 | bwd_allreduce_microstep: 5.33 | step_microstep: 48.24
-[2025-01-25 18:01:37,093] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.48 | bwd: 4641.30 | bwd_inner: 4635.82 | bwd_allreduce: 5.39 | step: 48.24
- 57%|█████▋    | 3289/5800 [9:15:07<4:49:22,  6.91s/it]                                                       {'loss': 0.0073, 'grad_norm': 3.593817949295044, 'learning_rate': 1.6643190884822844e-05, 'epoch': 28.35}
- 57%|█████▋    | 3289/5800 [9:15:07<4:49:22,  6.91s/it]score1 tensor([[0.4355],
-        [0.5742],
-        [0.5273],
-        [0.3926]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4238, 0.6172, 0.5039, 0.3945], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0200, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:01:44,004] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 18:01:44,005] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.16 | bwd_microstep: 4637.92 | bwd_inner_microstep: 4632.73 | bwd_allreduce_microstep: 5.08 | step_microstep: 45.37
-[2025-01-25 18:01:44,006] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.14 | bwd: 4637.94 | bwd_inner: 4632.73 | bwd_allreduce: 5.13 | step: 45.38
- 57%|█████▋    | 3290/5800 [9:15:13<4:49:15,  6.91s/it]                                                       {'loss': 0.02, 'grad_norm': 0.4132630228996277, 'learning_rate': 1.6632181716038012e-05, 'epoch': 28.36}
- 57%|█████▋    | 3290/5800 [9:15:13<4:49:15,  6.91s/it]score1 tensor([[0.6406],
-        [0.4727],
-        [0.7188],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.4805, 0.6875, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0225, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:01:50,921] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 18:01:50,922] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.85 | bwd_microstep: 4636.48 | bwd_inner_microstep: 4631.27 | bwd_allreduce_microstep: 5.13 | step_microstep: 46.94
-[2025-01-25 18:01:50,923] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.81 | bwd: 4636.50 | bwd_inner: 4631.27 | bwd_allreduce: 5.17 | step: 46.94
- 57%|█████▋    | 3291/5800 [9:15:20<4:49:11,  6.92s/it]                                                       {'loss': 0.0225, 'grad_norm': 5.190245151519775, 'learning_rate': 1.6621173597397354e-05, 'epoch': 28.37}
- 57%|█████▋    | 3291/5800 [9:15:20<4:49:11,  6.92s/it]score1 tensor([[0.5820],
-        [0.4648],
-        [0.5977],
-        [0.4941]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.4473, 0.5664, 0.5000], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:01:57,844] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 18:01:57,845] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.34 | bwd_microstep: 4637.09 | bwd_inner_microstep: 4629.50 | bwd_allreduce_microstep: 7.48 | step_microstep: 48.71
-[2025-01-25 18:01:57,846] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.30 | bwd: 4637.12 | bwd_inner: 4629.50 | bwd_allreduce: 7.54 | step: 48.71
- 57%|█████▋    | 3292/5800 [9:15:27<4:49:09,  6.92s/it]                                                       {'loss': 0.0176, 'grad_norm': 4.463839054107666, 'learning_rate': 1.66101665323334e-05, 'epoch': 28.38}
- 57%|█████▋    | 3292/5800 [9:15:27<4:49:09,  6.92s/it]score1 tensor([[0.5547],
-        [0.5430],
-        [0.4688],
-        [0.6602]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.5312, 0.4590, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0142, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:02:04,767] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 18:02:04,769] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.08 | bwd_microstep: 4639.26 | bwd_inner_microstep: 4634.25 | bwd_allreduce_microstep: 4.90 | step_microstep: 47.37
-[2025-01-25 18:02:04,769] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.04 | bwd: 4639.29 | bwd_inner: 4634.25 | bwd_allreduce: 4.96 | step: 47.37
- 57%|█████▋    | 3293/5800 [9:15:34<4:49:16,  6.92s/it]                                                       {'loss': 0.0142, 'grad_norm': 8.649818420410156, 'learning_rate': 1.6599160524278336e-05, 'epoch': 28.39}
- 57%|█████▋    | 3293/5800 [9:15:34<4:49:16,  6.92s/it]score1 tensor([[0.5625],
-        [0.6797],
-        [0.5586],
-        [0.6797]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5703, 0.6953, 0.5391, 0.6797], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:02:11,657] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 8.02 | optimizer_step: 4.36
-[2025-01-25 18:02:11,659] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.51 | bwd_microstep: 4593.01 | bwd_inner_microstep: 4587.24 | bwd_allreduce_microstep: 5.67 | step_microstep: 49.96
-[2025-01-25 18:02:11,659] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.46 | bwd: 4593.04 | bwd_inner: 4587.24 | bwd_allreduce: 5.72 | step: 49.96
- 57%|█████▋    | 3294/5800 [9:15:41<4:48:40,  6.91s/it]                                                       {'loss': 0.0107, 'grad_norm': 2.5165979862213135, 'learning_rate': 1.658815557666403e-05, 'epoch': 28.4}
- 57%|█████▋    | 3294/5800 [9:15:41<4:48:40,  6.91s/it]score1 tensor([[0.6367],
-        [0.4961],
-        [0.5195],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6367, 0.4980, 0.4941, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:02:18,537] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 18:02:18,542] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.87 | bwd_microstep: 4585.04 | bwd_inner_microstep: 4579.82 | bwd_allreduce_microstep: 5.13 | step_microstep: 48.84
-[2025-01-25 18:02:18,543] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.82 | bwd: 4585.06 | bwd_inner: 4579.82 | bwd_allreduce: 5.18 | step: 48.87
- 57%|█████▋    | 3295/5800 [9:15:48<4:48:05,  6.90s/it]                                                       {'loss': 0.0078, 'grad_norm': 2.037755012512207, 'learning_rate': 1.6577151692922018e-05, 'epoch': 28.41}
- 57%|█████▋    | 3295/5800 [9:15:48<4:48:05,  6.90s/it]score1 tensor([[0.4062],
-        [0.4238],
-        [0.4844],
-        [0.4941]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.4121, 0.4785, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:02:25,450] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 18:02:25,452] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.62 | bwd_microstep: 4627.95 | bwd_inner_microstep: 4622.97 | bwd_allreduce_microstep: 4.91 | step_microstep: 43.50
-[2025-01-25 18:02:25,452] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.59 | bwd: 4627.98 | bwd_inner: 4622.97 | bwd_allreduce: 4.95 | step: 43.51
- 57%|█████▋    | 3296/5800 [9:15:55<4:48:14,  6.91s/it]                                                       {'loss': 0.0083, 'grad_norm': 3.6819024085998535, 'learning_rate': 1.65661488764835e-05, 'epoch': 28.41}
- 57%|█████▋    | 3296/5800 [9:15:55<4:48:14,  6.91s/it]score1 tensor([[0.5742],
-        [0.4512],
-        [0.4961],
-        [0.4043]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.4727, 0.4844, 0.4121], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:02:32,382] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 18:02:32,383] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.58 | bwd_microstep: 4636.70 | bwd_inner_microstep: 4631.68 | bwd_allreduce_microstep: 4.90 | step_microstep: 46.27
-[2025-01-25 18:02:32,384] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.54 | bwd: 4636.73 | bwd_inner: 4631.68 | bwd_allreduce: 4.96 | step: 46.27
- 57%|█████▋    | 3297/5800 [9:16:02<4:48:21,  6.91s/it]                                                       {'loss': 0.0132, 'grad_norm': 0.7634965777397156, 'learning_rate': 1.655514713077934e-05, 'epoch': 28.42}
- 57%|█████▋    | 3297/5800 [9:16:02<4:48:21,  6.91s/it]score1 tensor([[0.6602],
-        [0.4980],
-        [0.5703],
-        [0.4688]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6602, 0.4961, 0.5781, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:02:39,250] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.37
-[2025-01-25 18:02:39,251] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.27 | bwd_microstep: 4580.61 | bwd_inner_microstep: 4575.13 | bwd_allreduce_microstep: 5.36 | step_microstep: 46.33
-[2025-01-25 18:02:39,252] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.24 | bwd: 4580.64 | bwd_inner: 4575.13 | bwd_allreduce: 5.43 | step: 46.34
- 57%|█████▋    | 3298/5800 [9:16:09<4:47:38,  6.90s/it]                                                       {'loss': 0.0063, 'grad_norm': 2.1487607955932617, 'learning_rate': 1.6544146459240087e-05, 'epoch': 28.43}
- 57%|█████▋    | 3298/5800 [9:16:09<4:47:38,  6.90s/it]score1 tensor([[0.4707],
-        [0.4492],
-        [0.4512],
-        [0.3887]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4863, 0.4453, 0.4492, 0.3789], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:02:46,163] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.37
-[2025-01-25 18:02:46,164] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.20 | bwd_microstep: 4628.40 | bwd_inner_microstep: 4623.19 | bwd_allreduce_microstep: 5.10 | step_microstep: 46.57
-[2025-01-25 18:02:46,164] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.17 | bwd: 4628.42 | bwd_inner: 4623.19 | bwd_allreduce: 5.15 | step: 46.58
- 57%|█████▋    | 3299/5800 [9:16:16<4:47:44,  6.90s/it]                                                       {'loss': 0.0078, 'grad_norm': 3.655759572982788, 'learning_rate': 1.6533146865295925e-05, 'epoch': 28.44}
- 57%|█████▋    | 3299/5800 [9:16:16<4:47:44,  6.90s/it]score1 tensor([[0.3418],
-        [0.4766],
-        [0.6797],
-        [0.4062]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3555, 0.4727, 0.6797, 0.4199], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:02:53,047] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.94 | optimizer_step: 4.37
-[2025-01-25 18:02:53,048] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.42 | bwd_microstep: 4592.82 | bwd_inner_microstep: 4587.61 | bwd_allreduce_microstep: 5.05 | step_microstep: 45.54
-[2025-01-25 18:02:53,049] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.39 | bwd: 4592.84 | bwd_inner: 4587.61 | bwd_allreduce: 5.14 | step: 45.56
- 57%|█████▋    | 3300/5800 [9:16:23<4:47:22,  6.90s/it]                                                       {'loss': 0.0078, 'grad_norm': 1.6111376285552979, 'learning_rate': 1.652214835237673e-05, 'epoch': 28.45}
- 57%|█████▋    | 3300/5800 [9:16:23<4:47:22,  6.90s/it]score1 tensor([[0.5195],
-        [0.5039],
-        [0.5117],
-        [0.4297]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.4883, 0.4902, 0.4160], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:02:59,962] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 18:02:59,967] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.05 | bwd_microstep: 4630.09 | bwd_inner_microstep: 4624.57 | bwd_allreduce_microstep: 5.41 | step_microstep: 50.38
-[2025-01-25 18:02:59,968] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.00 | bwd: 4630.13 | bwd_inner: 4624.57 | bwd_allreduce: 5.47 | step: 50.40
- 57%|█████▋    | 3301/5800 [9:16:29<4:47:31,  6.90s/it]                                                       {'loss': 0.0137, 'grad_norm': 8.00599479675293, 'learning_rate': 1.6511150923912018e-05, 'epoch': 28.46}
- 57%|█████▋    | 3301/5800 [9:16:29<4:47:31,  6.90s/it]score1 tensor([[0.3711],
-        [0.5312],
-        [0.5625],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3691, 0.5430, 0.5977, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0171, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:03:06,886] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 18:03:06,887] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.56 | bwd_microstep: 4637.21 | bwd_inner_microstep: 4632.16 | bwd_allreduce_microstep: 4.97 | step_microstep: 43.89
-[2025-01-25 18:03:06,888] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.53 | bwd: 4637.23 | bwd_inner: 4632.15 | bwd_allreduce: 5.01 | step: 43.90
- 57%|█████▋    | 3302/5800 [9:16:36<4:47:36,  6.91s/it]                                                       {'loss': 0.0171, 'grad_norm': 0.6067895889282227, 'learning_rate': 1.650015458333099e-05, 'epoch': 28.47}
- 57%|█████▋    | 3302/5800 [9:16:36<4:47:36,  6.91s/it]score1 tensor([[0.6289],
-        [0.5547],
-        [0.5781],
-        [0.4277]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.5664, 0.5781, 0.4355], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:03:13,753] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 18:03:13,754] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.07 | bwd_microstep: 4585.31 | bwd_inner_microstep: 4580.02 | bwd_allreduce_microstep: 5.19 | step_microstep: 46.08
-[2025-01-25 18:03:13,755] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.03 | bwd: 4585.33 | bwd_inner: 4580.02 | bwd_allreduce: 5.24 | step: 46.08
- 57%|█████▋    | 3303/5800 [9:16:43<4:46:59,  6.90s/it]                                                       {'loss': 0.0088, 'grad_norm': 6.299550533294678, 'learning_rate': 1.6489159334062485e-05, 'epoch': 28.47}
- 57%|█████▋    | 3303/5800 [9:16:43<4:46:59,  6.90s/it]score1 tensor([[0.4883],
-        [0.6914],
-        [0.4570],
-        [0.4590]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.6875, 0.4395, 0.4746], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:03:20,677] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 18:03:20,679] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.79 | bwd_microstep: 4643.07 | bwd_inner_microstep: 4633.97 | bwd_allreduce_microstep: 8.99 | step_microstep: 47.14
-[2025-01-25 18:03:20,679] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.76 | bwd: 4643.10 | bwd_inner: 4633.97 | bwd_allreduce: 9.05 | step: 47.15
- 57%|█████▋    | 3304/5800 [9:16:50<4:47:12,  6.90s/it]                                                       {'loss': 0.0132, 'grad_norm': 4.498138904571533, 'learning_rate': 1.647816517953503e-05, 'epoch': 28.48}
- 57%|█████▋    | 3304/5800 [9:16:50<4:47:12,  6.90s/it]score1 tensor([[0.5352],
-        [0.4648],
-        [0.4961],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4688, 0.4941, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:03:27,600] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.89 | optimizer_step: 4.37
-[2025-01-25 18:03:27,601] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.40 | bwd_microstep: 4637.67 | bwd_inner_microstep: 4632.79 | bwd_allreduce_microstep: 4.80 | step_microstep: 44.30
-[2025-01-25 18:03:27,602] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.35 | bwd: 4637.70 | bwd_inner: 4632.79 | bwd_allreduce: 4.83 | step: 44.31
- 57%|█████▋    | 3305/5800 [9:16:57<4:47:31,  6.91s/it]                                                       {'loss': 0.0063, 'grad_norm': 4.110589981079102, 'learning_rate': 1.6467172123176775e-05, 'epoch': 28.49}
- 57%|█████▋    | 3305/5800 [9:16:57<4:47:31,  6.91s/it]score1 tensor([[0.6602],
-        [0.5625],
-        [0.4141],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6562, 0.5586, 0.3809, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0142, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:03:34,534] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.91 | optimizer_step: 4.36
-[2025-01-25 18:03:34,535] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.94 | bwd_microstep: 4636.97 | bwd_inner_microstep: 4632.10 | bwd_allreduce_microstep: 4.78 | step_microstep: 46.35
-[2025-01-25 18:03:34,536] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.91 | bwd: 4636.99 | bwd_inner: 4632.10 | bwd_allreduce: 4.81 | step: 46.36
- 57%|█████▋    | 3306/5800 [9:17:04<4:47:28,  6.92s/it]                                                       {'loss': 0.0142, 'grad_norm': 4.283534049987793, 'learning_rate': 1.6456180168415546e-05, 'epoch': 28.5}
- 57%|█████▋    | 3306/5800 [9:17:04<4:47:28,  6.92s/it]score1 tensor([[0.4277],
-        [0.5430],
-        [0.6016],
-        [0.4746]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4277, 0.5391, 0.5625, 0.4766], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:03:41,390] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.36
-[2025-01-25 18:03:41,391] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.50 | bwd_microstep: 4575.15 | bwd_inner_microstep: 4570.00 | bwd_allreduce_microstep: 5.06 | step_microstep: 46.31
-[2025-01-25 18:03:41,392] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.46 | bwd: 4575.17 | bwd_inner: 4570.00 | bwd_allreduce: 5.10 | step: 46.32
- 57%|█████▋    | 3307/5800 [9:17:11<4:46:35,  6.90s/it]                                                       {'loss': 0.0112, 'grad_norm': 2.523387908935547, 'learning_rate': 1.644518931867884e-05, 'epoch': 28.51}
- 57%|█████▋    | 3307/5800 [9:17:11<4:46:35,  6.90s/it]score1 tensor([[0.5547],
-        [0.5859],
-        [0.4492],
-        [0.4258]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.5781, 0.4551, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:03:48,313] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.03 | optimizer_step: 4.36
-[2025-01-25 18:03:48,315] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.27 | bwd_microstep: 4636.86 | bwd_inner_microstep: 4631.40 | bwd_allreduce_microstep: 5.36 | step_microstep: 48.07
-[2025-01-25 18:03:48,315] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.23 | bwd: 4636.90 | bwd_inner: 4631.40 | bwd_allreduce: 5.40 | step: 48.07
- 57%|█████▋    | 3308/5800 [9:17:18<4:46:52,  6.91s/it]                                                       {'loss': 0.0103, 'grad_norm': 3.6881518363952637, 'learning_rate': 1.6434199577393778e-05, 'epoch': 28.52}
- 57%|█████▋    | 3308/5800 [9:17:18<4:46:52,  6.91s/it]score1 tensor([[0.4766],
-        [0.4805],
-        [0.4863],
-        [0.4258]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4707, 0.4844, 0.4941, 0.4297], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:03:55,247] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.36
-[2025-01-25 18:03:55,248] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.37 | bwd_microstep: 4638.69 | bwd_inner_microstep: 4633.56 | bwd_allreduce_microstep: 5.03 | step_microstep: 48.72
-[2025-01-25 18:03:55,249] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.33 | bwd: 4638.71 | bwd_inner: 4633.56 | bwd_allreduce: 5.08 | step: 48.73
- 57%|█████▋    | 3309/5800 [9:17:25<4:47:03,  6.91s/it]                                                       {'loss': 0.0054, 'grad_norm': 3.8601181507110596, 'learning_rate': 1.642321094798717e-05, 'epoch': 28.53}
- 57%|█████▋    | 3309/5800 [9:17:25<4:47:03,  6.91s/it]score1 tensor([[0.5156],
-        [0.4434],
-        [0.4316],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.4336, 0.4531, 0.5742], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:04:02,161] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.88 | optimizer_step: 4.36
-[2025-01-25 18:04:02,163] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.63 | bwd_microstep: 4634.94 | bwd_inner_microstep: 4629.79 | bwd_allreduce_microstep: 5.04 | step_microstep: 45.83
-[2025-01-25 18:04:02,163] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.60 | bwd: 4634.97 | bwd_inner: 4629.79 | bwd_allreduce: 5.10 | step: 45.84
- 57%|█████▋    | 3310/5800 [9:17:32<4:46:58,  6.91s/it]                                                       {'loss': 0.0146, 'grad_norm': 4.2020134925842285, 'learning_rate': 1.6412223433885442e-05, 'epoch': 28.53}
- 57%|█████▋    | 3310/5800 [9:17:32<4:46:58,  6.91s/it]score1 tensor([[0.4609],
-        [0.5078],
-        [0.6641],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4668, 0.4941, 0.6836, 0.4688], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:04:09,076] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.21 | optimizer_step: 4.36
-[2025-01-25 18:04:09,077] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.55 | bwd_microstep: 4633.73 | bwd_inner_microstep: 4628.58 | bwd_allreduce_microstep: 4.98 | step_microstep: 43.05
-[2025-01-25 18:04:09,078] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.52 | bwd: 4633.75 | bwd_inner: 4628.58 | bwd_allreduce: 5.07 | step: 43.05
- 57%|█████▋    | 3311/5800 [9:17:39<4:46:49,  6.91s/it]                                                       {'loss': 0.0186, 'grad_norm': 0.5257489681243896, 'learning_rate': 1.6401237038514703e-05, 'epoch': 28.54}
- 57%|█████▋    | 3311/5800 [9:17:39<4:46:49,  6.91s/it]score1 tensor([[0.4453],
-        [0.4844],
-        [0.3789],
-        [0.4707]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4395, 0.4629, 0.2812, 0.4668], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0322, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:04:15,992] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 18:04:15,993] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.42 | bwd_microstep: 4632.71 | bwd_inner_microstep: 4627.57 | bwd_allreduce_microstep: 5.02 | step_microstep: 46.88
-[2025-01-25 18:04:15,994] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.39 | bwd: 4632.74 | bwd_inner: 4627.57 | bwd_allreduce: 5.09 | step: 46.89
- 57%|█████▋    | 3312/5800 [9:17:45<4:46:44,  6.92s/it]                                                       {'loss': 0.0322, 'grad_norm': 7.593911170959473, 'learning_rate': 1.63902517653007e-05, 'epoch': 28.55}
- 57%|█████▋    | 3312/5800 [9:17:45<4:46:44,  6.92s/it]score1 tensor([[0.5234],
-        [0.4707],
-        [0.6797],
-        [0.3984]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.4668, 0.6875, 0.3730], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:04:22,900] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.94 | optimizer_step: 4.36
-[2025-01-25 18:04:22,901] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.31 | bwd_microstep: 4628.96 | bwd_inner_microstep: 4623.34 | bwd_allreduce_microstep: 5.49 | step_microstep: 45.02
-[2025-01-25 18:04:22,902] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.29 | bwd: 4629.00 | bwd_inner: 4623.34 | bwd_allreduce: 5.55 | step: 45.02
- 57%|█████▋    | 3313/5800 [9:17:52<4:46:33,  6.91s/it]                                                       {'loss': 0.0103, 'grad_norm': 1.059699296951294, 'learning_rate': 1.637926761766883e-05, 'epoch': 28.56}
- 57%|█████▋    | 3313/5800 [9:17:52<4:46:33,  6.91s/it]score1 tensor([[0.4805],
-        [0.5078],
-        [0.4902],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.5156, 0.4785, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:04:29,812] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.37
-[2025-01-25 18:04:29,814] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.94 | bwd_microstep: 4629.80 | bwd_inner_microstep: 4624.79 | bwd_allreduce_microstep: 4.91 | step_microstep: 44.93
-[2025-01-25 18:04:29,814] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.90 | bwd: 4629.82 | bwd_inner: 4624.79 | bwd_allreduce: 4.95 | step: 44.94
- 57%|█████▋    | 3314/5800 [9:17:59<4:46:25,  6.91s/it]                                                       {'loss': 0.0137, 'grad_norm': 0.36317023634910583, 'learning_rate': 1.636828459904414e-05, 'epoch': 28.57}
- 57%|█████▋    | 3314/5800 [9:17:59<4:46:25,  6.91s/it]score1 tensor([[0.4922],
-        [0.5117],
-        [0.6094],
-        [0.4180]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4922, 0.5352, 0.6406, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:04:36,640] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 18:04:36,641] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.88 | bwd_microstep: 4541.89 | bwd_inner_microstep: 4536.72 | bwd_allreduce_microstep: 5.09 | step_microstep: 47.09
-[2025-01-25 18:04:36,642] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.85 | bwd: 4541.92 | bwd_inner: 4536.72 | bwd_allreduce: 5.13 | step: 47.09
- 57%|█████▋    | 3315/5800 [9:18:06<4:45:11,  6.89s/it]                                                       {'loss': 0.0137, 'grad_norm': 4.322668075561523, 'learning_rate': 1.635730271285133e-05, 'epoch': 28.58}
- 57%|█████▋    | 3315/5800 [9:18:06<4:45:11,  6.89s/it]score1 tensor([[0.4473],
-        [0.4453],
-        [0.6094],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.4473, 0.6523, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:04:43,552] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 18:04:43,554] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.10 | bwd_microstep: 4631.43 | bwd_inner_microstep: 4626.56 | bwd_allreduce_microstep: 4.78 | step_microstep: 43.97
-[2025-01-25 18:04:43,554] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.06 | bwd: 4631.45 | bwd_inner: 4626.56 | bwd_allreduce: 4.82 | step: 43.98
- 57%|█████▋    | 3316/5800 [9:18:13<4:45:27,  6.90s/it]                                                       {'loss': 0.0161, 'grad_norm': 8.188945770263672, 'learning_rate': 1.634632196251474e-05, 'epoch': 28.59}
- 57%|█████▋    | 3316/5800 [9:18:13<4:45:27,  6.90s/it]score1 tensor([[0.6133],
-        [0.5430],
-        [0.3613],
-        [0.4648]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6016, 0.5391, 0.3438, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:04:50,472] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 18:04:50,474] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.61 | bwd_microstep: 4634.96 | bwd_inner_microstep: 4629.92 | bwd_allreduce_microstep: 4.97 | step_microstep: 41.65
-[2025-01-25 18:04:50,474] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.58 | bwd: 4634.99 | bwd_inner: 4629.92 | bwd_allreduce: 5.00 | step: 41.66
- 57%|█████▋    | 3317/5800 [9:18:20<4:45:44,  6.90s/it]                                                       {'loss': 0.0117, 'grad_norm': 8.069665908813477, 'learning_rate': 1.6335342351458362e-05, 'epoch': 28.59}
- 57%|█████▋    | 3317/5800 [9:18:20<4:45:44,  6.90s/it]score1 tensor([[0.5703],
-        [0.6406],
-        [0.5742],
-        [0.4531]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5742, 0.6172, 0.5625, 0.4434], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:04:57,393] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 18:04:57,395] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.63 | bwd_microstep: 4631.52 | bwd_inner_microstep: 4626.17 | bwd_allreduce_microstep: 5.24 | step_microstep: 42.73
-[2025-01-25 18:04:57,395] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.59 | bwd: 4631.54 | bwd_inner: 4626.17 | bwd_allreduce: 5.29 | step: 42.75
- 57%|█████▋    | 3318/5800 [9:18:27<4:45:45,  6.91s/it]                                                       {'loss': 0.0122, 'grad_norm': 4.299646377563477, 'learning_rate': 1.6324363883105822e-05, 'epoch': 28.6}
- 57%|█████▋    | 3318/5800 [9:18:27<4:45:45,  6.91s/it]score1 tensor([[0.4551],
-        [0.5234],
-        [0.5078],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4395, 0.4805, 0.4980, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0220, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:05:04,322] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.37
-[2025-01-25 18:05:04,324] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.37 | bwd_microstep: 4643.09 | bwd_inner_microstep: 4638.21 | bwd_allreduce_microstep: 4.73 | step_microstep: 45.58
-[2025-01-25 18:05:04,324] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.33 | bwd: 4643.11 | bwd_inner: 4638.21 | bwd_allreduce: 4.77 | step: 45.59
- 57%|█████▋    | 3319/5800 [9:18:34<4:45:52,  6.91s/it]                                                       {'loss': 0.022, 'grad_norm': 8.19841480255127, 'learning_rate': 1.6313386560880407e-05, 'epoch': 28.61}
- 57%|█████▋    | 3319/5800 [9:18:34<4:45:52,  6.91s/it]score1 tensor([[0.6562],
-        [0.5430],
-        [0.5234],
-        [0.6719]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.5391, 0.5156, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:05:11,250] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.94 | optimizer_step: 4.36
-[2025-01-25 18:05:11,251] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.22 | bwd_microstep: 4642.07 | bwd_inner_microstep: 4637.29 | bwd_allreduce_microstep: 4.71 | step_microstep: 42.89
-[2025-01-25 18:05:11,251] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.18 | bwd: 4642.10 | bwd_inner: 4637.29 | bwd_allreduce: 4.75 | step: 42.90
- 57%|█████▋    | 3320/5800 [9:18:41<4:45:55,  6.92s/it]                                                       {'loss': 0.0127, 'grad_norm': 9.085944175720215, 'learning_rate': 1.630241038820502e-05, 'epoch': 28.62}
- 57%|█████▋    | 3320/5800 [9:18:41<4:45:55,  6.92s/it]score1 tensor([[0.5859],
-        [0.4688],
-        [0.4434],
-        [0.5898]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5703, 0.4629, 0.4316, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0181, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:05:18,182] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.36
-[2025-01-25 18:05:18,183] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.04 | bwd_microstep: 4648.09 | bwd_inner_microstep: 4638.70 | bwd_allreduce_microstep: 9.27 | step_microstep: 43.01
-[2025-01-25 18:05:18,183] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.01 | bwd: 4648.12 | bwd_inner: 4638.70 | bwd_allreduce: 9.33 | step: 43.02
- 57%|█████▋    | 3321/5800 [9:18:48<4:45:57,  6.92s/it]                                                       {'loss': 0.0181, 'grad_norm': 8.316805839538574, 'learning_rate': 1.6291435368502243e-05, 'epoch': 28.63}
- 57%|█████▋    | 3321/5800 [9:18:48<4:45:57,  6.92s/it]score1 tensor([[0.3770],
-        [0.4336],
-        [0.4395],
-        [0.4004]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.4121, 0.4375, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:05:25,111] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 18:05:25,113] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.28 | bwd_microstep: 4647.99 | bwd_inner_microstep: 4642.89 | bwd_allreduce_microstep: 4.96 | step_microstep: 42.46
-[2025-01-25 18:05:25,113] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.24 | bwd: 4648.02 | bwd_inner: 4642.89 | bwd_allreduce: 5.05 | step: 42.47
- 57%|█████▋    | 3322/5800 [9:18:55<4:45:58,  6.92s/it]                                                       {'loss': 0.0107, 'grad_norm': 3.729156970977783, 'learning_rate': 1.628046150519426e-05, 'epoch': 28.64}
- 57%|█████▋    | 3322/5800 [9:18:55<4:45:58,  6.92s/it]score1 tensor([[0.5117],
-        [0.4727],
-        [0.4688],
-        [0.4668]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4922, 0.4863, 0.4902, 0.5000], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0220, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:05:32,031] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 18:05:32,033] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.10 | bwd_microstep: 4635.26 | bwd_inner_microstep: 4630.26 | bwd_allreduce_microstep: 4.89 | step_microstep: 42.92
-[2025-01-25 18:05:32,033] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.04 | bwd: 4635.29 | bwd_inner: 4630.26 | bwd_allreduce: 4.95 | step: 42.94
- 57%|█████▋    | 3323/5800 [9:19:02<4:45:50,  6.92s/it]                                                       {'loss': 0.022, 'grad_norm': 4.005037307739258, 'learning_rate': 1.6269488801702927e-05, 'epoch': 28.65}
- 57%|█████▋    | 3323/5800 [9:19:02<4:45:50,  6.92s/it]score1 tensor([[0.4805],
-        [0.6094],
-        [0.4336],
-        [0.3984]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.6484, 0.4570, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:05:38,956] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 18:05:38,958] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.89 | bwd_microstep: 4635.56 | bwd_inner_microstep: 4630.36 | bwd_allreduce_microstep: 5.11 | step_microstep: 44.77
-[2025-01-25 18:05:38,958] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.84 | bwd: 4635.58 | bwd_inner: 4630.36 | bwd_allreduce: 5.15 | step: 44.79
- 57%|█████▋    | 3324/5800 [9:19:08<4:45:47,  6.93s/it]                                                       {'loss': 0.0215, 'grad_norm': 7.92699670791626, 'learning_rate': 1.6258517261449716e-05, 'epoch': 28.66}
- 57%|█████▋    | 3324/5800 [9:19:08<4:45:47,  6.93s/it]score1 tensor([[0.4766],
-        [0.5352],
-        [0.3711],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4863, 0.5469, 0.3711, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:05:45,842] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.37
-[2025-01-25 18:05:45,843] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.80 | bwd_microstep: 4589.88 | bwd_inner_microstep: 4584.70 | bwd_allreduce_microstep: 5.06 | step_microstep: 42.14
-[2025-01-25 18:05:45,844] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.75 | bwd: 4589.91 | bwd_inner: 4584.70 | bwd_allreduce: 5.12 | step: 42.15
- 57%|█████▋    | 3325/5800 [9:19:15<4:45:07,  6.91s/it]                                                       {'loss': 0.0093, 'grad_norm': 6.182032585144043, 'learning_rate': 1.6247546887855733e-05, 'epoch': 28.66}
- 57%|█████▋    | 3325/5800 [9:19:15<4:45:07,  6.91s/it]score1 tensor([[0.4766],
-        [0.4160],
-        [0.4258],
-        [0.6016]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.4180, 0.4492, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:05:52,774] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.37
-[2025-01-25 18:05:52,775] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.70 | bwd_microstep: 4638.77 | bwd_inner_microstep: 4633.64 | bwd_allreduce_microstep: 5.02 | step_microstep: 42.37
-[2025-01-25 18:05:52,775] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.66 | bwd: 4638.79 | bwd_inner: 4633.64 | bwd_allreduce: 5.07 | step: 42.38
- 57%|█████▋    | 3326/5800 [9:19:22<4:45:11,  6.92s/it]                                                       {'loss': 0.0127, 'grad_norm': 7.949122428894043, 'learning_rate': 1.623657768434175e-05, 'epoch': 28.67}
- 57%|█████▋    | 3326/5800 [9:19:22<4:45:11,  6.92s/it]score1 tensor([[0.4355],
-        [0.3320],
-        [0.5547],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.3340, 0.5352, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:05:59,699] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.53 | optimizer_step: 4.36
-[2025-01-25 18:05:59,701] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.87 | bwd_microstep: 4644.85 | bwd_inner_microstep: 4639.96 | bwd_allreduce_microstep: 4.78 | step_microstep: 41.15
-[2025-01-25 18:05:59,701] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.83 | bwd: 4644.87 | bwd_inner: 4639.96 | bwd_allreduce: 4.84 | step: 41.16
- 57%|█████▋    | 3327/5800 [9:19:29<4:45:11,  6.92s/it]                                                       {'loss': 0.0093, 'grad_norm': 3.461841583251953, 'learning_rate': 1.6225609654328138e-05, 'epoch': 28.68}
- 57%|█████▋    | 3327/5800 [9:19:29<4:45:11,  6.92s/it]score1 tensor([[0.5898],
-        [0.4316],
-        [0.4961],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.4609, 0.5156, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0190, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:06:06,621] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.37
-[2025-01-25 18:06:06,622] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.27 | bwd_microstep: 4643.22 | bwd_inner_microstep: 4638.12 | bwd_allreduce_microstep: 5.00 | step_microstep: 44.94
-[2025-01-25 18:06:06,623] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.23 | bwd: 4643.24 | bwd_inner: 4638.12 | bwd_allreduce: 5.04 | step: 44.94
- 57%|█████▋    | 3328/5800 [9:19:36<4:45:09,  6.92s/it]                                                       {'loss': 0.019, 'grad_norm': 3.8191537857055664, 'learning_rate': 1.6214642801234937e-05, 'epoch': 28.69}
- 57%|█████▋    | 3328/5800 [9:19:36<4:45:09,  6.92s/it]score1 tensor([[0.5117],
-        [0.5273],
-        [0.4766],
-        [0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.5430, 0.4941, 0.4883], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:06:13,545] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 18:06:13,546] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.51 | bwd_microstep: 4638.13 | bwd_inner_microstep: 4632.55 | bwd_allreduce_microstep: 5.47 | step_microstep: 45.12
-[2025-01-25 18:06:13,546] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.46 | bwd: 4638.15 | bwd_inner: 4632.55 | bwd_allreduce: 5.53 | step: 45.13
- 57%|█████▋    | 3329/5800 [9:19:43<4:45:04,  6.92s/it]                                                       {'loss': 0.0161, 'grad_norm': 4.044074535369873, 'learning_rate': 1.620367712848179e-05, 'epoch': 28.7}
- 57%|█████▋    | 3329/5800 [9:19:43<4:45:04,  6.92s/it]score1 tensor([[0.4922],
-        [0.3730],
-        [0.4453],
-        [0.3926]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.3398, 0.4473, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:06:20,476] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.03 | optimizer_step: 4.36
-[2025-01-25 18:06:20,477] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.49 | bwd_microstep: 4638.96 | bwd_inner_microstep: 4633.93 | bwd_allreduce_microstep: 4.94 | step_microstep: 44.37
-[2025-01-25 18:06:20,478] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.45 | bwd: 4638.98 | bwd_inner: 4633.93 | bwd_allreduce: 4.97 | step: 44.38
- 57%|█████▋    | 3330/5800 [9:19:50<4:45:05,  6.93s/it]                                                       {'loss': 0.0146, 'grad_norm': 0.627945065498352, 'learning_rate': 1.619271263948798e-05, 'epoch': 28.71}
- 57%|█████▋    | 3330/5800 [9:19:50<4:45:05,  6.93s/it]score1 tensor([[0.5117],
-        [0.4453],
-        [0.6250],
-        [0.4805]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5000, 0.4375, 0.6094, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:06:27,411] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.36
-[2025-01-25 18:06:27,413] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.71 | bwd_microstep: 4642.31 | bwd_inner_microstep: 4637.41 | bwd_allreduce_microstep: 4.81 | step_microstep: 42.68
-[2025-01-25 18:06:27,413] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.66 | bwd: 4642.35 | bwd_inner: 4637.41 | bwd_allreduce: 4.85 | step: 42.69
- 57%|█████▋    | 3331/5800 [9:19:57<4:45:06,  6.93s/it]                                                       {'loss': 0.0098, 'grad_norm': 4.325037956237793, 'learning_rate': 1.6181749337672434e-05, 'epoch': 28.72}
- 57%|█████▋    | 3331/5800 [9:19:57<4:45:06,  6.93s/it]score1 tensor([[0.4805],
-        [0.6016],
-        [0.5312],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4707, 0.5703, 0.4961, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0210, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:06:34,338] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.37
-[2025-01-25 18:06:34,339] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.96 | bwd_microstep: 4636.89 | bwd_inner_microstep: 4631.69 | bwd_allreduce_microstep: 5.09 | step_microstep: 48.44
-[2025-01-25 18:06:34,340] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.93 | bwd: 4636.91 | bwd_inner: 4631.69 | bwd_allreduce: 5.15 | step: 48.46
- 57%|█████▋    | 3332/5800 [9:20:04<4:45:00,  6.93s/it]                                                       {'loss': 0.021, 'grad_norm': 4.3388991355896, 'learning_rate': 1.617078722645369e-05, 'epoch': 28.72}
- 57%|█████▋    | 3332/5800 [9:20:04<4:45:00,  6.93s/it]score1 tensor([[0.5664],
-        [0.3633],
-        [0.4648],
-        [0.4160]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.3477, 0.4355, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0249, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:06:41,271] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.91 | optimizer_step: 4.37
-[2025-01-25 18:06:41,272] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.92 | bwd_microstep: 4636.39 | bwd_inner_microstep: 4631.12 | bwd_allreduce_microstep: 5.17 | step_microstep: 43.44
-[2025-01-25 18:06:41,273] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.88 | bwd: 4636.42 | bwd_inner: 4631.12 | bwd_allreduce: 5.22 | step: 43.44
- 57%|█████▋    | 3333/5800 [9:20:11<4:44:54,  6.93s/it]                                                       {'loss': 0.0249, 'grad_norm': 7.665338516235352, 'learning_rate': 1.6159826309249945e-05, 'epoch': 28.73}
- 57%|█████▋    | 3333/5800 [9:20:11<4:44:54,  6.93s/it]score1 tensor([[0.4785],
-        [0.5117],
-        [0.5078],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4414, 0.4922, 0.4785, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0244, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:06:48,200] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.89 | optimizer_step: 4.37
-[2025-01-25 18:06:48,201] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.83 | bwd_microstep: 4639.95 | bwd_inner_microstep: 4635.03 | bwd_allreduce_microstep: 4.81 | step_microstep: 43.87
-[2025-01-25 18:06:48,201] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.79 | bwd: 4639.97 | bwd_inner: 4635.03 | bwd_allreduce: 4.87 | step: 43.89
- 57%|█████▋    | 3334/5800 [9:20:18<4:44:45,  6.93s/it]                                                       {'loss': 0.0244, 'grad_norm': 8.220844268798828, 'learning_rate': 1.614886658947898e-05, 'epoch': 28.74}
- 57%|█████▋    | 3334/5800 [9:20:18<4:44:45,  6.93s/it]score1 tensor([[0.4375],
-        [0.4238],
-        [0.5117],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4336, 0.4043, 0.4824, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:06:55,135] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.94 | optimizer_step: 4.36
-[2025-01-25 18:06:55,136] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.09 | bwd_microstep: 4635.99 | bwd_inner_microstep: 4631.11 | bwd_allreduce_microstep: 4.80 | step_microstep: 42.64
-[2025-01-25 18:06:55,136] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.05 | bwd: 4636.01 | bwd_inner: 4631.11 | bwd_allreduce: 4.84 | step: 42.65
- 57%|█████▊    | 3335/5800 [9:20:25<4:44:44,  6.93s/it]                                                       {'loss': 0.0161, 'grad_norm': 7.9030303955078125, 'learning_rate': 1.6137908070558243e-05, 'epoch': 28.75}
- 57%|█████▊    | 3335/5800 [9:20:25<4:44:44,  6.93s/it]score1 tensor([[0.6641],
-        [0.4473],
-        [0.5938],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.4316, 0.5898, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0205, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:07:02,060] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.96 | optimizer_step: 4.37
-[2025-01-25 18:07:02,062] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.26 | bwd_microstep: 4641.11 | bwd_inner_microstep: 4635.89 | bwd_allreduce_microstep: 5.15 | step_microstep: 46.71
-[2025-01-25 18:07:02,063] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.23 | bwd: 4641.14 | bwd_inner: 4635.89 | bwd_allreduce: 5.19 | step: 46.72
- 58%|█████▊    | 3336/5800 [9:20:32<4:44:34,  6.93s/it]                                                       {'loss': 0.0205, 'grad_norm': 4.423964500427246, 'learning_rate': 1.6126950755904785e-05, 'epoch': 28.76}
- 58%|█████▊    | 3336/5800 [9:20:32<4:44:34,  6.93s/it]score1 tensor([[0.4004],
-        [0.5586],
-        [0.5000],
-        [0.6758]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4082, 0.5430, 0.4805, 0.6289], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0225, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:07:08,988] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 18:07:08,989] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.10 | bwd_microstep: 4645.60 | bwd_inner_microstep: 4640.43 | bwd_allreduce_microstep: 5.06 | step_microstep: 42.40
-[2025-01-25 18:07:08,990] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.07 | bwd: 4645.62 | bwd_inner: 4640.44 | bwd_allreduce: 5.11 | step: 42.40
- 58%|█████▊    | 3337/5800 [9:20:38<4:44:28,  6.93s/it]                                                       {'loss': 0.0225, 'grad_norm': 4.95258903503418, 'learning_rate': 1.611599464893527e-05, 'epoch': 28.77}
- 58%|█████▊    | 3337/5800 [9:20:38<4:44:28,  6.93s/it]score1 tensor([[0.5430],
-        [0.6094],
-        [0.5781],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.6133, 0.5742, 0.5000], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:07:15,938] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.36
-[2025-01-25 18:07:15,939] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.46 | bwd_microstep: 4652.82 | bwd_inner_microstep: 4647.75 | bwd_allreduce_microstep: 4.98 | step_microstep: 44.00
-[2025-01-25 18:07:15,940] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.41 | bwd: 4652.85 | bwd_inner: 4647.75 | bwd_allreduce: 5.02 | step: 44.00
- 58%|█████▊    | 3338/5800 [9:20:45<4:44:31,  6.93s/it]                                                       {'loss': 0.0068, 'grad_norm': 0.4360138773918152, 'learning_rate': 1.6105039753066032e-05, 'epoch': 28.78}
- 58%|█████▊    | 3338/5800 [9:20:45<4:44:31,  6.93s/it]score1 tensor([[0.4375],
-        [0.4648],
-        [0.4199],
-        [0.6289]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.4648, 0.4277, 0.6133], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:07:22,803] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.88 | optimizer_step: 4.36
-[2025-01-25 18:07:22,805] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.64 | bwd_microstep: 4583.17 | bwd_inner_microstep: 4577.36 | bwd_allreduce_microstep: 5.70 | step_microstep: 45.04
-[2025-01-25 18:07:22,805] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.60 | bwd: 4583.20 | bwd_inner: 4577.36 | bwd_allreduce: 5.76 | step: 45.05
- 58%|█████▊    | 3339/5800 [9:20:52<4:43:36,  6.91s/it]                                                       {'loss': 0.0103, 'grad_norm': 1.5351378917694092, 'learning_rate': 1.609408607171297e-05, 'epoch': 28.78}
- 58%|█████▊    | 3339/5800 [9:20:52<4:43:36,  6.91s/it]score1 tensor([[0.4336],
-        [0.3672],
-        [0.4590],
-        [0.4082]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.3672, 0.4570, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:07:29,687] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 18:07:29,689] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.19 | bwd_microstep: 4589.14 | bwd_inner_microstep: 4584.00 | bwd_allreduce_microstep: 5.04 | step_microstep: 41.38
-[2025-01-25 18:07:29,689] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.16 | bwd: 4589.17 | bwd_inner: 4584.00 | bwd_allreduce: 5.09 | step: 41.39
- 58%|█████▊    | 3340/5800 [9:20:59<4:43:04,  6.90s/it]                                                       {'loss': 0.0068, 'grad_norm': 1.8246244192123413, 'learning_rate': 1.608313360829165e-05, 'epoch': 28.79}
- 58%|█████▊    | 3340/5800 [9:20:59<4:43:04,  6.90s/it]score1 tensor([[0.4707],
-        [0.1543],
-        [0.4883],
-        [0.4199]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.1787, 0.5000, 0.4199], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0110, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:07:36,572] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 18:07:36,574] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.34 | bwd_microstep: 4591.94 | bwd_inner_microstep: 4586.91 | bwd_allreduce_microstep: 4.92 | step_microstep: 47.42
-[2025-01-25 18:07:36,574] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.30 | bwd: 4591.97 | bwd_inner: 4586.91 | bwd_allreduce: 4.98 | step: 47.43
- 58%|█████▊    | 3341/5800 [9:21:06<4:42:43,  6.90s/it]                                                       {'loss': 0.011, 'grad_norm': 4.955446243286133, 'learning_rate': 1.6072182366217242e-05, 'epoch': 28.8}
- 58%|█████▊    | 3341/5800 [9:21:06<4:42:43,  6.90s/it]score1 tensor([[0.5039],
-        [0.5391],
-        [0.6055],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.5664, 0.6094, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:07:43,448] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.37
-[2025-01-25 18:07:43,450] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.56 | bwd_microstep: 4591.05 | bwd_inner_microstep: 4586.05 | bwd_allreduce_microstep: 4.92 | step_microstep: 45.34
-[2025-01-25 18:07:43,450] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.53 | bwd: 4591.07 | bwd_inner: 4586.05 | bwd_allreduce: 4.96 | step: 45.34
- 58%|█████▊    | 3342/5800 [9:21:13<4:42:19,  6.89s/it]                                                       {'loss': 0.0176, 'grad_norm': 2.279188871383667, 'learning_rate': 1.6061232348904515e-05, 'epoch': 28.81}
- 58%|█████▊    | 3342/5800 [9:21:13<4:42:19,  6.89s/it]score1 tensor([[0.4453],
-        [0.6562],
-        [0.6602],
-        [0.4043]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4609, 0.6250, 0.6484, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:07:50,381] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.96 | optimizer_step: 4.37
-[2025-01-25 18:07:50,383] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.30 | bwd_microstep: 4642.11 | bwd_inner_microstep: 4637.09 | bwd_allreduce_microstep: 4.93 | step_microstep: 43.52
-[2025-01-25 18:07:50,383] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.25 | bwd: 4642.14 | bwd_inner: 4637.09 | bwd_allreduce: 4.97 | step: 43.53
- 58%|█████▊    | 3343/5800 [9:21:20<4:42:43,  6.90s/it]                                                       {'loss': 0.0156, 'grad_norm': 4.769585132598877, 'learning_rate': 1.605028355976789e-05, 'epoch': 28.82}
- 58%|█████▊    | 3343/5800 [9:21:20<4:42:43,  6.90s/it]score1 tensor([[0.4023],
-        [0.4160],
-        [0.4551],
-        [0.3535]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4023, 0.3262, 0.4551, 0.3418], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:07:57,215] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.54 | optimizer_step: 4.37
-[2025-01-25 18:07:57,216] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.71 | bwd_microstep: 4544.93 | bwd_inner_microstep: 4540.06 | bwd_allreduce_microstep: 4.78 | step_microstep: 46.43
-[2025-01-25 18:07:57,217] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.68 | bwd: 4544.95 | bwd_inner: 4540.06 | bwd_allreduce: 4.82 | step: 46.44
- 58%|█████▊    | 3344/5800 [9:21:27<4:41:55,  6.89s/it]                                                       {'loss': 0.0254, 'grad_norm': 3.5413880348205566, 'learning_rate': 1.603933600222138e-05, 'epoch': 28.83}
- 58%|█████▊    | 3344/5800 [9:21:27<4:41:55,  6.89s/it]score1 tensor([[0.5312],
-        [0.4277],
-        [0.6211],
-        [0.4746]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.4219, 0.6016, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:08:04,152] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.04 | optimizer_step: 4.36
-[2025-01-25 18:08:04,154] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.75 | bwd_microstep: 4635.67 | bwd_inner_microstep: 4629.71 | bwd_allreduce_microstep: 5.81 | step_microstep: 45.25
-[2025-01-25 18:08:04,154] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.72 | bwd: 4635.69 | bwd_inner: 4629.71 | bwd_allreduce: 5.90 | step: 45.25
- 58%|█████▊    | 3345/5800 [9:21:34<4:42:13,  6.90s/it]                                                       {'loss': 0.0146, 'grad_norm': 4.024076461791992, 'learning_rate': 1.6028389679678634e-05, 'epoch': 28.84}
- 58%|█████▊    | 3345/5800 [9:21:34<4:42:13,  6.90s/it]score1 tensor([[0.4199],
-        [0.4883],
-        [0.4707],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4141, 0.4922, 0.5391, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:08:11,091] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.93 | optimizer_step: 4.37
-[2025-01-25 18:08:11,092] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2172.53 | bwd_microstep: 4645.51 | bwd_inner_microstep: 4640.73 | bwd_allreduce_microstep: 4.70 | step_microstep: 44.77
-[2025-01-25 18:08:11,092] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2172.40 | bwd: 4645.53 | bwd_inner: 4640.73 | bwd_allreduce: 4.74 | step: 44.77
- 58%|█████▊    | 3346/5800 [9:21:41<4:42:39,  6.91s/it]                                                       {'loss': 0.0215, 'grad_norm': 0.43105778098106384, 'learning_rate': 1.601744459555289e-05, 'epoch': 28.84}
- 58%|█████▊    | 3346/5800 [9:21:41<4:42:39,  6.91s/it]score1 tensor([[0.5430],
-        [0.5078],
-        [0.4824],
-        [0.6055]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.5195, 0.4883, 0.5898], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:08:18,023] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.07 | optimizer_step: 4.37
-[2025-01-25 18:08:18,024] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.21 | bwd_microstep: 4638.96 | bwd_inner_microstep: 4634.04 | bwd_allreduce_microstep: 4.80 | step_microstep: 47.59
-[2025-01-25 18:08:18,024] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.18 | bwd: 4638.98 | bwd_inner: 4634.04 | bwd_allreduce: 4.86 | step: 47.59
- 58%|█████▊    | 3347/5800 [9:21:47<4:42:45,  6.92s/it]                                                       {'loss': 0.0112, 'grad_norm': 3.8794100284576416, 'learning_rate': 1.6006500753257035e-05, 'epoch': 28.85}
- 58%|█████▊    | 3347/5800 [9:21:48<4:42:45,  6.92s/it]score1 tensor([[0.4062],
-        [0.4492],
-        [0.5820],
-        [0.6289]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3906, 0.4590, 0.6211, 0.5938], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0249, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:08:24,943] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 18:08:24,944] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.87 | bwd_microstep: 4638.56 | bwd_inner_microstep: 4632.76 | bwd_allreduce_microstep: 5.65 | step_microstep: 43.31
-[2025-01-25 18:08:24,944] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.84 | bwd: 4638.59 | bwd_inner: 4632.77 | bwd_allreduce: 5.73 | step: 43.32
- 58%|█████▊    | 3348/5800 [9:21:54<4:42:43,  6.92s/it]                                                       {'loss': 0.0249, 'grad_norm': 0.4599483609199524, 'learning_rate': 1.5995558156203536e-05, 'epoch': 28.86}
- 58%|█████▊    | 3348/5800 [9:21:54<4:42:43,  6.92s/it]score1 tensor([[0.4492],
-        [0.5156],
-        [0.3301],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.5195, 0.3457, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:08:31,875] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.96 | optimizer_step: 4.37
-[2025-01-25 18:08:31,877] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.57 | bwd_microstep: 4641.62 | bwd_inner_microstep: 4636.58 | bwd_allreduce_microstep: 4.96 | step_microstep: 44.48
-[2025-01-25 18:08:31,877] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.53 | bwd: 4641.64 | bwd_inner: 4636.58 | bwd_allreduce: 5.00 | step: 44.48
- 58%|█████▊    | 3349/5800 [9:22:01<4:42:46,  6.92s/it]                                                       {'loss': 0.0083, 'grad_norm': 7.69687032699585, 'learning_rate': 1.598461680780448e-05, 'epoch': 28.87}
- 58%|█████▊    | 3349/5800 [9:22:01<4:42:46,  6.92s/it]score1 tensor([[0.4043],
-        [0.3555],
-        [0.4297],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3887, 0.3750, 0.4434, 0.5234], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:08:38,811] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.37
-[2025-01-25 18:08:38,813] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.33 | bwd_microstep: 4639.82 | bwd_inner_microstep: 4634.45 | bwd_allreduce_microstep: 5.29 | step_microstep: 46.96
-[2025-01-25 18:08:38,813] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.27 | bwd: 4639.84 | bwd_inner: 4634.45 | bwd_allreduce: 5.33 | step: 46.96
- 58%|█████▊    | 3350/5800 [9:22:08<4:42:49,  6.93s/it]                                                       {'loss': 0.0132, 'grad_norm': 3.840860366821289, 'learning_rate': 1.5973676711471586e-05, 'epoch': 28.88}
- 58%|█████▊    | 3350/5800 [9:22:08<4:42:49,  6.93s/it]score1 tensor([[0.5273],
-        [0.4395],
-        [0.5391],
-        [0.5977]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.4473, 0.5117, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:08:45,749] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 18:08:45,750] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.10 | bwd_microstep: 4646.67 | bwd_inner_microstep: 4641.67 | bwd_allreduce_microstep: 4.90 | step_microstep: 43.62
-[2025-01-25 18:08:45,751] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.07 | bwd: 4646.69 | bwd_inner: 4641.67 | bwd_allreduce: 4.95 | step: 43.63
- 58%|█████▊    | 3351/5800 [9:22:15<4:42:47,  6.93s/it]                                                       {'loss': 0.0146, 'grad_norm': 0.42429107427597046, 'learning_rate': 1.5962737870616144e-05, 'epoch': 28.89}
- 58%|█████▊    | 3351/5800 [9:22:15<4:42:47,  6.93s/it]score1 tensor([[0.6055],
-        [0.3926],
-        [0.4160],
-        [0.4629]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6250, 0.4004, 0.4258, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:08:52,670] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.36
-[2025-01-25 18:08:52,671] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.50 | bwd_microstep: 4638.72 | bwd_inner_microstep: 4633.80 | bwd_allreduce_microstep: 4.84 | step_microstep: 43.50
-[2025-01-25 18:08:52,672] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.47 | bwd: 4638.74 | bwd_inner: 4633.80 | bwd_allreduce: 4.88 | step: 43.51
- 58%|█████▊    | 3352/5800 [9:22:22<4:42:40,  6.93s/it]                                                       {'loss': 0.0098, 'grad_norm': 7.754148006439209, 'learning_rate': 1.59518002886491e-05, 'epoch': 28.9}
- 58%|█████▊    | 3352/5800 [9:22:22<4:42:40,  6.93s/it]score1 tensor([[0.6055],
-        [0.6055],
-        [0.5156],
-        [0.4727]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.6211, 0.4941, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:08:59,594] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 18:08:59,596] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.85 | bwd_microstep: 4638.47 | bwd_inner_microstep: 4632.56 | bwd_allreduce_microstep: 5.82 | step_microstep: 45.63
-[2025-01-25 18:08:59,596] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.82 | bwd: 4638.50 | bwd_inner: 4632.56 | bwd_allreduce: 5.86 | step: 45.64
- 58%|█████▊    | 3353/5800 [9:22:29<4:42:29,  6.93s/it]                                                       {'loss': 0.0156, 'grad_norm': 0.8068718910217285, 'learning_rate': 1.594086396898096e-05, 'epoch': 28.91}
- 58%|█████▊    | 3353/5800 [9:22:29<4:42:29,  6.93s/it]score1 tensor([[0.5586],
-        [0.6172],
-        [0.7188],
-        [0.4375]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5586, 0.6016, 0.7070, 0.4258], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:09:06,480] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 18:09:06,481] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.21 | bwd_microstep: 4590.25 | bwd_inner_microstep: 4585.12 | bwd_allreduce_microstep: 5.02 | step_microstep: 45.76
-[2025-01-25 18:09:06,482] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.16 | bwd: 4590.27 | bwd_inner: 4585.12 | bwd_allreduce: 5.07 | step: 45.78
- 58%|█████▊    | 3354/5800 [9:22:36<4:41:50,  6.91s/it]                                                       {'loss': 0.0098, 'grad_norm': 6.740665435791016, 'learning_rate': 1.592992891502187e-05, 'epoch': 28.91}
- 58%|█████▊    | 3354/5800 [9:22:36<4:41:50,  6.91s/it]score1 tensor([[0.5664],
-        [0.4473],
-        [0.6484],
-        [0.4062]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4238, 0.6445, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:09:13,407] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.14 | optimizer_step: 4.37
-[2025-01-25 18:09:13,408] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.49 | bwd_microstep: 4643.79 | bwd_inner_microstep: 4638.73 | bwd_allreduce_microstep: 4.96 | step_microstep: 45.49
-[2025-01-25 18:09:13,408] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.44 | bwd: 4643.81 | bwd_inner: 4638.73 | bwd_allreduce: 5.01 | step: 45.50
- 58%|█████▊    | 3355/5800 [9:22:43<4:41:56,  6.92s/it]                                                       {'loss': 0.0186, 'grad_norm': 8.25633716583252, 'learning_rate': 1.591899513018157e-05, 'epoch': 28.92}
- 58%|█████▊    | 3355/5800 [9:22:43<4:41:56,  6.92s/it]score1 tensor([[0.5039],
-        [0.5391],
-        [0.5352],
-        [0.4785]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.5508, 0.5273, 0.4590], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:09:20,339] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 18:09:20,340] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.58 | bwd_microstep: 4642.54 | bwd_inner_microstep: 4637.15 | bwd_allreduce_microstep: 5.28 | step_microstep: 46.25
-[2025-01-25 18:09:20,341] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.53 | bwd: 4642.57 | bwd_inner: 4637.15 | bwd_allreduce: 5.34 | step: 46.23
- 58%|█████▊    | 3356/5800 [9:22:50<4:41:55,  6.92s/it]                                                       {'loss': 0.0186, 'grad_norm': 4.006093978881836, 'learning_rate': 1.590806261786939e-05, 'epoch': 28.93}
- 58%|█████▊    | 3356/5800 [9:22:50<4:41:55,  6.92s/it]score1 tensor([[0.6211],
-        [0.4785],
-        [0.4043],
-        [0.3613]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6289, 0.4727, 0.3984, 0.3457], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:09:27,257] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 18:09:27,258] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.15 | bwd_microstep: 4637.95 | bwd_inner_microstep: 4632.81 | bwd_allreduce_microstep: 5.05 | step_microstep: 44.93
-[2025-01-25 18:09:27,259] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.10 | bwd: 4637.97 | bwd_inner: 4632.81 | bwd_allreduce: 5.09 | step: 44.94
- 58%|█████▊    | 3357/5800 [9:22:57<4:41:44,  6.92s/it]                                                       {'loss': 0.0088, 'grad_norm': 3.265709638595581, 'learning_rate': 1.5897131381494294e-05, 'epoch': 28.94}
- 58%|█████▊    | 3357/5800 [9:22:57<4:41:44,  6.92s/it]score1 tensor([[0.4531],
-        [0.6055],
-        [0.5586],
-        [0.4336]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.5664, 0.5352, 0.4258], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:09:34,177] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.37
-[2025-01-25 18:09:34,178] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.95 | bwd_microstep: 4637.41 | bwd_inner_microstep: 4631.98 | bwd_allreduce_microstep: 5.34 | step_microstep: 45.71
-[2025-01-25 18:09:34,179] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.92 | bwd: 4637.44 | bwd_inner: 4631.98 | bwd_allreduce: 5.38 | step: 45.72
- 58%|█████▊    | 3358/5800 [9:23:04<4:41:40,  6.92s/it]                                                       {'loss': 0.0186, 'grad_norm': 8.172894477844238, 'learning_rate': 1.588620142446482e-05, 'epoch': 28.95}
- 58%|█████▊    | 3358/5800 [9:23:04<4:41:40,  6.92s/it]score1 tensor([[0.4297],
-        [0.5664],
-        [0.5703],
-        [0.3809]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4160, 0.5625, 0.5508, 0.3809], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:09:41,066] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 18:09:41,067] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.65 | bwd_microstep: 4602.12 | bwd_inner_microstep: 4596.80 | bwd_allreduce_microstep: 5.23 | step_microstep: 48.40
-[2025-01-25 18:09:41,068] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.58 | bwd: 4602.15 | bwd_inner: 4596.80 | bwd_allreduce: 5.28 | step: 48.41
- 58%|█████▊    | 3359/5800 [9:23:11<4:41:10,  6.91s/it]                                                       {'loss': 0.0093, 'grad_norm': 6.237989902496338, 'learning_rate': 1.5875272750189126e-05, 'epoch': 28.96}
- 58%|█████▊    | 3359/5800 [9:23:11<4:41:10,  6.91s/it]score1 tensor([[0.5938],
-        [0.5859],
-        [0.4453],
-        [0.4570]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6016, 0.5781, 0.4629, 0.4570], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:09:47,947] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 18:09:47,948] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.47 | bwd_microstep: 4596.69 | bwd_inner_microstep: 4591.66 | bwd_allreduce_microstep: 4.92 | step_microstep: 44.01
-[2025-01-25 18:09:47,949] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.44 | bwd: 4596.71 | bwd_inner: 4591.66 | bwd_allreduce: 4.98 | step: 44.02
- 58%|█████▊    | 3360/5800 [9:23:17<4:40:43,  6.90s/it]                                                       {'loss': 0.0083, 'grad_norm': 1.958743691444397, 'learning_rate': 1.5864345362074963e-05, 'epoch': 28.97}
- 58%|█████▊    | 3360/5800 [9:23:17<4:40:43,  6.90s/it]score1 tensor([[0.4824],
-        [0.5547],
-        [0.5312],
-        [0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.5469, 0.5586, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0142, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:09:54,870] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 18:09:54,872] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.38 | bwd_microstep: 4633.44 | bwd_inner_microstep: 4628.55 | bwd_allreduce_microstep: 4.81 | step_microstep: 44.73
-[2025-01-25 18:09:54,872] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.33 | bwd: 4633.46 | bwd_inner: 4628.55 | bwd_allreduce: 4.85 | step: 44.73
- 58%|█████▊    | 3361/5800 [9:23:24<4:40:49,  6.91s/it]                                                       {'loss': 0.0142, 'grad_norm': 3.94051194190979, 'learning_rate': 1.5853419263529668e-05, 'epoch': 28.97}
- 58%|█████▊    | 3361/5800 [9:23:24<4:40:49,  6.91s/it]score1 tensor([[0.5664],
-        [0.6172],
-        [0.6211],
-        [0.4688]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.5977, 0.6172, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:10:01,791] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.37
-[2025-01-25 18:10:01,793] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.83 | bwd_microstep: 4638.20 | bwd_inner_microstep: 4632.84 | bwd_allreduce_microstep: 5.25 | step_microstep: 46.00
-[2025-01-25 18:10:01,793] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.79 | bwd: 4638.23 | bwd_inner: 4632.84 | bwd_allreduce: 5.30 | step: 46.00
- 58%|█████▊    | 3362/5800 [9:23:31<4:40:52,  6.91s/it]                                                       {'loss': 0.0137, 'grad_norm': 4.889665126800537, 'learning_rate': 1.58424944579602e-05, 'epoch': 28.98}
- 58%|█████▊    | 3362/5800 [9:23:31<4:40:52,  6.91s/it]score1 tensor([[0.4746],
-        [0.6484],
-        [0.5820],
-        [0.3789]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4863, 0.6562, 0.5781, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:10:08,713] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.37
-[2025-01-25 18:10:08,715] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.45 | bwd_microstep: 4634.87 | bwd_inner_microstep: 4629.82 | bwd_allreduce_microstep: 4.94 | step_microstep: 45.51
-[2025-01-25 18:10:08,715] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.41 | bwd: 4634.89 | bwd_inner: 4629.82 | bwd_allreduce: 4.99 | step: 45.51
- 58%|█████▊    | 3363/5800 [9:23:38<4:40:57,  6.92s/it]                                                       {'loss': 0.0112, 'grad_norm': 3.8729958534240723, 'learning_rate': 1.5831570948773097e-05, 'epoch': 28.99}
- 58%|█████▊    | 3363/5800 [9:23:38<4:40:57,  6.92s/it][93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-score1 tensor([[0.4297]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:10:13,156] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.57 | optimizer_step: 4.36
-[2025-01-25 18:10:13,158] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 574.52 | bwd_microstep: 1222.05 | bwd_inner_microstep: 1216.78 | bwd_allreduce_microstep: 5.16 | step_microstep: 45.92
-[2025-01-25 18:10:13,158] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 574.48 | bwd: 1222.08 | bwd_inner: 1216.78 | bwd_allreduce: 5.21 | step: 45.93
- 58%|█████▊    | 3364/5800 [9:23:43<4:10:45,  6.18s/it]                                                       {'loss': 0.0352, 'grad_norm': 7.4521403312683105, 'learning_rate': 1.58206487393745e-05, 'epoch': 29.0}
- 58%|█████▊    | 3364/5800 [9:23:43<4:10:45,  6.18s/it][2025-01-25 18:10:17,773] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 18:10:28,298] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 18:10:38,968] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 18:10:49,563] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-score1 tensor([[0.5391],
-        [0.4473],
-        [0.4375],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.4648, 0.4141, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:11:04,192] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.06 | optimizer_step: 4.36
-[2025-01-25 18:11:04,194] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2133.12 | bwd_microstep: 4578.74 | bwd_inner_microstep: 4573.44 | bwd_allreduce_microstep: 5.20 | step_microstep: 51.34
-[2025-01-25 18:11:04,194] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2133.05 | bwd: 4578.85 | bwd_inner: 4573.44 | bwd_allreduce: 5.26 | step: 51.34
- 58%|█████▊    | 3365/5800 [9:24:34<13:16:42, 19.63s/it]                                                        {'loss': 0.0161, 'grad_norm': 4.135110378265381, 'learning_rate': 1.5809727833170144e-05, 'epoch': 29.01}
- 58%|█████▊    | 3365/5800 [9:24:34<13:16:42, 19.63s/it]score1 tensor([[0.7031],
-        [0.5703],
-        [0.4316],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.7070, 0.6016, 0.4453, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0142, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:11:11,040] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 18:11:11,041] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2131.05 | bwd_microstep: 4576.56 | bwd_inner_microstep: 4571.79 | bwd_allreduce_microstep: 4.67 | step_microstep: 47.80
-[2025-01-25 18:11:11,042] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2131.02 | bwd: 4576.59 | bwd_inner: 4571.79 | bwd_allreduce: 4.72 | step: 47.81
- 58%|█████▊    | 3366/5800 [9:24:41<10:40:48, 15.80s/it]                                                        {'loss': 0.0142, 'grad_norm': 8.615926742553711, 'learning_rate': 1.5798808233565358e-05, 'epoch': 29.02}
- 58%|█████▊    | 3366/5800 [9:24:41<10:40:48, 15.80s/it]score1 tensor([[0.4336],
-        [0.5586],
-        [0.4316],
-        [0.4707]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4395, 0.5508, 0.4414, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:11:17,893] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.64
-[2025-01-25 18:11:17,895] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2136.65 | bwd_microstep: 4588.21 | bwd_inner_microstep: 4583.02 | bwd_allreduce_microstep: 5.08 | step_microstep: 46.97
-[2025-01-25 18:11:17,895] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2136.62 | bwd: 4588.24 | bwd_inner: 4583.03 | bwd_allreduce: 5.14 | step: 46.98
- 58%|█████▊    | 3367/5800 [9:24:47<8:51:42, 13.11s/it]                                                        {'loss': 0.0117, 'grad_norm': 3.575972080230713, 'learning_rate': 1.578788994396507e-05, 'epoch': 29.03}
- 58%|█████▊    | 3367/5800 [9:24:47<8:51:42, 13.11s/it]score1 tensor([[0.5391],
-        [0.4277],
-        [0.6250],
-        [0.6797]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.4375, 0.6172, 0.6875], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:11:24,764] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 18:11:24,765] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2139.04 | bwd_microstep: 4604.99 | bwd_inner_microstep: 4599.33 | bwd_allreduce_microstep: 5.57 | step_microstep: 49.07
-[2025-01-25 18:11:24,765] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2138.99 | bwd: 4605.01 | bwd_inner: 4599.33 | bwd_allreduce: 5.61 | step: 49.08
- 58%|█████▊    | 3368/5800 [9:24:54<7:35:36, 11.24s/it]                                                       {'loss': 0.0103, 'grad_norm': 4.180863857269287, 'learning_rate': 1.577697296777377e-05, 'epoch': 29.03}
- 58%|█████▊    | 3368/5800 [9:24:54<7:35:36, 11.24s/it]score1 tensor([[0.6328],
-        [0.3594],
-        [0.5312],
-        [0.7070]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6289, 0.3652, 0.5195, 0.6875], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:11:31,630] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 18:11:31,632] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2151.08 | bwd_microstep: 4589.45 | bwd_inner_microstep: 4583.90 | bwd_allreduce_microstep: 5.41 | step_microstep: 45.53
-[2025-01-25 18:11:31,632] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2151.03 | bwd: 4589.47 | bwd_inner: 4583.90 | bwd_allreduce: 5.49 | step: 45.54
- 58%|█████▊    | 3369/5800 [9:25:01<6:42:14,  9.93s/it]                                                       {'loss': 0.0103, 'grad_norm': 5.378532886505127, 'learning_rate': 1.5766057308395586e-05, 'epoch': 29.04}
- 58%|█████▊    | 3369/5800 [9:25:01<6:42:14,  9.93s/it]score1 tensor([[0.6484],
-        [0.4727],
-        [0.5508],
-        [0.3848]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6602, 0.4727, 0.5352, 0.3809], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:11:38,437] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 18:11:38,438] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2146.22 | bwd_microstep: 4542.08 | bwd_inner_microstep: 4536.87 | bwd_allreduce_microstep: 5.11 | step_microstep: 45.48
-[2025-01-25 18:11:38,439] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2146.19 | bwd: 4542.10 | bwd_inner: 4536.88 | bwd_allreduce: 5.16 | step: 45.48
- 58%|█████▊    | 3370/5800 [9:25:08<6:04:08,  8.99s/it]                                                       {'loss': 0.0078, 'grad_norm': 1.6172640323638916, 'learning_rate': 1.5755142969234186e-05, 'epoch': 29.05}
- 58%|█████▊    | 3370/5800 [9:25:08<6:04:08,  8.99s/it]score1 tensor([[0.5703],
-        [0.6445],
-        [0.4199],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.6367, 0.4277, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:11:45,295] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.00 | optimizer_step: 4.37
-[2025-01-25 18:11:45,297] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.56 | bwd_microstep: 4586.98 | bwd_inner_microstep: 4581.87 | bwd_allreduce_microstep: 4.98 | step_microstep: 42.11
-[2025-01-25 18:11:45,297] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.51 | bwd: 4587.00 | bwd_inner: 4581.87 | bwd_allreduce: 5.04 | step: 42.12
- 58%|█████▊    | 3371/5800 [9:25:15<5:38:08,  8.35s/it]                                                       {'loss': 0.0137, 'grad_norm': 4.915392875671387, 'learning_rate': 1.5744229953692874e-05, 'epoch': 29.06}
- 58%|█████▊    | 3371/5800 [9:25:15<5:38:08,  8.35s/it]score1 tensor([[0.4668],
-        [0.4590],
-        [0.4219],
-        [0.6250]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4668, 0.4531, 0.4180, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:11:52,107] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.37
-[2025-01-25 18:11:52,108] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2149.39 | bwd_microstep: 4541.12 | bwd_inner_microstep: 4536.04 | bwd_allreduce_microstep: 5.00 | step_microstep: 41.78
-[2025-01-25 18:11:52,108] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2149.34 | bwd: 4541.15 | bwd_inner: 4536.04 | bwd_allreduce: 5.04 | step: 41.79
- 58%|█████▊    | 3372/5800 [9:25:22<5:19:16,  7.89s/it]                                                       {'loss': 0.0063, 'grad_norm': 6.093774795532227, 'learning_rate': 1.57333182651745e-05, 'epoch': 29.07}
- 58%|█████▊    | 3372/5800 [9:25:22<5:19:16,  7.89s/it]score1 tensor([[0.4668],
-        [0.4883],
-        [0.5430],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4961, 0.4785, 0.5352, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:11:58,966] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.89 | optimizer_step: 4.37
-[2025-01-25 18:11:58,967] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2147.49 | bwd_microstep: 4595.66 | bwd_inner_microstep: 4590.91 | bwd_allreduce_microstep: 4.67 | step_microstep: 43.25
-[2025-01-25 18:11:58,967] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2147.46 | bwd: 4595.68 | bwd_inner: 4590.91 | bwd_allreduce: 4.71 | step: 43.26
- 58%|█████▊    | 3373/5800 [9:25:28<5:06:37,  7.58s/it]                                                       {'loss': 0.0176, 'grad_norm': 0.3929377496242523, 'learning_rate': 1.572240790708153e-05, 'epoch': 29.08}
- 58%|█████▊    | 3373/5800 [9:25:28<5:06:37,  7.58s/it]score1 tensor([[0.4961],
-        [0.4727],
-        [0.5195],
-        [0.4141]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4805, 0.4941, 0.5391, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0225, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:12:05,844] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.36
-[2025-01-25 18:12:05,845] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2151.50 | bwd_microstep: 4605.53 | bwd_inner_microstep: 4600.06 | bwd_allreduce_microstep: 5.35 | step_microstep: 44.87
-[2025-01-25 18:12:05,845] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2151.47 | bwd: 4605.56 | bwd_inner: 4600.06 | bwd_allreduce: 5.40 | step: 44.87
- 58%|█████▊    | 3374/5800 [9:25:35<4:58:00,  7.37s/it]                                                       {'loss': 0.0225, 'grad_norm': 3.87310528755188, 'learning_rate': 1.5711498882815998e-05, 'epoch': 29.09}
- 58%|█████▊    | 3374/5800 [9:25:35<4:58:00,  7.37s/it]score1 tensor([[0.4023],
-        [0.4570],
-        [0.7305],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4121, 0.4727, 0.7031, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0190, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:12:12,729] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.36
-[2025-01-25 18:12:12,730] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.34 | bwd_microstep: 4605.47 | bwd_inner_microstep: 4600.63 | bwd_allreduce_microstep: 4.76 | step_microstep: 46.71
-[2025-01-25 18:12:12,731] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.30 | bwd: 4605.49 | bwd_inner: 4600.63 | bwd_allreduce: 4.80 | step: 46.72
- 58%|█████▊    | 3375/5800 [9:25:42<4:51:57,  7.22s/it]                                                       {'loss': 0.019, 'grad_norm': 3.4267539978027344, 'learning_rate': 1.570059119577952e-05, 'epoch': 29.09}
- 58%|█████▊    | 3375/5800 [9:25:42<4:51:57,  7.22s/it]score1 tensor([[0.5859],
-        [0.3516],
-        [0.4082],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.3418, 0.4043, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:12:19,607] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 18:12:19,608] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.82 | bwd_microstep: 4605.90 | bwd_inner_microstep: 4601.08 | bwd_allreduce_microstep: 4.74 | step_microstep: 44.67
-[2025-01-25 18:12:19,608] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.78 | bwd: 4605.92 | bwd_inner: 4601.08 | bwd_allreduce: 4.78 | step: 44.68
- 58%|█████▊    | 3376/5800 [9:25:49<4:47:37,  7.12s/it]                                                       {'loss': 0.0146, 'grad_norm': 3.3315255641937256, 'learning_rate': 1.5689684849373316e-05, 'epoch': 29.1}
- 58%|█████▊    | 3376/5800 [9:25:49<4:47:37,  7.12s/it]score1 tensor([[0.6523],
-        [0.6094],
-        [0.4883],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6562, 0.6094, 0.4844, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:12:26,450] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.28 | optimizer_step: 4.37
-[2025-01-25 18:12:26,451] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2151.24 | bwd_microstep: 4571.28 | bwd_inner_microstep: 4565.82 | bwd_allreduce_microstep: 5.36 | step_microstep: 45.73
-[2025-01-25 18:12:26,451] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2151.21 | bwd: 4571.32 | bwd_inner: 4565.82 | bwd_allreduce: 5.42 | step: 45.74
- 58%|█████▊    | 3377/5800 [9:25:56<4:44:11,  7.04s/it]                                                       {'loss': 0.0078, 'grad_norm': 2.4845950603485107, 'learning_rate': 1.5678779846998157e-05, 'epoch': 29.11}
- 58%|█████▊    | 3377/5800 [9:25:56<4:44:11,  7.04s/it]score1 tensor([[0.5117],
-        [0.3477],
-        [0.4375],
-        [0.4102]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.3457, 0.4434, 0.4160], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:12:33,356] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 18:12:33,357] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.75 | bwd_microstep: 4615.09 | bwd_inner_microstep: 4610.19 | bwd_allreduce_microstep: 4.83 | step_microstep: 42.03
-[2025-01-25 18:12:33,357] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.71 | bwd: 4615.11 | bwd_inner: 4610.19 | bwd_allreduce: 4.86 | step: 42.03
- 58%|█████▊    | 3378/5800 [9:26:03<4:42:28,  7.00s/it]                                                       {'loss': 0.0073, 'grad_norm': 4.056037425994873, 'learning_rate': 1.5667876192054428e-05, 'epoch': 29.12}
- 58%|█████▊    | 3378/5800 [9:26:03<4:42:28,  7.00s/it]score1 tensor([[0.5000],
-        [0.5273],
-        [0.6172],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.5508, 0.6484, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0200, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:12:40,260] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.05 | optimizer_step: 4.37
-[2025-01-25 18:12:40,261] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.81 | bwd_microstep: 4623.74 | bwd_inner_microstep: 4618.63 | bwd_allreduce_microstep: 5.02 | step_microstep: 45.78
-[2025-01-25 18:12:40,262] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.77 | bwd: 4623.77 | bwd_inner: 4618.63 | bwd_allreduce: 5.07 | step: 45.79
- 58%|█████▊    | 3379/5800 [9:26:10<4:41:15,  6.97s/it]                                                       {'loss': 0.02, 'grad_norm': 0.5813992619514465, 'learning_rate': 1.5656973887942073e-05, 'epoch': 29.13}
- 58%|█████▊    | 3379/5800 [9:26:10<4:41:15,  6.97s/it]score1 tensor([[0.5742],
-        [0.4629],
-        [0.5312],
-        [0.4512]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.4551, 0.5391, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:12:47,174] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 18:12:47,176] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.76 | bwd_microstep: 4632.07 | bwd_inner_microstep: 4626.14 | bwd_allreduce_microstep: 5.80 | step_microstep: 46.69
-[2025-01-25 18:12:47,177] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.71 | bwd: 4632.09 | bwd_inner: 4626.14 | bwd_allreduce: 5.87 | step: 46.70
- 58%|█████▊    | 3380/5800 [9:26:17<4:40:27,  6.95s/it]                                                       {'loss': 0.0068, 'grad_norm': 0.47227075695991516, 'learning_rate': 1.564607293806061e-05, 'epoch': 29.14}
- 58%|█████▊    | 3380/5800 [9:26:17<4:40:27,  6.95s/it]score1 tensor([[0.4453],
-        [0.5078],
-        [0.5898],
-        [0.4746]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.5156, 0.6094, 0.4688], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:12:54,087] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.37
-[2025-01-25 18:12:54,089] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.77 | bwd_microstep: 4630.21 | bwd_inner_microstep: 4625.48 | bwd_allreduce_microstep: 4.64 | step_microstep: 43.84
-[2025-01-25 18:12:54,089] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.72 | bwd: 4630.23 | bwd_inner: 4625.48 | bwd_allreduce: 4.69 | step: 43.85
- 58%|█████▊    | 3381/5800 [9:26:24<4:39:51,  6.94s/it]                                                       {'loss': 0.0093, 'grad_norm': 4.237843036651611, 'learning_rate': 1.5635173345809154e-05, 'epoch': 29.15}
- 58%|█████▊    | 3381/5800 [9:26:24<4:39:51,  6.94s/it]score1 tensor([[0.5273],
-        [0.4551],
-        [0.4648],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.4551, 0.4648, 0.5703], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:13:00,878] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 18:13:00,879] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.74 | bwd_microstep: 4507.11 | bwd_inner_microstep: 4502.19 | bwd_allreduce_microstep: 4.81 | step_microstep: 42.46
-[2025-01-25 18:13:00,880] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.69 | bwd: 4507.14 | bwd_inner: 4502.19 | bwd_allreduce: 4.87 | step: 42.45
- 58%|█████▊    | 3382/5800 [9:26:30<4:37:54,  6.90s/it]                                                       {'loss': 0.0039, 'grad_norm': 2.1109514236450195, 'learning_rate': 1.562427511458639e-05, 'epoch': 29.16}
- 58%|█████▊    | 3382/5800 [9:26:30<4:37:54,  6.90s/it]score1 tensor([[0.4844],
-        [0.5664],
-        [0.5703],
-        [0.6602]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4766, 0.5586, 0.5703, 0.6797], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:13:07,746] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.12 | optimizer_step: 4.36
-[2025-01-25 18:13:07,748] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.31 | bwd_microstep: 4583.02 | bwd_inner_microstep: 4577.83 | bwd_allreduce_microstep: 5.09 | step_microstep: 47.74
-[2025-01-25 18:13:07,749] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.26 | bwd: 4583.05 | bwd_inner: 4577.83 | bwd_allreduce: 5.13 | step: 47.76
- 58%|█████▊    | 3383/5800 [9:26:37<4:37:33,  6.89s/it]                                                       {'loss': 0.0088, 'grad_norm': 1.8248428106307983, 'learning_rate': 1.5613378247790566e-05, 'epoch': 29.16}
- 58%|█████▊    | 3383/5800 [9:26:37<4:37:33,  6.89s/it]score1 tensor([[0.4941],
-        [0.6094],
-        [0.5000],
-        [0.3848]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4766, 0.6133, 0.4844, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:13:14,680] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.06 | optimizer_step: 4.36
-[2025-01-25 18:13:14,682] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.74 | bwd_microstep: 4633.57 | bwd_inner_microstep: 4628.59 | bwd_allreduce_microstep: 4.89 | step_microstep: 48.03
-[2025-01-25 18:13:14,684] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.71 | bwd: 4633.60 | bwd_inner: 4628.59 | bwd_allreduce: 4.93 | step: 48.04
- 58%|█████▊    | 3384/5800 [9:26:44<4:37:59,  6.90s/it]                                                       {'loss': 0.0117, 'grad_norm': 3.5300230979919434, 'learning_rate': 1.560248274881952e-05, 'epoch': 29.17}
- 58%|█████▊    | 3384/5800 [9:26:44<4:37:59,  6.90s/it]score1 tensor([[0.6250],
-        [0.5664],
-        [0.6523],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6406, 0.6055, 0.6562, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:13:21,554] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.37
-[2025-01-25 18:13:21,555] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.46 | bwd_microstep: 4584.07 | bwd_inner_microstep: 4578.34 | bwd_allreduce_microstep: 5.59 | step_microstep: 43.55
-[2025-01-25 18:13:21,556] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.41 | bwd: 4584.10 | bwd_inner: 4578.34 | bwd_allreduce: 5.65 | step: 43.56
- 58%|█████▊    | 3385/5800 [9:26:51<4:37:25,  6.89s/it]                                                       {'loss': 0.0146, 'grad_norm': 6.967899799346924, 'learning_rate': 1.5591588621070658e-05, 'epoch': 29.18}
- 58%|█████▊    | 3385/5800 [9:26:51<4:37:25,  6.89s/it]score1 tensor([[0.4688],
-        [0.3691],
-        [0.3105],
-        [0.6016]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4395, 0.3789, 0.3086, 0.5938], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:13:28,467] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.96 | optimizer_step: 4.36
-[2025-01-25 18:13:28,469] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.03 | bwd_microstep: 4626.39 | bwd_inner_microstep: 4621.19 | bwd_allreduce_microstep: 5.08 | step_microstep: 47.03
-[2025-01-25 18:13:28,469] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.98 | bwd: 4626.41 | bwd_inner: 4621.20 | bwd_allreduce: 5.14 | step: 47.03
- 58%|█████▊    | 3386/5800 [9:26:58<4:37:30,  6.90s/it]                                                       {'loss': 0.0122, 'grad_norm': 4.085354804992676, 'learning_rate': 1.5580695867940957e-05, 'epoch': 29.19}
- 58%|█████▊    | 3386/5800 [9:26:58<4:37:30,  6.90s/it]score1 tensor([[0.4355],
-        [0.6016],
-        [0.5703],
-        [0.5898]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4023, 0.5820, 0.5430, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0259, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:13:35,381] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.18 | optimizer_step: 4.36
-[2025-01-25 18:13:35,383] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.62 | bwd_microstep: 4626.32 | bwd_inner_microstep: 4621.55 | bwd_allreduce_microstep: 4.70 | step_microstep: 55.21
-[2025-01-25 18:13:35,383] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.58 | bwd: 4626.35 | bwd_inner: 4621.55 | bwd_allreduce: 4.73 | step: 55.22
- 58%|█████▊    | 3387/5800 [9:27:05<4:37:38,  6.90s/it]                                                       {'loss': 0.0259, 'grad_norm': 8.590620040893555, 'learning_rate': 1.5569804492826963e-05, 'epoch': 29.2}
- 58%|█████▊    | 3387/5800 [9:27:05<4:37:38,  6.90s/it]score1 tensor([[0.4609],
-        [0.4941],
-        [0.5859],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4297, 0.4883, 0.5664, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:13:42,307] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.00 | optimizer_step: 4.36
-[2025-01-25 18:13:42,308] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.75 | bwd_microstep: 4636.10 | bwd_inner_microstep: 4630.64 | bwd_allreduce_microstep: 5.37 | step_microstep: 46.16
-[2025-01-25 18:13:42,309] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.71 | bwd: 4636.14 | bwd_inner: 4630.63 | bwd_allreduce: 5.42 | step: 46.17
- 58%|█████▊    | 3388/5800 [9:27:12<4:37:49,  6.91s/it]                                                       {'loss': 0.0161, 'grad_norm': 8.321528434753418, 'learning_rate': 1.5558914499124802e-05, 'epoch': 29.21}
- 58%|█████▊    | 3388/5800 [9:27:12<4:37:49,  6.91s/it]score1 tensor([[0.6289],
-        [0.4277],
-        [0.5312],
-        [0.4922]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6016, 0.4199, 0.5117, 0.4746], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0181, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:13:49,223] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.36
-[2025-01-25 18:13:49,224] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.85 | bwd_microstep: 4626.85 | bwd_inner_microstep: 4621.72 | bwd_allreduce_microstep: 5.00 | step_microstep: 42.27
-[2025-01-25 18:13:49,225] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.81 | bwd: 4626.88 | bwd_inner: 4621.72 | bwd_allreduce: 5.07 | step: 42.27
- 58%|█████▊    | 3389/5800 [9:27:19<4:37:42,  6.91s/it]                                                       {'loss': 0.0181, 'grad_norm': 8.27588176727295, 'learning_rate': 1.5548025890230153e-05, 'epoch': 29.22}
- 58%|█████▊    | 3389/5800 [9:27:19<4:37:42,  6.91s/it]score1 tensor([[0.4238],
-        [0.3809],
-        [0.4023],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4160, 0.3672, 0.3262, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0244, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:13:56,104] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.90 | optimizer_step: 4.36
-[2025-01-25 18:13:56,105] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.16 | bwd_microstep: 4591.97 | bwd_inner_microstep: 4587.14 | bwd_allreduce_microstep: 4.73 | step_microstep: 43.58
-[2025-01-25 18:13:56,105] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.13 | bwd: 4591.99 | bwd_inner: 4587.14 | bwd_allreduce: 4.78 | step: 43.59
- 58%|█████▊    | 3390/5800 [9:27:26<4:37:14,  6.90s/it]                                                       {'loss': 0.0244, 'grad_norm': 5.398965835571289, 'learning_rate': 1.553713866953829e-05, 'epoch': 29.22}
- 58%|█████▊    | 3390/5800 [9:27:26<4:37:14,  6.90s/it]score1 tensor([[0.4414],
-        [0.4629],
-        [0.5938],
-        [0.5859]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4297, 0.4531, 0.6211, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0142, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:14:03,013] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.36
-[2025-01-25 18:14:03,014] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.63 | bwd_microstep: 4629.62 | bwd_inner_microstep: 4624.41 | bwd_allreduce_microstep: 5.08 | step_microstep: 43.11
-[2025-01-25 18:14:03,015] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.59 | bwd: 4629.65 | bwd_inner: 4624.41 | bwd_allreduce: 5.15 | step: 43.12
- 58%|█████▊    | 3391/5800 [9:27:32<4:37:09,  6.90s/it]                                                       {'loss': 0.0142, 'grad_norm': 3.833591938018799, 'learning_rate': 1.5526252840444026e-05, 'epoch': 29.23}
- 58%|█████▊    | 3391/5800 [9:27:32<4:37:09,  6.90s/it]score1 tensor([[0.5742],
-        [0.5508],
-        [0.4707],
-        [0.6055]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.5469, 0.4980, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:14:09,926] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 18:14:09,927] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.84 | bwd_microstep: 4632.69 | bwd_inner_microstep: 4627.37 | bwd_allreduce_microstep: 5.23 | step_microstep: 46.04
-[2025-01-25 18:14:09,928] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.79 | bwd: 4632.71 | bwd_inner: 4627.37 | bwd_allreduce: 5.27 | step: 46.06
- 58%|█████▊    | 3392/5800 [9:27:39<4:37:10,  6.91s/it]                                                       {'loss': 0.0098, 'grad_norm': 4.294493675231934, 'learning_rate': 1.5515368406341745e-05, 'epoch': 29.24}
- 58%|█████▊    | 3392/5800 [9:27:39<4:37:10,  6.91s/it]score1 tensor([[0.4824],
-        [0.3730],
-        [0.3711],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4922, 0.4180, 0.3711, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:14:16,752] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.09 | optimizer_step: 4.36
-[2025-01-25 18:14:16,754] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.41 | bwd_microstep: 4542.99 | bwd_inner_microstep: 4537.83 | bwd_allreduce_microstep: 5.03 | step_microstep: 44.57
-[2025-01-25 18:14:16,754] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.37 | bwd: 4543.02 | bwd_inner: 4537.83 | bwd_allreduce: 5.10 | step: 44.57
- 58%|█████▊    | 3393/5800 [9:27:46<4:36:08,  6.88s/it]                                                       {'loss': 0.0137, 'grad_norm': 3.757823944091797, 'learning_rate': 1.5504485370625418e-05, 'epoch': 29.25}
- 58%|█████▊    | 3393/5800 [9:27:46<4:36:08,  6.88s/it]score1 tensor([[0.4434],
-        [0.3730],
-        [0.5977],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4590, 0.3887, 0.6250, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:14:23,677] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.91 | optimizer_step: 4.36
-[2025-01-25 18:14:23,678] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.02 | bwd_microstep: 4635.19 | bwd_inner_microstep: 4630.14 | bwd_allreduce_microstep: 4.95 | step_microstep: 44.89
-[2025-01-25 18:14:23,679] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.98 | bwd: 4635.21 | bwd_inner: 4630.14 | bwd_allreduce: 5.00 | step: 44.89
- 59%|█████▊    | 3394/5800 [9:27:53<4:36:33,  6.90s/it]                                                       {'loss': 0.0156, 'grad_norm': 7.911795139312744, 'learning_rate': 1.5493603736688547e-05, 'epoch': 29.26}
- 59%|█████▊    | 3394/5800 [9:27:53<4:36:33,  6.90s/it]score1 tensor([[0.4453],
-        [0.6133],
-        [0.4473],
-        [0.3926]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.6328, 0.4707, 0.4043], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:14:30,594] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 18:14:30,596] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.21 | bwd_microstep: 4628.35 | bwd_inner_microstep: 4623.01 | bwd_allreduce_microstep: 5.22 | step_microstep: 43.74
-[2025-01-25 18:14:30,596] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.17 | bwd: 4628.37 | bwd_inner: 4623.01 | bwd_allreduce: 5.28 | step: 43.75
- 59%|█████▊    | 3395/5800 [9:28:00<4:36:37,  6.90s/it]                                                       {'loss': 0.0186, 'grad_norm': 7.9087605476379395, 'learning_rate': 1.5482723507924235e-05, 'epoch': 29.27}
- 59%|█████▊    | 3395/5800 [9:28:00<4:36:37,  6.90s/it]score1 tensor([[0.5195],
-        [0.3652],
-        [0.5078],
-        [0.4434]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.3926, 0.5391, 0.4609], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0190, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:14:37,451] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.36
-[2025-01-25 18:14:37,453] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.74 | bwd_microstep: 4573.23 | bwd_inner_microstep: 4568.21 | bwd_allreduce_microstep: 4.92 | step_microstep: 42.50
-[2025-01-25 18:14:37,453] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.72 | bwd: 4573.25 | bwd_inner: 4568.21 | bwd_allreduce: 4.97 | step: 42.52
- 59%|█████▊    | 3396/5800 [9:28:07<4:36:01,  6.89s/it]                                                       {'loss': 0.019, 'grad_norm': 5.6637396812438965, 'learning_rate': 1.5471844687725105e-05, 'epoch': 29.28}
- 59%|█████▊    | 3396/5800 [9:28:07<4:36:01,  6.89s/it]score1 tensor([[0.5977],
-        [0.5859],
-        [0.4629],
-        [0.4824]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133, 0.6172, 0.4785, 0.5000], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0200, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:14:44,369] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 18:14:44,370] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.41 | bwd_microstep: 4628.87 | bwd_inner_microstep: 4623.89 | bwd_allreduce_microstep: 4.89 | step_microstep: 43.89
-[2025-01-25 18:14:44,370] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.37 | bwd: 4628.90 | bwd_inner: 4623.89 | bwd_allreduce: 4.94 | step: 43.90
- 59%|█████▊    | 3397/5800 [9:28:14<4:36:12,  6.90s/it]                                                       {'loss': 0.02, 'grad_norm': 8.451449394226074, 'learning_rate': 1.5460967279483376e-05, 'epoch': 29.28}
- 59%|█████▊    | 3397/5800 [9:28:14<4:36:12,  6.90s/it]score1 tensor([[0.4414],
-        [0.6055],
-        [0.4180],
-        [0.3164]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4609, 0.5625, 0.4219, 0.3398], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0225, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:14:51,283] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.97 | optimizer_step: 4.36
-[2025-01-25 18:14:51,284] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.89 | bwd_microstep: 4637.86 | bwd_inner_microstep: 4632.73 | bwd_allreduce_microstep: 5.02 | step_microstep: 43.69
-[2025-01-25 18:14:51,284] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.84 | bwd: 4637.89 | bwd_inner: 4632.73 | bwd_allreduce: 5.08 | step: 43.70
- 59%|█████▊    | 3398/5800 [9:28:21<4:36:21,  6.90s/it]                                                       {'loss': 0.0225, 'grad_norm': 3.138363838195801, 'learning_rate': 1.5450091286590808e-05, 'epoch': 29.29}
- 59%|█████▊    | 3398/5800 [9:28:21<4:36:21,  6.90s/it]score1 tensor([[0.4492],
-        [0.6406],
-        [0.4883],
-        [0.3105]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.6367, 0.4863, 0.3691], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0166, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:14:58,211] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.37
-[2025-01-25 18:14:58,212] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.86 | bwd_microstep: 4642.69 | bwd_inner_microstep: 4637.29 | bwd_allreduce_microstep: 5.30 | step_microstep: 42.77
-[2025-01-25 18:14:58,213] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.77 | bwd: 4642.71 | bwd_inner: 4637.29 | bwd_allreduce: 5.35 | step: 42.77
- 59%|█████▊    | 3399/5800 [9:28:28<4:36:29,  6.91s/it]                                                       {'loss': 0.0166, 'grad_norm': 1.2232387065887451, 'learning_rate': 1.5439216712438724e-05, 'epoch': 29.3}
- 59%|█████▊    | 3399/5800 [9:28:28<4:36:29,  6.91s/it]score1 tensor([[0.3809],
-        [0.6406],
-        [0.3535],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.6484, 0.3672, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:15:05,125] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.36
-[2025-01-25 18:15:05,126] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.61 | bwd_microstep: 4630.96 | bwd_inner_microstep: 4626.26 | bwd_allreduce_microstep: 4.61 | step_microstep: 43.56
-[2025-01-25 18:15:05,127] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.58 | bwd: 4630.98 | bwd_inner: 4626.26 | bwd_allreduce: 4.65 | step: 43.58
- 59%|█████▊    | 3400/5800 [9:28:35<4:36:24,  6.91s/it]                                                       {'loss': 0.0132, 'grad_norm': 3.733665943145752, 'learning_rate': 1.5428343560418008e-05, 'epoch': 29.31}
- 59%|█████▊    | 3400/5800 [9:28:35<4:36:24,  6.91s/it]score1 tensor([[0.5391],
-        [0.5977],
-        [0.3867],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.6211, 0.4238, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0229, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:15:12,041] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.37
-[2025-01-25 18:15:12,043] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.17 | bwd_microstep: 4628.36 | bwd_inner_microstep: 4623.44 | bwd_allreduce_microstep: 4.81 | step_microstep: 42.56
-[2025-01-25 18:15:12,043] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.12 | bwd: 4628.38 | bwd_inner: 4623.44 | bwd_allreduce: 4.86 | step: 42.56
- 59%|█████▊    | 3401/5800 [9:28:42<4:36:26,  6.91s/it]                                                       {'loss': 0.0229, 'grad_norm': 4.032650470733643, 'learning_rate': 1.5417471833919092e-05, 'epoch': 29.32}
- 59%|█████▊    | 3401/5800 [9:28:42<4:36:26,  6.91s/it]score1 tensor([[0.4609],
-        [0.4434],
-        [0.5078],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.4199, 0.4922, 0.4688], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0171, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:15:18,964] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.37
-[2025-01-25 18:15:18,966] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.05 | bwd_microstep: 4628.73 | bwd_inner_microstep: 4623.37 | bwd_allreduce_microstep: 5.28 | step_microstep: 46.98
-[2025-01-25 18:15:18,966] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.02 | bwd: 4628.76 | bwd_inner: 4623.37 | bwd_allreduce: 5.32 | step: 46.99
- 59%|█████▊    | 3402/5800 [9:28:48<4:36:25,  6.92s/it]                                                       {'loss': 0.0171, 'grad_norm': 7.880190372467041, 'learning_rate': 1.5406601536331974e-05, 'epoch': 29.33}
- 59%|█████▊    | 3402/5800 [9:28:48<4:36:25,  6.92s/it]score1 tensor([[0.4648],
-        [0.3984],
-        [0.5703],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.3867, 0.5586, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:15:25,894] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 18:15:25,896] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.93 | bwd_microstep: 4637.15 | bwd_inner_microstep: 4631.59 | bwd_allreduce_microstep: 5.41 | step_microstep: 47.34
-[2025-01-25 18:15:25,896] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.90 | bwd: 4637.18 | bwd_inner: 4631.59 | bwd_allreduce: 5.47 | step: 47.35
- 59%|█████▊    | 3403/5800 [9:28:55<4:36:29,  6.92s/it]                                                       {'loss': 0.0186, 'grad_norm': 3.8917555809020996, 'learning_rate': 1.5395732671046196e-05, 'epoch': 29.34}
- 59%|█████▊    | 3403/5800 [9:28:55<4:36:29,  6.92s/it]score1 tensor([[0.4727],
-        [0.3984],
-        [0.6484],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4355, 0.3750, 0.6016, 0.4668], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0381, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:15:32,813] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.04 | optimizer_step: 4.37
-[2025-01-25 18:15:32,814] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.36 | bwd_microstep: 4629.65 | bwd_inner_microstep: 4624.63 | bwd_allreduce_microstep: 4.94 | step_microstep: 41.88
-[2025-01-25 18:15:32,814] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.31 | bwd: 4629.67 | bwd_inner: 4624.63 | bwd_allreduce: 4.98 | step: 41.89
- 59%|█████▊    | 3404/5800 [9:29:02<4:36:17,  6.92s/it]                                                       {'loss': 0.0381, 'grad_norm': 8.121129035949707, 'learning_rate': 1.5384865241450865e-05, 'epoch': 29.34}
- 59%|█████▊    | 3404/5800 [9:29:02<4:36:17,  6.92s/it]score1 tensor([[0.3828],
-        [0.3828],
-        [0.5586],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3555, 0.3516, 0.5039, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0322, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:15:39,728] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.92 | optimizer_step: 4.36
-[2025-01-25 18:15:39,730] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.25 | bwd_microstep: 4631.22 | bwd_inner_microstep: 4626.11 | bwd_allreduce_microstep: 5.01 | step_microstep: 44.10
-[2025-01-25 18:15:39,730] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.21 | bwd: 4631.24 | bwd_inner: 4626.11 | bwd_allreduce: 5.06 | step: 44.10
- 59%|█████▊    | 3405/5800 [9:29:09<4:36:08,  6.92s/it]                                                       {'loss': 0.0322, 'grad_norm': 7.686522960662842, 'learning_rate': 1.5373999250934635e-05, 'epoch': 29.35}
- 59%|█████▊    | 3405/5800 [9:29:09<4:36:08,  6.92s/it]score1 tensor([[0.5391],
-        [0.5938],
-        [0.4707],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5000, 0.5664, 0.4512, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0322, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:15:46,644] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.36
-[2025-01-25 18:15:46,646] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.21 | bwd_microstep: 4634.13 | bwd_inner_microstep: 4629.19 | bwd_allreduce_microstep: 4.86 | step_microstep: 41.40
-[2025-01-25 18:15:46,646] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.17 | bwd: 4634.15 | bwd_inner: 4629.19 | bwd_allreduce: 4.89 | step: 41.42
- 59%|█████▊    | 3406/5800 [9:29:16<4:35:58,  6.92s/it]                                                       {'loss': 0.0322, 'grad_norm': 8.415850639343262, 'learning_rate': 1.5363134702885695e-05, 'epoch': 29.36}
- 59%|█████▊    | 3406/5800 [9:29:16<4:35:58,  6.92s/it]score1 tensor([[0.4258],
-        [0.6133],
-        [0.4453],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4082, 0.5898, 0.4258, 0.5000], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0200, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:15:53,564] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 18:15:53,565] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.08 | bwd_microstep: 4629.83 | bwd_inner_microstep: 4624.82 | bwd_allreduce_microstep: 4.92 | step_microstep: 43.49
-[2025-01-25 18:15:53,566] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.05 | bwd: 4629.86 | bwd_inner: 4624.82 | bwd_allreduce: 4.96 | step: 43.50
- 59%|█████▊    | 3407/5800 [9:29:23<4:35:55,  6.92s/it]                                                       {'loss': 0.02, 'grad_norm': 8.105783462524414, 'learning_rate': 1.5352271600691825e-05, 'epoch': 29.37}
- 59%|█████▊    | 3407/5800 [9:29:23<4:35:55,  6.92s/it]score1 tensor([[0.5195],
-        [0.5273],
-        [0.7148],
-        [0.4551]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.5000, 0.6875, 0.4238], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:16:00,500] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.91 | optimizer_step: 4.36
-[2025-01-25 18:16:00,501] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.74 | bwd_microstep: 4639.22 | bwd_inner_microstep: 4634.20 | bwd_allreduce_microstep: 4.93 | step_microstep: 45.11
-[2025-01-25 18:16:00,501] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.71 | bwd: 4639.25 | bwd_inner: 4634.20 | bwd_allreduce: 4.98 | step: 45.11
- 59%|█████▉    | 3408/5800 [9:29:30<4:36:01,  6.92s/it]                                                       {'loss': 0.0254, 'grad_norm': 8.59071159362793, 'learning_rate': 1.5341409947740303e-05, 'epoch': 29.38}
- 59%|█████▉    | 3408/5800 [9:29:30<4:36:01,  6.92s/it]score1 tensor([[0.5859],
-        [0.4551],
-        [0.4629],
-        [0.4746]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.4473, 0.4336, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:16:07,424] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.12 | optimizer_step: 4.36
-[2025-01-25 18:16:07,426] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.86 | bwd_microstep: 4632.71 | bwd_inner_microstep: 4627.77 | bwd_allreduce_microstep: 4.84 | step_microstep: 44.65
-[2025-01-25 18:16:07,426] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.82 | bwd: 4632.74 | bwd_inner: 4627.77 | bwd_allreduce: 4.89 | step: 44.66
- 59%|█████▉    | 3409/5800 [9:29:37<4:35:57,  6.92s/it]                                                       {'loss': 0.0137, 'grad_norm': 8.041947364807129, 'learning_rate': 1.5330549747418002e-05, 'epoch': 29.39}
- 59%|█████▉    | 3409/5800 [9:29:37<4:35:57,  6.92s/it]score1 tensor([[0.6836],
-        [0.5664],
-        [0.5430],
-        [0.3945]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6719, 0.5625, 0.5469, 0.3984], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:16:14,358] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.94 | optimizer_step: 4.36
-[2025-01-25 18:16:14,360] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.43 | bwd_microstep: 4638.88 | bwd_inner_microstep: 4633.97 | bwd_allreduce_microstep: 4.80 | step_microstep: 45.83
-[2025-01-25 18:16:14,360] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.39 | bwd: 4638.91 | bwd_inner: 4633.97 | bwd_allreduce: 4.86 | step: 45.85
- 59%|█████▉    | 3410/5800 [9:29:44<4:35:55,  6.93s/it]                                                       {'loss': 0.0059, 'grad_norm': 1.002736210823059, 'learning_rate': 1.531969100311131e-05, 'epoch': 29.4}
- 59%|█████▉    | 3410/5800 [9:29:44<4:35:55,  6.93s/it]score1 tensor([[0.3613],
-        [0.5195],
-        [0.4199],
-        [0.4375]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3789, 0.5234, 0.4277, 0.4297], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:16:21,277] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.03 | optimizer_step: 4.36
-[2025-01-25 18:16:21,279] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.53 | bwd_microstep: 4630.15 | bwd_inner_microstep: 4625.12 | bwd_allreduce_microstep: 4.91 | step_microstep: 43.59
-[2025-01-25 18:16:21,279] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.49 | bwd: 4630.18 | bwd_inner: 4625.12 | bwd_allreduce: 4.96 | step: 43.61
- 59%|█████▉    | 3411/5800 [9:29:51<4:35:41,  6.92s/it]                                                       {'loss': 0.0093, 'grad_norm': 3.7148935794830322, 'learning_rate': 1.530883371820617e-05, 'epoch': 29.41}
- 59%|█████▉    | 3411/5800 [9:29:51<4:35:41,  6.92s/it]score1 tensor([[0.5078],
-        [0.4160],
-        [0.3711],
-        [0.4980]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.4121, 0.3730, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:16:28,209] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.37
-[2025-01-25 18:16:28,210] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.81 | bwd_microstep: 4637.36 | bwd_inner_microstep: 4632.03 | bwd_allreduce_microstep: 5.24 | step_microstep: 43.75
-[2025-01-25 18:16:28,211] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.76 | bwd: 4637.38 | bwd_inner: 4632.03 | bwd_allreduce: 5.28 | step: 43.76
- 59%|█████▉    | 3412/5800 [9:29:58<4:35:44,  6.93s/it]                                                       {'loss': 0.0117, 'grad_norm': 3.970616579055786, 'learning_rate': 1.529797789608808e-05, 'epoch': 29.41}
- 59%|█████▉    | 3412/5800 [9:29:58<4:35:44,  6.93s/it]score1 tensor([[0.5000],
-        [0.4941],
-        [0.4961],
-        [0.4863]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.4844, 0.4863, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:16:35,140] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 18:16:35,142] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.70 | bwd_microstep: 4633.98 | bwd_inner_microstep: 4628.69 | bwd_allreduce_microstep: 5.20 | step_microstep: 46.45
-[2025-01-25 18:16:35,142] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.65 | bwd: 4634.01 | bwd_inner: 4628.69 | bwd_allreduce: 5.24 | step: 46.46
- 59%|█████▉    | 3413/5800 [9:30:05<4:35:42,  6.93s/it]                                                       {'loss': 0.0122, 'grad_norm': 0.3601783812046051, 'learning_rate': 1.5287123540142063e-05, 'epoch': 29.42}
- 59%|█████▉    | 3413/5800 [9:30:05<4:35:42,  6.93s/it]score1 tensor([[0.5039],
-        [0.3770],
-        [0.5469],
-        [0.4023]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.3945, 0.5742, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:16:42,080] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.92 | optimizer_step: 4.37
-[2025-01-25 18:16:42,082] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.90 | bwd_microstep: 4635.16 | bwd_inner_microstep: 4629.71 | bwd_allreduce_microstep: 5.36 | step_microstep: 47.18
-[2025-01-25 18:16:42,082] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.86 | bwd: 4635.19 | bwd_inner: 4629.71 | bwd_allreduce: 5.40 | step: 47.24
- 59%|█████▉    | 3414/5800 [9:30:12<4:35:36,  6.93s/it]                                                       {'loss': 0.0146, 'grad_norm': 4.124436855316162, 'learning_rate': 1.527627065375271e-05, 'epoch': 29.43}
- 59%|█████▉    | 3414/5800 [9:30:12<4:35:36,  6.93s/it]score1 tensor([[0.4863],
-        [0.4082],
-        [0.6328],
-        [0.3984]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.4277, 0.6562, 0.3906], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0190, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:16:49,006] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 18:16:49,008] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.36 | bwd_microstep: 4630.94 | bwd_inner_microstep: 4625.67 | bwd_allreduce_microstep: 5.17 | step_microstep: 44.32
-[2025-01-25 18:16:49,008] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.32 | bwd: 4630.96 | bwd_inner: 4625.67 | bwd_allreduce: 5.22 | step: 44.33
- 59%|█████▉    | 3415/5800 [9:30:18<4:35:28,  6.93s/it]                                                       {'loss': 0.019, 'grad_norm': 4.348252296447754, 'learning_rate': 1.526541924030412e-05, 'epoch': 29.44}
- 59%|█████▉    | 3415/5800 [9:30:18<4:35:28,  6.93s/it]score1 tensor([[0.4863],
-        [0.5508],
-        [0.4980],
-        [0.6172]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4805, 0.5664, 0.4941, 0.6484], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0142, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:16:55,943] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.37
-[2025-01-25 18:16:55,945] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.98 | bwd_microstep: 4641.67 | bwd_inner_microstep: 4636.34 | bwd_allreduce_microstep: 5.23 | step_microstep: 43.27
-[2025-01-25 18:16:55,945] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.94 | bwd: 4641.70 | bwd_inner: 4636.34 | bwd_allreduce: 5.28 | step: 43.28
- 59%|█████▉    | 3416/5800 [9:30:25<4:35:23,  6.93s/it]                                                       {'loss': 0.0142, 'grad_norm': 0.6636162996292114, 'learning_rate': 1.5254569303179976e-05, 'epoch': 29.45}
- 59%|█████▉    | 3416/5800 [9:30:25<4:35:23,  6.93s/it]score1 tensor([[0.5977],
-        [0.5898],
-        [0.4668],
-        [0.5938]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5938, 0.5898, 0.4805, 0.6016], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:17:02,810] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.88 | optimizer_step: 4.37
-[2025-01-25 18:17:02,812] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.28 | bwd_microstep: 4581.90 | bwd_inner_microstep: 4576.67 | bwd_allreduce_microstep: 5.12 | step_microstep: 43.68
-[2025-01-25 18:17:02,812] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.24 | bwd: 4581.92 | bwd_inner: 4576.67 | bwd_allreduce: 5.17 | step: 43.69
- 59%|█████▉    | 3417/5800 [9:30:32<4:34:34,  6.91s/it]                                                       {'loss': 0.0063, 'grad_norm': 1.9569379091262817, 'learning_rate': 1.5243720845763454e-05, 'epoch': 29.46}
- 59%|█████▉    | 3417/5800 [9:30:32<4:34:34,  6.91s/it]score1 tensor([[0.4824],
-        [0.3789],
-        [0.3574],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.3750, 0.3457, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:17:09,745] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 18:17:09,747] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.36 | bwd_microstep: 4639.75 | bwd_inner_microstep: 4633.90 | bwd_allreduce_microstep: 5.77 | step_microstep: 45.37
-[2025-01-25 18:17:09,747] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.32 | bwd: 4639.78 | bwd_inner: 4633.90 | bwd_allreduce: 5.81 | step: 45.39
- 59%|█████▉    | 3418/5800 [9:30:39<4:34:39,  6.92s/it]                                                       {'loss': 0.0098, 'grad_norm': 7.486464023590088, 'learning_rate': 1.5232873871437304e-05, 'epoch': 29.47}
- 59%|█████▉    | 3418/5800 [9:30:39<4:34:39,  6.92s/it]score1 tensor([[0.5234],
-        [0.4922],
-        [0.5273],
-        [0.3887]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.4844, 0.5156, 0.3984], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:17:16,664] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.37
-[2025-01-25 18:17:16,665] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.40 | bwd_microstep: 4634.02 | bwd_inner_microstep: 4628.98 | bwd_allreduce_microstep: 4.95 | step_microstep: 42.53
-[2025-01-25 18:17:16,665] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.34 | bwd: 4634.04 | bwd_inner: 4628.98 | bwd_allreduce: 4.99 | step: 42.54
- 59%|█████▉    | 3419/5800 [9:30:46<4:34:32,  6.92s/it]                                                       {'loss': 0.0083, 'grad_norm': 0.4549361765384674, 'learning_rate': 1.5222028383583797e-05, 'epoch': 29.47}
- 59%|█████▉    | 3419/5800 [9:30:46<4:34:32,  6.92s/it]score1 tensor([[0.4473],
-        [0.4629],
-        [0.6602],
-        [0.4043]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.4473, 0.6641, 0.4082], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:17:23,582] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.91 | optimizer_step: 4.36
-[2025-01-25 18:17:23,584] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.13 | bwd_microstep: 4632.30 | bwd_inner_microstep: 4627.00 | bwd_allreduce_microstep: 5.19 | step_microstep: 47.01
-[2025-01-25 18:17:23,584] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.10 | bwd: 4632.34 | bwd_inner: 4626.99 | bwd_allreduce: 5.24 | step: 47.01
- 59%|█████▉    | 3420/5800 [9:30:53<4:34:25,  6.92s/it]                                                       {'loss': 0.0063, 'grad_norm': 4.1225810050964355, 'learning_rate': 1.5211184385584745e-05, 'epoch': 29.48}
- 59%|█████▉    | 3420/5800 [9:30:53<4:34:25,  6.92s/it]score1 tensor([[0.6016],
-        [0.4863],
-        [0.4238],
-        [0.4102]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.4922, 0.4141, 0.4062], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:17:30,496] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 18:17:30,498] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.08 | bwd_microstep: 4631.08 | bwd_inner_microstep: 4625.30 | bwd_allreduce_microstep: 5.62 | step_microstep: 43.61
-[2025-01-25 18:17:30,499] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.03 | bwd: 4631.11 | bwd_inner: 4625.30 | bwd_allreduce: 5.69 | step: 43.62
- 59%|█████▉    | 3421/5800 [9:31:00<4:34:19,  6.92s/it]                                                       {'loss': 0.0068, 'grad_norm': 0.8501352667808533, 'learning_rate': 1.5200341880821496e-05, 'epoch': 29.49}
- 59%|█████▉    | 3421/5800 [9:31:00<4:34:19,  6.92s/it]score1 tensor([[0.5820],
-        [0.4707],
-        [0.4590],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4512, 0.4375, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0220, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:17:37,423] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 18:17:37,424] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.39 | bwd_microstep: 4638.99 | bwd_inner_microstep: 4633.99 | bwd_allreduce_microstep: 4.93 | step_microstep: 43.26
-[2025-01-25 18:17:37,425] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.35 | bwd: 4639.01 | bwd_inner: 4633.99 | bwd_allreduce: 4.96 | step: 43.27
- 59%|█████▉    | 3422/5800 [9:31:07<4:34:14,  6.92s/it]                                                       {'loss': 0.022, 'grad_norm': 3.951964855194092, 'learning_rate': 1.5189500872674934e-05, 'epoch': 29.5}
- 59%|█████▉    | 3422/5800 [9:31:07<4:34:14,  6.92s/it]score1 tensor([[0.4648],
-        [0.6055],
-        [0.5586],
-        [0.6172]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.6133, 0.5586, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:17:44,288] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 18:17:44,290] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.96 | bwd_microstep: 4581.74 | bwd_inner_microstep: 4576.79 | bwd_allreduce_microstep: 4.81 | step_microstep: 42.66
-[2025-01-25 18:17:44,290] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.93 | bwd: 4581.77 | bwd_inner: 4576.79 | bwd_allreduce: 4.89 | step: 42.68
- 59%|█████▉    | 3423/5800 [9:31:14<4:33:28,  6.90s/it]                                                       {'loss': 0.0161, 'grad_norm': 1.979972004890442, 'learning_rate': 1.517866136452546e-05, 'epoch': 29.51}
- 59%|█████▉    | 3423/5800 [9:31:14<4:33:28,  6.90s/it]score1 tensor([[0.5625],
-        [0.6094],
-        [0.3945],
-        [0.5898]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.6289, 0.2812, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0459, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:17:51,151] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.37
-[2025-01-25 18:17:51,152] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.58 | bwd_microstep: 4577.60 | bwd_inner_microstep: 4571.92 | bwd_allreduce_microstep: 5.58 | step_microstep: 43.93
-[2025-01-25 18:17:51,153] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.53 | bwd: 4577.62 | bwd_inner: 4571.92 | bwd_allreduce: 5.64 | step: 43.93
- 59%|█████▉    | 3424/5800 [9:31:21<4:32:54,  6.89s/it]                                                       {'loss': 0.0459, 'grad_norm': 1.793434739112854, 'learning_rate': 1.5167823359753038e-05, 'epoch': 29.52}
- 59%|█████▉    | 3424/5800 [9:31:21<4:32:54,  6.89s/it]score1 tensor([[0.6094],
-        [0.5156],
-        [0.5039],
-        [0.4531]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.5195, 0.4941, 0.4629], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:17:58,072] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 18:17:58,073] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.72 | bwd_microstep: 4632.84 | bwd_inner_microstep: 4627.62 | bwd_allreduce_microstep: 5.12 | step_microstep: 43.97
-[2025-01-25 18:17:58,073] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.68 | bwd: 4632.87 | bwd_inner: 4627.62 | bwd_allreduce: 5.18 | step: 43.97
- 59%|█████▉    | 3425/5800 [9:31:28<4:33:06,  6.90s/it]                                                       {'loss': 0.0068, 'grad_norm': 0.6383680105209351, 'learning_rate': 1.515698686173713e-05, 'epoch': 29.53}
- 59%|█████▉    | 3425/5800 [9:31:28<4:33:06,  6.90s/it]score1 tensor([[0.4629],
-        [0.6172],
-        [0.6797],
-        [0.6172]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.6172, 0.6953, 0.6367], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:18:04,936] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.01 | optimizer_step: 4.37
-[2025-01-25 18:18:04,937] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.17 | bwd_microstep: 4580.38 | bwd_inner_microstep: 4575.15 | bwd_allreduce_microstep: 5.14 | step_microstep: 42.88
-[2025-01-25 18:18:04,937] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.14 | bwd: 4580.40 | bwd_inner: 4575.14 | bwd_allreduce: 5.19 | step: 42.89
- 59%|█████▉    | 3426/5800 [9:31:34<4:32:36,  6.89s/it]                                                       {'loss': 0.0107, 'grad_norm': 2.908473491668701, 'learning_rate': 1.514615187385676e-05, 'epoch': 29.53}
- 59%|█████▉    | 3426/5800 [9:31:34<4:32:36,  6.89s/it]score1 tensor([[0.4629],
-        [0.6055],
-        [0.4941],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4570, 0.6211, 0.4941, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:18:11,808] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 18:18:11,809] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.02 | bwd_microstep: 4587.12 | bwd_inner_microstep: 4581.56 | bwd_allreduce_microstep: 5.43 | step_microstep: 43.67
-[2025-01-25 18:18:11,810] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.99 | bwd: 4587.14 | bwd_inner: 4581.56 | bwd_allreduce: 5.50 | step: 43.68
- 59%|█████▉    | 3427/5800 [9:31:41<4:32:15,  6.88s/it]                                                       {'loss': 0.0063, 'grad_norm': 2.491633892059326, 'learning_rate': 1.513531839949045e-05, 'epoch': 29.54}
- 59%|█████▉    | 3427/5800 [9:31:41<4:32:15,  6.88s/it]score1 tensor([[0.4102],
-        [0.4551],
-        [0.5547],
-        [0.5938]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4023, 0.4492, 0.5664, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:18:18,729] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.90 | optimizer_step: 4.36
-[2025-01-25 18:18:18,731] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.03 | bwd_microstep: 4638.78 | bwd_inner_microstep: 4633.74 | bwd_allreduce_microstep: 4.91 | step_microstep: 43.52
-[2025-01-25 18:18:18,731] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.99 | bwd: 4638.80 | bwd_inner: 4633.74 | bwd_allreduce: 4.98 | step: 43.52
- 59%|█████▉    | 3428/5800 [9:31:48<4:32:34,  6.89s/it]                                                       {'loss': 0.0132, 'grad_norm': 3.8214426040649414, 'learning_rate': 1.5124486442016282e-05, 'epoch': 29.55}
- 59%|█████▉    | 3428/5800 [9:31:48<4:32:34,  6.89s/it]score1 tensor([[0.5742],
-        [0.3594],
-        [0.5078],
-        [0.4141]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.3340, 0.5273, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0171, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:18:25,638] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.36
-[2025-01-25 18:18:25,639] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.64 | bwd_microstep: 4631.37 | bwd_inner_microstep: 4626.06 | bwd_allreduce_microstep: 5.20 | step_microstep: 44.46
-[2025-01-25 18:18:25,640] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.60 | bwd: 4631.40 | bwd_inner: 4626.06 | bwd_allreduce: 5.26 | step: 44.47
- 59%|█████▉    | 3429/5800 [9:31:55<4:32:38,  6.90s/it]                                                       {'loss': 0.0171, 'grad_norm': 0.3954784572124481, 'learning_rate': 1.5113656004811837e-05, 'epoch': 29.56}
- 59%|█████▉    | 3429/5800 [9:31:55<4:32:38,  6.90s/it]score1 tensor([[0.4980],
-        [0.5508],
-        [0.3457],
-        [0.6016]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5078, 0.5820, 0.3438, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0166, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:18:32,547] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 18:18:32,548] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.54 | bwd_microstep: 4627.71 | bwd_inner_microstep: 4623.01 | bwd_allreduce_microstep: 4.61 | step_microstep: 42.24
-[2025-01-25 18:18:32,549] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.52 | bwd: 4627.73 | bwd_inner: 4623.01 | bwd_allreduce: 4.65 | step: 42.25
- 59%|█████▉    | 3430/5800 [9:32:02<4:32:42,  6.90s/it]                                                       {'loss': 0.0166, 'grad_norm': 0.5088630318641663, 'learning_rate': 1.5102827091254227e-05, 'epoch': 29.57}
- 59%|█████▉    | 3430/5800 [9:32:02<4:32:42,  6.90s/it]score1 tensor([[0.4863],
-        [0.4980],
-        [0.5234],
-        [0.6680]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.4922, 0.5625, 0.6797], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0171, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:18:39,475] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.98 | optimizer_step: 4.36
-[2025-01-25 18:18:39,476] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.96 | bwd_microstep: 4635.71 | bwd_inner_microstep: 4630.61 | bwd_allreduce_microstep: 5.02 | step_microstep: 48.25
-[2025-01-25 18:18:39,477] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.92 | bwd: 4635.74 | bwd_inner: 4630.61 | bwd_allreduce: 5.06 | step: 48.26
- 59%|█████▉    | 3431/5800 [9:32:09<4:32:50,  6.91s/it]                                                       {'loss': 0.0171, 'grad_norm': 4.469700336456299, 'learning_rate': 1.509199970472011e-05, 'epoch': 29.58}
- 59%|█████▉    | 3431/5800 [9:32:09<4:32:50,  6.91s/it]score1 tensor([[0.4570],
-        [0.4785],
-        [0.4473],
-        [0.6094]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4434, 0.4863, 0.4492, 0.5977], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:18:46,388] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.12 | optimizer_step: 4.37
-[2025-01-25 18:18:46,394] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.38 | bwd_microstep: 4627.61 | bwd_inner_microstep: 4622.52 | bwd_allreduce_microstep: 5.01 | step_microstep: 47.74
-[2025-01-25 18:18:46,394] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.34 | bwd: 4627.63 | bwd_inner: 4622.52 | bwd_allreduce: 5.04 | step: 47.75
- 59%|█████▉    | 3432/5800 [9:32:16<4:32:47,  6.91s/it]                                                       {'loss': 0.0088, 'grad_norm': 0.5826060175895691, 'learning_rate': 1.5081173848585642e-05, 'epoch': 29.59}
- 59%|█████▉    | 3432/5800 [9:32:16<4:32:47,  6.91s/it]score1 tensor([[0.5430],
-        [0.4961],
-        [0.5430],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.4863, 0.5391, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:18:53,308] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 18:18:53,309] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.36 | bwd_microstep: 4631.52 | bwd_inner_microstep: 4626.55 | bwd_allreduce_microstep: 4.88 | step_microstep: 42.50
-[2025-01-25 18:18:53,309] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.30 | bwd: 4631.55 | bwd_inner: 4626.55 | bwd_allreduce: 4.93 | step: 42.52
- 59%|█████▉    | 3433/5800 [9:32:23<4:32:40,  6.91s/it]                                                       {'loss': 0.0054, 'grad_norm': 4.155144214630127, 'learning_rate': 1.5070349526226523e-05, 'epoch': 29.59}
- 59%|█████▉    | 3433/5800 [9:32:23<4:32:40,  6.91s/it]score1 tensor([[0.3828],
-        [0.3691],
-        [0.4121],
-        [0.4492]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3809, 0.3652, 0.4121, 0.4336], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:19:00,169] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.36
-[2025-01-25 18:19:00,170] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.08 | bwd_microstep: 4580.78 | bwd_inner_microstep: 4575.62 | bwd_allreduce_microstep: 5.06 | step_microstep: 43.30
-[2025-01-25 18:19:00,171] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.05 | bwd: 4580.80 | bwd_inner: 4575.62 | bwd_allreduce: 5.11 | step: 43.31
- 59%|█████▉    | 3434/5800 [9:32:30<4:32:00,  6.90s/it]                                                       {'loss': 0.0054, 'grad_norm': 5.386696815490723, 'learning_rate': 1.505952674101795e-05, 'epoch': 29.6}
- 59%|█████▉    | 3434/5800 [9:32:30<4:32:00,  6.90s/it]score1 tensor([[0.3242],
-        [0.4922],
-        [0.6641],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3105, 0.5000, 0.6641, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:19:07,037] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.36
-[2025-01-25 18:19:07,038] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.90 | bwd_microstep: 4580.98 | bwd_inner_microstep: 4576.05 | bwd_allreduce_microstep: 4.86 | step_microstep: 45.21
-[2025-01-25 18:19:07,038] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.87 | bwd: 4581.00 | bwd_inner: 4576.05 | bwd_allreduce: 4.89 | step: 45.21
- 59%|█████▉    | 3435/5800 [9:32:37<4:31:31,  6.89s/it]                                                       {'loss': 0.0093, 'grad_norm': 2.536437511444092, 'learning_rate': 1.5048705496334672e-05, 'epoch': 29.61}
- 59%|█████▉    | 3435/5800 [9:32:37<4:31:31,  6.89s/it]score1 tensor([[0.5430],
-        [0.3984],
-        [0.3633],
-        [0.4727]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4004, 0.3789, 0.4883], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:19:13,960] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.36
-[2025-01-25 18:19:13,962] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.44 | bwd_microstep: 4638.08 | bwd_inner_microstep: 4633.19 | bwd_allreduce_microstep: 4.79 | step_microstep: 45.18
-[2025-01-25 18:19:13,962] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.41 | bwd: 4638.10 | bwd_inner: 4633.19 | bwd_allreduce: 4.84 | step: 45.20
- 59%|█████▉    | 3436/5800 [9:32:43<4:31:52,  6.90s/it]                                                       {'loss': 0.0103, 'grad_norm': 7.514510631561279, 'learning_rate': 1.5037885795550928e-05, 'epoch': 29.62}
- 59%|█████▉    | 3436/5800 [9:32:43<4:31:52,  6.90s/it]score1 tensor([[0.4316],
-        [0.4121],
-        [0.5820],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4570, 0.4316, 0.5898, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0142, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:19:20,879] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.37
-[2025-01-25 18:19:20,880] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.54 | bwd_microstep: 4634.22 | bwd_inner_microstep: 4629.08 | bwd_allreduce_microstep: 5.04 | step_microstep: 44.56
-[2025-01-25 18:19:20,881] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.51 | bwd: 4634.25 | bwd_inner: 4629.08 | bwd_allreduce: 5.09 | step: 44.57
- 59%|█████▉    | 3437/5800 [9:32:50<4:31:56,  6.90s/it]                                                       {'loss': 0.0142, 'grad_norm': 7.979158401489258, 'learning_rate': 1.5027067642040493e-05, 'epoch': 29.63}
- 59%|█████▉    | 3437/5800 [9:32:50<4:31:56,  6.90s/it]score1 tensor([[0.4805],
-        [0.5117],
-        [0.4863],
-        [0.4941]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4609, 0.5469, 0.4980, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0190, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:19:27,799] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 18:19:27,802] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.03 | bwd_microstep: 4632.02 | bwd_inner_microstep: 4627.39 | bwd_allreduce_microstep: 4.55 | step_microstep: 46.43
-[2025-01-25 18:19:27,802] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.98 | bwd: 4632.05 | bwd_inner: 4627.39 | bwd_allreduce: 4.59 | step: 46.45
- 59%|█████▉    | 3438/5800 [9:32:57<4:32:01,  6.91s/it]                                                       {'loss': 0.019, 'grad_norm': 4.050300598144531, 'learning_rate': 1.501625103917666e-05, 'epoch': 29.64}
- 59%|█████▉    | 3438/5800 [9:32:57<4:32:01,  6.91s/it]score1 tensor([[0.5039],
-        [0.5820],
-        [0.5000],
-        [0.3750]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4805, 0.5586, 0.4863, 0.3613], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:19:34,722] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.37
-[2025-01-25 18:19:34,724] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.64 | bwd_microstep: 4632.57 | bwd_inner_microstep: 4626.03 | bwd_allreduce_microstep: 6.46 | step_microstep: 43.09
-[2025-01-25 18:19:34,724] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.61 | bwd: 4632.59 | bwd_inner: 4626.03 | bwd_allreduce: 6.49 | step: 43.10
- 59%|█████▉    | 3439/5800 [9:33:04<4:32:03,  6.91s/it]                                                       {'loss': 0.0186, 'grad_norm': 8.034527778625488, 'learning_rate': 1.5005435990332217e-05, 'epoch': 29.65}
- 59%|█████▉    | 3439/5800 [9:33:04<4:32:03,  6.91s/it]score1 tensor([[0.4434],
-        [0.3926],
-        [0.4785],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.4043, 0.4844, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0166, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:19:41,644] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 18:19:41,647] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.94 | bwd_microstep: 4632.32 | bwd_inner_microstep: 4627.29 | bwd_allreduce_microstep: 4.93 | step_microstep: 44.63
-[2025-01-25 18:19:41,650] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.90 | bwd: 4632.35 | bwd_inner: 4627.29 | bwd_allreduce: 4.99 | step: 44.63
- 59%|█████▉    | 3440/5800 [9:33:11<4:32:04,  6.92s/it]                                                       {'loss': 0.0166, 'grad_norm': 0.6081334948539734, 'learning_rate': 1.499462249887951e-05, 'epoch': 29.66}
- 59%|█████▉    | 3440/5800 [9:33:11<4:32:04,  6.92s/it]score1 tensor([[0.4355],
-        [0.5195],
-        [0.6680],
-        [0.4648]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4453, 0.5117, 0.6289, 0.4336], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0220, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:19:48,562] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 18:19:48,563] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.26 | bwd_microstep: 4632.23 | bwd_inner_microstep: 4627.26 | bwd_allreduce_microstep: 4.90 | step_microstep: 41.47
-[2025-01-25 18:19:48,563] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.22 | bwd: 4632.26 | bwd_inner: 4627.26 | bwd_allreduce: 4.93 | step: 41.48
- 59%|█████▉    | 3441/5800 [9:33:18<4:31:53,  6.92s/it]                                                       {'loss': 0.022, 'grad_norm': 4.594008922576904, 'learning_rate': 1.4983810568190356e-05, 'epoch': 29.66}
- 59%|█████▉    | 3441/5800 [9:33:18<4:31:53,  6.92s/it]score1 tensor([[0.6562],
-        [0.4668],
-        [0.4160],
-        [0.5977]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6484, 0.4746, 0.4043, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:19:55,481] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.37
-[2025-01-25 18:19:55,483] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.20 | bwd_microstep: 4632.63 | bwd_inner_microstep: 4627.62 | bwd_allreduce_microstep: 4.92 | step_microstep: 45.96
-[2025-01-25 18:19:55,483] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.16 | bwd: 4632.66 | bwd_inner: 4627.62 | bwd_allreduce: 4.97 | step: 45.96
- 59%|█████▉    | 3442/5800 [9:33:25<4:31:49,  6.92s/it]                                                       {'loss': 0.0088, 'grad_norm': 0.4724644124507904, 'learning_rate': 1.4973000201636097e-05, 'epoch': 29.67}
- 59%|█████▉    | 3442/5800 [9:33:25<4:31:49,  6.92s/it]score1 tensor([[0.5195],
-        [0.4688],
-        [0.4922],
-        [0.4766]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.4551, 0.4844, 0.4629], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:20:02,337] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.88 | optimizer_step: 4.37
-[2025-01-25 18:20:02,338] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.78 | bwd_microstep: 4575.19 | bwd_inner_microstep: 4570.12 | bwd_allreduce_microstep: 4.96 | step_microstep: 42.37
-[2025-01-25 18:20:02,338] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.75 | bwd: 4575.21 | bwd_inner: 4570.12 | bwd_allreduce: 5.02 | step: 42.37
- 59%|█████▉    | 3443/5800 [9:33:32<4:30:59,  6.90s/it]                                                       {'loss': 0.0088, 'grad_norm': 5.936038494110107, 'learning_rate': 1.4962191402587608e-05, 'epoch': 29.68}
- 59%|█████▉    | 3443/5800 [9:33:32<4:30:59,  6.90s/it]score1 tensor([[0.4238],
-        [0.4746],
-        [0.4844],
-        [0.4688]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4219, 0.4512, 0.4707, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:20:09,248] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 18:20:09,249] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.09 | bwd_microstep: 4629.82 | bwd_inner_microstep: 4624.34 | bwd_allreduce_microstep: 5.40 | step_microstep: 44.22
-[2025-01-25 18:20:09,250] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.05 | bwd: 4629.84 | bwd_inner: 4624.34 | bwd_allreduce: 5.44 | step: 44.23
- 59%|█████▉    | 3444/5800 [9:33:39<4:31:02,  6.90s/it]                                                       {'loss': 0.0146, 'grad_norm': 7.708193778991699, 'learning_rate': 1.495138417441525e-05, 'epoch': 29.69}
- 59%|█████▉    | 3444/5800 [9:33:39<4:31:02,  6.90s/it]score1 tensor([[0.4688],
-        [0.5742],
-        [0.4824],
-        [0.4980]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4609, 0.5703, 0.4688, 0.4883], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:20:16,165] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.07 | optimizer_step: 4.37
-[2025-01-25 18:20:16,166] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.22 | bwd_microstep: 4635.43 | bwd_inner_microstep: 4628.02 | bwd_allreduce_microstep: 7.27 | step_microstep: 42.20
-[2025-01-25 18:20:16,166] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.18 | bwd: 4635.46 | bwd_inner: 4628.02 | bwd_allreduce: 7.35 | step: 42.26
- 59%|█████▉    | 3445/5800 [9:33:46<4:31:04,  6.91s/it]                                                       {'loss': 0.0088, 'grad_norm': 8.117509841918945, 'learning_rate': 1.494057852048891e-05, 'epoch': 29.7}
- 59%|█████▉    | 3445/5800 [9:33:46<4:31:04,  6.91s/it]score1 tensor([[0.5352],
-        [0.5000],
-        [0.5352],
-        [0.7031]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5234, 0.4941, 0.5547, 0.6836], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0142, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:20:23,076] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.90 | optimizer_step: 4.37
-[2025-01-25 18:20:23,077] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.85 | bwd_microstep: 4629.99 | bwd_inner_microstep: 4625.29 | bwd_allreduce_microstep: 4.62 | step_microstep: 42.84
-[2025-01-25 18:20:23,078] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.78 | bwd: 4630.02 | bwd_inner: 4625.29 | bwd_allreduce: 4.66 | step: 42.85
- 59%|█████▉    | 3446/5800 [9:33:53<4:31:00,  6.91s/it]                                                       {'loss': 0.0142, 'grad_norm': 4.558979034423828, 'learning_rate': 1.4929774444177972e-05, 'epoch': 29.71}
- 59%|█████▉    | 3446/5800 [9:33:53<4:31:00,  6.91s/it]score1 tensor([[0.4785],
-        [0.4805],
-        [0.6172],
-        [0.4160]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.4766, 0.6406, 0.4258], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:20:29,943] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 18:20:29,944] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.11 | bwd_microstep: 4582.38 | bwd_inner_microstep: 4578.03 | bwd_allreduce_microstep: 4.27 | step_microstep: 42.51
-[2025-01-25 18:20:29,945] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.07 | bwd: 4582.41 | bwd_inner: 4578.03 | bwd_allreduce: 4.31 | step: 42.52
- 59%|█████▉    | 3447/5800 [9:33:59<4:30:23,  6.89s/it]                                                       {'loss': 0.0093, 'grad_norm': 2.182312250137329, 'learning_rate': 1.4918971948851342e-05, 'epoch': 29.72}
- 59%|█████▉    | 3447/5800 [9:33:59<4:30:23,  6.89s/it]score1 tensor([[0.5000],
-        [0.6055],
-        [0.5977],
-        [0.6094]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5078, 0.6094, 0.6172, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:20:36,882] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 18:20:36,884] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.22 | bwd_microstep: 4632.04 | bwd_inner_microstep: 4626.89 | bwd_allreduce_microstep: 5.07 | step_microstep: 47.19
-[2025-01-25 18:20:36,884] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.18 | bwd: 4632.07 | bwd_inner: 4626.89 | bwd_allreduce: 5.11 | step: 47.21
- 59%|█████▉    | 3448/5800 [9:34:06<4:30:48,  6.91s/it]                                                       {'loss': 0.0098, 'grad_norm': 8.816826820373535, 'learning_rate': 1.4908171037877426e-05, 'epoch': 29.72}
- 59%|█████▉    | 3448/5800 [9:34:06<4:30:48,  6.91s/it]score1 tensor([[0.6289],
-        [0.5625],
-        [0.4609],
-        [0.3555]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6250, 0.5742, 0.4629, 0.3652], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:20:43,786] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.37
-[2025-01-25 18:20:43,787] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.35 | bwd_microstep: 4632.79 | bwd_inner_microstep: 4627.88 | bwd_allreduce_microstep: 4.80 | step_microstep: 42.01
-[2025-01-25 18:20:43,787] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.32 | bwd: 4632.82 | bwd_inner: 4627.88 | bwd_allreduce: 4.84 | step: 42.02
- 59%|█████▉    | 3449/5800 [9:34:13<4:30:43,  6.91s/it]                                                       {'loss': 0.0068, 'grad_norm': 3.5138790607452393, 'learning_rate': 1.4897371714624123e-05, 'epoch': 29.73}
- 59%|█████▉    | 3449/5800 [9:34:13<4:30:43,  6.91s/it]score1 tensor([[0.3965],
-        [0.4336],
-        [0.5156],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.4355, 0.5547, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0205, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:20:50,702] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 18:20:50,703] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.88 | bwd_microstep: 4630.56 | bwd_inner_microstep: 4625.62 | bwd_allreduce_microstep: 4.82 | step_microstep: 42.24
-[2025-01-25 18:20:50,704] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.83 | bwd: 4630.60 | bwd_inner: 4625.62 | bwd_allreduce: 4.88 | step: 42.24
- 59%|█████▉    | 3450/5800 [9:34:20<4:30:36,  6.91s/it]                                                       {'loss': 0.0205, 'grad_norm': 7.8003740310668945, 'learning_rate': 1.4886573982458862e-05, 'epoch': 29.74}
- 59%|█████▉    | 3450/5800 [9:34:20<4:30:36,  6.91s/it]evaluate!
-score1 tensor([[0.4648]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4805, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5508]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5625, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4766]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4766, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0., device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5430]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5703, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5977]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5742, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4258]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5547, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1289, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5977, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4219, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1133, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5586]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4941]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5000, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4434]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3945, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0488, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5312]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5000, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4766]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3730, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1035, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4707]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4043]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4414, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0371, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4961]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6797, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1836, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5391, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4863]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4648, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3965]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4258, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4492]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4707, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4375]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3965, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0410, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4883]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3750, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1133, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4492, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5117]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5586, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5703]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6523, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0820, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5703]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5898, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4355, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0996, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5078]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5156, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5000]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3262, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1738, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4004]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4863, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0859, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4648]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1836, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5469]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6055, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0586, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4570, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0820, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5586]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0898, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5820]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.7031, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1211, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4609]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4336, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5312]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5391, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4434]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4473, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5195]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5430, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4375]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4590, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6289]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6641, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5000]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4453, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5820]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.7109, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1289, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5273, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4922]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3691, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1230, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4414]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4180, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4551]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3223, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1328, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5547]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6016, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4688]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4648, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4336]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3418, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0918, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4844]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4590, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4941]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5117, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4922]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4551, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0371, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4648]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5234, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0586, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5703]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5312, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5820]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5820, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0., device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4199]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4199, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0., device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4062]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3926, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0938, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4961]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5117]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5352, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5781]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6250, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5898]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5703, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4414]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4727, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5625]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4434, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1191, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4863]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4941, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5430]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5352, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4727]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4414]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3828, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0586, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4316]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4219, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4727]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4844, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5117]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4805, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5469]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5625]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5898, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6289]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6641, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4902]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4688, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4277]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4297, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4141]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4668, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0527, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1016, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4414]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4297, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4199]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4609, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0410, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4785]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1387, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4609]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4492, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.3984]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3730, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6445, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1094, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4922]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1562, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4453]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5430, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0977, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4883]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5273, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4297]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3867, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4727]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4668, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5078]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5625, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5547]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6250, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5664]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0625, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5430]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0859, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4316]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4785, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4824]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4062, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0762, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4395, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0762, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4219]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5664, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1445, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4395]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3555, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0840, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4766]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4785, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5078, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5703]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6016, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4648]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4277, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0371, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5625]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5781, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4805]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3477, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1328, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4961]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6133, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1172, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5156, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4414]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3379, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1035, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4219]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3418, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0801, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4590]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3848, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0742, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4121]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3828, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4668]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4277, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4004]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4043, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-Results saved to /DATA/DATA1/wjr/intern/InternVL/internvl_chat/new_train_result/stage2/mos3_test.csv
-Accuracy: 0.0
-SRCC_score: 0.6762967667184788
-PLCC_score: 0.6705051858943587
-KRCC_score: 0.4911094393064507
-SRCC_level: 0.6762967667184788
-PLCC_level: 0.6705051858943587
-KRCC_level: 0.4911094393064507
-score1 tensor([[0.5352],
-        [0.4902],
-        [0.6094],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4922, 0.6445, 0.6523], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0327, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:31:19,220] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 18:31:19,221] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2141.34 | bwd_microstep: 4597.94 | bwd_inner_microstep: 4593.33 | bwd_allreduce_microstep: 4.53 | step_microstep: 45.05
-[2025-01-25 18:31:19,222] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2141.30 | bwd: 4597.96 | bwd_inner: 4593.33 | bwd_allreduce: 4.57 | step: 45.06
- 60%|█████▉    | 3451/5800 [9:44:49<126:11:16, 193.39s/it]                                                          {'loss': 0.0327, 'grad_norm': 8.577571868896484, 'learning_rate': 1.4875777844748553e-05, 'epoch': 29.75}
- 60%|█████▉    | 3451/5800 [9:44:49<126:11:16, 193.39s/it]score1 tensor([[0.4453],
-        [0.4922],
-        [0.4453],
-        [0.4453]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4414, 0.5156, 0.4512, 0.4727], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0151, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:31:26,044] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.88 | optimizer_step: 4.37
-[2025-01-25 18:31:26,045] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2137.58 | bwd_microstep: 4570.16 | bwd_inner_microstep: 4565.47 | bwd_allreduce_microstep: 4.61 | step_microstep: 43.24
-[2025-01-25 18:31:26,046] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2137.54 | bwd: 4570.19 | bwd_inner: 4565.47 | bwd_allreduce: 4.65 | step: 43.25
- 60%|█████▉    | 3452/5800 [9:44:56<89:37:44, 137.42s/it]                                                          {'loss': 0.0151, 'grad_norm': 3.8942501544952393, 'learning_rate': 1.4864983304859632e-05, 'epoch': 29.76}
- 60%|█████▉    | 3452/5800 [9:44:56<89:37:44, 137.42s/it]score1 tensor([[0.5312],
-        [0.4570],
-        [0.5312],
-        [0.6211]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.4453, 0.5469, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:31:32,886] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.89 | optimizer_step: 4.37
-[2025-01-25 18:31:32,888] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2133.23 | bwd_microstep: 4583.92 | bwd_inner_microstep: 4579.08 | bwd_allreduce_microstep: 4.77 | step_microstep: 43.37
-[2025-01-25 18:31:32,888] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2133.18 | bwd: 4583.95 | bwd_inner: 4579.08 | bwd_allreduce: 4.80 | step: 43.37
- 60%|█████▉    | 3453/5800 [9:45:02<64:03:10, 98.25s/it]                                                         {'loss': 0.0176, 'grad_norm': 4.2342376708984375, 'learning_rate': 1.4854190366158017e-05, 'epoch': 29.77}
- 60%|█████▉    | 3453/5800 [9:45:02<64:03:10, 98.25s/it]score1 tensor([[0.5234],
-        [0.5430],
-        [0.4121],
-        [0.3984]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.5430, 0.4023, 0.4141], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:31:39,692] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 18:31:39,693] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2135.62 | bwd_microstep: 4540.42 | bwd_inner_microstep: 4535.42 | bwd_allreduce_microstep: 4.89 | step_microstep: 44.19
-[2025-01-25 18:31:39,693] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2135.58 | bwd: 4540.44 | bwd_inner: 4535.42 | bwd_allreduce: 4.95 | step: 44.19
- 60%|█████▉    | 3454/5800 [9:45:09<46:08:49, 70.81s/it]                                                        {'loss': 0.0112, 'grad_norm': 2.157557964324951, 'learning_rate': 1.4843399032009128e-05, 'epoch': 29.78}
- 60%|█████▉    | 3454/5800 [9:45:09<46:08:49, 70.81s/it]score1 tensor([[0.4551],
-        [0.6094],
-        [0.3438],
-        [0.1680]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.6133, 0.3730, 0.1787], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0125, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:31:46,555] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 18:31:46,557] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2141.58 | bwd_microstep: 4600.18 | bwd_inner_microstep: 4595.18 | bwd_allreduce_microstep: 4.92 | step_microstep: 50.33
-[2025-01-25 18:31:46,557] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2141.55 | bwd: 4600.21 | bwd_inner: 4595.18 | bwd_allreduce: 4.96 | step: 50.33
- 60%|█████▉    | 3455/5800 [9:45:16<33:37:47, 51.63s/it]                                                        {'loss': 0.0125, 'grad_norm': 3.094984531402588, 'learning_rate': 1.48326093057779e-05, 'epoch': 29.78}
- 60%|█████▉    | 3455/5800 [9:45:16<33:37:47, 51.63s/it]score1 tensor([[0.6211],
-        [0.6367],
-        [0.3555],
-        [0.3691]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.6094, 0.3477, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0142, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:31:53,426] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 18:31:53,428] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2147.58 | bwd_microstep: 4608.16 | bwd_inner_microstep: 4603.22 | bwd_allreduce_microstep: 4.85 | step_microstep: 45.25
-[2025-01-25 18:31:53,428] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2147.54 | bwd: 4608.18 | bwd_inner: 4603.22 | bwd_allreduce: 4.89 | step: 45.26
- 60%|█████▉    | 3456/5800 [9:45:23<24:52:25, 38.20s/it]                                                        {'loss': 0.0142, 'grad_norm': 4.64267635345459, 'learning_rate': 1.4821821190828747e-05, 'epoch': 29.79}
- 60%|█████▉    | 3456/5800 [9:45:23<24:52:25, 38.20s/it]score1 tensor([[0.5547],
-        [0.5586],
-        [0.5117],
-        [0.4590]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5742, 0.5625, 0.5312, 0.4395], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:32:00,298] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 18:32:00,299] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2147.20 | bwd_microstep: 4609.08 | bwd_inner_microstep: 4604.59 | bwd_allreduce_microstep: 4.42 | step_microstep: 42.95
-[2025-01-25 18:32:00,300] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2147.16 | bwd: 4609.12 | bwd_inner: 4604.59 | bwd_allreduce: 4.45 | step: 42.96
- 60%|█████▉    | 3457/5800 [9:45:30<18:44:45, 28.80s/it]                                                        {'loss': 0.0156, 'grad_norm': 4.411694526672363, 'learning_rate': 1.4811034690525603e-05, 'epoch': 29.8}
- 60%|█████▉    | 3457/5800 [9:45:30<18:44:45, 28.80s/it]score1 tensor([[0.5195],
-        [0.5430],
-        [0.4844],
-        [0.4375]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.5625, 0.4590, 0.4277], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:32:07,186] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 18:32:07,187] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.08 | bwd_microstep: 4617.49 | bwd_inner_microstep: 4612.64 | bwd_allreduce_microstep: 4.75 | step_microstep: 42.59
-[2025-01-25 18:32:07,188] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.05 | bwd: 4617.51 | bwd_inner: 4612.64 | bwd_allreduce: 4.80 | step: 42.59
- 60%|█████▉    | 3458/5800 [9:45:37<14:27:37, 22.23s/it]                                                        {'loss': 0.0146, 'grad_norm': 3.7631235122680664, 'learning_rate': 1.480024980823187e-05, 'epoch': 29.81}
- 60%|█████▉    | 3458/5800 [9:45:37<14:27:37, 22.23s/it]score1 tensor([[0.5078],
-        [0.6250],
-        [0.5000],
-        [0.3242]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5078, 0.6055, 0.4902, 0.3223], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:32:14,009] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 18:32:14,010] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2148.28 | bwd_microstep: 4550.42 | bwd_inner_microstep: 4544.88 | bwd_allreduce_microstep: 5.45 | step_microstep: 47.80
-[2025-01-25 18:32:14,011] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2148.24 | bwd: 4550.44 | bwd_inner: 4544.88 | bwd_allreduce: 5.50 | step: 47.81
- 60%|█████▉    | 3459/5800 [9:45:43<11:26:58, 17.61s/it]                                                        {'loss': 0.0078, 'grad_norm': 5.961917400360107, 'learning_rate': 1.4789466547310483e-05, 'epoch': 29.82}
- 60%|█████▉    | 3459/5800 [9:45:43<11:26:58, 17.61s/it]score1 tensor([[0.4043],
-        [0.4609],
-        [0.5156],
-        [0.4062]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4258, 0.4590, 0.4980, 0.4062], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:32:20,853] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.32 | optimizer_step: 4.36
-[2025-01-25 18:32:20,854] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.30 | bwd_microstep: 4567.25 | bwd_inner_microstep: 4562.88 | bwd_allreduce_microstep: 4.29 | step_microstep: 39.55
-[2025-01-25 18:32:20,855] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.26 | bwd: 4567.27 | bwd_inner: 4562.88 | bwd_allreduce: 4.33 | step: 39.56
- 60%|█████▉    | 3460/5800 [9:45:50<9:20:42, 14.38s/it]                                                        {'loss': 0.0103, 'grad_norm': 2.2312543392181396, 'learning_rate': 1.4778684911123833e-05, 'epoch': 29.83}
- 60%|█████▉    | 3460/5800 [9:45:50<9:20:42, 14.38s/it]score1 tensor([[0.6250],
-        [0.4785],
-        [0.4707],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.4844, 0.4648, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:32:27,734] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 18:32:27,735] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.14 | bwd_microstep: 4613.68 | bwd_inner_microstep: 4608.84 | bwd_allreduce_microstep: 4.73 | step_microstep: 41.49
-[2025-01-25 18:32:27,735] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.12 | bwd: 4613.70 | bwd_inner: 4608.84 | bwd_allreduce: 4.78 | step: 41.50
- 60%|█████▉    | 3461/5800 [9:45:57<7:52:49, 12.13s/it]                                                       {'loss': 0.0107, 'grad_norm': 4.3903326988220215, 'learning_rate': 1.4767904903033818e-05, 'epoch': 29.84}
- 60%|█████▉    | 3461/5800 [9:45:57<7:52:49, 12.13s/it]score1 tensor([[0.5664],
-        [0.5703],
-        [0.5508],
-        [0.4805]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.5625, 0.5508, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:32:34,580] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.97 | optimizer_step: 4.36
-[2025-01-25 18:32:34,581] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.98 | bwd_microstep: 4577.44 | bwd_inner_microstep: 4572.34 | bwd_allreduce_microstep: 5.00 | step_microstep: 44.22
-[2025-01-25 18:32:34,582] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.94 | bwd: 4577.47 | bwd_inner: 4572.34 | bwd_allreduce: 5.05 | step: 44.23
- 60%|█████▉    | 3462/5800 [9:46:04<6:50:54, 10.55s/it]                                                       {'loss': 0.0122, 'grad_norm': 2.4314520359039307, 'learning_rate': 1.4757126526401848e-05, 'epoch': 29.84}
- 60%|█████▉    | 3462/5800 [9:46:04<6:50:54, 10.55s/it]score1 tensor([[0.5078],
-        [0.4453],
-        [0.4941],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.4316, 0.4941, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:32:41,430] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.36
-[2025-01-25 18:32:41,432] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.25 | bwd_microstep: 4565.96 | bwd_inner_microstep: 4561.10 | bwd_allreduce_microstep: 4.77 | step_microstep: 44.46
-[2025-01-25 18:32:41,432] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.22 | bwd: 4565.99 | bwd_inner: 4561.10 | bwd_allreduce: 4.81 | step: 44.46
- 60%|█████▉    | 3463/5800 [9:46:11<6:07:35,  9.44s/it]                                                       {'loss': 0.0122, 'grad_norm': 2.172893762588501, 'learning_rate': 1.474634978458879e-05, 'epoch': 29.85}
- 60%|█████▉    | 3463/5800 [9:46:11<6:07:35,  9.44s/it]score1 tensor([[0.3926],
-        [0.4609],
-        [0.5078],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.4688, 0.5078, 0.5742], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:32:48,287] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 18:32:48,288] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.11 | bwd_microstep: 4573.94 | bwd_inner_microstep: 4568.89 | bwd_allreduce_microstep: 4.96 | step_microstep: 46.85
-[2025-01-25 18:32:48,288] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.07 | bwd: 4573.96 | bwd_inner: 4568.89 | bwd_allreduce: 5.00 | step: 46.86
- 60%|█████▉    | 3464/5800 [9:46:18<5:37:11,  8.66s/it]                                                       {'loss': 0.0117, 'grad_norm': 5.821758270263672, 'learning_rate': 1.4735574680955034e-05, 'epoch': 29.86}
- 60%|█████▉    | 3464/5800 [9:46:18<5:37:11,  8.66s/it]score1 tensor([[0.5352],
-        [0.5781],
-        [0.4785],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.5820, 0.4980, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:32:55,179] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 18:32:55,180] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.78 | bwd_microstep: 4617.86 | bwd_inner_microstep: 4612.89 | bwd_allreduce_microstep: 4.87 | step_microstep: 47.51
-[2025-01-25 18:32:55,180] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.76 | bwd: 4617.88 | bwd_inner: 4612.89 | bwd_allreduce: 4.92 | step: 47.52
- 60%|█████▉    | 3465/5800 [9:46:25<5:16:24,  8.13s/it]                                                       {'loss': 0.0098, 'grad_norm': 4.223923683166504, 'learning_rate': 1.4724801218860435e-05, 'epoch': 29.87}
- 60%|█████▉    | 3465/5800 [9:46:25<5:16:24,  8.13s/it]score1 tensor([[0.5039],
-        [0.4453],
-        [0.4961],
-        [0.4805]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4961, 0.4707, 0.5117, 0.4824], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:33:02,064] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.36
-[2025-01-25 18:33:02,066] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.94 | bwd_microstep: 4611.57 | bwd_inner_microstep: 4606.91 | bwd_allreduce_microstep: 4.57 | step_microstep: 43.49
-[2025-01-25 18:33:02,066] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.91 | bwd: 4611.60 | bwd_inner: 4606.91 | bwd_allreduce: 4.62 | step: 43.50
- 60%|█████▉    | 3466/5800 [9:46:32<5:01:50,  7.76s/it]                                                       {'loss': 0.0127, 'grad_norm': 3.8336338996887207, 'learning_rate': 1.4714029401664355e-05, 'epoch': 29.88}
- 60%|█████▉    | 3466/5800 [9:46:32<5:01:50,  7.76s/it]score1 tensor([[0.4961],
-        [0.5156],
-        [0.6133],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.5156, 0.6445, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0225, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:33:08,919] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 18:33:08,920] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.38 | bwd_microstep: 4566.01 | bwd_inner_microstep: 4560.51 | bwd_allreduce_microstep: 5.38 | step_microstep: 46.70
-[2025-01-25 18:33:08,921] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.33 | bwd: 4566.03 | bwd_inner: 4560.51 | bwd_allreduce: 5.44 | step: 46.71
- 60%|█████▉    | 3467/5800 [9:46:38<4:51:05,  7.49s/it]                                                       {'loss': 0.0225, 'grad_norm': 6.329904079437256, 'learning_rate': 1.4703259232725624e-05, 'epoch': 29.89}
- 60%|█████▉    | 3467/5800 [9:46:38<4:51:05,  7.49s/it]score1 tensor([[0.4512],
-        [0.4785],
-        [0.4922],
-        [0.4551]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.4688, 0.4980, 0.4570], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:33:15,815] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.36
-[2025-01-25 18:33:15,816] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.37 | bwd_microstep: 4624.11 | bwd_inner_microstep: 4619.11 | bwd_allreduce_microstep: 4.91 | step_microstep: 43.40
-[2025-01-25 18:33:15,817] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.32 | bwd: 4624.14 | bwd_inner: 4619.11 | bwd_allreduce: 4.96 | step: 43.40
- 60%|█████▉    | 3468/5800 [9:46:45<4:44:06,  7.31s/it]                                                       {'loss': 0.0054, 'grad_norm': 0.37413454055786133, 'learning_rate': 1.4692490715402565e-05, 'epoch': 29.9}
- 60%|█████▉    | 3468/5800 [9:46:45<4:44:06,  7.31s/it]score1 tensor([[0.4590],
-        [0.4766],
-        [0.4512],
-        [0.4473]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4668, 0.4727, 0.4473, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:33:22,680] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 18:33:22,681] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.57 | bwd_microstep: 4580.08 | bwd_inner_microstep: 4575.14 | bwd_allreduce_microstep: 4.87 | step_microstep: 45.00
-[2025-01-25 18:33:22,681] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.54 | bwd: 4580.11 | bwd_inner: 4575.14 | bwd_allreduce: 4.90 | step: 45.01
- 60%|█████▉    | 3469/5800 [9:46:52<4:38:46,  7.18s/it]                                                       {'loss': 0.0039, 'grad_norm': 1.949422001838684, 'learning_rate': 1.4681723853053002e-05, 'epoch': 29.91}
- 60%|█████▉    | 3469/5800 [9:46:52<4:38:46,  7.18s/it]score1 tensor([[0.4863],
-        [0.5352],
-        [0.5508],
-        [0.4395]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4902, 0.5391, 0.5430, 0.4375], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0044, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:33:29,577] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 18:33:29,578] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.97 | bwd_microstep: 4616.82 | bwd_inner_microstep: 4611.79 | bwd_allreduce_microstep: 4.95 | step_microstep: 43.84
-[2025-01-25 18:33:29,579] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.93 | bwd: 4616.84 | bwd_inner: 4611.79 | bwd_allreduce: 4.98 | step: 43.84
- 60%|█████▉    | 3470/5800 [9:46:59<4:35:22,  7.09s/it]                                                       {'loss': 0.0044, 'grad_norm': 0.3773377239704132, 'learning_rate': 1.4670958649034213e-05, 'epoch': 29.91}
- 60%|█████▉    | 3470/5800 [9:46:59<4:35:22,  7.09s/it]score1 tensor([[0.4590],
-        [0.5820],
-        [0.5000],
-        [0.4473]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4570, 0.5664, 0.5039, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:33:36,469] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.36
-[2025-01-25 18:33:36,471] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.58 | bwd_microstep: 4617.49 | bwd_inner_microstep: 4611.89 | bwd_allreduce_microstep: 5.52 | step_microstep: 43.98
-[2025-01-25 18:33:36,471] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.55 | bwd: 4617.52 | bwd_inner: 4611.89 | bwd_allreduce: 5.55 | step: 44.00
- 60%|█████▉    | 3471/5800 [9:47:06<4:33:02,  7.03s/it]                                                       {'loss': 0.0059, 'grad_norm': 4.0229620933532715, 'learning_rate': 1.4660195106702989e-05, 'epoch': 29.92}
- 60%|█████▉    | 3471/5800 [9:47:06<4:33:02,  7.03s/it]score1 tensor([[0.5078],
-        [0.4258],
-        [0.3809],
-        [0.6133]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.4160, 0.4004, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:33:43,371] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 18:33:43,373] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.69 | bwd_microstep: 4615.60 | bwd_inner_microstep: 4609.92 | bwd_allreduce_microstep: 5.55 | step_microstep: 47.29
-[2025-01-25 18:33:43,373] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.65 | bwd: 4615.63 | bwd_inner: 4609.92 | bwd_allreduce: 5.62 | step: 47.30
- 60%|█████▉    | 3472/5800 [9:47:13<4:31:21,  6.99s/it]                                                       {'loss': 0.0107, 'grad_norm': 0.42540159821510315, 'learning_rate': 1.4649433229415588e-05, 'epoch': 29.93}
- 60%|█████▉    | 3472/5800 [9:47:13<4:31:21,  6.99s/it]score1 tensor([[0.6328],
-        [0.5625],
-        [0.5195],
-        [0.6133]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.5547, 0.5508, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:33:50,287] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 18:33:50,289] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.30 | bwd_microstep: 4628.07 | bwd_inner_microstep: 4623.23 | bwd_allreduce_microstep: 4.73 | step_microstep: 48.80
-[2025-01-25 18:33:50,289] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.24 | bwd: 4628.09 | bwd_inner: 4623.24 | bwd_allreduce: 4.78 | step: 48.80
- 60%|█████▉    | 3473/5800 [9:47:20<4:30:17,  6.97s/it]                                                       {'loss': 0.0137, 'grad_norm': 0.44162464141845703, 'learning_rate': 1.4638673020527739e-05, 'epoch': 29.94}
- 60%|█████▉    | 3473/5800 [9:47:20<4:30:17,  6.97s/it]score1 tensor([[0.5703],
-        [0.5742],
-        [0.6367],
-        [0.4531]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133, 0.5664, 0.6211, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0171, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:33:57,201] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.37
-[2025-01-25 18:33:57,202] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.62 | bwd_microstep: 4624.44 | bwd_inner_microstep: 4619.59 | bwd_allreduce_microstep: 4.75 | step_microstep: 49.05
-[2025-01-25 18:33:57,203] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.58 | bwd: 4624.47 | bwd_inner: 4619.59 | bwd_allreduce: 4.80 | step: 49.06
- 60%|█████▉    | 3474/5800 [9:47:27<4:29:33,  6.95s/it]                                                       {'loss': 0.0171, 'grad_norm': 0.6849909424781799, 'learning_rate': 1.4627914483394677e-05, 'epoch': 29.95}
- 60%|█████▉    | 3474/5800 [9:47:27<4:29:33,  6.95s/it]score1 tensor([[0.6328],
-        [0.6445],
-        [0.5000],
-        [0.4355]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.6445, 0.5156, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:34:04,054] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 18:34:04,056] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.77 | bwd_microstep: 4573.29 | bwd_inner_microstep: 4568.34 | bwd_allreduce_microstep: 4.84 | step_microstep: 44.62
-[2025-01-25 18:34:04,056] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.74 | bwd: 4573.32 | bwd_inner: 4568.34 | bwd_allreduce: 4.90 | step: 44.62
- 60%|█████▉    | 3475/5800 [9:47:34<4:28:13,  6.92s/it]                                                       {'loss': 0.0093, 'grad_norm': 6.210969924926758, 'learning_rate': 1.4617157621371085e-05, 'epoch': 29.96}
- 60%|█████▉    | 3475/5800 [9:47:34<4:28:13,  6.92s/it]score1 tensor([[0.3730],
-        [0.4766],
-        [0.5117],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.4844, 0.5117, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:34:10,900] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 18:34:10,901] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.24 | bwd_microstep: 4574.52 | bwd_inner_microstep: 4569.57 | bwd_allreduce_microstep: 4.84 | step_microstep: 41.50
-[2025-01-25 18:34:10,902] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.21 | bwd: 4574.54 | bwd_inner: 4569.57 | bwd_allreduce: 4.89 | step: 41.51
- 60%|█████▉    | 3476/5800 [9:47:40<4:27:17,  6.90s/it]                                                       {'loss': 0.0083, 'grad_norm': 5.891745567321777, 'learning_rate': 1.4606402437811156e-05, 'epoch': 29.97}
- 60%|█████▉    | 3476/5800 [9:47:40<4:27:17,  6.90s/it]score1 tensor([[0.5469],
-        [0.4746],
-        [0.5781],
-        [0.4609]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.4824, 0.5977, 0.4629], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:34:17,811] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 18:34:17,812] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.21 | bwd_microstep: 4622.39 | bwd_inner_microstep: 4616.90 | bwd_allreduce_microstep: 5.37 | step_microstep: 50.15
-[2025-01-25 18:34:17,813] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.17 | bwd: 4622.41 | bwd_inner: 4616.90 | bwd_allreduce: 5.43 | step: 50.12
- 60%|█████▉    | 3477/5800 [9:47:47<4:27:14,  6.90s/it]                                                       {'loss': 0.0122, 'grad_norm': 3.9719841480255127, 'learning_rate': 1.4595648936068525e-05, 'epoch': 29.97}
- 60%|█████▉    | 3477/5800 [9:47:47<4:27:14,  6.90s/it]score1 tensor([[0.4824],
-        [0.4902],
-        [0.5234],
-        [0.4043]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.4941, 0.5312, 0.3945], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:34:24,715] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 18:34:24,716] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.88 | bwd_microstep: 4624.85 | bwd_inner_microstep: 4619.84 | bwd_allreduce_microstep: 4.91 | step_microstep: 43.84
-[2025-01-25 18:34:24,717] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.84 | bwd: 4624.89 | bwd_inner: 4619.84 | bwd_allreduce: 4.96 | step: 43.85
- 60%|█████▉    | 3478/5800 [9:47:54<4:27:10,  6.90s/it]                                                       {'loss': 0.0063, 'grad_norm': 0.6266171336174011, 'learning_rate': 1.4584897119496337e-05, 'epoch': 29.98}
- 60%|█████▉    | 3478/5800 [9:47:54<4:27:10,  6.90s/it]score1 tensor([[0.5703],
-        [0.5234],
-        [0.5430],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.5352, 0.5391, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:34:31,616] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 18:34:31,618] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.89 | bwd_microstep: 4621.96 | bwd_inner_microstep: 4617.06 | bwd_allreduce_microstep: 4.75 | step_microstep: 43.34
-[2025-01-25 18:34:31,618] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.85 | bwd: 4621.99 | bwd_inner: 4617.06 | bwd_allreduce: 4.80 | step: 43.35
- 60%|█████▉    | 3479/5800 [9:48:01<4:27:00,  6.90s/it]                                                       {'loss': 0.0078, 'grad_norm': 0.4455549716949463, 'learning_rate': 1.4574146991447187e-05, 'epoch': 29.99}
- 60%|█████▉    | 3479/5800 [9:48:01<4:27:00,  6.90s/it][93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-score1 tensor([[0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:34:35,729] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 18:34:35,730] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 571.12 | bwd_microstep: 1218.97 | bwd_inner_microstep: 1215.16 | bwd_allreduce_microstep: 3.71 | step_microstep: 40.96
-[2025-01-25 18:34:35,731] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 571.06 | bwd: 1218.99 | bwd_inner: 1215.16 | bwd_allreduce: 3.75 | step: 40.97
- 60%|██████    | 3480/5800 [9:48:05<3:54:30,  6.06s/it]                                                       {'loss': 0.0117, 'grad_norm': 8.23853874206543, 'learning_rate': 1.4563398555273143e-05, 'epoch': 30.0}
- 60%|██████    | 3480/5800 [9:48:05<3:54:30,  6.06s/it][2025-01-25 18:34:40,520] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 18:34:50,630] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 18:35:00,706] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 18:35:11,089] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-score1 tensor([[0.5078],
-        [0.6523],
-        [0.4395],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4707, 0.6445, 0.4199, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0220, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:35:30,379] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 18:35:30,381] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2138.80 | bwd_microstep: 4579.72 | bwd_inner_microstep: 4574.74 | bwd_allreduce_microstep: 4.88 | step_microstep: 45.00
-[2025-01-25 18:35:30,381] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2138.76 | bwd: 4579.75 | bwd_inner: 4574.74 | bwd_allreduce: 4.92 | step: 45.01
- 60%|██████    | 3481/5800 [9:49:00<13:17:49, 20.64s/it]                                                        {'loss': 0.022, 'grad_norm': 8.359655380249023, 'learning_rate': 1.4552651814325773e-05, 'epoch': 30.01}
- 60%|██████    | 3481/5800 [9:49:00<13:17:49, 20.64s/it]score1 tensor([[0.6016],
-        [0.4688],
-        [0.4980],
-        [0.4648]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5820, 0.4355, 0.4609, 0.4414], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0283, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:35:37,211] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 18:35:37,212] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2130.98 | bwd_microstep: 4577.55 | bwd_inner_microstep: 4572.51 | bwd_allreduce_microstep: 4.92 | step_microstep: 40.81
-[2025-01-25 18:35:37,213] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2130.94 | bwd: 4577.58 | bwd_inner: 4572.51 | bwd_allreduce: 4.98 | step: 40.81
- 60%|██████    | 3482/5800 [9:49:07<10:37:24, 16.50s/it]                                                        {'loss': 0.0283, 'grad_norm': 8.176764488220215, 'learning_rate': 1.4541906771956079e-05, 'epoch': 30.02}
- 60%|██████    | 3482/5800 [9:49:07<10:37:24, 16.50s/it]score1 tensor([[0.5352],
-        [0.3809],
-        [0.3809],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.3789, 0.3730, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:35:44,045] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.36
-[2025-01-25 18:35:44,046] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2131.53 | bwd_microstep: 4580.08 | bwd_inner_microstep: 4574.91 | bwd_allreduce_microstep: 5.07 | step_microstep: 45.78
-[2025-01-25 18:35:44,046] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2131.50 | bwd: 4580.11 | bwd_inner: 4574.91 | bwd_allreduce: 5.12 | step: 45.79
- 60%|██████    | 3483/5800 [9:49:14<8:45:12, 13.60s/it]                                                        {'loss': 0.0103, 'grad_norm': 3.5246012210845947, 'learning_rate': 1.4531163431514568e-05, 'epoch': 30.03}
- 60%|██████    | 3483/5800 [9:49:14<8:45:12, 13.60s/it]score1 tensor([[0.5938],
-        [0.5664],
-        [0.5156],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5703, 0.5586, 0.5156, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:35:50,863] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.93 | optimizer_step: 4.36
-[2025-01-25 18:35:50,865] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2143.36 | bwd_microstep: 4547.41 | bwd_inner_microstep: 4542.40 | bwd_allreduce_microstep: 4.89 | step_microstep: 43.58
-[2025-01-25 18:35:50,865] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2143.31 | bwd: 4547.43 | bwd_inner: 4542.40 | bwd_allreduce: 4.95 | step: 43.59
- 60%|██████    | 3484/5800 [9:49:20<7:26:24, 11.56s/it]                                                       {'loss': 0.0103, 'grad_norm': 6.487453460693359, 'learning_rate': 1.452042179635119e-05, 'epoch': 30.03}
- 60%|██████    | 3484/5800 [9:49:20<7:26:24, 11.56s/it]score1 tensor([[0.3281],
-        [0.6172],
-        [0.5430],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3438, 0.6055, 0.5156, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0278, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:35:57,725] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 18:35:57,726] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2143.39 | bwd_microstep: 4594.23 | bwd_inner_microstep: 4589.73 | bwd_allreduce_microstep: 4.42 | step_microstep: 41.73
-[2025-01-25 18:35:57,726] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2143.36 | bwd: 4594.27 | bwd_inner: 4589.74 | bwd_allreduce: 4.46 | step: 41.73
- 60%|██████    | 3485/5800 [9:49:27<6:31:46, 10.15s/it]                                                       {'loss': 0.0278, 'grad_norm': 4.897113800048828, 'learning_rate': 1.4509681869815366e-05, 'epoch': 30.04}
- 60%|██████    | 3485/5800 [9:49:27<6:31:46, 10.15s/it]score1 tensor([[0.3359],
-        [0.4629],
-        [0.5273],
-        [0.6484]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3340, 0.4551, 0.5312, 0.6719], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:36:04,593] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.36
-[2025-01-25 18:36:04,594] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2145.18 | bwd_microstep: 4604.56 | bwd_inner_microstep: 4599.45 | bwd_allreduce_microstep: 5.02 | step_microstep: 41.63
-[2025-01-25 18:36:04,595] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2145.14 | bwd: 4604.58 | bwd_inner: 4599.45 | bwd_allreduce: 5.06 | step: 41.64
- 60%|████��█    | 3486/5800 [9:49:34<5:53:34,  9.17s/it]                                                       {'loss': 0.0093, 'grad_norm': 1.2438201904296875, 'learning_rate': 1.4498943655256006e-05, 'epoch': 30.05}
- 60%|██████    | 3486/5800 [9:49:34<5:53:34,  9.17s/it]score1 tensor([[0.3965],
-        [0.4531],
-        [0.4102],
-        [0.6992]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4082, 0.4492, 0.4180, 0.7031], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:36:11,458] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.90 | optimizer_step: 4.36
-[2025-01-25 18:36:11,459] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2144.96 | bwd_microstep: 4600.54 | bwd_inner_microstep: 4595.58 | bwd_allreduce_microstep: 4.87 | step_microstep: 43.21
-[2025-01-25 18:36:11,460] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2144.94 | bwd: 4600.56 | bwd_inner: 4595.58 | bwd_allreduce: 4.92 | step: 43.24
- 60%|██████    | 3487/5800 [9:49:41<5:26:49,  8.48s/it]                                                       {'loss': 0.0068, 'grad_norm': 4.132805347442627, 'learning_rate': 1.4488207156021465e-05, 'epoch': 30.06}
- 60%|██████    | 3487/5800 [9:49:41<5:26:49,  8.48s/it]score1 tensor([[0.6641],
-        [0.4316],
-        [0.5195],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6641, 0.4277, 0.4961, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:36:18,281] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.53 | optimizer_step: 4.36
-[2025-01-25 18:36:18,283] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2147.37 | bwd_microstep: 4554.35 | bwd_inner_microstep: 4549.75 | bwd_allreduce_microstep: 4.49 | step_microstep: 41.10
-[2025-01-25 18:36:18,283] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2147.34 | bwd: 4554.38 | bwd_inner: 4549.75 | bwd_allreduce: 4.54 | step: 41.11
- 60%|██████    | 3488/5800 [9:49:48<5:07:28,  7.98s/it]                                                       {'loss': 0.0068, 'grad_norm': 3.952421188354492, 'learning_rate': 1.4477472375459573e-05, 'epoch': 30.07}
- 60%|██████    | 3488/5800 [9:49:48<5:07:28,  7.98s/it]score1 tensor([[0.5938],
-        [0.4609],
-        [0.5742],
-        [0.6484]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5977, 0.4727, 0.5703, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:36:25,152] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 18:36:25,153] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2149.56 | bwd_microstep: 4607.63 | bwd_inner_microstep: 4601.28 | bwd_allreduce_microstep: 6.26 | step_microstep: 42.23
-[2025-01-25 18:36:25,154] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2149.53 | bwd: 4607.66 | bwd_inner: 4601.28 | bwd_allreduce: 6.30 | step: 42.24
- 60%|██████    | 3489/5800 [9:49:55<4:54:31,  7.65s/it]                                                       {'loss': 0.0059, 'grad_norm': 0.6989167332649231, 'learning_rate': 1.4466739316917618e-05, 'epoch': 30.08}
- 60%|██████    | 3489/5800 [9:49:55<4:54:31,  7.65s/it]score1 tensor([[0.5352],
-        [0.4590],
-        [0.5508],
-        [0.4629]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.4805, 0.5625, 0.4727], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:36:32,019] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 18:36:32,021] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.74 | bwd_microstep: 4603.08 | bwd_inner_microstep: 4598.56 | bwd_allreduce_microstep: 4.41 | step_microstep: 42.01
-[2025-01-25 18:36:32,021] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.69 | bwd: 4603.10 | bwd_inner: 4598.56 | bwd_allreduce: 4.48 | step: 42.01
- 60%|██████    | 3490/5800 [9:50:01<4:45:25,  7.41s/it]                                                       {'loss': 0.0117, 'grad_norm': 8.12009334564209, 'learning_rate': 1.445600798374235e-05, 'epoch': 30.09}
- 60%|██████    | 3490/5800 [9:50:01<4:45:25,  7.41s/it]score1 tensor([[0.5938],
-        [0.3809],
-        [0.5820],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.4238, 0.5820, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0166, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:36:38,823] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 18:36:38,824] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.38 | bwd_microstep: 4533.55 | bwd_inner_microstep: 4528.77 | bwd_allreduce_microstep: 4.70 | step_microstep: 42.77
-[2025-01-25 18:36:38,824] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.35 | bwd: 4533.57 | bwd_inner: 4528.77 | bwd_allreduce: 4.74 | step: 42.77
- 60%|██████    | 3491/5800 [9:50:08<4:38:14,  7.23s/it]                                                       {'loss': 0.0166, 'grad_norm': 3.9715120792388916, 'learning_rate': 1.4445278379279999e-05, 'epoch': 30.09}
- 60%|██████    | 3491/5800 [9:50:08<4:38:14,  7.23s/it]score1 tensor([[0.5508],
-        [0.6211],
-        [0.4727],
-        [0.4707]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.6133, 0.4785, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:36:45,708] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.93 | optimizer_step: 4.37
-[2025-01-25 18:36:45,710] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.11 | bwd_microstep: 4615.83 | bwd_inner_microstep: 4611.15 | bwd_allreduce_microstep: 4.59 | step_microstep: 41.56
-[2025-01-25 18:36:45,710] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.05 | bwd: 4615.85 | bwd_inner: 4611.15 | bwd_allreduce: 4.63 | step: 41.57
- 60%|██████    | 3492/5800 [9:50:15<4:34:10,  7.13s/it]                                                       {'loss': 0.0083, 'grad_norm': 3.761986017227173, 'learning_rate': 1.4434550506876219e-05, 'epoch': 30.1}
- 60%|██████    | 3492/5800 [9:50:15<4:34:10,  7.13s/it]score1 tensor([[0.6445],
-        [0.6680],
-        [0.5000],
-        [0.4863]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6484, 0.6875, 0.4980, 0.4727], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:36:52,596] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 18:36:52,597] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.03 | bwd_microstep: 4611.11 | bwd_inner_microstep: 4606.25 | bwd_allreduce_microstep: 4.77 | step_microstep: 42.60
-[2025-01-25 18:36:52,598] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.00 | bwd: 4611.14 | bwd_inner: 4606.25 | bwd_allreduce: 4.81 | step: 42.61
- 60%|██████    | 3493/5800 [9:50:22<4:31:19,  7.06s/it]                                                       {'loss': 0.0098, 'grad_norm': 1.105849266052246, 'learning_rate': 1.4423824369876172e-05, 'epoch': 30.11}
- 60%|██████    | 3493/5800 [9:50:22<4:31:19,  7.06s/it]score1 tensor([[0.5469],
-        [0.4199],
-        [0.2754],
-        [0.3965]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.4434, 0.3086, 0.4062], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:36:59,482] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 18:36:59,484] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.55 | bwd_microstep: 4611.35 | bwd_inner_microstep: 4606.81 | bwd_allreduce_microstep: 4.48 | step_microstep: 40.94
-[2025-01-25 18:36:59,484] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.52 | bwd: 4611.38 | bwd_inner: 4606.81 | bwd_allreduce: 4.51 | step: 40.95
- 60%|██████    | 3494/5800 [9:50:29<4:29:11,  7.00s/it]                                                       {'loss': 0.0176, 'grad_norm': 3.1240971088409424, 'learning_rate': 1.4413099971624431e-05, 'epoch': 30.12}
- 60%|██████    | 3494/5800 [9:50:29<4:29:11,  7.00s/it]score1 tensor([[0.3906],
-        [0.5352],
-        [0.6523],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3984, 0.4844, 0.6406, 0.4863], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0220, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:37:06,367] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 18:37:06,369] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.41 | bwd_microstep: 4613.44 | bwd_inner_microstep: 4608.58 | bwd_allreduce_microstep: 4.75 | step_microstep: 42.16
-[2025-01-25 18:37:06,369] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.38 | bwd: 4613.47 | bwd_inner: 4608.57 | bwd_allreduce: 4.81 | step: 42.17
- 60%|██████    | 3495/5800 [9:50:36<4:27:43,  6.97s/it]                                                       {'loss': 0.022, 'grad_norm': 4.785723686218262, 'learning_rate': 1.440237731546507e-05, 'epoch': 30.13}
- 60%|██████    | 3495/5800 [9:50:36<4:27:43,  6.97s/it]score1 tensor([[0.3848],
-        [0.3887],
-        [0.6484],
-        [0.4355]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3711, 0.3945, 0.6289, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:37:13,289] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.37
-[2025-01-25 18:37:13,290] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.15 | bwd_microstep: 4640.22 | bwd_inner_microstep: 4635.31 | bwd_allreduce_microstep: 4.82 | step_microstep: 42.37
-[2025-01-25 18:37:13,291] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.10 | bwd: 4640.24 | bwd_inner: 4635.31 | bwd_allreduce: 4.86 | step: 42.39
- 60%|██████    | 3496/5800 [9:50:43<4:27:00,  6.95s/it]                                                       {'loss': 0.0127, 'grad_norm': 0.7764579057693481, 'learning_rate': 1.4391656404741578e-05, 'epoch': 30.14}
- 60%|██████    | 3496/5800 [9:50:43<4:27:00,  6.95s/it]score1 tensor([[0.5625],
-        [0.4707],
-        [0.4570],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.4512, 0.4453, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:37:20,206] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 18:37:20,207] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.22 | bwd_microstep: 4640.24 | bwd_inner_microstep: 4635.20 | bwd_allreduce_microstep: 4.90 | step_microstep: 40.78
-[2025-01-25 18:37:20,208] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.18 | bwd: 4640.27 | bwd_inner: 4635.20 | bwd_allreduce: 4.97 | step: 40.78
- 60%|██████    | 3497/5800 [9:50:50<4:26:30,  6.94s/it]                                                       {'loss': 0.0195, 'grad_norm': 8.211478233337402, 'learning_rate': 1.4380937242796944e-05, 'epoch': 30.15}
- 60%|██████    | 3497/5800 [9:50:50<4:26:30,  6.94s/it]score1 tensor([[0.4727],
-        [0.5195],
-        [0.4238],
-        [0.6172]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4531, 0.5000, 0.4258, 0.5898], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0171, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:37:27,126] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 18:37:27,127] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.01 | bwd_microstep: 4635.79 | bwd_inner_microstep: 4631.22 | bwd_allreduce_microstep: 4.45 | step_microstep: 41.97
-[2025-01-25 18:37:27,127] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.98 | bwd: 4635.83 | bwd_inner: 4631.22 | bwd_allreduce: 4.51 | step: 41.98
- 60%|██████    | 3498/5800 [9:50:57<4:26:07,  6.94s/it]                                                       {'loss': 0.0171, 'grad_norm': 4.51209020614624, 'learning_rate': 1.4370219832973575e-05, 'epoch': 30.16}
- 60%|██████    | 3498/5800 [9:50:57<4:26:07,  6.94s/it]score1 tensor([[0.6445],
-        [0.4316],
-        [0.5547],
-        [0.4727]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6484, 0.4121, 0.5430, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:37:34,069] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.36
-[2025-01-25 18:37:34,070] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.28 | bwd_microstep: 4641.80 | bwd_inner_microstep: 4636.19 | bwd_allreduce_microstep: 5.50 | step_microstep: 46.73
-[2025-01-25 18:37:34,070] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.17 | bwd: 4641.82 | bwd_inner: 4636.19 | bwd_allreduce: 5.55 | step: 46.73
- 60%|██████    | 3499/5800 [9:51:04<4:26:04,  6.94s/it]                                                       {'loss': 0.0103, 'grad_norm': 0.6061258912086487, 'learning_rate': 1.435950417861334e-05, 'epoch': 30.16}
- 60%|███��██    | 3499/5800 [9:51:04<4:26:04,  6.94s/it]score1 tensor([[0.4180],
-        [0.5391],
-        [0.5273],
-        [0.4277]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4023, 0.5547, 0.4922, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0210, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:37:40,994] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 18:37:40,995] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.67 | bwd_microstep: 4641.93 | bwd_inner_microstep: 4637.24 | bwd_allreduce_microstep: 4.58 | step_microstep: 46.75
-[2025-01-25 18:37:40,995] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.64 | bwd: 4641.95 | bwd_inner: 4637.24 | bwd_allreduce: 4.64 | step: 46.77
- 60%|██████    | 3500/5800 [9:51:10<4:25:49,  6.93s/it]                                                       {'loss': 0.021, 'grad_norm': 0.4607834815979004, 'learning_rate': 1.4348790283057582e-05, 'epoch': 30.17}
- 60%|██████    | 3500/5800 [9:51:10<4:25:49,  6.93s/it]score1 tensor([[0.5938],
-        [0.4785],
-        [0.6133],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.4590, 0.6133, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0166, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:37:47,869] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 18:37:47,870] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.46 | bwd_microstep: 4592.94 | bwd_inner_microstep: 4588.49 | bwd_allreduce_microstep: 4.35 | step_microstep: 42.52
-[2025-01-25 18:37:47,871] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.42 | bwd: 4592.96 | bwd_inner: 4588.50 | bwd_allreduce: 4.39 | step: 42.51
- 60%|██████    | 3501/5800 [9:51:17<4:25:02,  6.92s/it]                                                       {'loss': 0.0166, 'grad_norm': 6.328148365020752, 'learning_rate': 1.4338078149647061e-05, 'epoch': 30.18}
- 60%|██████    | 3501/5800 [9:51:17<4:25:02,  6.92s/it]score1 tensor([[0.5586],
-        [0.4453],
-        [0.5742],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.4492, 0.5664, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:37:54,792] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.53 | optimizer_step: 4.37
-[2025-01-25 18:37:54,793] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.27 | bwd_microstep: 4644.23 | bwd_inner_microstep: 4639.57 | bwd_allreduce_microstep: 4.56 | step_microstep: 43.12
-[2025-01-25 18:37:54,793] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.23 | bwd: 4644.25 | bwd_inner: 4639.57 | bwd_allreduce: 4.61 | step: 43.12
- 60%|██████    | 3502/5800 [9:51:24<4:24:55,  6.92s/it]                                                       {'loss': 0.0127, 'grad_norm': 4.608822345733643, 'learning_rate': 1.4327367781722025e-05, 'epoch': 30.19}
- 60%|██████    | 3502/5800 [9:51:24<4:24:55,  6.92s/it]score1 tensor([[0.5273],
-        [0.5938],
-        [0.6172],
-        [0.2988]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.5781, 0.6094, 0.3105], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:38:01,705] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.36
-[2025-01-25 18:38:01,706] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.48 | bwd_microstep: 4641.09 | bwd_inner_microstep: 4636.04 | bwd_allreduce_microstep: 4.95 | step_microstep: 40.70
-[2025-01-25 18:38:01,706] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.43 | bwd: 4641.12 | bwd_inner: 4636.04 | bwd_allreduce: 5.00 | step: 40.71
- 60%|██████    | 3503/5800 [9:51:31<4:24:50,  6.92s/it]                                                       {'loss': 0.0107, 'grad_norm': 5.138838768005371, 'learning_rate': 1.4316659182622139e-05, 'epoch': 30.2}
- 60%|██████    | 3503/5800 [9:51:31<4:24:50,  6.92s/it]score1 tensor([[0.5312],
-        [0.5273],
-        [0.3633],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.5195, 0.3652, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:38:08,615] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.31 | optimizer_step: 4.37
-[2025-01-25 18:38:08,616] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.69 | bwd_microstep: 4634.26 | bwd_inner_microstep: 4629.46 | bwd_allreduce_microstep: 4.71 | step_microstep: 33.46
-[2025-01-25 18:38:08,616] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.66 | bwd: 4634.28 | bwd_inner: 4629.46 | bwd_allreduce: 4.75 | step: 33.47
- 60%|██████    | 3504/5800 [9:51:38<4:24:28,  6.91s/it]                                                       {'loss': 0.0132, 'grad_norm': 0.6440286040306091, 'learning_rate': 1.4305952355686535e-05, 'epoch': 30.21}
- 60%|██████    | 3504/5800 [9:51:38<4:24:28,  6.91s/it]score1 tensor([[0.4688],
-        [0.4727],
-        [0.4219],
-        [0.3301]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4570, 0.4746, 0.4297, 0.3398], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:38:15,527] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 18:38:15,528] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.92 | bwd_microstep: 4642.04 | bwd_inner_microstep: 4637.11 | bwd_allreduce_microstep: 4.83 | step_microstep: 45.61
-[2025-01-25 18:38:15,529] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.90 | bwd: 4642.07 | bwd_inner: 4637.11 | bwd_allreduce: 4.88 | step: 45.61
- 60%|██████    | 3505/5800 [9:51:45<4:24:30,  6.92s/it]                                                       {'loss': 0.0078, 'grad_norm': 3.532884359359741, 'learning_rate': 1.4295247304253786e-05, 'epoch': 30.22}
- 60%|██████    | 3505/5800 [9:51:45<4:24:30,  6.92s/it]score1 tensor([[0.4199],
-        [0.4844],
-        [0.4453],
-        [0.4727]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4434, 0.5000, 0.4648, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0205, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:38:22,446] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 18:38:22,447] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.22 | bwd_microstep: 4640.35 | bwd_inner_microstep: 4636.13 | bwd_allreduce_microstep: 4.15 | step_microstep: 42.07
-[2025-01-25 18:38:22,447] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.17 | bwd: 4640.38 | bwd_inner: 4636.13 | bwd_allreduce: 4.18 | step: 42.08
- 60%|██████    | 3506/5800 [9:51:52<4:24:23,  6.92s/it]                                                       {'loss': 0.0205, 'grad_norm': 7.669388294219971, 'learning_rate': 1.4284544031661913e-05, 'epoch': 30.22}
- 60%|██████    | 3506/5800 [9:51:52<4:24:23,  6.92s/it]score1 tensor([[0.5547],
-        [0.5000],
-        [0.4922],
-        [0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5703, 0.5039, 0.5078, 0.5000], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:38:29,357] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.37
-[2025-01-25 18:38:29,358] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.49 | bwd_microstep: 4633.37 | bwd_inner_microstep: 4628.47 | bwd_allreduce_microstep: 4.80 | step_microstep: 42.41
-[2025-01-25 18:38:29,359] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.45 | bwd: 4633.39 | bwd_inner: 4628.47 | bwd_allreduce: 4.84 | step: 42.42
- 60%|██████    | 3507/5800 [9:51:59<4:24:14,  6.91s/it]                                                       {'loss': 0.0127, 'grad_norm': 8.106977462768555, 'learning_rate': 1.4273842541248384e-05, 'epoch': 30.23}
- 60%|██████    | 3507/5800 [9:51:59<4:24:14,  6.91s/it]score1 tensor([[0.6211],
-        [0.5664],
-        [0.4824],
-        [0.6680]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6367, 0.5508, 0.5117, 0.6484], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0200, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:38:36,268] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 18:38:36,269] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.41 | bwd_microstep: 4635.13 | bwd_inner_microstep: 4630.49 | bwd_allreduce_microstep: 4.55 | step_microstep: 40.85
-[2025-01-25 18:38:36,270] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.38 | bwd: 4635.16 | bwd_inner: 4630.49 | bwd_allreduce: 4.59 | step: 40.86
- 60%|██���███    | 3508/5800 [9:52:06<4:24:03,  6.91s/it]                                                       {'loss': 0.02, 'grad_norm': 0.5606990456581116, 'learning_rate': 1.4263142836350107e-05, 'epoch': 30.24}
- 60%|██████    | 3508/5800 [9:52:06<4:24:03,  6.91s/it]score1 tensor([[0.6016],
-        [0.6289],
-        [0.4707],
-        [0.4902]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.6562, 0.4941, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0220, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:38:43,187] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 18:38:43,188] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.93 | bwd_microstep: 4642.46 | bwd_inner_microstep: 4634.34 | bwd_allreduce_microstep: 8.04 | step_microstep: 41.93
-[2025-01-25 18:38:43,189] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.89 | bwd: 4642.48 | bwd_inner: 4634.34 | bwd_allreduce: 8.07 | step: 41.94
- 60%|██████    | 3509/5800 [9:52:13<4:24:03,  6.92s/it]                                                       {'loss': 0.022, 'grad_norm': 8.525040626525879, 'learning_rate': 1.4252444920303438e-05, 'epoch': 30.25}
- 60%|██████    | 3509/5800 [9:52:13<4:24:03,  6.92s/it]score1 tensor([[0.5391],
-        [0.4316],
-        [0.3730],
-        [0.6172]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.4473, 0.3672, 0.6367], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0171, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:38:50,102] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.37
-[2025-01-25 18:38:50,103] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.61 | bwd_microstep: 4635.28 | bwd_inner_microstep: 4630.27 | bwd_allreduce_microstep: 4.93 | step_microstep: 43.63
-[2025-01-25 18:38:50,103] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.58 | bwd: 4635.31 | bwd_inner: 4630.28 | bwd_allreduce: 4.97 | step: 43.64
- 61%|██████    | 3510/5800 [9:52:20<4:23:56,  6.92s/it]                                                       {'loss': 0.0171, 'grad_norm': 4.613121032714844, 'learning_rate': 1.4241748796444175e-05, 'epoch': 30.26}
- 61%|██████    | 3510/5800 [9:52:20<4:23:56,  6.92s/it]score1 tensor([[0.4141],
-        [0.4199],
-        [0.5039],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.4180, 0.5078, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:38:57,027] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.88 | optimizer_step: 4.36
-[2025-01-25 18:38:57,028] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.57 | bwd_microstep: 4641.59 | bwd_inner_microstep: 4636.93 | bwd_allreduce_microstep: 4.55 | step_microstep: 45.83
-[2025-01-25 18:38:57,028] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.54 | bwd: 4641.61 | bwd_inner: 4636.93 | bwd_allreduce: 4.61 | step: 45.84
- 61%|██████    | 3511/5800 [9:52:27<4:23:57,  6.92s/it]                                                       {'loss': 0.0054, 'grad_norm': 4.127998352050781, 'learning_rate': 1.4231054468107543e-05, 'epoch': 30.27}
- 61%|██████    | 3511/5800 [9:52:27<4:23:57,  6.92s/it]score1 tensor([[0.3730],
-        [0.5312],
-        [0.4727],
-        [0.6055]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4043, 0.4980, 0.4629, 0.6250], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:39:03,954] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 18:39:03,955] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.82 | bwd_microstep: 4643.63 | bwd_inner_microstep: 4638.70 | bwd_allreduce_microstep: 4.83 | step_microstep: 42.15
-[2025-01-25 18:39:03,956] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.79 | bwd: 4643.69 | bwd_inner: 4638.70 | bwd_allreduce: 4.86 | step: 42.17
- 61%|██████    | 3512/5800 [9:52:33<4:23:54,  6.92s/it]                                                       {'loss': 0.0234, 'grad_norm': 0.4049108028411865, 'learning_rate': 1.4220361938628236e-05, 'epoch': 30.28}
- 61%|██████    | 3512/5800 [9:52:33<4:23:54,  6.92s/it]score1 tensor([[0.4453],
-        [0.5430],
-        [0.5586],
-        [0.4023]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4414, 0.5312, 0.5508, 0.4219], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:39:10,875] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.36
-[2025-01-25 18:39:10,876] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.60 | bwd_microstep: 4645.89 | bwd_inner_microstep: 4638.85 | bwd_allreduce_microstep: 6.94 | step_microstep: 41.29
-[2025-01-25 18:39:10,876] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.57 | bwd: 4645.92 | bwd_inner: 4638.85 | bwd_allreduce: 6.99 | step: 41.29
- 61%|██████    | 3513/5800 [9:52:40<4:23:46,  6.92s/it]                                                       {'loss': 0.0107, 'grad_norm': 4.385627269744873, 'learning_rate': 1.4209671211340354e-05, 'epoch': 30.28}
- 61%|██████    | 3513/5800 [9:52:40<4:23:46,  6.92s/it]score1 tensor([[0.5000],
-        [0.5039],
-        [0.4941],
-        [0.6562]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.5000, 0.4961, 0.6367], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:39:17,795] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 18:39:17,796] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.36 | bwd_microstep: 4637.30 | bwd_inner_microstep: 4632.85 | bwd_allreduce_microstep: 4.38 | step_microstep: 46.16
-[2025-01-25 18:39:17,796] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.33 | bwd: 4637.33 | bwd_inner: 4632.85 | bwd_allreduce: 4.41 | step: 46.17
- 61%|██████    | 3514/5800 [9:52:47<4:23:37,  6.92s/it]                                                       {'loss': 0.0068, 'grad_norm': 4.4821953773498535, 'learning_rate': 1.419898228957747e-05, 'epoch': 30.29}
- 61%|██████    | 3514/5800 [9:52:47<4:23:37,  6.92s/it]score1 tensor([[0.6836],
-        [0.4297],
-        [0.4219],
-        [0.3516]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6602, 0.4160, 0.3984, 0.3691], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:39:24,708] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 18:39:24,709] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.40 | bwd_microstep: 4638.67 | bwd_inner_microstep: 4634.06 | bwd_allreduce_microstep: 4.52 | step_microstep: 41.48
-[2025-01-25 18:39:24,709] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.37 | bwd: 4638.69 | bwd_inner: 4634.06 | bwd_allreduce: 4.56 | step: 41.49
- 61%|██████    | 3515/5800 [9:52:54<4:23:27,  6.92s/it]                                                       {'loss': 0.0195, 'grad_norm': 4.449418067932129, 'learning_rate': 1.4188295176672559e-05, 'epoch': 30.3}
- 61%|██████    | 3515/5800 [9:52:54<4:23:27,  6.92s/it]score1 tensor([[0.4531],
-        [0.3594],
-        [0.4355],
-        [0.4941]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.3418, 0.4453, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:39:31,627] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.37
-[2025-01-25 18:39:31,628] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.33 | bwd_microstep: 4635.23 | bwd_inner_microstep: 4630.34 | bwd_allreduce_microstep: 4.81 | step_microstep: 46.79
-[2025-01-25 18:39:31,628] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.29 | bwd: 4635.26 | bwd_inner: 4630.34 | bwd_allreduce: 4.84 | step: 46.80
- 61%|██████    | 3516/5800 [9:53:01<4:23:27,  6.92s/it]                                                       {'loss': 0.0103, 'grad_norm': 0.5416978597640991, 'learning_rate': 1.4177609875958051e-05, 'epoch': 30.31}
- 61%|██████    | 3516/5800 [9:53:01<4:23:27,  6.92s/it]score1 tensor([[0.6367],
-        [0.5117],
-        [0.6055],
-        [0.4316]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6523, 0.4941, 0.6211, 0.4121], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0171, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:39:38,559] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 18:39:38,560] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.45 | bwd_microstep: 4637.26 | bwd_inner_microstep: 4632.35 | bwd_allreduce_microstep: 4.82 | step_microstep: 48.04
-[2025-01-25 18:39:38,560] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.40 | bwd: 4637.29 | bwd_inner: 4632.35 | bwd_allreduce: 4.87 | step: 48.05
- 61%|██████    | 3517/5800 [9:53:08<4:23:27,  6.92s/it]                                                       {'loss': 0.0171, 'grad_norm': 1.0151076316833496, 'learning_rate': 1.4166926390765819e-05, 'epoch': 30.32}
- 61%|██████    | 3517/5800 [9:53:08<4:23:27,  6.92s/it]score1 tensor([[0.4785],
-        [0.4082],
-        [0.4551],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4629, 0.4004, 0.4766, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0142, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:39:45,495] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 18:39:45,496] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.83 | bwd_microstep: 4648.59 | bwd_inner_microstep: 4643.90 | bwd_allreduce_microstep: 4.59 | step_microstep: 43.47
-[2025-01-25 18:39:45,496] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.79 | bwd: 4648.61 | bwd_inner: 4643.90 | bwd_allreduce: 4.64 | step: 43.48
- 61%|██████    | 3518/5800 [9:53:15<4:23:27,  6.93s/it]                                                       {'loss': 0.0142, 'grad_norm': 4.000509738922119, 'learning_rate': 1.4156244724427138e-05, 'epoch': 30.33}
- 61%|██████    | 3518/5800 [9:53:15<4:23:27,  6.93s/it]score1 tensor([[0.4648],
-        [0.5391],
-        [0.4375],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4570, 0.5508, 0.4316, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:39:52,418] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.93 | optimizer_step: 4.37
-[2025-01-25 18:39:52,419] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.07 | bwd_microstep: 4635.34 | bwd_inner_microstep: 4630.58 | bwd_allreduce_microstep: 4.65 | step_microstep: 44.71
-[2025-01-25 18:39:52,419] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.04 | bwd: 4635.36 | bwd_inner: 4630.58 | bwd_allreduce: 4.71 | step: 44.73
- 61%|██████    | 3519/5800 [9:53:22<4:23:16,  6.93s/it]                                                       {'loss': 0.0083, 'grad_norm': 0.6395413875579834, 'learning_rate': 1.4145564880272762e-05, 'epoch': 30.34}
- 61%|██████    | 3519/5800 [9:53:22<4:23:16,  6.93s/it]score1 tensor([[0.4746],
-        [0.5156],
-        [0.5742],
-        [0.4199]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.5039, 0.5664, 0.4141], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:39:59,336] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 18:39:59,338] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.14 | bwd_microstep: 4638.09 | bwd_inner_microstep: 4633.31 | bwd_allreduce_microstep: 4.69 | step_microstep: 41.07
-[2025-01-25 18:39:59,338] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.10 | bwd: 4638.12 | bwd_inner: 4633.31 | bwd_allreduce: 4.74 | step: 41.07
- 61%|██████    | 3520/5800 [9:53:29<4:23:05,  6.92s/it]                                                       {'loss': 0.0112, 'grad_norm': 4.059523105621338, 'learning_rate': 1.4134886861632828e-05, 'epoch': 30.34}
- 61%|██████    | 3520/5800 [9:53:29<4:23:05,  6.92s/it]score1 tensor([[0.5078],
-        [0.6328],
-        [0.4648],
-        [0.4727]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5234, 0.6172, 0.4844, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:40:06,272] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 18:40:06,273] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.71 | bwd_microstep: 4643.93 | bwd_inner_microstep: 4639.21 | bwd_allreduce_microstep: 4.62 | step_microstep: 43.72
-[2025-01-25 18:40:06,274] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.67 | bwd: 4643.95 | bwd_inner: 4639.20 | bwd_allreduce: 4.67 | step: 43.73
- 61%|██████    | 3521/5800 [9:53:36<4:23:06,  6.93s/it]                                                       {'loss': 0.0156, 'grad_norm': 3.680318593978882, 'learning_rate': 1.412421067183695e-05, 'epoch': 30.35}
- 61%|██████    | 3521/5800 [9:53:36<4:23:06,  6.93s/it]score1 tensor([[0.4316],
-        [0.5195],
-        [0.5625],
-        [0.4473]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4609, 0.5391, 0.5781, 0.4707], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0220, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:40:13,210] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.52 | optimizer_step: 4.37
-[2025-01-25 18:40:13,211] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.18 | bwd_microstep: 4646.29 | bwd_inner_microstep: 4641.73 | bwd_allreduce_microstep: 4.48 | step_microstep: 48.16
-[2025-01-25 18:40:13,212] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.14 | bwd: 4646.31 | bwd_inner: 4641.73 | bwd_allreduce: 4.52 | step: 48.17
- 61%|██████    | 3522/5800 [9:53:43<4:23:05,  6.93s/it]                                                       {'loss': 0.022, 'grad_norm': 7.938770771026611, 'learning_rate': 1.4113536314214136e-05, 'epoch': 30.36}
- 61%|██████    | 3522/5800 [9:53:43<4:23:05,  6.93s/it]score1 tensor([[0.3457],
-        [0.4961],
-        [0.5039],
-        [0.2129]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3516, 0.5000, 0.5039, 0.1787], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0110, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:40:20,084] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.37
-[2025-01-25 18:40:20,085] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2172.34 | bwd_microstep: 4590.57 | bwd_inner_microstep: 4585.85 | bwd_allreduce_microstep: 4.62 | step_microstep: 42.01
-[2025-01-25 18:40:20,085] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2172.30 | bwd: 4590.60 | bwd_inner: 4585.85 | bwd_allreduce: 4.67 | step: 42.01
- 61%|██████    | 3523/5800 [9:53:50<4:22:20,  6.91s/it]                                                       {'loss': 0.011, 'grad_norm': 2.7029337882995605, 'learning_rate': 1.4102863792092834e-05, 'epoch': 30.37}
- 61%|██████    | 3523/5800 [9:53:50<4:22:20,  6.91s/it]score1 tensor([[0.4355],
-        [0.5273],
-        [0.4160],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4316, 0.5273, 0.4160, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0029, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:40:26,900] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.28 | optimizer_step: 4.37
-[2025-01-25 18:40:26,901] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.40 | bwd_microstep: 4541.09 | bwd_inner_microstep: 4537.36 | bwd_allreduce_microstep: 3.67 | step_microstep: 39.02
-[2025-01-25 18:40:26,902] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.34 | bwd: 4541.11 | bwd_inner: 4537.36 | bwd_allreduce: 3.69 | step: 39.02
- 61%|██████    | 3524/5800 [9:53:56<4:21:03,  6.88s/it]                                                       {'loss': 0.0029, 'grad_norm': 3.928431749343872, 'learning_rate': 1.4092193108800926e-05, 'epoch': 30.38}
- 61%|██████    | 3524/5800 [9:53:56<4:21:03,  6.88s/it]score1 tensor([[0.4355],
-        [0.5391],
-        [0.3906],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.5469, 0.4004, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:40:33,806] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 18:40:33,808] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.86 | bwd_microstep: 4640.04 | bwd_inner_microstep: 4634.88 | bwd_allreduce_microstep: 5.06 | step_microstep: 47.41
-[2025-01-25 18:40:33,808] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.84 | bwd: 4640.06 | bwd_inner: 4634.88 | bwd_allreduce: 5.10 | step: 47.42
- 61%|██████    | 3525/5800 [9:54:03<4:21:21,  6.89s/it]                                                       {'loss': 0.0083, 'grad_norm': 3.730240821838379, 'learning_rate': 1.4081524267665721e-05, 'epoch': 30.39}
- 61%|██████    | 3525/5800 [9:54:03<4:21:21,  6.89s/it]score1 tensor([[0.3750],
-        [0.5000],
-        [0.3535],
-        [0.4121]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3809, 0.4941, 0.3477, 0.4258], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:40:40,729] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.91 | optimizer_step: 4.36
-[2025-01-25 18:40:40,731] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.58 | bwd_microstep: 4639.61 | bwd_inner_microstep: 4634.96 | bwd_allreduce_microstep: 4.57 | step_microstep: 41.54
-[2025-01-25 18:40:40,731] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.54 | bwd: 4639.63 | bwd_inner: 4634.96 | bwd_allreduce: 4.61 | step: 41.54
- 61%|██████    | 3526/5800 [9:54:10<4:21:32,  6.90s/it]                                                       {'loss': 0.0078, 'grad_norm': 0.4255991280078888, 'learning_rate': 1.407085727201394e-05, 'epoch': 30.4}
- 61%|██████    | 3526/5800 [9:54:10<4:21:32,  6.90s/it]score1 tensor([[0.5859],
-        [0.4902],
-        [0.3906],
-        [0.4102]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5938, 0.4785, 0.3926, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:40:47,646] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.36
-[2025-01-25 18:40:47,647] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.27 | bwd_microstep: 4637.92 | bwd_inner_microstep: 4633.33 | bwd_allreduce_microstep: 4.50 | step_microstep: 42.73
-[2025-01-25 18:40:47,647] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.24 | bwd: 4637.94 | bwd_inner: 4633.33 | bwd_allreduce: 4.54 | step: 42.74
- 61%|██████    | 3527/5800 [9:54:17<4:21:39,  6.91s/it]                                                       {'loss': 0.0073, 'grad_norm': 3.809727191925049, 'learning_rate': 1.4060192125171738e-05, 'epoch': 30.41}
- 61%|██████    | 3527/5800 [9:54:17<4:21:39,  6.91s/it]score1 tensor([[0.5117],
-        [0.4043],
-        [0.5781],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.4180, 0.5781, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:40:54,512] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 18:40:54,513] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.58 | bwd_microstep: 4580.14 | bwd_inner_microstep: 4575.85 | bwd_allreduce_microstep: 4.21 | step_microstep: 41.30
-[2025-01-25 18:40:54,514] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.54 | bwd: 4580.17 | bwd_inner: 4575.85 | bwd_allreduce: 4.25 | step: 41.31
- 61%|██████    | 3528/5800 [9:54:24<4:21:01,  6.89s/it]                                                       {'loss': 0.0054, 'grad_norm': 5.975624084472656, 'learning_rate': 1.4049528830464705e-05, 'epoch': 30.41}
- 61%|██████    | 3528/5800 [9:54:24<4:21:01,  6.89s/it]score1 tensor([[0.5938],
-        [0.6211],
-        [0.4277],
-        [0.6133]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.6094, 0.4277, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:41:01,381] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 18:41:01,383] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.39 | bwd_microstep: 4586.73 | bwd_inner_microstep: 4581.69 | bwd_allreduce_microstep: 4.92 | step_microstep: 43.61
-[2025-01-25 18:41:01,383] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.35 | bwd: 4586.75 | bwd_inner: 4581.69 | bwd_allreduce: 4.97 | step: 43.62
- 61%|██████    | 3529/5800 [9:54:31<4:20:39,  6.89s/it]                                                       {'loss': 0.0078, 'grad_norm': 2.2812867164611816, 'learning_rate': 1.403886739121783e-05, 'epoch': 30.42}
- 61%|██████    | 3529/5800 [9:54:31<4:20:39,  6.89s/it]score1 tensor([[0.4941],
-        [0.6367],
-        [0.5586],
-        [0.4023]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.6016, 0.5625, 0.4082], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:41:08,312] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.37
-[2025-01-25 18:41:08,313] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.87 | bwd_microstep: 4639.08 | bwd_inner_microstep: 4634.25 | bwd_allreduce_microstep: 4.75 | step_microstep: 43.26
-[2025-01-25 18:41:08,314] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.83 | bwd: 4639.10 | bwd_inner: 4634.25 | bwd_allreduce: 4.78 | step: 43.27
- 61%|���█████    | 3530/5800 [9:54:38<4:21:00,  6.90s/it]                                                       {'loss': 0.0137, 'grad_norm': 0.6523404717445374, 'learning_rate': 1.402820781075553e-05, 'epoch': 30.43}
- 61%|██████    | 3530/5800 [9:54:38<4:21:00,  6.90s/it]score1 tensor([[0.5117],
-        [0.4629],
-        [0.4160],
-        [0.3750]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4902, 0.4785, 0.4043, 0.3555], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0171, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:41:15,232] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 18:41:15,234] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.90 | bwd_microstep: 4641.38 | bwd_inner_microstep: 4636.64 | bwd_allreduce_microstep: 4.63 | step_microstep: 41.11
-[2025-01-25 18:41:15,234] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.87 | bwd: 4641.41 | bwd_inner: 4636.64 | bwd_allreduce: 4.70 | step: 41.13
- 61%|██████    | 3531/5800 [9:54:45<4:21:08,  6.91s/it]                                                       {'loss': 0.0171, 'grad_norm': 3.6778266429901123, 'learning_rate': 1.4017550092401661e-05, 'epoch': 30.44}
- 61%|██████    | 3531/5800 [9:54:45<4:21:08,  6.91s/it]score1 tensor([[0.4980],
-        [0.4355],
-        [0.4180],
-        [0.4980]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.3262, 0.4062, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0425, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:41:22,093] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.99 | optimizer_step: 4.36
-[2025-01-25 18:41:22,094] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.54 | bwd_microstep: 4578.43 | bwd_inner_microstep: 4573.91 | bwd_allreduce_microstep: 4.45 | step_microstep: 44.55
-[2025-01-25 18:41:22,095] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.50 | bwd: 4578.46 | bwd_inner: 4573.91 | bwd_allreduce: 4.48 | step: 44.56
- 61%|██████    | 3532/5800 [9:54:52<4:20:30,  6.89s/it]                                                       {'loss': 0.0425, 'grad_norm': 5.669289588928223, 'learning_rate': 1.4006894239479474e-05, 'epoch': 30.45}
- 61%|██████    | 3532/5800 [9:54:52<4:20:30,  6.89s/it]score1 tensor([[0.6328],
-        [0.4785],
-        [0.4766],
-        [0.5898]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.4824, 0.4668, 0.5742], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0142, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:41:29,007] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 18:41:29,008] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.22 | bwd_microstep: 4632.27 | bwd_inner_microstep: 4626.11 | bwd_allreduce_microstep: 6.07 | step_microstep: 44.80
-[2025-01-25 18:41:29,009] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.19 | bwd: 4632.30 | bwd_inner: 4626.11 | bwd_allreduce: 6.12 | step: 44.81
- 61%|██████    | 3533/5800 [9:54:58<4:20:38,  6.90s/it]                                                       {'loss': 0.0142, 'grad_norm': 4.595768928527832, 'learning_rate': 1.3996240255311657e-05, 'epoch': 30.46}
- 61%|██████    | 3533/5800 [9:54:58<4:20:38,  6.90s/it]score1 tensor([[0.4629],
-        [0.5508],
-        [0.5195],
-        [0.3828]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.5430, 0.5352, 0.3672], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:41:35,921] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.37
-[2025-01-25 18:41:35,923] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.90 | bwd_microstep: 4632.76 | bwd_inner_microstep: 4627.98 | bwd_allreduce_microstep: 4.70 | step_microstep: 47.27
-[2025-01-25 18:41:35,923] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.86 | bwd: 4632.78 | bwd_inner: 4627.98 | bwd_allreduce: 4.73 | step: 47.27
- 61%|██████    | 3534/5800 [9:55:05<4:20:42,  6.90s/it]                                                       {'loss': 0.0117, 'grad_norm': 3.7620785236358643, 'learning_rate': 1.3985588143220303e-05, 'epoch': 30.47}
- 61%|██████    | 3534/5800 [9:55:05<4:20:42,  6.90s/it]score1 tensor([[0.4629],
-        [0.4766],
-        [0.5430],
-        [0.6133]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4805, 0.4980, 0.5469, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:41:42,834] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.93 | optimizer_step: 4.37
-[2025-01-25 18:41:42,836] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.21 | bwd_microstep: 4631.26 | bwd_inner_microstep: 4625.89 | bwd_allreduce_microstep: 5.26 | step_microstep: 42.95
-[2025-01-25 18:41:42,836] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.17 | bwd: 4631.28 | bwd_inner: 4625.89 | bwd_allreduce: 5.30 | step: 42.97
- 61%|██████    | 3535/5800 [9:55:12<4:20:50,  6.91s/it]                                                       {'loss': 0.0127, 'grad_norm': 3.7216944694519043, 'learning_rate': 1.3974937906526917e-05, 'epoch': 30.47}
- 61%|██████    | 3535/5800 [9:55:12<4:20:50,  6.91s/it]score1 tensor([[0.5273],
-        [0.6289],
-        [0.5352],
-        [0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.6172, 0.5508, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0229, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:41:49,774] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 18:41:49,775] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.27 | bwd_microstep: 4634.61 | bwd_inner_microstep: 4630.02 | bwd_allreduce_microstep: 4.50 | step_microstep: 51.52
-[2025-01-25 18:41:49,775] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.23 | bwd: 4634.63 | bwd_inner: 4630.02 | bwd_allreduce: 4.55 | step: 51.53
- 61%|██████    | 3536/5800 [9:55:19<4:20:57,  6.92s/it]                                                       {'loss': 0.0229, 'grad_norm': 0.6930527687072754, 'learning_rate': 1.3964289548552446e-05, 'epoch': 30.48}
- 61%|██████    | 3536/5800 [9:55:19<4:20:57,  6.92s/it]score1 tensor([[0.5391],
-        [0.4434],
-        [0.6680],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.4844, 0.6797, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0288, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:41:56,691] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.36
-[2025-01-25 18:41:56,692] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.04 | bwd_microstep: 4633.49 | bwd_inner_microstep: 4628.32 | bwd_allreduce_microstep: 5.04 | step_microstep: 41.51
-[2025-01-25 18:41:56,692] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.00 | bwd: 4633.51 | bwd_inner: 4628.33 | bwd_allreduce: 5.10 | step: 41.51
- 61%|██████    | 3537/5800 [9:55:26<4:20:49,  6.92s/it]                                                       {'loss': 0.0288, 'grad_norm': 8.603842735290527, 'learning_rate': 1.3953643072617211e-05, 'epoch': 30.49}
- 61%|██████    | 3537/5800 [9:55:26<4:20:49,  6.92s/it]score1 tensor([[0.3418],
-        [0.5391],
-        [0.5547],
-        [0.4336]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3457, 0.5664, 0.5664, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:42:03,605] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.37
-[2025-01-25 18:42:03,607] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.96 | bwd_microstep: 4631.95 | bwd_inner_microstep: 4626.78 | bwd_allreduce_microstep: 5.05 | step_microstep: 47.86
-[2025-01-25 18:42:03,607] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.92 | bwd: 4631.98 | bwd_inner: 4626.78 | bwd_allreduce: 5.11 | step: 47.87
- 61%|██████    | 3538/5800 [9:55:33<4:20:41,  6.92s/it]                                                       {'loss': 0.0161, 'grad_norm': 7.76596212387085, 'learning_rate': 1.394299848204099e-05, 'epoch': 30.5}
- 61%|██████    | 3538/5800 [9:55:33<4:20:41,  6.92s/it]score1 tensor([[0.4766],
-        [0.5430],
-        [0.3867],
-        [0.3750]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.5352, 0.3906, 0.3789], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:42:10,514] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.37
-[2025-01-25 18:42:10,516] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.98 | bwd_microstep: 4635.76 | bwd_inner_microstep: 4630.89 | bwd_allreduce_microstep: 4.80 | step_microstep: 42.98
-[2025-01-25 18:42:10,516] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.94 | bwd: 4635.79 | bwd_inner: 4630.89 | bwd_allreduce: 4.83 | step: 42.99
- 61%|██████    | 3539/5800 [9:55:40<4:20:36,  6.92s/it]                                                       {'loss': 0.0068, 'grad_norm': 0.8855499029159546, 'learning_rate': 1.3932355780142932e-05, 'epoch': 30.51}
- 61%|██████    | 3539/5800 [9:55:40<4:20:36,  6.92s/it]score1 tensor([[0.6250],
-        [0.4297],
-        [0.5625],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6328, 0.4473, 0.5547, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:42:17,427] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.26 | optimizer_step: 4.37
-[2025-01-25 18:42:17,429] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.16 | bwd_microstep: 4626.83 | bwd_inner_microstep: 4621.83 | bwd_allreduce_microstep: 4.88 | step_microstep: 43.55
-[2025-01-25 18:42:17,429] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.13 | bwd: 4626.86 | bwd_inner: 4621.83 | bwd_allreduce: 4.95 | step: 43.56
- 61%|██████    | 3540/5800 [9:55:47<4:20:26,  6.91s/it]                                                       {'loss': 0.0122, 'grad_norm': 0.3795850872993469, 'learning_rate': 1.392171497024163e-05, 'epoch': 30.52}
- 61%|██████    | 3540/5800 [9:55:47<4:20:26,  6.91s/it]score1 tensor([[0.5508],
-        [0.4336],
-        [0.7227],
-        [0.6289]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.4297, 0.6875, 0.6484], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:42:24,355] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 18:42:24,356] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.64 | bwd_microstep: 4639.96 | bwd_inner_microstep: 4634.95 | bwd_allreduce_microstep: 4.91 | step_microstep: 45.15
-[2025-01-25 18:42:24,356] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.61 | bwd: 4639.99 | bwd_inner: 4634.95 | bwd_allreduce: 4.96 | step: 45.16
- 61%|██████    | 3541/5800 [9:55:54<4:20:23,  6.92s/it]                                                       {'loss': 0.0156, 'grad_norm': 4.200186729431152, 'learning_rate': 1.3911076055655066e-05, 'epoch': 30.53}
- 61%|██████    | 3541/5800 [9:55:54<4:20:23,  6.92s/it]score1 tensor([[0.5742],
-        [0.4941],
-        [0.4805],
-        [0.4746]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.5039, 0.5039, 0.4707], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:42:31,265] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.37
-[2025-01-25 18:42:31,266] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.15 | bwd_microstep: 4636.17 | bwd_inner_microstep: 4631.42 | bwd_allreduce_microstep: 4.65 | step_microstep: 41.90
-[2025-01-25 18:42:31,267] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.12 | bwd: 4636.20 | bwd_inner: 4631.42 | bwd_allreduce: 4.70 | step: 41.91
- 61%|██████    | 3542/5800 [9:56:01<4:20:17,  6.92s/it]                                                       {'loss': 0.0103, 'grad_norm': 4.190493106842041, 'learning_rate': 1.3900439039700641e-05, 'epoch': 30.53}
- 61%|██████    | 3542/5800 [9:56:01<4:20:17,  6.92s/it]score1 tensor([[0.5234],
-        [0.4453],
-        [0.4805],
-        [0.4961]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.4512, 0.4980, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0166, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:42:38,186] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.17 | optimizer_step: 4.37
-[2025-01-25 18:42:38,187] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.66 | bwd_microstep: 4634.95 | bwd_inner_microstep: 4629.78 | bwd_allreduce_microstep: 5.06 | step_microstep: 43.31
-[2025-01-25 18:42:38,188] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.62 | bwd: 4634.98 | bwd_inner: 4629.78 | bwd_allreduce: 5.12 | step: 43.32
- 61%|██████    | 3543/5800 [9:56:08<4:20:13,  6.92s/it]                                                       {'loss': 0.0166, 'grad_norm': 7.901347637176514, 'learning_rate': 1.3889803925695158e-05, 'epoch': 30.54}
- 61%|██████    | 3543/5800 [9:56:08<4:20:13,  6.92s/it]score1 tensor([[0.6055],
-        [0.4727],
-        [0.5547],
-        [0.3984]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.4902, 0.5625, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:42:45,109] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 18:42:45,110] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.23 | bwd_microstep: 4630.13 | bwd_inner_microstep: 4625.19 | bwd_allreduce_microstep: 4.86 | step_microstep: 42.58
-[2025-01-25 18:42:45,110] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.19 | bwd: 4630.15 | bwd_inner: 4625.19 | bwd_allreduce: 4.90 | step: 42.59
- 61%|██████    | 3544/5800 [9:56:15<4:20:09,  6.92s/it]                                                       {'loss': 0.0132, 'grad_norm': 4.586091041564941, 'learning_rate': 1.3879170716954828e-05, 'epoch': 30.55}
- 61%|██████    | 3544/5800 [9:56:15<4:20:09,  6.92s/it]score1 tensor([[0.5039],
-        [0.5430],
-        [0.5820],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.5352, 0.6094, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:42:52,031] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 18:42:52,032] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.80 | bwd_microstep: 4631.67 | bwd_inner_microstep: 4627.11 | bwd_allreduce_microstep: 4.49 | step_microstep: 46.08
-[2025-01-25 18:42:52,032] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.77 | bwd: 4631.70 | bwd_inner: 4627.11 | bwd_allreduce: 4.52 | step: 46.08
- 61%|██████    | 3545/5800 [9:56:22<4:20:01,  6.92s/it]                                                       {'loss': 0.0117, 'grad_norm': 4.253632068634033, 'learning_rate': 1.3868539416795284e-05, 'epoch': 30.56}
- 61%|██████    | 3545/5800 [9:56:22<4:20:01,  6.92s/it]score1 tensor([[0.5000],
-        [0.5039],
-        [0.5234],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.5078, 0.5195, 0.4883], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:42:58,944] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 18:42:58,945] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.31 | bwd_microstep: 4633.11 | bwd_inner_microstep: 4628.20 | bwd_allreduce_microstep: 4.80 | step_microstep: 42.99
-[2025-01-25 18:42:58,945] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.27 | bwd: 4633.13 | bwd_inner: 4628.21 | bwd_allreduce: 4.85 | step: 42.99
- 61%|██████    | 3546/5800 [9:56:28<4:19:51,  6.92s/it]                                                       {'loss': 0.0078, 'grad_norm': 0.34370067715644836, 'learning_rate': 1.3857910028531538e-05, 'epoch': 30.57}
- 61%|██████    | 3546/5800 [9:56:28<4:19:51,  6.92s/it]score1 tensor([[0.3945],
-        [0.5391],
-        [0.4551],
-        [0.4629]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3789, 0.5508, 0.4473, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:43:05,853] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 18:43:05,854] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.28 | bwd_microstep: 4631.88 | bwd_inner_microstep: 4627.24 | bwd_allreduce_microstep: 4.56 | step_microstep: 43.57
-[2025-01-25 18:43:05,854] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.24 | bwd: 4631.90 | bwd_inner: 4627.23 | bwd_allreduce: 4.60 | step: 43.59
- 61%|██████    | 3547/5800 [9:56:35<4:19:38,  6.91s/it]                                                       {'loss': 0.0127, 'grad_norm': 3.543377637863159, 'learning_rate': 1.3847282555478013e-05, 'epoch': 30.58}
- 61%|██████    | 3547/5800 [9:56:35<4:19:38,  6.91s/it]score1 tensor([[0.4121],
-        [0.5391],
-        [0.4434],
-        [0.5781]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.5156, 0.4199, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0269, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:43:12,767] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 18:43:12,768] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.40 | bwd_microstep: 4633.06 | bwd_inner_microstep: 4628.17 | bwd_allreduce_microstep: 4.80 | step_microstep: 43.74
-[2025-01-25 18:43:12,768] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.36 | bwd: 4633.08 | bwd_inner: 4628.17 | bwd_allreduce: 4.85 | step: 43.74
- 61%|██████    | 3548/5800 [9:56:42<4:19:28,  6.91s/it]                                                       {'loss': 0.0269, 'grad_norm': 7.962711811065674, 'learning_rate': 1.3836657000948553e-05, 'epoch': 30.59}
- 61%|██████    | 3548/5800 [9:56:42<4:19:28,  6.91s/it]score1 tensor([[0.4824],
-        [0.4805],
-        [0.4180],
-        [0.4492]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.4668, 0.3750, 0.4160], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0269, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:43:19,687] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 18:43:19,689] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.20 | bwd_microstep: 4641.80 | bwd_inner_microstep: 4636.53 | bwd_allreduce_microstep: 5.13 | step_microstep: 44.04
-[2025-01-25 18:43:19,689] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.17 | bwd: 4641.83 | bwd_inner: 4636.53 | bwd_allreduce: 5.22 | step: 44.05
- 61%|██████    | 3549/5800 [9:56:49<4:19:34,  6.92s/it]                                                       {'loss': 0.0269, 'grad_norm': 7.662116050720215, 'learning_rate': 1.3826033368256376e-05, 'epoch': 30.59}
- 61%|██████    | 3549/5800 [9:56:49<4:19:34,  6.92s/it]score1 tensor([[0.4004],
-        [0.4707],
-        [0.6914],
-        [0.4570]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3652, 0.4648, 0.6797, 0.4336], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0190, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:43:26,604] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 18:43:26,605] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.20 | bwd_microstep: 4630.43 | bwd_inner_microstep: 4625.35 | bwd_allreduce_microstep: 4.96 | step_microstep: 42.32
-[2025-01-25 18:43:26,605] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.17 | bwd: 4630.46 | bwd_inner: 4625.35 | bwd_allreduce: 5.02 | step: 42.32
- 61%|██████    | 3550/5800 [9:56:56<4:19:21,  6.92s/it]                                                       {'loss': 0.019, 'grad_norm': 8.024531364440918, 'learning_rate': 1.381541166071413e-05, 'epoch': 30.6}
- 61%|██████    | 3550/5800 [9:56:56<4:19:21,  6.92s/it]score1 tensor([[0.4590],
-        [0.5000],
-        [0.4551],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4609, 0.4922, 0.4551, 0.5977], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:43:33,470] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 18:43:33,471] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.94 | bwd_microstep: 4580.60 | bwd_inner_microstep: 4576.02 | bwd_allreduce_microstep: 4.50 | step_microstep: 43.19
-[2025-01-25 18:43:33,471] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.91 | bwd: 4580.63 | bwd_inner: 4576.02 | bwd_allreduce: 4.54 | step: 43.20
- 61%|██████    | 3551/5800 [9:57:03<4:18:38,  6.90s/it]                                                       {'loss': 0.0083, 'grad_norm': 2.094508409500122, 'learning_rate': 1.3804791881633832e-05, 'epoch': 30.61}
- 61%|██████    | 3551/5800 [9:57:03<4:18:38,  6.90s/it]score1 tensor([[0.5352],
-        [0.5859],
-        [0.4980],
-        [0.4473]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.6172, 0.5352, 0.4355], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0220, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:43:40,394] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 18:43:40,395] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.21 | bwd_microstep: 4641.59 | bwd_inner_microstep: 4636.37 | bwd_allreduce_microstep: 5.12 | step_microstep: 49.33
-[2025-01-25 18:43:40,395] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.17 | bwd: 4641.62 | bwd_inner: 4636.37 | bwd_allreduce: 5.17 | step: 49.34
- 61%|██████    | 3552/5800 [9:57:10<4:18:46,  6.91s/it]                                                       {'loss': 0.022, 'grad_norm': 0.5080738067626953, 'learning_rate': 1.379417403432693e-05, 'epoch': 30.62}
- 61%|██████    | 3552/5800 [9:57:10<4:18:46,  6.91s/it]score1 tensor([[0.4668],
-        [0.3496],
-        [0.4785],
-        [0.4336]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4590, 0.3223, 0.4688, 0.3887], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0225, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:43:47,312] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.47 | optimizer_step: 4.36
-[2025-01-25 18:43:47,313] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.73 | bwd_microstep: 4633.67 | bwd_inner_microstep: 4628.80 | bwd_allreduce_microstep: 4.78 | step_microstep: 48.20
-[2025-01-25 18:43:47,313] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.69 | bwd: 4633.70 | bwd_inner: 4628.80 | bwd_allreduce: 4.82 | step: 48.21
- 61%|██████▏   | 3553/5800 [9:57:17<4:18:47,  6.91s/it]                                                       {'loss': 0.0225, 'grad_norm': 7.399120330810547, 'learning_rate': 1.3783558122104244e-05, 'epoch': 30.63}
- 61%|██████▏   | 3553/5800 [9:57:17<4:18:47,  6.91s/it]score1 tensor([[0.4707],
-        [0.5742],
-        [0.5039],
-        [0.4746]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4395, 0.6016, 0.4883, 0.4609], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0220, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:43:54,238] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 18:43:54,239] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.60 | bwd_microstep: 4646.02 | bwd_inner_microstep: 4641.04 | bwd_allreduce_microstep: 4.88 | step_microstep: 43.69
-[2025-01-25 18:43:54,239] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.56 | bwd: 4646.04 | bwd_inner: 4641.04 | bwd_allreduce: 4.93 | step: 43.70
- 61%|██████▏   | 3554/5800 [9:57:24<4:18:53,  6.92s/it]                                                       {'loss': 0.022, 'grad_norm': 3.7494869232177734, 'learning_rate': 1.3772944148275996e-05, 'epoch': 30.64}
- 61%|██████▏   | 3554/5800 [9:57:24<4:18:53,  6.92s/it]score1 tensor([[0.3926],
-        [0.5430],
-        [0.5430],
-        [0.4199]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.2812, 0.5391, 0.5391, 0.4023], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0342, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:44:01,168] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.37
-[2025-01-25 18:44:01,169] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.30 | bwd_microstep: 4644.70 | bwd_inner_microstep: 4639.81 | bwd_allreduce_microstep: 4.78 | step_microstep: 44.40
-[2025-01-25 18:44:01,169] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.27 | bwd: 4644.72 | bwd_inner: 4639.81 | bwd_allreduce: 4.84 | step: 44.41
- 61%|██████▏   | 3555/5800 [9:57:31<4:18:51,  6.92s/it]                                                       {'loss': 0.0342, 'grad_norm': 7.798098564147949, 'learning_rate': 1.3762332116151815e-05, 'epoch': 30.65}
- 61%|██████▏   | 3555/5800 [9:57:31<4:18:51,  6.92s/it]score1 tensor([[0.5234],
-        [0.4883],
-        [0.6133],
-        [0.4766]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.4941, 0.6445, 0.4688], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:44:08,072] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 18:44:08,073] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.09 | bwd_microstep: 4631.80 | bwd_inner_microstep: 4626.94 | bwd_allreduce_microstep: 4.76 | step_microstep: 42.73
-[2025-01-25 18:44:08,074] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.05 | bwd: 4631.83 | bwd_inner: 4626.94 | bwd_allreduce: 4.81 | step: 42.75
- 61%|██████▏   | 3556/5800 [9:57:38<4:18:36,  6.91s/it]                                                       {'loss': 0.0122, 'grad_norm': 4.407682418823242, 'learning_rate': 1.3751722029040707e-05, 'epoch': 30.66}
- 61%|██████▏   | 3556/5800 [9:57:38<4:18:36,  6.91s/it]score1 tensor([[0.4570],
-        [0.4961],
-        [0.3906],
-        [0.6172]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.4941, 0.3809, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:44:14,993] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.37
-[2025-01-25 18:44:14,995] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.24 | bwd_microstep: 4642.45 | bwd_inner_microstep: 4637.75 | bwd_allreduce_microstep: 4.61 | step_microstep: 43.05
-[2025-01-25 18:44:14,995] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.21 | bwd: 4642.47 | bwd_inner: 4637.75 | bwd_allreduce: 4.65 | step: 43.06
- 61%|██████▏   | 3557/5800 [9:57:44<4:18:40,  6.92s/it]                                                       {'loss': 0.0117, 'grad_norm': 0.6772646307945251, 'learning_rate': 1.3741113890251093e-05, 'epoch': 30.66}
- 61%|██████▏   | 3557/5800 [9:57:44<4:18:40,  6.92s/it]score1 tensor([[0.6484],
-        [0.4648],
-        [0.5078],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6562, 0.4551, 0.5391, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:44:21,902] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.37
-[2025-01-25 18:44:21,903] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.34 | bwd_microstep: 4602.02 | bwd_inner_microstep: 4597.08 | bwd_allreduce_microstep: 4.84 | step_microstep: 48.21
-[2025-01-25 18:44:21,904] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.31 | bwd: 4602.05 | bwd_inner: 4597.08 | bwd_allreduce: 4.89 | step: 48.22
- 61%|██████▏   | 3558/5800 [9:57:51<4:18:21,  6.91s/it]                                                       {'loss': 0.0122, 'grad_norm': 2.514335870742798, 'learning_rate': 1.3730507703090763e-05, 'epoch': 30.67}
- 61%|██████▏   | 3558/5800 [9:57:51<4:18:21,  6.91s/it]score1 tensor([[0.5469],
-        [0.4082],
-        [0.4922],
-        [0.4414]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.4004, 0.4863, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:44:28,835] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.41 | optimizer_step: 4.37
-[2025-01-25 18:44:28,836] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.44 | bwd_microstep: 4647.96 | bwd_inner_microstep: 4643.07 | bwd_allreduce_microstep: 4.77 | step_microstep: 44.71
-[2025-01-25 18:44:28,837] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.40 | bwd: 4647.98 | bwd_inner: 4643.07 | bwd_allreduce: 4.83 | step: 44.72
- 61%|██████▏   | 3559/5800 [9:57:58<4:18:35,  6.92s/it]                                                       {'loss': 0.0107, 'grad_norm': 0.537018895149231, 'learning_rate': 1.3719903470866917e-05, 'epoch': 30.68}
- 61%|██████▏   | 3559/5800 [9:57:58<4:18:35,  6.92s/it]score1 tensor([[0.4648],
-        [0.4297],
-        [0.5273],
-        [0.4355]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4883, 0.4375, 0.5430, 0.4277], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:44:35,776] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 18:44:35,778] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.17 | bwd_microstep: 4643.08 | bwd_inner_microstep: 4637.87 | bwd_allreduce_microstep: 5.09 | step_microstep: 44.91
-[2025-01-25 18:44:35,778] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.13 | bwd: 4643.10 | bwd_inner: 4637.87 | bwd_allreduce: 5.15 | step: 44.92
- 61%|██████▏   | 3560/5800 [9:58:05<4:18:31,  6.92s/it]                                                       {'loss': 0.0137, 'grad_norm': 4.025040626525879, 'learning_rate': 1.3709301196886146e-05, 'epoch': 30.69}
- 61%|██████▏   | 3560/5800 [9:58:05<4:18:31,  6.92s/it]score1 tensor([[0.4473],
-        [0.4902],
-        [0.5234],
-        [0.6289]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4531, 0.4824, 0.5312, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:44:42,704] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.36
-[2025-01-25 18:44:42,705] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.52 | bwd_microstep: 4643.70 | bwd_inner_microstep: 4638.76 | bwd_allreduce_microstep: 4.84 | step_microstep: 45.08
-[2025-01-25 18:44:42,706] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.48 | bwd: 4643.73 | bwd_inner: 4638.77 | bwd_allreduce: 4.89 | step: 45.09
- 61%|██████▏   | 3561/5800 [9:58:12<4:18:29,  6.93s/it]                                                       {'loss': 0.0112, 'grad_norm': 0.5970805287361145, 'learning_rate': 1.3698700884454405e-05, 'epoch': 30.7}
- 61%|██████▏   | 3561/5800 [9:58:12<4:18:29,  6.93s/it]score1 tensor([[0.5117],
-        [0.4102],
-        [0.3496],
-        [0.4434]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.4023, 0.3730, 0.4570], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0151, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:44:49,629] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.37
-[2025-01-25 18:44:49,631] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.44 | bwd_microstep: 4642.38 | bwd_inner_microstep: 4637.67 | bwd_allreduce_microstep: 4.62 | step_microstep: 43.62
-[2025-01-25 18:44:49,631] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.40 | bwd: 4642.40 | bwd_inner: 4637.67 | bwd_allreduce: 4.67 | step: 43.64
- 61%|██████▏   | 3562/5800 [9:58:19<4:18:19,  6.93s/it]                                                       {'loss': 0.0151, 'grad_norm': 3.763503313064575, 'learning_rate': 1.3688102536877077e-05, 'epoch': 30.71}
- 61%|██████▏   | 3562/5800 [9:58:19<4:18:19,  6.93s/it]score1 tensor([[0.4512],
-        [0.4219],
-        [0.5898],
-        [0.4531]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4395, 0.4297, 0.6016, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:44:56,547] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.37
-[2025-01-25 18:44:56,549] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.07 | bwd_microstep: 4638.20 | bwd_inner_microstep: 4633.40 | bwd_allreduce_microstep: 4.70 | step_microstep: 43.97
-[2025-01-25 18:44:56,549] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.02 | bwd: 4638.23 | bwd_inner: 4633.40 | bwd_allreduce: 4.75 | step: 43.98
- 61%|██████▏   | 3563/5800 [9:58:26<4:18:11,  6.93s/it]                                                       {'loss': 0.0098, 'grad_norm': 0.4604395627975464, 'learning_rate': 1.3677506157458887e-05, 'epoch': 30.72}
- 61%|██████▏   | 3563/5800 [9:58:26<4:18:11,  6.93s/it]score1 tensor([[0.4648],
-        [0.6016],
-        [0.6289],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4766, 0.6016, 0.6211, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:45:03,421] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.36
-[2025-01-25 18:45:03,422] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.82 | bwd_microstep: 4584.40 | bwd_inner_microstep: 4579.65 | bwd_allreduce_microstep: 4.60 | step_microstep: 43.63
-[2025-01-25 18:45:03,423] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.78 | bwd: 4584.43 | bwd_inner: 4579.65 | bwd_allreduce: 4.66 | step: 43.64
- 61%|██████▏   | 3564/5800 [9:58:33<4:17:31,  6.91s/it]                                                       {'loss': 0.0078, 'grad_norm': 2.508401393890381, 'learning_rate': 1.3666911749503991e-05, 'epoch': 30.72}
- 61%|██████▏   | 3564/5800 [9:58:33<4:17:31,  6.91s/it]score1 tensor([[0.5312],
-        [0.3984],
-        [0.5039],
-        [0.4414]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.4141, 0.4922, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:45:10,353] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.18 | optimizer_step: 4.36
-[2025-01-25 18:45:10,354] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.81 | bwd_microstep: 4640.79 | bwd_inner_microstep: 4635.98 | bwd_allreduce_microstep: 4.73 | step_microstep: 44.63
-[2025-01-25 18:45:10,355] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.77 | bwd: 4640.81 | bwd_inner: 4635.98 | bwd_allreduce: 4.77 | step: 44.63
- 61%|██████▏   | 3565/5800 [9:58:40<4:17:36,  6.92s/it]                                                       {'loss': 0.0146, 'grad_norm': 0.32934409379959106, 'learning_rate': 1.3656319316315903e-05, 'epoch': 30.73}
- 61%|██████▏   | 3565/5800 [9:58:40<4:17:36,  6.92s/it]score1 tensor([[0.4316],
-        [0.4316],
-        [0.4707],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4590, 0.4141, 0.4844, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0166, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:45:17,284] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.18 | optimizer_step: 4.37
-[2025-01-25 18:45:17,285] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.09 | bwd_microstep: 4644.52 | bwd_inner_microstep: 4639.54 | bwd_allreduce_microstep: 4.90 | step_microstep: 44.53
-[2025-01-25 18:45:17,286] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.05 | bwd: 4644.55 | bwd_inner: 4639.54 | bwd_allreduce: 4.94 | step: 44.56
- 61%|██████▏   | 3566/5800 [9:58:47<4:17:40,  6.92s/it]                                                       {'loss': 0.0166, 'grad_norm': 4.011900424957275, 'learning_rate': 1.3645728861197512e-05, 'epoch': 30.74}
- 61%|██████▏   | 3566/5800 [9:58:47<4:17:40,  6.92s/it]score1 tensor([[0.4785],
-        [0.4570],
-        [0.4453],
-        [0.6250]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4863, 0.4453, 0.4473, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:45:24,218] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 18:45:24,219] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.62 | bwd_microstep: 4642.80 | bwd_inner_microstep: 4637.60 | bwd_allreduce_microstep: 5.11 | step_microstep: 44.26
-[2025-01-25 18:45:24,220] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.60 | bwd: 4642.83 | bwd_inner: 4637.60 | bwd_allreduce: 5.16 | step: 44.27
- 62%|██████▏   | 3567/5800 [9:58:54<4:17:40,  6.92s/it]                                                       {'loss': 0.0093, 'grad_norm': 0.6502198576927185, 'learning_rate': 1.3635140387451129e-05, 'epoch': 30.75}
- 62%|██████▏   | 3567/5800 [9:58:54<4:17:40,  6.92s/it]score1 tensor([[0.6328],
-        [0.5156],
-        [0.4219],
-        [0.3945]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6250, 0.5469, 0.4043, 0.3867], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:45:31,134] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.18 | optimizer_step: 4.37
-[2025-01-25 18:45:31,135] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.75 | bwd_microstep: 4634.06 | bwd_inner_microstep: 4629.01 | bwd_allreduce_microstep: 4.92 | step_microstep: 42.24
-[2025-01-25 18:45:31,136] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.72 | bwd: 4634.10 | bwd_inner: 4629.01 | bwd_allreduce: 4.99 | step: 42.25
- 62%|██████▏   | 3568/5800 [9:59:01<4:17:25,  6.92s/it]                                                       {'loss': 0.0161, 'grad_norm': 3.889557361602783, 'learning_rate': 1.3624553898378404e-05, 'epoch': 30.76}
- 62%|██████▏   | 3568/5800 [9:59:01<4:17:25,  6.92s/it]score1 tensor([[0.4668],
-        [0.4766],
-        [0.5742],
-        [0.3730]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.4727, 0.5781, 0.3457], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:45:38,055] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.95 | optimizer_step: 4.36
-[2025-01-25 18:45:38,057] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.97 | bwd_microstep: 4640.36 | bwd_inner_microstep: 4634.23 | bwd_allreduce_microstep: 5.99 | step_microstep: 47.41
-[2025-01-25 18:45:38,057] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.93 | bwd: 4640.39 | bwd_inner: 4634.23 | bwd_allreduce: 6.06 | step: 47.42
- 62%|██████▏   | 3569/5800 [9:59:08<4:17:19,  6.92s/it]                                                       {'loss': 0.0093, 'grad_norm': 0.6772372722625732, 'learning_rate': 1.3613969397280405e-05, 'epoch': 30.77}
- 62%|██████▏   | 3569/5800 [9:59:08<4:17:19,  6.92s/it]score1 tensor([[0.4941],
-        [0.5234],
-        [0.5156],
-        [0.3789]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.5312, 0.5078, 0.3613], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:45:44,929] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.90 | optimizer_step: 4.36
-[2025-01-25 18:45:44,931] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.25 | bwd_microstep: 4589.09 | bwd_inner_microstep: 4584.42 | bwd_allreduce_microstep: 4.58 | step_microstep: 45.40
-[2025-01-25 18:45:44,931] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.20 | bwd: 4589.11 | bwd_inner: 4584.42 | bwd_allreduce: 4.62 | step: 45.41
- 62%|██████▏   | 3570/5800 [9:59:14<4:16:44,  6.91s/it]                                                       {'loss': 0.0083, 'grad_norm': 1.7780030965805054, 'learning_rate': 1.3603386887457548e-05, 'epoch': 30.78}
- 62%|██████▏   | 3570/5800 [9:59:14<4:16:44,  6.91s/it]score1 tensor([[0.5508],
-        [0.5742],
-        [0.5156],
-        [0.4238]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.5508, 0.4961, 0.4121], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:45:51,858] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.05 | optimizer_step: 4.37
-[2025-01-25 18:45:51,859] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.50 | bwd_microstep: 4640.99 | bwd_inner_microstep: 4636.05 | bwd_allreduce_microstep: 4.81 | step_microstep: 44.14
-[2025-01-25 18:45:51,860] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.46 | bwd: 4641.02 | bwd_inner: 4636.05 | bwd_allreduce: 4.88 | step: 44.15
- 62%|██████▏   | 3571/5800 [9:59:21<4:16:51,  6.91s/it]                                                       {'loss': 0.0176, 'grad_norm': 8.208917617797852, 'learning_rate': 1.3592806372209656e-05, 'epoch': 30.78}
- 62%|██████▏   | 3571/5800 [9:59:21<4:16:51,  6.91s/it]score1 tensor([[0.4883],
-        [0.5547],
-        [0.4434],
-        [0.4648]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4922, 0.5625, 0.4551, 0.4688], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:45:58,787] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.37
-[2025-01-25 18:45:58,789] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.20 | bwd_microstep: 4638.94 | bwd_inner_microstep: 4633.38 | bwd_allreduce_microstep: 5.47 | step_microstep: 48.25
-[2025-01-25 18:45:58,789] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.14 | bwd: 4638.97 | bwd_inner: 4633.38 | bwd_allreduce: 5.52 | step: 48.27
- 62%|██████▏   | 3572/5800 [9:59:28<4:16:52,  6.92s/it]                                                       {'loss': 0.0068, 'grad_norm': 7.887229919433594, 'learning_rate': 1.3582227854835914e-05, 'epoch': 30.79}
- 62%|██████▏   | 3572/5800 [9:59:28<4:16:52,  6.92s/it]score1 tensor([[0.4355],
-        [0.4570],
-        [0.5625],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4219, 0.4863, 0.5664, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0166, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:46:05,708] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 18:46:05,710] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.15 | bwd_microstep: 4637.03 | bwd_inner_microstep: 4631.67 | bwd_allreduce_microstep: 5.28 | step_microstep: 44.39
-[2025-01-25 18:46:05,710] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.09 | bwd: 4637.06 | bwd_inner: 4631.67 | bwd_allreduce: 5.32 | step: 44.39
- 62%|██████▏   | 3573/5800 [9:59:35<4:16:48,  6.92s/it]                                                       {'loss': 0.0166, 'grad_norm': 0.3900964856147766, 'learning_rate': 1.3571651338634877e-05, 'epoch': 30.8}
- 62%|██████▏   | 3573/5800 [9:59:35<4:16:48,  6.92s/it]score1 tensor([[0.5352],
-        [0.4805],
-        [0.6992],
-        [0.4395]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.4844, 0.6836, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:46:12,639] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.37
-[2025-01-25 18:46:12,641] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.29 | bwd_microstep: 4644.78 | bwd_inner_microstep: 4639.60 | bwd_allreduce_microstep: 5.06 | step_microstep: 46.01
-[2025-01-25 18:46:12,641] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.24 | bwd: 4644.83 | bwd_inner: 4639.60 | bwd_allreduce: 5.12 | step: 46.02
- 62%|██████▏   | 3574/5800 [9:59:42<4:16:49,  6.92s/it]                                                       {'loss': 0.0088, 'grad_norm': 3.5345003604888916, 'learning_rate': 1.3561076826904503e-05, 'epoch': 30.81}
- 62%|██████▏   | 3574/5800 [9:59:42<4:16:49,  6.92s/it]score1 tensor([[0.5742],
-        [0.5352],
-        [0.6250],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5586, 0.5391, 0.6445, 0.5742], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:46:19,567] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 18:46:19,569] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.91 | bwd_microstep: 4638.19 | bwd_inner_microstep: 4632.97 | bwd_allreduce_microstep: 5.12 | step_microstep: 51.13
-[2025-01-25 18:46:19,569] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.88 | bwd: 4638.22 | bwd_inner: 4632.97 | bwd_allreduce: 5.17 | step: 51.14
- 62%|██████▏   | 3575/5800 [9:59:49<4:16:49,  6.93s/it]                                                       {'loss': 0.0146, 'grad_norm': 4.39705228805542, 'learning_rate': 1.3550504322942088e-05, 'epoch': 30.82}
- 62%|██████▏   | 3575/5800 [9:59:49<4:16:49,  6.93s/it]score1 tensor([[0.5352],
-        [0.5039],
-        [0.4727],
-        [0.6250]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.4805, 0.4648, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:46:26,490] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 18:46:26,492] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.46 | bwd_microstep: 4635.21 | bwd_inner_microstep: 4630.08 | bwd_allreduce_microstep: 5.04 | step_microstep: 45.06
-[2025-01-25 18:46:26,492] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.42 | bwd: 4635.23 | bwd_inner: 4630.08 | bwd_allreduce: 5.08 | step: 45.07
- 62%|██████▏   | 3576/5800 [9:59:56<4:16:36,  6.92s/it]                                                       {'loss': 0.0107, 'grad_norm': 4.179659366607666, 'learning_rate': 1.353993383004434e-05, 'epoch': 30.83}
- 62%|██████▏   | 3576/5800 [9:59:56<4:16:36,  6.92s/it]score1 tensor([[0.5312],
-        [0.5781],
-        [0.4668],
-        [0.6211]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.5469, 0.4785, 0.6133], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:46:33,409] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.37
-[2025-01-25 18:46:33,410] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.85 | bwd_microstep: 4636.59 | bwd_inner_microstep: 4631.14 | bwd_allreduce_microstep: 5.34 | step_microstep: 45.99
-[2025-01-25 18:46:33,411] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.80 | bwd: 4636.61 | bwd_inner: 4631.14 | bwd_allreduce: 5.39 | step: 46.00
- 62%|██████▏   | 3577/5800 [10:00:03<4:16:32,  6.92s/it]                                                        {'loss': 0.0137, 'grad_norm': 0.7673910856246948, 'learning_rate': 1.352936535150732e-05, 'epoch': 30.84}
- 62%|██████▏   | 3577/5800 [10:00:03<4:16:32,  6.92s/it]score1 tensor([[0.6211],
-        [0.7227],
-        [0.4688],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.7070, 0.4844, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:46:40,305] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 18:46:40,306] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.40 | bwd_microstep: 4590.33 | bwd_inner_microstep: 4584.87 | bwd_allreduce_microstep: 5.37 | step_microstep: 53.27
-[2025-01-25 18:46:40,306] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.35 | bwd: 4590.36 | bwd_inner: 4584.87 | bwd_allreduce: 5.42 | step: 53.29
- 62%|██████▏   | 3578/5800 [10:00:10<4:16:03,  6.91s/it]                                                        {'loss': 0.0112, 'grad_norm': 2.7134315967559814, 'learning_rate': 1.3518798890626448e-05, 'epoch': 30.84}
- 62%|██████▏   | 3578/5800 [10:00:10<4:16:03,  6.91s/it]score1 tensor([[0.5430],
-        [0.4160],
-        [0.5703],
-        [0.4453]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.4004, 0.5664, 0.4336], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:46:47,244] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.37
-[2025-01-25 18:46:47,245] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.01 | bwd_microstep: 4650.37 | bwd_inner_microstep: 4645.56 | bwd_allreduce_microstep: 4.71 | step_microstep: 44.50
-[2025-01-25 18:46:47,246] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.96 | bwd: 4650.39 | bwd_inner: 4645.56 | bwd_allreduce: 4.76 | step: 44.51
- 62%|██████▏   | 3579/5800 [10:00:17<4:16:15,  6.92s/it]                                                        {'loss': 0.0127, 'grad_norm': 3.857592821121216, 'learning_rate': 1.3508234450696552e-05, 'epoch': 30.85}
- 62%|██████▏   | 3579/5800 [10:00:17<4:16:15,  6.92s/it]score1 tensor([[0.4668],
-        [0.6094],
-        [0.5898],
-        [0.4531]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4375, 0.6094, 0.5664, 0.4629], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:46:54,127] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.47 | optimizer_step: 4.37
-[2025-01-25 18:46:54,128] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.26 | bwd_microstep: 4590.06 | bwd_inner_microstep: 4584.41 | bwd_allreduce_microstep: 5.53 | step_microstep: 44.60
-[2025-01-25 18:46:54,129] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2168.22 | bwd: 4590.09 | bwd_inner: 4584.41 | bwd_allreduce: 5.59 | step: 44.61
- 62%|██████▏   | 3580/5800 [10:00:24<4:15:37,  6.91s/it]                                                        {'loss': 0.0156, 'grad_norm': 2.3004212379455566, 'learning_rate': 1.3497672035011786e-05, 'epoch': 30.86}
- 62%|██████▏   | 3580/5800 [10:00:24<4:15:37,  6.91s/it]score1 tensor([[0.4941],
-        [0.3750],
-        [0.5039],
-        [0.6016]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4922, 0.3750, 0.4961, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0044, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:47:01,002] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 18:47:01,003] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.99 | bwd_microstep: 4593.35 | bwd_inner_microstep: 4588.00 | bwd_allreduce_microstep: 5.26 | step_microstep: 44.23
-[2025-01-25 18:47:01,003] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.96 | bwd: 4593.38 | bwd_inner: 4588.00 | bwd_allreduce: 5.31 | step: 44.24
- 62%|██████▏   | 3581/5800 [10:00:30<4:15:09,  6.90s/it]                                                        {'loss': 0.0044, 'grad_norm': 1.8331987857818604, 'learning_rate': 1.3487111646865714e-05, 'epoch': 30.87}
- 62%|██████▏   | 3581/5800 [10:00:30<4:15:09,  6.90s/it]score1 tensor([[0.5234],
-        [0.5820],
-        [0.4375],
-        [0.4277]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.5742, 0.4492, 0.4395], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:47:07,926] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 18:47:07,927] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.13 | bwd_microstep: 4637.35 | bwd_inner_microstep: 4632.01 | bwd_allreduce_microstep: 5.21 | step_microstep: 45.57
-[2025-01-25 18:47:07,928] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.10 | bwd: 4637.38 | bwd_inner: 4632.01 | bwd_allreduce: 5.27 | step: 45.56
- 62%|██████▏   | 3582/5800 [10:00:37<4:15:18,  6.91s/it]                                                        {'loss': 0.0156, 'grad_norm': 3.6149473190307617, 'learning_rate': 1.347655328955123e-05, 'epoch': 30.88}
- 62%|██████▏   | 3582/5800 [10:00:37<4:15:18,  6.91s/it]score1 tensor([[0.6250],
-        [0.4785],
-        [0.4277],
-        [0.6211]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6406, 0.4805, 0.4570, 0.6289], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:47:14,860] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.37
-[2025-01-25 18:47:14,861] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.55 | bwd_microstep: 4645.32 | bwd_inner_microstep: 4640.15 | bwd_allreduce_microstep: 5.04 | step_microstep: 39.58
-[2025-01-25 18:47:14,862] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.49 | bwd: 4645.35 | bwd_inner: 4640.15 | bwd_allreduce: 5.12 | step: 39.59
- 62%|██████▏   | 3583/5800 [10:00:44<4:15:33,  6.92s/it]                                                        {'loss': 0.0137, 'grad_norm': 8.443559646606445, 'learning_rate': 1.346599696636063e-05, 'epoch': 30.89}
- 62%|██████▏   | 3583/5800 [10:00:44<4:15:33,  6.92s/it]score1 tensor([[0.4062],
-        [0.5898],
-        [0.5352],
-        [0.4609]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4238, 0.5898, 0.6055, 0.4766], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0259, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:47:21,740] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.28 | optimizer_step: 4.36
-[2025-01-25 18:47:21,741] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.40 | bwd_microstep: 4586.13 | bwd_inner_microstep: 4581.37 | bwd_allreduce_microstep: 4.68 | step_microstep: 51.07
-[2025-01-25 18:47:21,741] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.36 | bwd: 4586.16 | bwd_inner: 4581.37 | bwd_allreduce: 4.72 | step: 51.08
- 62%|██████▏   | 3584/5800 [10:00:51<4:15:04,  6.91s/it]                                                        {'loss': 0.0259, 'grad_norm': 5.860926151275635, 'learning_rate': 1.3455442680585554e-05, 'epoch': 30.9}
- 62%|██████▏   | 3584/5800 [10:00:51<4:15:04,  6.91s/it]score1 tensor([[0.4512],
-        [0.5430],
-        [0.5664],
-        [0.4297]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4629, 0.5508, 0.5625, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:47:28,681] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.94 | optimizer_step: 4.37
-[2025-01-25 18:47:28,683] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.47 | bwd_microstep: 4645.64 | bwd_inner_microstep: 4640.00 | bwd_allreduce_microstep: 5.53 | step_microstep: 49.21
-[2025-01-25 18:47:28,683] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.41 | bwd: 4645.68 | bwd_inner: 4640.00 | bwd_allreduce: 5.58 | step: 49.22
- 62%|██████▏   | 3585/5800 [10:00:58<4:15:23,  6.92s/it]                                                        {'loss': 0.0107, 'grad_norm': 3.71980619430542, 'learning_rate': 1.3444890435516996e-05, 'epoch': 30.91}
- 62%|██████▏   | 3585/5800 [10:00:58<4:15:23,  6.92s/it]score1 tensor([[0.4707],
-        [0.4043],
-        [0.4844],
-        [0.6328]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4746, 0.4043, 0.5156, 0.6641], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0166, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:47:35,586] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.26 | optimizer_step: 4.37
-[2025-01-25 18:47:35,587] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2170.67 | bwd_microstep: 4582.21 | bwd_inner_microstep: 4576.79 | bwd_allreduce_microstep: 5.31 | step_microstep: 49.07
-[2025-01-25 18:47:35,588] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2170.60 | bwd: 4582.24 | bwd_inner: 4576.79 | bwd_allreduce: 5.37 | step: 49.08
- 62%|██████▏   | 3586/5800 [10:01:05<4:15:03,  6.91s/it]                                                        {'loss': 0.0166, 'grad_norm': 6.21143102645874, 'learning_rate': 1.343434023444535e-05, 'epoch': 30.91}
- 62%|██████▏   | 3586/5800 [10:01:05<4:15:03,  6.91s/it]score1 tensor([[0.6289],
-        [0.3691],
-        [0.3887],
-        [0.6836]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6289, 0.3652, 0.3945, 0.6953], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:47:42,458] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 18:47:42,460] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.94 | bwd_microstep: 4584.68 | bwd_inner_microstep: 4579.20 | bwd_allreduce_microstep: 5.40 | step_microstep: 44.52
-[2025-01-25 18:47:42,460] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.90 | bwd: 4584.71 | bwd_inner: 4579.20 | bwd_allreduce: 5.44 | step: 44.53
- 62%|██████▏   | 3587/5800 [10:01:12<4:14:28,  6.90s/it]                                                        {'loss': 0.0054, 'grad_norm': 2.5401153564453125, 'learning_rate': 1.3423792080660337e-05, 'epoch': 30.92}
- 62%|██████▏   | 3587/5800 [10:01:12<4:14:28,  6.90s/it]score1 tensor([[0.5586],
-        [0.5586],
-        [0.4863],
-        [0.6758]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5742, 0.5312, 0.4863, 0.6875], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:47:49,345] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.37
-[2025-01-25 18:47:49,346] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2169.59 | bwd_microstep: 4593.85 | bwd_inner_microstep: 4588.47 | bwd_allreduce_microstep: 5.31 | step_microstep: 44.44
-[2025-01-25 18:47:49,346] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2169.54 | bwd: 4593.87 | bwd_inner: 4588.47 | bwd_allreduce: 5.34 | step: 44.45
- 62%|██████▏   | 3588/5800 [10:01:19<4:14:14,  6.90s/it]                                                        {'loss': 0.0137, 'grad_norm': 2.469818353652954, 'learning_rate': 1.3413245977451066e-05, 'epoch': 30.93}
- 62%|██████▏   | 3588/5800 [10:01:19<4:14:14,  6.90s/it]score1 tensor([[0.5977],
-        [0.4746],
-        [0.5430],
-        [0.6445]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.4688, 0.5625, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:47:56,226] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 18:47:56,228] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.08 | bwd_microstep: 4594.11 | bwd_inner_microstep: 4588.93 | bwd_allreduce_microstep: 5.10 | step_microstep: 45.23
-[2025-01-25 18:47:56,228] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.05 | bwd: 4594.13 | bwd_inner: 4588.93 | bwd_allreduce: 5.14 | step: 45.23
- 62%|██████▏   | 3589/5800 [10:01:26<4:13:55,  6.89s/it]                                                        {'loss': 0.0093, 'grad_norm': 2.4599008560180664, 'learning_rate': 1.3402701928105981e-05, 'epoch': 30.94}
- 62%|██████▏   | 3589/5800 [10:01:26<4:13:55,  6.89s/it]score1 tensor([[0.5195],
-        [0.4688],
-        [0.5742],
-        [0.4160]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.4668, 0.5430, 0.4258], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:48:03,095] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.88 | optimizer_step: 4.36
-[2025-01-25 18:48:03,096] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.61 | bwd_microstep: 4581.46 | bwd_inner_microstep: 4576.24 | bwd_allreduce_microstep: 5.12 | step_microstep: 44.84
-[2025-01-25 18:48:03,096] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.56 | bwd: 4581.49 | bwd_inner: 4576.24 | bwd_allreduce: 5.17 | step: 44.84
- 62%|██████▏   | 3590/5800 [10:01:33<4:13:34,  6.88s/it]                                                        {'loss': 0.0107, 'grad_norm': 2.3980727195739746, 'learning_rate': 1.3392159935912919e-05, 'epoch': 30.95}
- 62%|██████▏   | 3590/5800 [10:01:33<4:13:34,  6.88s/it]score1 tensor([[0.6055],
-        [0.6250],
-        [0.5156],
-        [0.4980]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5898, 0.6133, 0.5039, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:48:10,028] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 18:48:10,029] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.35 | bwd_microstep: 4646.77 | bwd_inner_microstep: 4641.53 | bwd_allreduce_microstep: 5.04 | step_microstep: 43.29
-[2025-01-25 18:48:10,030] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.30 | bwd: 4646.80 | bwd_inner: 4641.53 | bwd_allreduce: 5.19 | step: 43.30
- 62%|██████▏   | 3591/5800 [10:01:40<4:13:59,  6.90s/it]                                                        {'loss': 0.0132, 'grad_norm': 8.679993629455566, 'learning_rate': 1.3381620004159037e-05, 'epoch': 30.96}
- 62%|██████▏   | 3591/5800 [10:01:40<4:13:59,  6.90s/it]score1 tensor([[0.5312],
-        [0.6133],
-        [0.5820],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.5820, 0.5586, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0259, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:48:16,963] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 18:48:16,964] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.62 | bwd_microstep: 4642.90 | bwd_inner_microstep: 4637.76 | bwd_allreduce_microstep: 5.03 | step_microstep: 49.80
-[2025-01-25 18:48:16,964] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.57 | bwd: 4642.92 | bwd_inner: 4637.76 | bwd_allreduce: 5.08 | step: 49.80
- 62%|██████▏   | 3592/5800 [10:01:46<4:14:14,  6.91s/it]                                                        {'loss': 0.0259, 'grad_norm': 8.759068489074707, 'learning_rate': 1.337108213613088e-05, 'epoch': 30.97}
- 62%|██████▏   | 3592/5800 [10:01:46<4:14:14,  6.91s/it]score1 tensor([[0.4473],
-        [0.5820],
-        [0.4590],
-        [0.4512]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4375, 0.5508, 0.4512, 0.4336], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0166, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:48:23,884] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 18:48:23,885] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.57 | bwd_microstep: 4645.62 | bwd_inner_microstep: 4640.95 | bwd_allreduce_microstep: 4.59 | step_microstep: 43.73
-[2025-01-25 18:48:23,886] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.53 | bwd: 4645.64 | bwd_inner: 4640.95 | bwd_allreduce: 4.62 | step: 43.74
- 62%|██████▏   | 3593/5800 [10:01:53<4:14:16,  6.91s/it]                                                        {'loss': 0.0166, 'grad_norm': 7.894493103027344, 'learning_rate': 1.3360546335114331e-05, 'epoch': 30.97}
- 62%|██████▏   | 3593/5800 [10:01:53<4:14:16,  6.91s/it]score1 tensor([[0.4316],
-        [0.5469],
-        [0.4141],
-        [0.6016]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4277, 0.5234, 0.4004, 0.5938], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:48:30,805] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 18:48:30,806] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.06 | bwd_microstep: 4639.71 | bwd_inner_microstep: 4634.69 | bwd_allreduce_microstep: 4.94 | step_microstep: 44.23
-[2025-01-25 18:48:30,807] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.01 | bwd: 4639.74 | bwd_inner: 4634.69 | bwd_allreduce: 4.98 | step: 44.24
- 62%|██████▏   | 3594/5800 [10:02:00<4:14:17,  6.92s/it]                                                        {'loss': 0.0122, 'grad_norm': 8.027538299560547, 'learning_rate': 1.3350012604394642e-05, 'epoch': 30.98}
- 62%|██████▏   | 3594/5800 [10:02:00<4:14:17,  6.92s/it]score1 tensor([[0.6641],
-        [0.5703],
-        [0.6211],
-        [0.6172]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6562, 0.5586, 0.6133, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:48:37,727] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 18:48:37,728] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.24 | bwd_microstep: 4643.10 | bwd_inner_microstep: 4638.16 | bwd_allreduce_microstep: 4.82 | step_microstep: 42.61
-[2025-01-25 18:48:37,729] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.18 | bwd: 4643.13 | bwd_inner: 4638.16 | bwd_allreduce: 4.87 | step: 42.62
- 62%|██████▏   | 3595/5800 [10:02:07<4:14:18,  6.92s/it]                                                        {'loss': 0.0078, 'grad_norm': 4.627591609954834, 'learning_rate': 1.3339480947256412e-05, 'epoch': 30.99}
- 62%|██████▏   | 3595/5800 [10:02:07<4:14:18,  6.92s/it][93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-score1 tensor([[0.3848]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:48:41,965] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 18:48:41,966] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 573.05 | bwd_microstep: 1221.86 | bwd_inner_microstep: 1217.34 | bwd_allreduce_microstep: 4.43 | step_microstep: 44.13
-[2025-01-25 18:48:41,966] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 573.02 | bwd: 1221.88 | bwd_inner: 1217.34 | bwd_allreduce: 4.47 | step: 44.14
- 62%|██████▏   | 3596/5800 [10:02:11<3:44:32,  6.11s/it]                                                        {'loss': 0.0098, 'grad_norm': 7.102183818817139, 'learning_rate': 1.3328951366983594e-05, 'epoch': 31.0}
- 62%|██████▏   | 3596/5800 [10:02:11<3:44:32,  6.11s/it][2025-01-25 18:48:46,590] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 18:48:57,193] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 18:49:07,985] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 18:49:19,241] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-score1 tensor([[0.5703],
-        [0.5195],
-        [0.5117],
-        [0.4453]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.5273, 0.5391, 0.4570], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:49:33,787] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 18:49:33,789] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2134.59 | bwd_microstep: 4570.46 | bwd_inner_microstep: 4565.78 | bwd_allreduce_microstep: 4.58 | step_microstep: 45.81
-[2025-01-25 18:49:33,789] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2134.54 | bwd: 4570.49 | bwd_inner: 4565.78 | bwd_allreduce: 4.63 | step: 45.81
- 62%|██████▏   | 3597/5800 [10:03:03<12:07:55, 19.83s/it]                                                         {'loss': 0.0127, 'grad_norm': 3.8324649333953857, 'learning_rate': 1.3318423866859482e-05, 'epoch': 31.01}
- 62%|██████▏   | 3597/5800 [10:03:03<12:07:55, 19.83s/it]score1 tensor([[0.4023],
-        [0.3633],
-        [0.4219],
-        [0.4414]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4141, 0.3398, 0.4219, 0.4258], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:49:40,582] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.97 | optimizer_step: 4.36
-[2025-01-25 18:49:40,583] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2130.97 | bwd_microstep: 4545.35 | bwd_inner_microstep: 4540.04 | bwd_allreduce_microstep: 5.15 | step_microstep: 44.50
-[2025-01-25 18:49:40,584] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2130.94 | bwd: 4545.38 | bwd_inner: 4540.04 | bwd_allreduce: 5.23 | step: 44.51
- 62%|██████▏   | 3598/5800 [10:03:10<9:44:10, 15.92s/it]                                                         {'loss': 0.0127, 'grad_norm': 1.8623000383377075, 'learning_rate': 1.3307898450166756e-05, 'epoch': 31.02}
- 62%|██████▏   | 3598/5800 [10:03:10<9:44:10, 15.92s/it]score1 tensor([[0.4473],
-        [0.4629],
-        [0.6289],
-        [0.3242]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4453, 0.4707, 0.6289, 0.3105], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:49:47,378] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.95 | optimizer_step: 4.36
-[2025-01-25 18:49:47,380] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2139.15 | bwd_microstep: 4537.37 | bwd_inner_microstep: 4531.96 | bwd_allreduce_microstep: 5.31 | step_microstep: 42.33
-[2025-01-25 18:49:47,384] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2139.11 | bwd: 4537.40 | bwd_inner: 4531.96 | bwd_allreduce: 5.37 | step: 42.34
- 62%|██████▏   | 3599/5800 [10:03:17<8:03:36, 13.18s/it]                                                        {'loss': 0.0059, 'grad_norm': 1.694351315498352, 'learning_rate': 1.3297375120187401e-05, 'epoch': 31.03}
- 62%|██████▏   | 3599/5800 [10:03:17<8:03:36, 13.18s/it]score1 tensor([[0.6992],
-        [0.4746],
-        [0.4375],
-        [0.3906]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.7070, 0.4512, 0.4629, 0.4023], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0171, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:49:54,246] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 18:49:54,247] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2141.55 | bwd_microstep: 4592.55 | bwd_inner_microstep: 4587.21 | bwd_allreduce_microstep: 5.23 | step_microstep: 47.04
-[2025-01-25 18:49:54,248] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2141.52 | bwd: 4592.57 | bwd_inner: 4587.22 | bwd_allreduce: 5.29 | step: 47.04
- 62%|██████▏   | 3600/5800 [10:03:24<6:53:46, 11.28s/it]                                                        {'loss': 0.0171, 'grad_norm': 4.184047222137451, 'learning_rate': 1.3286853880202798e-05, 'epoch': 31.03}
- 62%|██████▏   | 3600/5800 [10:03:24<6:53:46, 11.28s/it]score1 tensor([[0.3906],
-        [0.3359],
-        [0.5703],
-        [0.3672]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4121, 0.3516, 0.5664, 0.3652], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:50:01,103] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 18:50:01,105] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2143.73 | bwd_microstep: 4601.95 | bwd_inner_microstep: 4596.72 | bwd_allreduce_microstep: 5.11 | step_microstep: 43.13
-[2025-01-25 18:50:01,105] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2143.69 | bwd: 4601.98 | bwd_inner: 4596.72 | bwd_allreduce: 5.17 | step: 43.14
- 62%|██████▏   | 3601/5800 [10:03:31<6:04:58,  9.96s/it]                                                        {'loss': 0.0107, 'grad_norm': 0.719279944896698, 'learning_rate': 1.3276334733493624e-05, 'epoch': 31.04}
- 62%|██████▏   | 3601/5800 [10:03:31<6:04:58,  9.96s/it]score1 tensor([[0.5000],
-        [0.4961],
-        [0.4492],
-        [0.5781]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5078, 0.5039, 0.4941, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0151, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:50:07,932] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.36
-[2025-01-25 18:50:07,934] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2148.45 | bwd_microstep: 4562.74 | bwd_inner_microstep: 4554.93 | bwd_allreduce_microstep: 7.71 | step_microstep: 44.54
-[2025-01-25 18:50:07,934] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2148.41 | bwd: 4562.77 | bwd_inner: 4554.93 | bwd_allreduce: 7.77 | step: 44.54
- 62%|██████▏   | 3602/5800 [10:03:37<5:30:22,  9.02s/it]                                                        {'loss': 0.0151, 'grad_norm': 5.915563583374023, 'learning_rate': 1.3265817683339955e-05, 'epoch': 31.05}
- 62%|██████▏   | 3602/5800 [10:03:37<5:30:22,  9.02s/it]score1 tensor([[0.4355],
-        [0.4023],
-        [0.6445],
-        [0.4473]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.4316, 0.6641, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0151, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:50:14,762] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 18:50:14,763] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2149.75 | bwd_microstep: 4560.43 | bwd_inner_microstep: 4555.47 | bwd_allreduce_microstep: 4.86 | step_microstep: 43.57
-[2025-01-25 18:50:14,764] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2149.71 | bwd: 4560.46 | bwd_inner: 4555.47 | bwd_allreduce: 4.91 | step: 43.56
- 62%|██████▏   | 3603/5800 [10:03:44<5:06:10,  8.36s/it]                                                        {'loss': 0.0151, 'grad_norm': 6.024079322814941, 'learning_rate': 1.325530273302118e-05, 'epoch': 31.06}
- 62%|██████▏   | 3603/5800 [10:03:44<5:06:10,  8.36s/it]score1 tensor([[0.6406],
-        [0.4980],
-        [0.5234],
-        [0.3906]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6367, 0.5156, 0.5430, 0.4062], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0142, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:50:21,636] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 18:50:21,637] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2148.80 | bwd_microstep: 4606.58 | bwd_inner_microstep: 4600.85 | bwd_allreduce_microstep: 5.62 | step_microstep: 47.31
-[2025-01-25 18:50:21,638] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2148.77 | bwd: 4606.60 | bwd_inner: 4600.85 | bwd_allreduce: 5.67 | step: 47.32
- 62%|██████▏   | 3604/5800 [10:03:51<4:49:43,  7.92s/it]                                                        {'loss': 0.0142, 'grad_norm': 3.5464186668395996, 'learning_rate': 1.3244789885816039e-05, 'epoch': 31.07}
- 62%|██████▏   | 3604/5800 [10:03:51<4:49:43,  7.92s/it]score1 tensor([[0.6836],
-        [0.5391],
-        [0.5000],
-        [0.3691]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6875, 0.5469, 0.5117, 0.3926], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:50:28,523] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.37
-[2025-01-25 18:50:28,525] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.84 | bwd_microstep: 4610.59 | bwd_inner_microstep: 4605.64 | bwd_allreduce_microstep: 4.88 | step_microstep: 44.63
-[2025-01-25 18:50:28,525] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.81 | bwd: 4610.62 | bwd_inner: 4605.64 | bwd_allreduce: 4.91 | step: 44.64
- 62%|██████▏   | 3605/5800 [10:03:58<4:38:21,  7.61s/it]                                                        {'loss': 0.0117, 'grad_norm': 8.315693855285645, 'learning_rate': 1.3234279145002629e-05, 'epoch': 31.08}
- 62%|██████▏   | 3605/5800 [10:03:58<4:38:21,  7.61s/it]score1 tensor([[0.4766],
-        [0.3398],
-        [0.4824],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5000, 0.3652, 0.4961, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:50:35,360] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 18:50:35,361] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.59 | bwd_microstep: 4560.06 | bwd_inner_microstep: 4554.19 | bwd_allreduce_microstep: 5.76 | step_microstep: 43.76
-[2025-01-25 18:50:35,362] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.56 | bwd: 4560.08 | bwd_inner: 4554.20 | bwd_allreduce: 5.81 | step: 43.77
- 62%|██████▏   | 3606/5800 [10:04:05<4:29:47,  7.38s/it]                                                        {'loss': 0.0156, 'grad_norm': 5.605062484741211, 'learning_rate': 1.322377051385837e-05, 'epoch': 31.09}
- 62%|██████▏   | 3606/5800 [10:04:05<4:29:47,  7.38s/it]score1 tensor([[0.6133],
-        [0.5117],
-        [0.6562],
-        [0.4941]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6016, 0.5039, 0.6719, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:50:42,215] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 18:50:42,217] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.81 | bwd_microstep: 4568.13 | bwd_inner_microstep: 4562.89 | bwd_allreduce_microstep: 5.17 | step_microstep: 42.90
-[2025-01-25 18:50:42,217] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.77 | bwd: 4568.16 | bwd_inner: 4562.89 | bwd_allreduce: 5.21 | step: 42.91
- 62%|██████▏   | 3607/5800 [10:04:12<4:23:52,  7.22s/it]                                                        {'loss': 0.0088, 'grad_norm': 1.9579170942306519, 'learning_rate': 1.3213263995660051e-05, 'epoch': 31.09}
- 62%|██████▏   | 3607/5800 [10:04:12<4:23:52,  7.22s/it]score1 tensor([[0.5000],
-        [0.4668],
-        [0.4746],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.4844, 0.4941, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:50:49,103] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.93 | optimizer_step: 4.36
-[2025-01-25 18:50:49,105] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.48 | bwd_microstep: 4603.46 | bwd_inner_microstep: 4597.36 | bwd_allreduce_microstep: 5.95 | step_microstep: 51.17
-[2025-01-25 18:50:49,105] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.43 | bwd: 4603.48 | bwd_inner: 4597.36 | bwd_allreduce: 6.03 | step: 51.17
- 62%|██████▏   | 3608/5800 [10:04:19<4:20:06,  7.12s/it]                                                        {'loss': 0.0161, 'grad_norm': 8.078700065612793, 'learning_rate': 1.3202759593683774e-05, 'epoch': 31.1}
- 62%|██████▏   | 3608/5800 [10:04:19<4:20:06,  7.12s/it]score1 tensor([[0.3535],
-        [0.4629],
-        [0.6289],
-        [0.4863]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.2812, 0.4727, 0.6094, 0.4863], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:50:55,952] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 18:50:55,954] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.86 | bwd_microstep: 4564.06 | bwd_inner_microstep: 4558.60 | bwd_allreduce_microstep: 5.37 | step_microstep: 44.97
-[2025-01-25 18:50:55,954] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.81 | bwd: 4564.11 | bwd_inner: 4558.60 | bwd_allreduce: 5.42 | step: 44.98
- 62%|██████▏   | 3609/5800 [10:04:25<4:17:00,  7.04s/it]                                                        {'loss': 0.0254, 'grad_norm': 2.0804388523101807, 'learning_rate': 1.3192257311204999e-05, 'epoch': 31.11}
- 62%|██████▏   | 3609/5800 [10:04:25<4:17:00,  7.04s/it]score1 tensor([[0.4805],
-        [0.3945],
-        [0.3359],
-        [0.4023]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.4023, 0.3457, 0.3984], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:51:02,843] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.49 | optimizer_step: 4.36
-[2025-01-25 18:51:02,844] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.89 | bwd_microstep: 4607.11 | bwd_inner_microstep: 4601.68 | bwd_allreduce_microstep: 5.31 | step_microstep: 47.08
-[2025-01-25 18:51:02,845] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.85 | bwd: 4607.15 | bwd_inner: 4601.68 | bwd_allreduce: 5.37 | step: 47.08
- 62%|██████▏   | 3610/5800 [10:04:32<4:15:17,  6.99s/it]                                                        {'loss': 0.0063, 'grad_norm': 3.632155179977417, 'learning_rate': 1.3181757151498518e-05, 'epoch': 31.12}
- 62%|██████▏   | 3610/5800 [10:04:32<4:15:17,  6.99s/it]score1 tensor([[0.3789],
-        [0.5234],
-        [0.4707],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3867, 0.5195, 0.4766, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:51:09,740] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 18:51:09,741] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.53 | bwd_microstep: 4606.19 | bwd_inner_microstep: 4600.74 | bwd_allreduce_microstep: 5.33 | step_microstep: 43.48
-[2025-01-25 18:51:09,741] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.49 | bwd: 4606.22 | bwd_inner: 4600.74 | bwd_allreduce: 5.39 | step: 43.48
- 62%|██████▏   | 3611/5800 [10:04:39<4:14:05,  6.96s/it]                                                        {'loss': 0.0054, 'grad_norm': 0.8253551125526428, 'learning_rate': 1.3171259117838465e-05, 'epoch': 31.13}
- 62%|██████▏   | 3611/5800 [10:04:39<4:14:05,  6.96s/it]score1 tensor([[0.5625],
-        [0.4238],
-        [0.4434],
-        [0.4629]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4551, 0.4297, 0.4395], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0200, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:51:16,643] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.37
-[2025-01-25 18:51:16,645] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.21 | bwd_microstep: 4620.61 | bwd_inner_microstep: 4614.33 | bwd_allreduce_microstep: 6.16 | step_microstep: 49.64
-[2025-01-25 18:51:16,645] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.15 | bwd: 4620.64 | bwd_inner: 4614.33 | bwd_allreduce: 6.23 | step: 49.64
- 62%|██████▏   | 3612/5800 [10:04:46<4:13:22,  6.95s/it]                                                        {'loss': 0.02, 'grad_norm': 4.1449666023254395, 'learning_rate': 1.3160763213498307e-05, 'epoch': 31.14}
- 62%|██████▏   | 3612/5800 [10:04:46<4:13:22,  6.95s/it]score1 tensor([[0.5078],
-        [0.5625],
-        [0.5977],
-        [0.3809]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4902, 0.5430, 0.5977, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:51:23,511] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.36
-[2025-01-25 18:51:23,513] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.16 | bwd_microstep: 4578.91 | bwd_inner_microstep: 4572.98 | bwd_allreduce_microstep: 5.78 | step_microstep: 47.69
-[2025-01-25 18:51:23,513] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.11 | bwd: 4578.93 | bwd_inner: 4572.98 | bwd_allreduce: 5.86 | step: 47.69
- 62%|██████▏   | 3613/5800 [10:04:53<4:12:18,  6.92s/it]                                                        {'loss': 0.0107, 'grad_norm': 5.909097194671631, 'learning_rate': 1.3150269441750849e-05, 'epoch': 31.15}
- 62%|██████▏   | 3613/5800 [10:04:53<4:12:18,  6.92s/it]score1 tensor([[0.5781],
-        [0.6211],
-        [0.7109],
-        [0.4492]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.6133, 0.6875, 0.4434], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:51:30,367] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.36
-[2025-01-25 18:51:30,369] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.37 | bwd_microstep: 4579.35 | bwd_inner_microstep: 4574.47 | bwd_allreduce_microstep: 4.80 | step_microstep: 43.14
-[2025-01-25 18:51:30,369] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.31 | bwd: 4579.37 | bwd_inner: 4574.47 | bwd_allreduce: 4.84 | step: 43.14
- 62%|██████▏   | 3614/5800 [10:05:00<4:11:30,  6.90s/it]                                                        {'loss': 0.0093, 'grad_norm': 6.740379333496094, 'learning_rate': 1.3139777805868235e-05, 'epoch': 31.16}
- 62%|██████▏   | 3614/5800 [10:05:00<4:11:30,  6.90s/it]score1 tensor([[0.3984],
-        [0.5000],
-        [0.4688],
-        [0.6094]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3809, 0.5000, 0.4531, 0.5898], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:51:37,232] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 18:51:37,233] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.37 | bwd_microstep: 4579.46 | bwd_inner_microstep: 4574.47 | bwd_allreduce_microstep: 4.86 | step_microstep: 44.19
-[2025-01-25 18:51:37,234] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.32 | bwd: 4579.49 | bwd_inner: 4574.47 | bwd_allreduce: 4.92 | step: 44.20
- 62%|██████▏   | 3615/5800 [10:05:07<4:10:57,  6.89s/it]                                                        {'loss': 0.0132, 'grad_norm': 6.0072197914123535, 'learning_rate': 1.3129288309121937e-05, 'epoch': 31.16}
- 62%|██████▏   | 3615/5800 [10:05:07<4:10:57,  6.89s/it]score1 tensor([[0.5391],
-        [0.4844],
-        [0.3926],
-        [0.3613]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.4941, 0.3652, 0.3730], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0151, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:51:44,142] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.37
-[2025-01-25 18:51:44,143] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.33 | bwd_microstep: 4628.98 | bwd_inner_microstep: 4623.60 | bwd_allreduce_microstep: 5.30 | step_microstep: 44.57
-[2025-01-25 18:51:44,143] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.29 | bwd: 4629.00 | bwd_inner: 4623.60 | bwd_allreduce: 5.34 | step: 44.57
- 62%|██████▏   | 3616/5800 [10:05:14<4:11:01,  6.90s/it]                                                        {'loss': 0.0151, 'grad_norm': 0.4267805516719818, 'learning_rate': 1.3118800954782755e-05, 'epoch': 31.17}
- 62%|██████▏   | 3616/5800 [10:05:14<4:11:01,  6.90s/it]score1 tensor([[0.5586],
-        [0.4688],
-        [0.4531],
-        [0.4355]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5703, 0.4629, 0.4551, 0.4395], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:51:51,062] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.19 | optimizer_step: 4.36
-[2025-01-25 18:51:51,064] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.94 | bwd_microstep: 4635.87 | bwd_inner_microstep: 4630.86 | bwd_allreduce_microstep: 4.85 | step_microstep: 45.33
-[2025-01-25 18:51:51,064] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.90 | bwd: 4635.90 | bwd_inner: 4630.86 | bwd_allreduce: 4.95 | step: 45.34
- 62%|██████▏   | 3617/5800 [10:05:21<4:11:17,  6.91s/it]                                                        {'loss': 0.0059, 'grad_norm': 3.9937021732330322, 'learning_rate': 1.3108315746120846e-05, 'epoch': 31.18}
- 62%|██████▏   | 3617/5800 [10:05:21<4:11:17,  6.91s/it]score1 tensor([[0.5508],
-        [0.6133],
-        [0.6172],
-        [0.3652]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.6094, 0.6445, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:51:58,001] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 18:51:58,002] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.36 | bwd_microstep: 4639.02 | bwd_inner_microstep: 4633.49 | bwd_allreduce_microstep: 5.41 | step_microstep: 47.46
-[2025-01-25 18:51:58,003] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.31 | bwd: 4639.04 | bwd_inner: 4633.49 | bwd_allreduce: 5.48 | step: 47.46
- 62%|██████▏   | 3618/5800 [10:05:27<4:11:26,  6.91s/it]                                                        {'loss': 0.0112, 'grad_norm': 0.6982657313346863, 'learning_rate': 1.3097832686405655e-05, 'epoch': 31.19}
- 62%|██████▏   | 3618/5800 [10:05:27<4:11:26,  6.91s/it]score1 tensor([[0.4277],
-        [0.4121],
-        [0.4590],
-        [0.4492]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4258, 0.4199, 0.4531, 0.4570], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:52:04,912] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.36
-[2025-01-25 18:52:04,913] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.71 | bwd_microstep: 4625.40 | bwd_inner_microstep: 4620.03 | bwd_allreduce_microstep: 5.27 | step_microstep: 46.43
-[2025-01-25 18:52:04,913] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.66 | bwd: 4625.43 | bwd_inner: 4620.03 | bwd_allreduce: 5.32 | step: 46.44
- 62%|██████▏   | 3619/5800 [10:05:34<4:11:22,  6.92s/it]                                                        {'loss': 0.0059, 'grad_norm': 0.34231501817703247, 'learning_rate': 1.3087351778906006e-05, 'epoch': 31.2}
- 62%|██████▏   | 3619/5800 [10:05:34<4:11:22,  6.92s/it]score1 tensor([[0.4258],
-        [0.5547],
-        [0.3398],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4141, 0.5391, 0.3438, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:52:11,796] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.88 | optimizer_step: 4.37
-[2025-01-25 18:52:11,797] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.85 | bwd_microstep: 4587.54 | bwd_inner_microstep: 4582.06 | bwd_allreduce_microstep: 5.38 | step_microstep: 47.13
-[2025-01-25 18:52:11,797] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.81 | bwd: 4587.57 | bwd_inner: 4582.06 | bwd_allreduce: 5.44 | step: 47.14
- 62%|██████▏   | 3620/5800 [10:05:41<4:10:48,  6.90s/it]                                                        {'loss': 0.0078, 'grad_norm': 2.368422269821167, 'learning_rate': 1.307687302689001e-05, 'epoch': 31.21}
- 62%|██████▏   | 3620/5800 [10:05:41<4:10:48,  6.90s/it]score1 tensor([[0.4277],
-        [0.4609],
-        [0.6797],
-        [0.4492]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4277, 0.4883, 0.6797, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:52:18,630] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 18:52:18,632] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.83 | bwd_microstep: 4550.81 | bwd_inner_microstep: 4544.98 | bwd_allreduce_microstep: 5.73 | step_microstep: 47.09
-[2025-01-25 18:52:18,632] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.78 | bwd: 4550.84 | bwd_inner: 4544.98 | bwd_allreduce: 5.78 | step: 47.10
- 62%|██████▏   | 3621/5800 [10:05:48<4:09:59,  6.88s/it]                                                        {'loss': 0.0068, 'grad_norm': 1.9382680654525757, 'learning_rate': 1.3066396433625137e-05, 'epoch': 31.22}
- 62%|██████▏   | 3621/5800 [10:05:48<4:09:59,  6.88s/it]score1 tensor([[0.4121],
-        [0.4922],
-        [0.4355],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.4824, 0.4375, 0.5742], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:52:25,549] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.37
-[2025-01-25 18:52:25,550] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.36 | bwd_microstep: 4628.71 | bwd_inner_microstep: 4623.19 | bwd_allreduce_microstep: 5.41 | step_microstep: 45.29
-[2025-01-25 18:52:25,551] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.28 | bwd: 4628.74 | bwd_inner: 4623.19 | bwd_allreduce: 5.47 | step: 45.29
- 62%|██████▏   | 3622/5800 [10:05:55<4:10:13,  6.89s/it]                                                        {'loss': 0.0063, 'grad_norm': 3.8436853885650635, 'learning_rate': 1.3055922002378165e-05, 'epoch': 31.22}
- 62%|██████▏   | 3622/5800 [10:05:55<4:10:13,  6.89s/it]score1 tensor([[0.4180],
-        [0.5508],
-        [0.5977],
-        [0.6055]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4121, 0.5664, 0.6250, 0.6484], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0229, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:52:32,466] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.37
-[2025-01-25 18:52:32,468] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.94 | bwd_microstep: 4631.28 | bwd_inner_microstep: 4625.19 | bwd_allreduce_microstep: 5.95 | step_microstep: 47.41
-[2025-01-25 18:52:32,468] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.91 | bwd: 4631.30 | bwd_inner: 4625.19 | bwd_allreduce: 6.04 | step: 47.42
- 62%|██████▏   | 3623/5800 [10:06:02<4:10:21,  6.90s/it]                                                        {'loss': 0.0229, 'grad_norm': 4.879299640655518, 'learning_rate': 1.3045449736415197e-05, 'epoch': 31.23}
- 62%|██████▏   | 3623/5800 [10:06:02<4:10:21,  6.90s/it]score1 tensor([[0.6250],
-        [0.5820],
-        [0.5430],
-        [0.4590]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6523, 0.5898, 0.5625, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:52:39,383] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.98 | optimizer_step: 4.36
-[2025-01-25 18:52:39,384] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.70 | bwd_microstep: 4630.57 | bwd_inner_microstep: 4624.89 | bwd_allreduce_microstep: 5.58 | step_microstep: 46.63
-[2025-01-25 18:52:39,384] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.66 | bwd: 4630.60 | bwd_inner: 4624.89 | bwd_allreduce: 5.63 | step: 46.64
- 62%|██████▏   | 3624/5800 [10:06:09<4:10:25,  6.91s/it]                                                        {'loss': 0.0156, 'grad_norm': 4.757456302642822, 'learning_rate': 1.303497963900168e-05, 'epoch': 31.24}
- 62%|██████▏   | 3624/5800 [10:06:09<4:10:25,  6.91s/it]score1 tensor([[0.5078],
-        [0.5430],
-        [0.5312],
-        [0.4375]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.5508, 0.5469, 0.4316], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:52:46,308] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.36
-[2025-01-25 18:52:46,310] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.33 | bwd_microstep: 4629.64 | bwd_inner_microstep: 4623.42 | bwd_allreduce_microstep: 6.11 | step_microstep: 52.88
-[2025-01-25 18:52:46,310] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.28 | bwd: 4629.66 | bwd_inner: 4623.42 | bwd_allreduce: 6.17 | step: 52.89
- 62%|██████▎   | 3625/5800 [10:06:16<4:10:32,  6.91s/it]                                                        {'loss': 0.0122, 'grad_norm': 4.357447147369385, 'learning_rate': 1.3024511713402355e-05, 'epoch': 31.25}
- 62%|██████▎   | 3625/5800 [10:06:16<4:10:32,  6.91s/it]score1 tensor([[0.5273],
-        [0.5117],
-        [0.5547],
-        [0.4805]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.4883, 0.5508, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:52:53,184] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 18:52:53,185] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.43 | bwd_microstep: 4584.29 | bwd_inner_microstep: 4578.72 | bwd_allreduce_microstep: 5.47 | step_microstep: 50.49
-[2025-01-25 18:52:53,185] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.40 | bwd: 4584.35 | bwd_inner: 4578.72 | bwd_allreduce: 5.52 | step: 50.50
- 63%|██████▎   | 3626/5800 [10:06:23<4:10:00,  6.90s/it]                                                        {'loss': 0.0103, 'grad_norm': 2.2275350093841553, 'learning_rate': 1.3014045962881322e-05, 'epoch': 31.26}
- 63%|██████▎   | 3626/5800 [10:06:23<4:10:00,  6.90s/it]score1 tensor([[0.4492],
-        [0.6016],
-        [0.4883],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4336, 0.6172, 0.4883, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:53:00,052] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 18:53:00,055] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.40 | bwd_microstep: 4584.05 | bwd_inner_microstep: 4578.40 | bwd_allreduce_microstep: 5.54 | step_microstep: 46.89
-[2025-01-25 18:53:00,055] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.36 | bwd: 4584.07 | bwd_inner: 4578.40 | bwd_allreduce: 5.59 | step: 46.90
- 63%|██████▎   | 3627/5800 [10:06:30<4:09:38,  6.89s/it]                                                        {'loss': 0.0103, 'grad_norm': 2.3502345085144043, 'learning_rate': 1.3003582390701973e-05, 'epoch': 31.27}
- 63%|██████▎   | 3627/5800 [10:06:30<4:09:38,  6.89s/it]score1 tensor([[0.6523],
-        [0.4941],
-        [0.5078],
-        [0.4805]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6562, 0.4980, 0.4785, 0.4688], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:53:06,986] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.99 | optimizer_step: 4.36
-[2025-01-25 18:53:06,988] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.29 | bwd_microstep: 4632.24 | bwd_inner_microstep: 4626.31 | bwd_allreduce_microstep: 5.82 | step_microstep: 49.82
-[2025-01-25 18:53:06,988] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.26 | bwd: 4632.28 | bwd_inner: 4626.31 | bwd_allreduce: 5.86 | step: 49.83
- 63%|██████▎   | 3628/5800 [10:06:36<4:09:54,  6.90s/it]                                                        {'loss': 0.0122, 'grad_norm': 0.5906041264533997, 'learning_rate': 1.2993121000127031e-05, 'epoch': 31.28}
- 63%|██████▎   | 3628/5800 [10:06:36<4:09:54,  6.90s/it]score1 tensor([[0.5117],
-        [0.4316],
-        [0.4648],
-        [0.4590]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4863, 0.4004, 0.4453, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0200, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:53:13,915] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.37
-[2025-01-25 18:53:13,917] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.96 | bwd_microstep: 4639.31 | bwd_inner_microstep: 4632.59 | bwd_allreduce_microstep: 6.63 | step_microstep: 48.86
-[2025-01-25 18:53:13,917] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.93 | bwd: 4639.34 | bwd_inner: 4632.59 | bwd_allreduce: 6.68 | step: 48.87
- 63%|██████▎   | 3629/5800 [10:06:43<4:10:03,  6.91s/it]                                                        {'loss': 0.02, 'grad_norm': 7.718921184539795, 'learning_rate': 1.298266179441854e-05, 'epoch': 31.28}
- 63%|██████▎   | 3629/5800 [10:06:43<4:10:03,  6.91s/it]score1 tensor([[0.5469],
-        [0.5508],
-        [0.5469],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.5508, 0.5273, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:53:20,796] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.93 | optimizer_step: 4.36
-[2025-01-25 18:53:20,797] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.63 | bwd_microstep: 4585.37 | bwd_inner_microstep: 4579.96 | bwd_allreduce_microstep: 5.30 | step_microstep: 48.84
-[2025-01-25 18:53:20,799] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.60 | bwd: 4585.40 | bwd_inner: 4579.96 | bwd_allreduce: 5.35 | step: 48.85
- 63%|██████▎   | 3630/5800 [10:06:50<4:09:41,  6.90s/it]                                                        {'loss': 0.0063, 'grad_norm': 2.0307528972625732, 'learning_rate': 1.2972204776837865e-05, 'epoch': 31.29}
- 63%|██████▎   | 3630/5800 [10:06:50<4:09:41,  6.90s/it]score1 tensor([[0.5273],
-        [0.4375],
-        [0.4922],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.4277, 0.4785, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:53:27,729] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.37
-[2025-01-25 18:53:27,731] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.56 | bwd_microstep: 4635.19 | bwd_inner_microstep: 4630.21 | bwd_allreduce_microstep: 4.89 | step_microstep: 43.47
-[2025-01-25 18:53:27,731] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.51 | bwd: 4635.21 | bwd_inner: 4630.21 | bwd_allreduce: 4.94 | step: 43.49
- 63%|██████▎   | 3631/5800 [10:06:57<4:09:50,  6.91s/it]                                                        {'loss': 0.0127, 'grad_norm': 8.00152587890625, 'learning_rate': 1.2961749950645684e-05, 'epoch': 31.3}
- 63%|██████▎   | 3631/5800 [10:06:57<4:09:50,  6.91s/it]score1 tensor([[0.5703],
-        [0.5508],
-        [0.5234],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.5469, 0.5039, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:53:34,652] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.88 | optimizer_step: 4.37
-[2025-01-25 18:53:34,654] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.88 | bwd_microstep: 4631.95 | bwd_inner_microstep: 4626.85 | bwd_allreduce_microstep: 4.99 | step_microstep: 44.26
-[2025-01-25 18:53:34,654] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.85 | bwd: 4631.98 | bwd_inner: 4626.85 | bwd_allreduce: 5.05 | step: 44.27
- 63%|██████▎   | 3632/5800 [10:07:04<4:09:50,  6.91s/it]                                                        {'loss': 0.0117, 'grad_norm': 8.390115737915039, 'learning_rate': 1.2951297319101985e-05, 'epoch': 31.31}
- 63%|██████▎   | 3632/5800 [10:07:04<4:09:50,  6.91s/it]score1 tensor([[0.6914],
-        [0.4531],
-        [0.6094],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6836, 0.4453, 0.6211, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:53:41,580] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.37
-[2025-01-25 18:53:41,581] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.07 | bwd_microstep: 4642.86 | bwd_inner_microstep: 4637.79 | bwd_allreduce_microstep: 4.95 | step_microstep: 44.64
-[2025-01-25 18:53:41,582] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.04 | bwd: 4642.88 | bwd_inner: 4637.79 | bwd_allreduce: 5.02 | step: 44.65
- 63%|██████▎   | 3633/5800 [10:07:11<4:09:54,  6.92s/it]                                                        {'loss': 0.0088, 'grad_norm': 4.217959403991699, 'learning_rate': 1.2940846885466099e-05, 'epoch': 31.32}
- 63%|██████▎   | 3633/5800 [10:07:11<4:09:54,  6.92s/it]score1 tensor([[0.4941],
-        [0.4395],
-        [0.5625],
-        [0.4512]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4922, 0.4141, 0.5469, 0.4336], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0151, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:53:48,505] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.03 | optimizer_step: 4.36
-[2025-01-25 18:53:48,506] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.24 | bwd_microstep: 4641.04 | bwd_inner_microstep: 4635.80 | bwd_allreduce_microstep: 5.13 | step_microstep: 43.61
-[2025-01-25 18:53:48,507] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.20 | bwd: 4641.06 | bwd_inner: 4635.80 | bwd_allreduce: 5.18 | step: 43.62
- 63%|██████▎   | 3634/5800 [10:07:18<4:09:49,  6.92s/it]                                                        {'loss': 0.0151, 'grad_norm': 7.911653518676758, 'learning_rate': 1.2930398652996639e-05, 'epoch': 31.33}
- 63%|██████▎   | 3634/5800 [10:07:18<4:09:49,  6.92s/it]score1 tensor([[0.4922],
-        [0.5078],
-        [0.4180],
-        [0.6328]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.5195, 0.4180, 0.6484], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:53:55,370] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.16 | optimizer_step: 4.37
-[2025-01-25 18:53:55,372] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.51 | bwd_microstep: 4580.31 | bwd_inner_microstep: 4574.89 | bwd_allreduce_microstep: 5.35 | step_microstep: 43.93
-[2025-01-25 18:53:55,372] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.48 | bwd: 4580.34 | bwd_inner: 4574.89 | bwd_allreduce: 5.39 | step: 43.93
- 63%|██████▎   | 3635/5800 [10:07:25<4:09:06,  6.90s/it]                                                        {'loss': 0.0073, 'grad_norm': 6.332437992095947, 'learning_rate': 1.2919952624951542e-05, 'epoch': 31.34}
- 63%|██████▎   | 3635/5800 [10:07:25<4:09:06,  6.90s/it]score1 tensor([[0.5430],
-        [0.4980],
-        [0.5352],
-        [0.3809]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133, 0.5117, 0.5586, 0.3789], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:54:02,291] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.90 | optimizer_step: 4.37
-[2025-01-25 18:54:02,292] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.64 | bwd_microstep: 4633.67 | bwd_inner_microstep: 4627.71 | bwd_allreduce_microstep: 5.85 | step_microstep: 45.46
-[2025-01-25 18:54:02,293] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.59 | bwd: 4633.70 | bwd_inner: 4627.71 | bwd_allreduce: 5.90 | step: 45.45
- 63%|██████▎   | 3636/5800 [10:07:32<4:09:13,  6.91s/it]                                                        {'loss': 0.0273, 'grad_norm': 4.529596328735352, 'learning_rate': 1.2909508804588071e-05, 'epoch': 31.34}
- 63%|██████▎   | 3636/5800 [10:07:32<4:09:13,  6.91s/it]score1 tensor([[0.4688],
-        [0.4688],
-        [0.4258],
-        [0.3379]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4570, 0.4551, 0.4355, 0.3477], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:54:09,226] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.92 | optimizer_step: 4.36
-[2025-01-25 18:54:09,227] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.43 | bwd_microstep: 4632.45 | bwd_inner_microstep: 4626.78 | bwd_allreduce_microstep: 5.54 | step_microstep: 46.16
-[2025-01-25 18:54:09,228] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.40 | bwd: 4632.48 | bwd_inner: 4626.78 | bwd_allreduce: 5.60 | step: 46.18
- 63%|██████▎   | 3637/5800 [10:07:39<4:09:23,  6.92s/it]                                                        {'loss': 0.0112, 'grad_norm': 0.6327628493309021, 'learning_rate': 1.289906719516278e-05, 'epoch': 31.35}
- 63%|██████▎   | 3637/5800 [10:07:39<4:09:23,  6.92s/it]score1 tensor([[0.4531],
-        [0.5039],
-        [0.5820],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.5117, 0.6172, 0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:54:16,154] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.37
-[2025-01-25 18:54:16,155] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.05 | bwd_microstep: 4632.23 | bwd_inner_microstep: 4626.47 | bwd_allreduce_microstep: 5.65 | step_microstep: 44.86
-[2025-01-25 18:54:16,155] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.02 | bwd: 4632.26 | bwd_inner: 4626.47 | bwd_allreduce: 5.70 | step: 44.86
- 63%|██████▎   | 3638/5800 [10:07:46<4:09:18,  6.92s/it]                                                        {'loss': 0.0161, 'grad_norm': 8.138400077819824, 'learning_rate': 1.2888627799931555e-05, 'epoch': 31.36}
- 63%|██████▎   | 3638/5800 [10:07:46<4:09:18,  6.92s/it]score1 tensor([[0.5312],
-        [0.4297],
-        [0.4727],
-        [0.6680]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.4277, 0.4746, 0.6641], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0029, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:54:23,080] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 18:54:23,081] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.83 | bwd_microstep: 4636.76 | bwd_inner_microstep: 4631.33 | bwd_allreduce_microstep: 5.29 | step_microstep: 46.90
-[2025-01-25 18:54:23,081] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.79 | bwd: 4636.79 | bwd_inner: 4631.33 | bwd_allreduce: 5.35 | step: 46.91
- 63%|██████▎   | 3639/5800 [10:07:53<4:09:15,  6.92s/it]                                                        {'loss': 0.0029, 'grad_norm': 0.45817720890045166, 'learning_rate': 1.2878190622149566e-05, 'epoch': 31.37}
- 63%|██████▎   | 3639/5800 [10:07:53<4:09:15,  6.92s/it]score1 tensor([[0.4941],
-        [0.5977],
-        [0.5938],
-        [0.4258]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.6133, 0.6094, 0.4160], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:54:30,003] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 18:54:30,005] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.89 | bwd_microstep: 4636.71 | bwd_inner_microstep: 4631.27 | bwd_allreduce_microstep: 5.35 | step_microstep: 46.26
-[2025-01-25 18:54:30,005] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.85 | bwd: 4636.74 | bwd_inner: 4631.27 | bwd_allreduce: 5.39 | step: 46.27
- 63%|██████▎   | 3640/5800 [10:07:59<4:09:11,  6.92s/it]                                                        {'loss': 0.0112, 'grad_norm': 4.692072868347168, 'learning_rate': 1.2867755665071328e-05, 'epoch': 31.38}
- 63%|██████▎   | 3640/5800 [10:07:59<4:09:11,  6.92s/it]score1 tensor([[0.4980],
-        [0.4785],
-        [0.4766],
-        [0.3379]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.4492, 0.4688, 0.3223], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:54:36,920] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.37
-[2025-01-25 18:54:36,921] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.64 | bwd_microstep: 4633.27 | bwd_inner_microstep: 4628.28 | bwd_allreduce_microstep: 4.90 | step_microstep: 43.14
-[2025-01-25 18:54:36,922] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.60 | bwd: 4633.30 | bwd_inner: 4628.28 | bwd_allreduce: 4.95 | step: 43.14
- 63%|██████▎   | 3641/5800 [10:08:06<4:09:06,  6.92s/it]                                                        {'loss': 0.0176, 'grad_norm': 3.5434012413024902, 'learning_rate': 1.2857322931950624e-05, 'epoch': 31.39}
- 63%|██████▎   | 3641/5800 [10:08:06<4:09:06,  6.92s/it]score1 tensor([[0.4492],
-        [0.5586],
-        [0.6133],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4590, 0.5664, 0.6211, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:54:43,864] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 7.49 | optimizer_step: 4.37
-[2025-01-25 18:54:43,865] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.02 | bwd_microstep: 4639.59 | bwd_inner_microstep: 4634.24 | bwd_allreduce_microstep: 5.27 | step_microstep: 53.99
-[2025-01-25 18:54:43,866] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.97 | bwd: 4639.61 | bwd_inner: 4634.24 | bwd_allreduce: 5.31 | step: 54.00
- 63%|██████▎   | 3642/5800 [10:08:13<4:09:07,  6.93s/it]                                                        {'loss': 0.0073, 'grad_norm': 8.39931583404541, 'learning_rate': 1.2846892426040563e-05, 'epoch': 31.4}
- 63%|██████▎   | 3642/5800 [10:08:13<4:09:07,  6.93s/it]score1 tensor([[0.5508],
-        [0.5508],
-        [0.5039],
-        [0.6055]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5742, 0.5508, 0.5156, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:54:50,732] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.88 | optimizer_step: 4.37
-[2025-01-25 18:54:50,734] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.22 | bwd_microstep: 4580.85 | bwd_inner_microstep: 4575.35 | bwd_allreduce_microstep: 5.38 | step_microstep: 46.98
-[2025-01-25 18:54:50,738] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.18 | bwd: 4580.87 | bwd_inner: 4575.35 | bwd_allreduce: 5.45 | step: 46.98
- 63%|██████▎   | 3643/5800 [10:08:20<4:08:25,  6.91s/it]                                                        {'loss': 0.0186, 'grad_norm': 6.379092693328857, 'learning_rate': 1.2836464150593564e-05, 'epoch': 31.41}
- 63%|██████▎   | 3643/5800 [10:08:20<4:08:25,  6.91s/it]score1 tensor([[0.5977],
-        [0.6289],
-        [0.4805],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.6445, 0.4766, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:54:57,669] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.37
-[2025-01-25 18:54:57,671] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.03 | bwd_microstep: 4641.74 | bwd_inner_microstep: 4636.48 | bwd_allreduce_microstep: 5.16 | step_microstep: 46.17
-[2025-01-25 18:54:57,671] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.99 | bwd: 4641.77 | bwd_inner: 4636.48 | bwd_allreduce: 5.21 | step: 46.18
- 63%|██████▎   | 3644/5800 [10:08:27<4:08:34,  6.92s/it]                                                        {'loss': 0.0078, 'grad_norm': 4.759146213531494, 'learning_rate': 1.2826038108861335e-05, 'epoch': 31.41}
- 63%|██████▎   | 3644/5800 [10:08:27<4:08:34,  6.92s/it]score1 tensor([[0.6016],
-        [0.5625],
-        [0.6328],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.5781, 0.6367, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:55:04,605] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 18:55:04,607] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.76 | bwd_microstep: 4643.10 | bwd_inner_microstep: 4637.62 | bwd_allreduce_microstep: 5.39 | step_microstep: 45.29
-[2025-01-25 18:55:04,607] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.72 | bwd: 4643.13 | bwd_inner: 4637.62 | bwd_allreduce: 5.44 | step: 45.30
- 63%|██████▎   | 3645/5800 [10:08:34<4:08:38,  6.92s/it]                                                        {'loss': 0.0117, 'grad_norm': 8.890583038330078, 'learning_rate': 1.2815614304094913e-05, 'epoch': 31.42}
- 63%|██████▎   | 3645/5800 [10:08:34<4:08:38,  6.92s/it]score1 tensor([[0.6641],
-        [0.5430],
-        [0.4961],
-        [0.3809]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6562, 0.5312, 0.4805, 0.3711], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:55:11,536] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.99 | optimizer_step: 4.36
-[2025-01-25 18:55:11,538] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.08 | bwd_microstep: 4643.86 | bwd_inner_microstep: 4634.54 | bwd_allreduce_microstep: 9.24 | step_microstep: 48.10
-[2025-01-25 18:55:11,538] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.04 | bwd: 4643.88 | bwd_inner: 4634.53 | bwd_allreduce: 9.28 | step: 48.11
- 63%|██████▎   | 3646/5800 [10:08:41<4:08:37,  6.93s/it]                                                        {'loss': 0.0112, 'grad_norm': 8.237866401672363, 'learning_rate': 1.2805192739544602e-05, 'epoch': 31.43}
- 63%|██████▎   | 3646/5800 [10:08:41<4:08:37,  6.93s/it]score1 tensor([[0.5000],
-        [0.5898],
-        [0.5352],
-        [0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.5742, 0.5273, 0.4590], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:55:18,466] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 18:55:18,468] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.29 | bwd_microstep: 4637.82 | bwd_inner_microstep: 4632.77 | bwd_allreduce_microstep: 4.94 | step_microstep: 45.30
-[2025-01-25 18:55:18,468] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.26 | bwd: 4637.84 | bwd_inner: 4632.77 | bwd_allreduce: 5.00 | step: 45.31
- 63%|██████▎   | 3647/5800 [10:08:48<4:08:36,  6.93s/it]                                                        {'loss': 0.0176, 'grad_norm': 8.263937950134277, 'learning_rate': 1.279477341846004e-05, 'epoch': 31.44}
- 63%|██████▎   | 3647/5800 [10:08:48<4:08:36,  6.93s/it]score1 tensor([[0.4980],
-        [0.7070],
-        [0.4844],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.6953, 0.4648, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:55:25,395] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.83 | optimizer_step: 4.36
-[2025-01-25 18:55:25,397] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.11 | bwd_microstep: 4629.52 | bwd_inner_microstep: 4624.60 | bwd_allreduce_microstep: 4.80 | step_microstep: 49.62
-[2025-01-25 18:55:25,397] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.09 | bwd: 4629.55 | bwd_inner: 4624.60 | bwd_allreduce: 4.86 | step: 49.63
- 63%|██████▎   | 3648/5800 [10:08:55<4:08:27,  6.93s/it]                                                        {'loss': 0.0122, 'grad_norm': 8.521627426147461, 'learning_rate': 1.2784356344090145e-05, 'epoch': 31.45}
- 63%|██████▎   | 3648/5800 [10:08:55<4:08:27,  6.93s/it]score1 tensor([[0.5625],
-        [0.4238],
-        [0.4414],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.3262, 0.3945, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0400, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:55:32,328] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 18:55:32,329] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.62 | bwd_microstep: 4639.47 | bwd_inner_microstep: 4634.03 | bwd_allreduce_microstep: 5.33 | step_microstep: 47.54
-[2025-01-25 18:55:32,330] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.53 | bwd: 4639.50 | bwd_inner: 4634.04 | bwd_allreduce: 5.38 | step: 47.54
- 63%|██████▎   | 3649/5800 [10:09:02<4:08:22,  6.93s/it]                                                        {'loss': 0.04, 'grad_norm': 11.5612211227417, 'learning_rate': 1.2773941519683142e-05, 'epoch': 31.46}
- 63%|██████▎   | 3649/5800 [10:09:02<4:08:22,  6.93s/it]score1 tensor([[0.4414],
-        [0.5000],
-        [0.5938],
-        [0.4238]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4238, 0.4785, 0.6016, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:55:39,256] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 18:55:39,258] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.18 | bwd_microstep: 4633.46 | bwd_inner_microstep: 4628.55 | bwd_allreduce_microstep: 4.80 | step_microstep: 49.09
-[2025-01-25 18:55:39,258] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.15 | bwd: 4633.49 | bwd_inner: 4628.55 | bwd_allreduce: 4.86 | step: 49.10
- 63%|██████▎   | 3650/5800 [10:09:09<4:08:18,  6.93s/it]                                                        {'loss': 0.0176, 'grad_norm': 3.557309150695801, 'learning_rate': 1.2763528948486563e-05, 'epoch': 31.47}
- 63%|██████▎   | 3650/5800 [10:09:09<4:08:18,  6.93s/it]score1 tensor([[0.4609],
-        [0.6602],
-        [0.4336],
-        [0.3809]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4414, 0.6484, 0.4004, 0.3555], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0225, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:55:46,178] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.01 | optimizer_step: 4.37
-[2025-01-25 18:55:46,179] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.61 | bwd_microstep: 4633.05 | bwd_inner_microstep: 4627.98 | bwd_allreduce_microstep: 4.98 | step_microstep: 43.27
-[2025-01-25 18:55:46,180] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.56 | bwd: 4633.08 | bwd_inner: 4627.98 | bwd_allreduce: 5.03 | step: 43.28
- 63%|██████▎   | 3651/5800 [10:09:16<4:08:05,  6.93s/it]                                                        {'loss': 0.0225, 'grad_norm': 7.85680627822876, 'learning_rate': 1.2753118633747214e-05, 'epoch': 31.47}
- 63%|██████▎   | 3651/5800 [10:09:16<4:08:05,  6.93s/it]score1 tensor([[0.4336],
-        [0.4355],
-        [0.4609],
-        [0.4258]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4219, 0.3906, 0.4473, 0.3984], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0244, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:55:53,103] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 18:55:53,104] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.50 | bwd_microstep: 4641.70 | bwd_inner_microstep: 4636.85 | bwd_allreduce_microstep: 4.77 | step_microstep: 44.17
-[2025-01-25 18:55:53,104] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.47 | bwd: 4641.73 | bwd_inner: 4636.85 | bwd_allreduce: 4.81 | step: 44.17
- 63%|██████▎   | 3652/5800 [10:09:23<4:07:58,  6.93s/it]                                                        {'loss': 0.0244, 'grad_norm': 7.474110126495361, 'learning_rate': 1.2742710578711226e-05, 'epoch': 31.48}
- 63%|██████▎   | 3652/5800 [10:09:23<4:07:58,  6.93s/it]score1 tensor([[0.5820],
-        [0.5859],
-        [0.4570],
-        [0.4766]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5820, 0.5938, 0.4453, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:55:59,978] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 18:55:59,980] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.81 | bwd_microstep: 4589.88 | bwd_inner_microstep: 4585.13 | bwd_allreduce_microstep: 4.66 | step_microstep: 42.07
-[2025-01-25 18:55:59,980] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.75 | bwd: 4589.90 | bwd_inner: 4585.13 | bwd_allreduce: 4.70 | step: 42.08
- 63%|██████▎   | 3653/5800 [10:09:29<4:07:14,  6.91s/it]                                                        {'loss': 0.0078, 'grad_norm': 1.7063014507293701, 'learning_rate': 1.2732304786624005e-05, 'epoch': 31.49}
- 63%|██████▎   | 3653/5800 [10:09:29<4:07:14,  6.91s/it]score1 tensor([[0.5586],
-        [0.5508],
-        [0.5273],
-        [0.4570]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.5508, 0.5312, 0.4688], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:56:06,834] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.02 | optimizer_step: 4.36
-[2025-01-25 18:56:06,835] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.24 | bwd_microstep: 4575.41 | bwd_inner_microstep: 4570.54 | bwd_allreduce_microstep: 4.76 | step_microstep: 46.04
-[2025-01-25 18:56:06,835] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.20 | bwd: 4575.43 | bwd_inner: 4570.54 | bwd_allreduce: 4.81 | step: 46.05
- 63%|██████▎   | 3654/5800 [10:09:36<4:06:35,  6.89s/it]                                                        {'loss': 0.0156, 'grad_norm': 6.12371826171875, 'learning_rate': 1.2721901260730252e-05, 'epoch': 31.5}
- 63%|██████▎   | 3654/5800 [10:09:36<4:06:35,  6.89s/it]score1 tensor([[0.5195],
-        [0.5039],
-        [0.5312],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.5156, 0.5547, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0151, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:56:13,757] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 18:56:13,758] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.42 | bwd_microstep: 4636.20 | bwd_inner_microstep: 4631.62 | bwd_allreduce_microstep: 4.51 | step_microstep: 44.10
-[2025-01-25 18:56:13,759] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.38 | bwd: 4636.22 | bwd_inner: 4631.61 | bwd_allreduce: 4.55 | step: 44.11
- 63%|██████▎   | 3655/5800 [10:09:43<4:06:47,  6.90s/it]                                                        {'loss': 0.0151, 'grad_norm': 4.1396613121032715, 'learning_rate': 1.2711500004273978e-05, 'epoch': 31.51}
- 63%|██████▎   | 3655/5800 [10:09:43<4:06:47,  6.90s/it]score1 tensor([[0.4863],
-        [0.4082],
-        [0.4453],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.4043, 0.4609, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0151, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:56:20,706] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 18:56:20,707] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.66 | bwd_microstep: 4652.55 | bwd_inner_microstep: 4646.37 | bwd_allreduce_microstep: 6.10 | step_microstep: 50.98
-[2025-01-25 18:56:20,708] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.60 | bwd: 4652.58 | bwd_inner: 4646.37 | bwd_allreduce: 6.14 | step: 50.98
- 63%|██████▎   | 3656/5800 [10:09:50<4:07:10,  6.92s/it]                                                        {'loss': 0.0151, 'grad_norm': 0.44515398144721985, 'learning_rate': 1.2701101020498459e-05, 'epoch': 31.52}
- 63%|██████▎   | 3656/5800 [10:09:50<4:07:10,  6.92s/it]score1 tensor([[0.6094],
-        [0.5273],
-        [0.4395],
-        [0.4004]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.5508, 0.4668, 0.4043], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:56:27,564] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.36
-[2025-01-25 18:56:27,566] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.42 | bwd_microstep: 4577.34 | bwd_inner_microstep: 4572.55 | bwd_allreduce_microstep: 4.69 | step_microstep: 42.20
-[2025-01-25 18:56:27,566] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.38 | bwd: 4577.37 | bwd_inner: 4572.55 | bwd_allreduce: 4.73 | step: 42.21
- 63%|██████▎   | 3657/5800 [10:09:57<4:06:28,  6.90s/it]                                                        {'loss': 0.0137, 'grad_norm': 5.719315528869629, 'learning_rate': 1.2690704312646298e-05, 'epoch': 31.53}
- 63%|██████▎   | 3657/5800 [10:09:57<4:06:28,  6.90s/it]score1 tensor([[0.5781],
-        [0.5078],
-        [0.6055],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.5234, 0.6211, 0.5742], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0205, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:56:34,485] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 18:56:34,486] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.90 | bwd_microstep: 4632.98 | bwd_inner_microstep: 4627.92 | bwd_allreduce_microstep: 4.98 | step_microstep: 42.56
-[2025-01-25 18:56:34,487] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.86 | bwd: 4633.00 | bwd_inner: 4627.92 | bwd_allreduce: 5.01 | step: 42.57
- 63%|██████▎   | 3658/5800 [10:10:04<4:06:29,  6.90s/it]                                                        {'loss': 0.0205, 'grad_norm': 8.599835395812988, 'learning_rate': 1.2680309883959361e-05, 'epoch': 31.53}
- 63%|██████▎   | 3658/5800 [10:10:04<4:06:29,  6.90s/it]score1 tensor([[0.3750],
-        [0.3652],
-        [0.5312],
-        [0.4590]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3672, 0.3672, 0.5547, 0.4668], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:56:41,404] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 18:56:41,405] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.20 | bwd_microstep: 4632.66 | bwd_inner_microstep: 4627.48 | bwd_allreduce_microstep: 5.07 | step_microstep: 44.27
-[2025-01-25 18:56:41,405] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.17 | bwd: 4632.69 | bwd_inner: 4627.49 | bwd_allreduce: 5.13 | step: 44.27
- 63%|██████▎   | 3659/5800 [10:10:11<4:06:31,  6.91s/it]                                                        {'loss': 0.0103, 'grad_norm': 3.982584238052368, 'learning_rate': 1.26699177376788e-05, 'epoch': 31.54}
- 63%|██████▎   | 3659/5800 [10:10:11<4:06:31,  6.91s/it]score1 tensor([[0.5781],
-        [0.5859],
-        [0.4414],
-        [0.4707]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5820, 0.6211, 0.4512, 0.5000], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:56:48,313] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.98 | optimizer_step: 4.36
-[2025-01-25 18:56:48,314] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.75 | bwd_microstep: 4628.20 | bwd_inner_microstep: 4623.03 | bwd_allreduce_microstep: 5.07 | step_microstep: 44.34
-[2025-01-25 18:56:48,315] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.72 | bwd: 4628.23 | bwd_inner: 4623.03 | bwd_allreduce: 5.12 | step: 44.35
- 63%|██████▎   | 3660/5800 [10:10:18<4:06:28,  6.91s/it]                                                        {'loss': 0.0195, 'grad_norm': 8.219254493713379, 'learning_rate': 1.2659527877045087e-05, 'epoch': 31.55}
- 63%|██████▎   | 3660/5800 [10:10:18<4:06:28,  6.91s/it]score1 tensor([[0.4824],
-        [0.6602],
-        [0.5391],
-        [0.3672]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.6602, 0.5430, 0.3789], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:56:55,179] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 18:56:55,181] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.56 | bwd_microstep: 4583.82 | bwd_inner_microstep: 4578.60 | bwd_allreduce_microstep: 5.09 | step_microstep: 42.49
-[2025-01-25 18:56:55,181] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.52 | bwd: 4583.84 | bwd_inner: 4578.60 | bwd_allreduce: 5.16 | step: 42.50
- 63%|██████▎   | 3661/5800 [10:10:25<4:05:51,  6.90s/it]                                                        {'loss': 0.0093, 'grad_norm': 5.723668575286865, 'learning_rate': 1.264914030529794e-05, 'epoch': 31.56}
- 63%|██████▎   | 3661/5800 [10:10:25<4:05:51,  6.90s/it]score1 tensor([[0.4570],
-        [0.5078],
-        [0.4980],
-        [0.3457]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.4902, 0.5039, 0.3457], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:57:02,058] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 18:57:02,059] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.48 | bwd_microstep: 4593.39 | bwd_inner_microstep: 4588.28 | bwd_allreduce_microstep: 5.01 | step_microstep: 42.30
-[2025-01-25 18:57:02,060] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.43 | bwd: 4593.43 | bwd_inner: 4588.28 | bwd_allreduce: 5.06 | step: 42.30
- 63%|██████▎   | 3662/5800 [10:10:32<4:05:32,  6.89s/it]                                                        {'loss': 0.0098, 'grad_norm': 1.9397162199020386, 'learning_rate': 1.2638755025676401e-05, 'epoch': 31.57}
- 63%|██████▎   | 3662/5800 [10:10:32<4:05:32,  6.89s/it]score1 tensor([[0.4883],
-        [0.4277],
-        [0.4863],
-        [0.5781]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.4160, 0.4805, 0.5586], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0151, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:57:08,986] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.18 | optimizer_step: 4.36
-[2025-01-25 18:57:08,987] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.94 | bwd_microstep: 4637.86 | bwd_inner_microstep: 4632.74 | bwd_allreduce_microstep: 5.03 | step_microstep: 43.39
-[2025-01-25 18:57:08,987] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.90 | bwd: 4637.88 | bwd_inner: 4632.74 | bwd_allreduce: 5.07 | step: 43.40
- 63%|██████▎   | 3663/5800 [10:10:38<4:05:49,  6.90s/it]                                                        {'loss': 0.0151, 'grad_norm': 8.046173095703125, 'learning_rate': 1.2628372041418769e-05, 'epoch': 31.58}
- 63%|██████▎   | 3663/5800 [10:10:38<4:05:49,  6.90s/it]score1 tensor([[0.4824],
-        [0.4668],
-        [0.5195],
-        [0.4062]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.4492, 0.4844, 0.4023], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0166, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:57:15,904] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 18:57:15,905] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.80 | bwd_microstep: 4634.05 | bwd_inner_microstep: 4629.06 | bwd_allreduce_microstep: 4.91 | step_microstep: 42.37
-[2025-01-25 18:57:15,906] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.76 | bwd: 4634.08 | bwd_inner: 4629.06 | bwd_allreduce: 4.95 | step: 42.38
- 63%|██████▎   | 3664/5800 [10:10:45<4:05:52,  6.91s/it]                                                        {'loss': 0.0166, 'grad_norm': 7.734158992767334, 'learning_rate': 1.2617991355762644e-05, 'epoch': 31.59}
- 63%|██████▎   | 3664/5800 [10:10:45<4:05:52,  6.91s/it]score1 tensor([[0.4375],
-        [0.5508],
-        [0.6172],
-        [0.4746]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.5391, 0.6133, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:57:22,828] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.36
-[2025-01-25 18:57:22,829] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.23 | bwd_microstep: 4638.00 | bwd_inner_microstep: 4629.26 | bwd_allreduce_microstep: 8.62 | step_microstep: 43.53
-[2025-01-25 18:57:22,829] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.20 | bwd: 4638.03 | bwd_inner: 4629.26 | bwd_allreduce: 8.69 | step: 43.53
- 63%|██████▎   | 3665/5800 [10:10:52<4:05:56,  6.91s/it]                                                        {'loss': 0.0156, 'grad_norm': 8.214564323425293, 'learning_rate': 1.2607612971944912e-05, 'epoch': 31.59}
- 63%|██████▎   | 3665/5800 [10:10:52<4:05:56,  6.91s/it]score1 tensor([[0.5781],
-        [0.5781],
-        [0.4668],
-        [0.4551]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.5625, 0.4648, 0.4395], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:57:29,757] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 18:57:29,758] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.04 | bwd_microstep: 4639.82 | bwd_inner_microstep: 4634.01 | bwd_allreduce_microstep: 5.72 | step_microstep: 52.36
-[2025-01-25 18:57:29,759] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.00 | bwd: 4639.85 | bwd_inner: 4634.01 | bwd_allreduce: 5.77 | step: 52.36
- 63%|██████▎   | 3666/5800 [10:10:59<4:06:00,  6.92s/it]                                                        {'loss': 0.0122, 'grad_norm': 8.234245300292969, 'learning_rate': 1.2597236893201712e-05, 'epoch': 31.6}
- 63%|██████▎   | 3666/5800 [10:10:59<4:06:00,  6.92s/it]score1 tensor([[0.4199],
-        [0.5625],
-        [0.4688],
-        [0.6367]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3887, 0.5703, 0.4707, 0.6289], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:57:36,683] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.37
-[2025-01-25 18:57:36,685] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.58 | bwd_microstep: 4639.11 | bwd_inner_microstep: 4633.80 | bwd_allreduce_microstep: 5.19 | step_microstep: 44.18
-[2025-01-25 18:57:36,685] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.53 | bwd: 4639.13 | bwd_inner: 4633.80 | bwd_allreduce: 5.24 | step: 44.18
- 63%|██████▎   | 3667/5800 [10:11:06<4:06:06,  6.92s/it]                                                        {'loss': 0.0122, 'grad_norm': 0.41694873571395874, 'learning_rate': 1.2586863122768507e-05, 'epoch': 31.61}
- 63%|██████▎   | 3667/5800 [10:11:06<4:06:06,  6.92s/it]score1 tensor([[0.5234],
-        [0.5430],
-        [0.5898],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4961, 0.5117, 0.6016, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:57:43,614] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.88 | optimizer_step: 4.37
-[2025-01-25 18:57:43,615] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.30 | bwd_microstep: 4634.34 | bwd_inner_microstep: 4629.36 | bwd_allreduce_microstep: 4.87 | step_microstep: 44.12
-[2025-01-25 18:57:43,616] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.27 | bwd: 4634.36 | bwd_inner: 4629.36 | bwd_allreduce: 4.93 | step: 44.13
- 63%|██████▎   | 3668/5800 [10:11:13<4:05:57,  6.92s/it]                                                        {'loss': 0.0195, 'grad_norm': 4.115731239318848, 'learning_rate': 1.2576491663880002e-05, 'epoch': 31.62}
- 63%|██████▎   | 3668/5800 [10:11:13<4:05:57,  6.92s/it]score1 tensor([[0.5430],
-        [0.6055],
-        [0.5820],
-        [0.4980]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.6133, 0.5547, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:57:50,478] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 18:57:50,479] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.73 | bwd_microstep: 4581.07 | bwd_inner_microstep: 4575.85 | bwd_allreduce_microstep: 5.10 | step_microstep: 45.14
-[2025-01-25 18:57:50,480] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.69 | bwd: 4581.10 | bwd_inner: 4575.85 | bwd_allreduce: 5.18 | step: 45.15
- 63%|██████▎   | 3669/5800 [10:11:20<4:05:12,  6.90s/it]                                                        {'loss': 0.0122, 'grad_norm': 1.9561238288879395, 'learning_rate': 1.2566122519770215e-05, 'epoch': 31.63}
- 63%|██████▎   | 3669/5800 [10:11:20<4:05:12,  6.90s/it]score1 tensor([[0.6602],
-        [0.3262],
-        [0.4590],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6328, 0.3086, 0.4551, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:57:57,406] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.11 | optimizer_step: 4.37
-[2025-01-25 18:57:57,408] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.15 | bwd_microstep: 4640.68 | bwd_inner_microstep: 4635.52 | bwd_allreduce_microstep: 5.05 | step_microstep: 43.13
-[2025-01-25 18:57:57,408] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.11 | bwd: 4640.70 | bwd_inner: 4635.52 | bwd_allreduce: 5.10 | step: 43.13
- 63%|██████▎   | 3670/5800 [10:11:27<4:05:25,  6.91s/it]                                                        {'loss': 0.0132, 'grad_norm': 3.822488784790039, 'learning_rate': 1.2555755693672404e-05, 'epoch': 31.64}
- 63%|██████▎   | 3670/5800 [10:11:27<4:05:25,  6.91s/it]score1 tensor([[0.4668],
-        [0.6250],
-        [0.3809],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.6289, 0.3809, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:58:04,293] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.92 | optimizer_step: 4.36
-[2025-01-25 18:58:04,294] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.83 | bwd_microstep: 4592.77 | bwd_inner_microstep: 4587.13 | bwd_allreduce_microstep: 5.52 | step_microstep: 45.88
-[2025-01-25 18:58:04,295] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.79 | bwd: 4592.80 | bwd_inner: 4587.13 | bwd_allreduce: 5.58 | step: 45.89
- 63%|██████▎   | 3671/5800 [10:11:34<4:04:58,  6.90s/it]                                                        {'loss': 0.0063, 'grad_norm': 2.186594009399414, 'learning_rate': 1.2545391188819145e-05, 'epoch': 31.65}
- 63%|██████▎   | 3671/5800 [10:11:34<4:04:58,  6.90s/it]score1 tensor([[0.5156],
-        [0.4668],
-        [0.4746],
-        [0.3809]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.4629, 0.4863, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:58:11,202] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.88 | optimizer_step: 4.36
-[2025-01-25 18:58:11,204] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.51 | bwd_microstep: 4634.80 | bwd_inner_microstep: 4629.54 | bwd_allreduce_microstep: 5.13 | step_microstep: 42.96
-[2025-01-25 18:58:11,204] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.48 | bwd: 4634.85 | bwd_inner: 4629.54 | bwd_allreduce: 5.18 | step: 42.97
- 63%|██████▎   | 3672/5800 [10:11:41<4:04:55,  6.91s/it]                                                        {'loss': 0.0063, 'grad_norm': 3.8111774921417236, 'learning_rate': 1.2535029008442266e-05, 'epoch': 31.66}
- 63%|██████▎   | 3672/5800 [10:11:41<4:04:55,  6.91s/it]score1 tensor([[0.6211],
-        [0.5859],
-        [0.6055],
-        [0.4375]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.5781, 0.6094, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:58:18,116] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 18:58:18,117] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.49 | bwd_microstep: 4629.58 | bwd_inner_microstep: 4624.66 | bwd_allreduce_microstep: 4.84 | step_microstep: 42.73
-[2025-01-25 18:58:18,117] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.45 | bwd: 4629.61 | bwd_inner: 4624.66 | bwd_allreduce: 4.87 | step: 42.74
- 63%|██████▎   | 3673/5800 [10:11:48<4:04:53,  6.91s/it]                                                        {'loss': 0.0068, 'grad_norm': 0.6041336059570312, 'learning_rate': 1.2524669155772858e-05, 'epoch': 31.66}
- 63%|██████▎   | 3673/5800 [10:11:48<4:04:53,  6.91s/it]score1 tensor([[0.4023],
-        [0.5352],
-        [0.4883],
-        [0.7148]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4082, 0.5547, 0.5352, 0.7031], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0210, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:58:25,041] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 18:58:25,043] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.43 | bwd_microstep: 4636.99 | bwd_inner_microstep: 4631.60 | bwd_allreduce_microstep: 5.31 | step_microstep: 49.12
-[2025-01-25 18:58:25,043] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.39 | bwd: 4637.01 | bwd_inner: 4631.60 | bwd_allreduce: 5.35 | step: 49.13
- 63%|██████▎   | 3674/5800 [10:11:55<4:04:55,  6.91s/it]                                                        {'loss': 0.021, 'grad_norm': 3.3858232498168945, 'learning_rate': 1.2514311634041325e-05, 'epoch': 31.67}
- 63%|██████▎   | 3674/5800 [10:11:55<4:04:55,  6.91s/it]score1 tensor([[0.4688],
-        [0.5195],
-        [0.5977],
-        [0.4648]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.5352, 0.6250, 0.4609], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:58:31,954] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.36
-[2025-01-25 18:58:31,956] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.57 | bwd_microstep: 4638.02 | bwd_inner_microstep: 4632.68 | bwd_allreduce_microstep: 5.22 | step_microstep: 41.93
-[2025-01-25 18:58:31,956] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.54 | bwd: 4638.04 | bwd_inner: 4632.68 | bwd_allreduce: 5.28 | step: 41.94
- 63%|██████▎   | 3675/5800 [10:12:01<4:04:51,  6.91s/it]                                                        {'loss': 0.0161, 'grad_norm': 0.708885908126831, 'learning_rate': 1.2503956446477306e-05, 'epoch': 31.68}
- 63%|██████▎   | 3675/5800 [10:12:01<4:04:51,  6.91s/it]score1 tensor([[0.5273],
-        [0.3770],
-        [0.5469],
-        [0.4707]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.3730, 0.5664, 0.4863], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:58:38,876] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 18:58:38,878] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.16 | bwd_microstep: 4637.81 | bwd_inner_microstep: 4633.09 | bwd_allreduce_microstep: 4.65 | step_microstep: 44.47
-[2025-01-25 18:58:38,878] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.12 | bwd: 4637.83 | bwd_inner: 4633.09 | bwd_allreduce: 4.68 | step: 44.48
- 63%|██████▎   | 3676/5800 [10:12:08<4:04:51,  6.92s/it]                                                        {'loss': 0.0146, 'grad_norm': 4.429490089416504, 'learning_rate': 1.2493603596309743e-05, 'epoch': 31.69}
- 63%|██████▎   | 3676/5800 [10:12:08<4:04:51,  6.92s/it]score1 tensor([[0.5508],
-        [0.5508],
-        [0.5859],
-        [0.4297]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.5664, 0.5977, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0142, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:58:45,802] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.37
-[2025-01-25 18:58:45,803] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.25 | bwd_microstep: 4640.18 | bwd_inner_microstep: 4635.13 | bwd_allreduce_microstep: 4.97 | step_microstep: 41.66
-[2025-01-25 18:58:45,803] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.21 | bwd: 4640.21 | bwd_inner: 4635.13 | bwd_allreduce: 5.01 | step: 41.66
- 63%|██████▎   | 3677/5800 [10:12:15<4:04:48,  6.92s/it]                                                        {'loss': 0.0142, 'grad_norm': 4.036227226257324, 'learning_rate': 1.2483253086766822e-05, 'epoch': 31.7}
- 63%|██████▎   | 3677/5800 [10:12:15<4:04:48,  6.92s/it]score1 tensor([[0.3906],
-        [0.6641],
-        [0.5273],
-        [0.4688]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.6875, 0.5273, 0.4863], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:58:52,666] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.90 | optimizer_step: 4.36
-[2025-01-25 18:58:52,668] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.14 | bwd_microstep: 4582.83 | bwd_inner_microstep: 4578.12 | bwd_allreduce_microstep: 4.64 | step_microstep: 42.41
-[2025-01-25 18:58:52,668] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.10 | bwd: 4582.86 | bwd_inner: 4578.12 | bwd_allreduce: 4.67 | step: 42.44
- 63%|██████▎   | 3678/5800 [10:12:22<4:04:15,  6.91s/it]                                                        {'loss': 0.0127, 'grad_norm': 6.093481540679932, 'learning_rate': 1.247290492107601e-05, 'epoch': 31.71}
- 63%|██████▎   | 3678/5800 [10:12:22<4:04:15,  6.91s/it]score1 tensor([[0.5781],
-        [0.4590],
-        [0.5039],
-        [0.3770]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.4609, 0.5195, 0.4043], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:58:59,541] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.36
-[2025-01-25 18:58:59,543] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.11 | bwd_microstep: 4576.15 | bwd_inner_microstep: 4571.10 | bwd_allreduce_microstep: 4.96 | step_microstep: 44.35
-[2025-01-25 18:58:59,543] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.06 | bwd: 4576.17 | bwd_inner: 4571.10 | bwd_allreduce: 5.00 | step: 44.36
- 63%|██████▎   | 3679/5800 [10:12:29<4:03:39,  6.89s/it]                                                        {'loss': 0.0112, 'grad_norm': 5.690494060516357, 'learning_rate': 1.2462559102464057e-05, 'epoch': 31.72}
- 63%|██████▎   | 3679/5800 [10:12:29<4:03:39,  6.89s/it]score1 tensor([[0.4375],
-        [0.4785],
-        [0.6328],
-        [0.4746]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4297, 0.4961, 0.6484, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 18:59:06,463] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.37
-[2025-01-25 18:59:06,465] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.72 | bwd_microstep: 4637.60 | bwd_inner_microstep: 4632.72 | bwd_allreduce_microstep: 4.80 | step_microstep: 44.09
-[2025-01-25 18:59:06,465] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.67 | bwd: 4637.63 | bwd_inner: 4632.72 | bwd_allreduce: 4.84 | step: 44.10
- 63%|██████▎   | 3680/5800 [10:12:36<4:03:51,  6.90s/it]                                                        {'loss': 0.0146, 'grad_norm': 4.391592025756836, 'learning_rate': 1.2452215634156954e-05, 'epoch': 31.72}
- 63%|██████▎   | 3680/5800 [10:12:36<4:03:51,  6.90s/it]evaluate!
-score1 tensor([[0.4570]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4805, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5820]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5625, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4512]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4766, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5469]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5703, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6562]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5742, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0820, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4453]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5547, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1094, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5977, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0625, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5586]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4219, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1367, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5664]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0781, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5039]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5000, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4434]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3945, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0488, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5469]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5000, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4844]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3730, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1113, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4434]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0488, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4336]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4414, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6797, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1562, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5391, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4922]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4648, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4062]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4258, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4512]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4707, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4512]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3965, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4902]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3750, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1152, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4492, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0898, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5586, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5625]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6523, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0898, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5977]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5898, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5625]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4355, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1270, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5117]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5156, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5117]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3262, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1855, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4062]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4863, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0801, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4668]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1816, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5508]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6055, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5195]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4570, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0625, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5703]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0781, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6055]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.7031, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0977, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4551]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4336, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5430]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5391, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4492]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4473, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5312]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5430, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4434]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4590, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6484]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6641, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4453, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0938, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6055]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.7109, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1055, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5430]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5273, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4844]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3691, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1152, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4473]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4180, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4512]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3223, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1289, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5742]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6016, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4727]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4648, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4590]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3418, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1172, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4688]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4590, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4922]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5117, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5039]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4551, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0488, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4746]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5234, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0488, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5664]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5312, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5820]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5820, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0., device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4102]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4199, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4219]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3926, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5469]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5312]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5352, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5898]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6250, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5977]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5703, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4668]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4727, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5625]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4434, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1191, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4941]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4941, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0., device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5703]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5352, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4824]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4668]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3828, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0840, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4316]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4219, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4785]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4844, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4805, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5703]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5820]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5898, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6523]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6641, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4961]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4688, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4316]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4297, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4160]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4668, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5312]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0977, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4414]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4297, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4336]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4609, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4766]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1406, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4629]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4492, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4102]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3730, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0371, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5195]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6445, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1250, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4668]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1816, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4570]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5430, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0859, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4961]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5273, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4512]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3867, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0645, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4824]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4668, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5625]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5625, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0., device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5859]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6250, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5625]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5547]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0742, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4512]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4785, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4844]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4062, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0781, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5312]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4395, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0918, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4375]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5664, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1289, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4570]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3555, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1016, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4727]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4785, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5508]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5078, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5859]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6016, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4727]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4277, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0449, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5898]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5781, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4805]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3477, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1328, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5039]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6133, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1094, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5430]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5156, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4453]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3379, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1074, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4395]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3418, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0977, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4844]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3848, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0996, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4141]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3828, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4609]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4277, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0332, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4219]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4043, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16)
-Results saved to /DATA/DATA1/wjr/intern/InternVL/internvl_chat/new_train_result/stage2/mos3_test.csv
-Accuracy: 0.0
-SRCC_score: 0.6503903611823723
-PLCC_score: 0.6529828572837242
-KRCC_score: 0.4684891841702203
-SRCC_level: 0.6503903611823723
-PLCC_level: 0.6529828572837242
-KRCC_level: 0.4684891841702203
-score1 tensor([[0.5781],
-        [0.4883],
-        [0.5000],
-        [0.4824]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.5156, 0.5078, 0.4824], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:09:15,817] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.30 | optimizer_step: 4.37
-[2025-01-25 19:09:15,818] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2134.64 | bwd_microstep: 4545.86 | bwd_inner_microstep: 4540.64 | bwd_allreduce_microstep: 5.14 | step_microstep: 44.73
-[2025-01-25 19:09:15,819] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2134.61 | bwd: 4545.88 | bwd_inner: 4540.63 | bwd_allreduce: 5.17 | step: 44.74
- 63%|██████▎   | 3681/5800 [10:22:45<110:26:45, 187.64s/it]                                                           {'loss': 0.0127, 'grad_norm': 1.8475288152694702, 'learning_rate': 1.2441874519379989e-05, 'epoch': 31.73}
- 63%|██████▎   | 3681/5800 [10:22:45<110:26:45, 187.64s/it]score1 tensor([[0.4414],
-        [0.5039],
-        [0.2061],
-        [0.6172]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4355, 0.4844, 0.1787, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:09:22,651] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 19:09:22,652] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2130.51 | bwd_microstep: 4577.38 | bwd_inner_microstep: 4572.57 | bwd_allreduce_microstep: 4.74 | step_microstep: 41.76
-[2025-01-25 19:09:22,653] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2130.47 | bwd: 4577.41 | bwd_inner: 4572.57 | bwd_allreduce: 4.78 | step: 41.78
- 63%|██████▎   | 3682/5800 [10:22:52<78:28:53, 133.40s/it]                                                           {'loss': 0.0161, 'grad_norm': 7.530764102935791, 'learning_rate': 1.243153576135769e-05, 'epoch': 31.74}
- 63%|██████▎   | 3682/5800 [10:22:52<78:28:53, 133.40s/it]score1 tensor([[0.5234],
-        [0.5312],
-        [0.5547],
-        [0.4297]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.4961, 0.5352, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0220, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:09:29,497] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 19:09:29,499] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2131.12 | bwd_microstep: 4588.95 | bwd_inner_microstep: 4584.39 | bwd_allreduce_microstep: 4.46 | step_microstep: 42.07
-[2025-01-25 19:09:29,499] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2131.08 | bwd: 4588.98 | bwd_inner: 4584.39 | bwd_allreduce: 4.51 | step: 42.08
- 64%|██████▎   | 3683/5800 [10:22:59<56:07:07, 95.43s/it]                                                          {'loss': 0.022, 'grad_norm': 0.5505063533782959, 'learning_rate': 1.2421199363313866e-05, 'epoch': 31.75}
- 64%|██████▎   | 3683/5800 [10:22:59<56:07:07, 95.43s/it]score1 tensor([[0.5352],
-        [0.3945],
-        [0.3574],
-        [0.4316]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.3789, 0.3340, 0.4121], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:09:36,339] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.03 | optimizer_step: 4.37
-[2025-01-25 19:09:36,340] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2134.28 | bwd_microstep: 4584.10 | bwd_inner_microstep: 4579.32 | bwd_allreduce_microstep: 4.71 | step_microstep: 42.92
-[2025-01-25 19:09:36,341] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2134.25 | bwd: 4584.12 | bwd_inner: 4579.32 | bwd_allreduce: 4.74 | step: 42.93
- 64%|██████▎   | 3684/5800 [10:23:06<40:28:14, 68.85s/it]                                                         {'loss': 0.0186, 'grad_norm': 7.392186164855957, 'learning_rate': 1.2410865328471589e-05, 'epoch': 31.76}
- 64%|██████▎   | 3684/5800 [10:23:06<40:28:14, 68.85s/it]score1 tensor([[0.4102],
-        [0.5898],
-        [0.5391],
-        [0.6211]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4062, 0.5703, 0.5039, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:09:43,205] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 19:09:43,206] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2142.08 | bwd_microstep: 4598.99 | bwd_inner_microstep: 4594.28 | bwd_allreduce_microstep: 4.63 | step_microstep: 43.45
-[2025-01-25 19:09:43,206] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2142.04 | bwd: 4599.02 | bwd_inner: 4594.28 | bwd_allreduce: 4.67 | step: 43.46
- 64%|██████▎   | 3685/5800 [10:23:13<29:31:32, 50.26s/it]                                                         {'loss': 0.0176, 'grad_norm': 8.393800735473633, 'learning_rate': 1.2400533660053178e-05, 'epoch': 31.77}
- 64%|██████▎   | 3685/5800 [10:23:13<29:31:32, 50.26s/it]score1 tensor([[0.5898],
-        [0.5078],
-        [0.5781],
-        [0.4922]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.4961, 0.5391, 0.4766], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:09:50,079] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.36
-[2025-01-25 19:09:50,080] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2149.28 | bwd_microstep: 4607.13 | bwd_inner_microstep: 4602.74 | bwd_allreduce_microstep: 4.31 | step_microstep: 42.57
-[2025-01-25 19:09:50,080] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2149.25 | bwd: 4607.16 | bwd_inner: 4602.74 | bwd_allreduce: 4.36 | step: 42.58
- 64%|██████▎   | 3686/5800 [10:23:20<21:52:06, 37.24s/it]                                                         {'loss': 0.0234, 'grad_norm': 8.428089141845703, 'learning_rate': 1.239020436128024e-05, 'epoch': 31.78}
- 64%|██████▎   | 3686/5800 [10:23:20<21:52:06, 37.24s/it]score1 tensor([[0.4551],
-        [0.5039],
-        [0.5820],
-        [0.4531]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.4941, 0.5664, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:09:56,936] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 19:09:56,937] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2145.52 | bwd_microstep: 4604.17 | bwd_inner_microstep: 4599.06 | bwd_allreduce_microstep: 5.02 | step_microstep: 41.80
-[2025-01-25 19:09:56,937] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2145.50 | bwd: 4604.20 | bwd_inner: 4599.06 | bwd_allreduce: 5.07 | step: 41.80
- 64%|██████▎   | 3687/5800 [10:23:26<16:30:32, 28.13s/it]                                                         {'loss': 0.0078, 'grad_norm': 7.996181488037109, 'learning_rate': 1.2379877435373615e-05, 'epoch': 31.78}
- 64%|██████▎   | 3687/5800 [10:23:26<16:30:32, 28.13s/it]score1 tensor([[0.5586],
-        [0.3613],
-        [0.5195],
-        [0.4336]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.3613, 0.5117, 0.4199], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:10:03,755] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.37
-[2025-01-25 19:10:03,756] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2148.80 | bwd_microstep: 4551.94 | bwd_inner_microstep: 4547.10 | bwd_allreduce_microstep: 4.75 | step_microstep: 43.67
-[2025-01-25 19:10:03,756] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2148.77 | bwd: 4551.96 | bwd_inner: 4547.10 | bwd_allreduce: 4.79 | step: 43.68
- 64%|██████▎   | 3688/5800 [10:23:33<12:45:01, 21.73s/it]                                                         {'loss': 0.0063, 'grad_norm': 1.8201779127120972, 'learning_rate': 1.2369552885553437e-05, 'epoch': 31.79}
- 64%|██████▎   | 3688/5800 [10:23:33<12:45:01, 21.73s/it]score1 tensor([[0.4824],
-        [0.4473],
-        [0.4785],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4629, 0.4492, 0.4570, 0.5000], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:10:10,613] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.28 | optimizer_step: 4.37
-[2025-01-25 19:10:10,614] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2148.82 | bwd_microstep: 4600.33 | bwd_inner_microstep: 4595.38 | bwd_allreduce_microstep: 4.85 | step_microstep: 40.82
-[2025-01-25 19:10:10,614] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2148.79 | bwd: 4600.35 | bwd_inner: 4595.38 | bwd_allreduce: 4.90 | step: 40.83
- 64%|██████▎   | 3689/5800 [10:23:40<10:07:43, 17.27s/it]                                                         {'loss': 0.0137, 'grad_norm': 4.080055236816406, 'learning_rate': 1.2359230715039068e-05, 'epoch': 31.8}
- 64%|██████▎   | 3689/5800 [10:23:40<10:07:43, 17.27s/it]score1 tensor([[0.5742],
-        [0.5273],
-        [0.5508],
-        [0.4922]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.5195, 0.5391, 0.4746], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:10:17,485] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 19:10:17,487] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2145.98 | bwd_microstep: 4607.18 | bwd_inner_microstep: 4602.61 | bwd_allreduce_microstep: 4.49 | step_microstep: 42.66
-[2025-01-25 19:10:17,487] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2145.94 | bwd: 4607.21 | bwd_inner: 4602.61 | bwd_allreduce: 4.53 | step: 42.66
- 64%|██████▎   | 3690/5800 [10:23:47<8:17:42, 14.15s/it]                                                         {'loss': 0.0103, 'grad_norm': 4.004147529602051, 'learning_rate': 1.2348910927049133e-05, 'epoch': 31.81}
- 64%|██████▎   | 3690/5800 [10:23:47<8:17:42, 14.15s/it]score1 tensor([[0.4316],
-        [0.4785],
-        [0.5352],
-        [0.5781]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4375, 0.5039, 0.5430, 0.5938], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:10:24,368] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 19:10:24,369] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2148.25 | bwd_microstep: 4602.05 | bwd_inner_microstep: 4597.30 | bwd_allreduce_microstep: 4.66 | step_microstep: 42.28
-[2025-01-25 19:10:24,370] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2148.23 | bwd: 4602.08 | bwd_inner: 4597.30 | bwd_allreduce: 4.71 | step: 42.29
- 64%|██████▎   | 3691/5800 [10:23:54<7:00:43, 11.97s/it]                                                        {'loss': 0.0137, 'grad_norm': 8.108399391174316, 'learning_rate': 1.2338593524801534e-05, 'epoch': 31.82}
- 64%|██████▎   | 3691/5800 [10:23:54<7:00:43, 11.97s/it]score1 tensor([[0.4043],
-        [0.4141],
-        [0.5000],
-        [0.6328]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4082, 0.4297, 0.5039, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:10:31,250] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 19:10:31,251] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.21 | bwd_microstep: 4615.21 | bwd_inner_microstep: 4610.66 | bwd_allreduce_microstep: 4.45 | step_microstep: 41.05
-[2025-01-25 19:10:31,251] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.19 | bwd: 4615.28 | bwd_inner: 4610.66 | bwd_allreduce: 4.50 | step: 41.06
- 64%|██████▎   | 3692/5800 [10:24:01<6:06:54, 10.44s/it]                                                        {'loss': 0.0098, 'grad_norm': 3.3508996963500977, 'learning_rate': 1.2328278511513407e-05, 'epoch': 31.83}
- 64%|██████▎   | 3692/5800 [10:24:01<6:06:54, 10.44s/it]score1 tensor([[0.4727],
-        [0.5430],
-        [0.4336],
-        [0.3652]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.6055, 0.4473, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:10:38,076] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 19:10:38,077] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.12 | bwd_microstep: 4558.85 | bwd_inner_microstep: 4553.93 | bwd_allreduce_microstep: 4.83 | step_microstep: 43.03
-[2025-01-25 19:10:38,077] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.08 | bwd: 4558.88 | bwd_inner: 4553.93 | bwd_allreduce: 4.88 | step: 43.03
- 64%|██████▎   | 3693/5800 [10:24:08<5:28:38,  9.36s/it]                                                        {'loss': 0.0215, 'grad_norm': 5.680731773376465, 'learning_rate': 1.2317965890401162e-05, 'epoch': 31.84}
- 64%|██████▎   | 3693/5800 [10:24:08<5:28:38,  9.36s/it]score1 tensor([[0.4941],
-        [0.5977],
-        [0.6211],
-        [0.6133]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.6055, 0.6406, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:10:44,905] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.37
-[2025-01-25 19:10:44,906] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.17 | bwd_microstep: 4560.91 | bwd_inner_microstep: 4556.40 | bwd_allreduce_microstep: 4.41 | step_microstep: 41.60
-[2025-01-25 19:10:44,907] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.12 | bwd: 4560.94 | bwd_inner: 4556.40 | bwd_allreduce: 4.46 | step: 41.61
- 64%|██████▎   | 3694/5800 [10:24:14<5:01:54,  8.60s/it]                                                        {'loss': 0.0146, 'grad_norm': 6.890247821807861, 'learning_rate': 1.230765566468044e-05, 'epoch': 31.84}
- 64%|██████▎   | 3694/5800 [10:24:14<5:01:54,  8.60s/it]score1 tensor([[0.5273],
-        [0.4180],
-        [0.4512],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5234, 0.4258, 0.4844, 0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0142, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:10:51,802] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 19:10:51,804] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.04 | bwd_microstep: 4617.15 | bwd_inner_microstep: 4612.11 | bwd_allreduce_microstep: 4.96 | step_microstep: 46.11
-[2025-01-25 19:10:51,804] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.01 | bwd: 4617.18 | bwd_inner: 4612.11 | bwd_allreduce: 5.00 | step: 46.12
- 64%|██████▎   | 3695/5800 [10:24:21<4:43:46,  8.09s/it]                                                        {'loss': 0.0142, 'grad_norm': 0.7031601071357727, 'learning_rate': 1.2297347837566163e-05, 'epoch': 31.85}
- 64%|██████▎   | 3695/5800 [10:24:21<4:43:46,  8.09s/it]score1 tensor([[0.4141],
-        [0.3711],
-        [0.4102],
-        [0.4121]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.3750, 0.4277, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:10:58,692] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 19:10:58,693] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.50 | bwd_microstep: 4617.31 | bwd_inner_microstep: 4612.51 | bwd_allreduce_microstep: 4.72 | step_microstep: 42.10
-[2025-01-25 19:10:58,694] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.47 | bwd: 4617.33 | bwd_inner: 4612.51 | bwd_allreduce: 4.75 | step: 42.11
- 64%|██████▎   | 3696/5800 [10:24:28<4:31:03,  7.73s/it]                                                        {'loss': 0.0146, 'grad_norm': 7.180191516876221, 'learning_rate': 1.2287042412272482e-05, 'epoch': 31.86}
- 64%|██████▎   | 3696/5800 [10:24:28<4:31:03,  7.73s/it]score1 tensor([[0.5078],
-        [0.4707],
-        [0.6250],
-        [0.4395]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.4785, 0.6367, 0.4434], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:11:05,602] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.36
-[2025-01-25 19:11:05,603] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.21 | bwd_microstep: 4625.76 | bwd_inner_microstep: 4621.16 | bwd_allreduce_microstep: 4.53 | step_microstep: 45.63
-[2025-01-25 19:11:05,604] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.17 | bwd: 4625.79 | bwd_inner: 4621.16 | bwd_allreduce: 4.57 | step: 45.63
- 64%|██████▎   | 3697/5800 [10:24:35<4:22:14,  7.48s/it]                                                        {'loss': 0.0068, 'grad_norm': 8.144054412841797, 'learning_rate': 1.2276739392012805e-05, 'epoch': 31.87}
- 64%|██████▎   | 3697/5800 [10:24:35<4:22:14,  7.48s/it]score1 tensor([[0.4219],
-        [0.4043],
-        [0.4395],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4336, 0.4160, 0.4414, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:11:12,507] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.36
-[2025-01-25 19:11:12,508] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.07 | bwd_microstep: 4634.92 | bwd_inner_microstep: 4630.44 | bwd_allreduce_microstep: 4.40 | step_microstep: 42.26
-[2025-01-25 19:11:12,508] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.04 | bwd: 4634.94 | bwd_inner: 4630.44 | bwd_allreduce: 4.44 | step: 42.27
- 64%|██████▍   | 3698/5800 [10:24:42<4:16:05,  7.31s/it]                                                        {'loss': 0.0112, 'grad_norm': 7.606608867645264, 'learning_rate': 1.2266438779999797e-05, 'epoch': 31.88}
- 64%|██████▍   | 3698/5800 [10:24:42<4:16:05,  7.31s/it]score1 tensor([[0.5234],
-        [0.5469],
-        [0.4922],
-        [0.6758]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.5469, 0.4922, 0.6797], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:11:19,315] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 19:11:19,317] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.03 | bwd_microstep: 4537.09 | bwd_inner_microstep: 4532.20 | bwd_allreduce_microstep: 4.81 | step_microstep: 41.28
-[2025-01-25 19:11:19,317] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.00 | bwd: 4537.12 | bwd_inner: 4532.20 | bwd_allreduce: 4.85 | step: 41.29
- 64%|██████▍   | 3699/5800 [10:24:49<4:10:43,  7.16s/it]                                                        {'loss': 0.0049, 'grad_norm': 4.505207061767578, 'learning_rate': 1.2256140579445363e-05, 'epoch': 31.89}
- 64%|██████▍   | 3699/5800 [10:24:49<4:10:43,  7.16s/it]score1 tensor([[0.5430],
-        [0.4844],
-        [0.4160],
-        [0.6562]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.4922, 0.4238, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:11:26,233] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.25 | optimizer_step: 4.37
-[2025-01-25 19:11:26,234] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.31 | bwd_microstep: 4639.24 | bwd_inner_microstep: 4634.21 | bwd_allreduce_microstep: 4.94 | step_microstep: 42.49
-[2025-01-25 19:11:26,235] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.27 | bwd: 4639.26 | bwd_inner: 4634.21 | bwd_allreduce: 4.99 | step: 42.49
- 64%|██████▍   | 3700/5800 [10:24:56<4:08:07,  7.09s/it]                                                        {'loss': 0.0098, 'grad_norm': 3.548354148864746, 'learning_rate': 1.2245844793560666e-05, 'epoch': 31.9}
- 64%|██████▍   | 3700/5800 [10:24:56<4:08:07,  7.09s/it]score1 tensor([[0.4766],
-        [0.4492],
-        [0.6680],
-        [0.4141]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.4375, 0.6406, 0.3945], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:11:33,163] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.53 | optimizer_step: 4.37
-[2025-01-25 19:11:33,164] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.16 | bwd_microstep: 4635.77 | bwd_inner_microstep: 4630.89 | bwd_allreduce_microstep: 4.81 | step_microstep: 47.75
-[2025-01-25 19:11:33,164] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.13 | bwd: 4635.79 | bwd_inner: 4630.89 | bwd_allreduce: 4.84 | step: 47.76
- 64%|██████▍   | 3701/5800 [10:25:03<4:06:12,  7.04s/it]                                                        {'loss': 0.0176, 'grad_norm': 8.040803909301758, 'learning_rate': 1.2235551425556112e-05, 'epoch': 31.91}
- 64%|██████▍   | 3701/5800 [10:25:03<4:06:12,  7.04s/it]score1 tensor([[0.4941],
-        [0.5117],
-        [0.4688],
-        [0.4824]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.5078, 0.4609, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:11:40,073] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 19:11:40,075] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.42 | bwd_microstep: 4637.45 | bwd_inner_microstep: 4632.93 | bwd_allreduce_microstep: 4.43 | step_microstep: 42.10
-[2025-01-25 19:11:40,075] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.37 | bwd: 4637.48 | bwd_inner: 4632.93 | bwd_allreduce: 4.48 | step: 42.11
- 64%|██████▍   | 3702/5800 [10:25:10<4:04:46,  7.00s/it]                                                        {'loss': 0.0122, 'grad_norm': 3.9781177043914795, 'learning_rate': 1.2225260478641343e-05, 'epoch': 31.91}
- 64%|██████▍   | 3702/5800 [10:25:10<4:04:46,  7.00s/it]score1 tensor([[0.4453],
-        [0.5000],
-        [0.5352],
-        [0.3672]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.4980, 0.5430, 0.3691], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:11:46,979] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.53 | optimizer_step: 4.37
-[2025-01-25 19:11:46,981] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.63 | bwd_microstep: 4632.53 | bwd_inner_microstep: 4627.84 | bwd_allreduce_microstep: 4.62 | step_microstep: 40.67
-[2025-01-25 19:11:46,981] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.60 | bwd: 4632.56 | bwd_inner: 4627.84 | bwd_allreduce: 4.65 | step: 40.67
- 64%|██████▍   | 3703/5800 [10:25:16<4:03:42,  6.97s/it]                                                        {'loss': 0.0039, 'grad_norm': 3.672635078430176, 'learning_rate': 1.221497195602527e-05, 'epoch': 31.92}
- 64%|██████▍   | 3703/5800 [10:25:16<4:03:42,  6.97s/it]score1 tensor([[0.4414],
-        [0.5117],
-        [0.4375],
-        [0.4805]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4043, 0.4785, 0.4180, 0.4590], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0278, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:11:53,899] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 19:11:53,900] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.72 | bwd_microstep: 4636.49 | bwd_inner_microstep: 4631.59 | bwd_allreduce_microstep: 4.77 | step_microstep: 42.05
-[2025-01-25 19:11:53,901] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.69 | bwd: 4636.52 | bwd_inner: 4631.59 | bwd_allreduce: 4.85 | step: 42.05
- 64%|██████▍   | 3704/5800 [10:25:23<4:03:02,  6.96s/it]                                                        {'loss': 0.0278, 'grad_norm': 7.722806453704834, 'learning_rate': 1.220468586091601e-05, 'epoch': 31.93}
- 64%|██████▍   | 3704/5800 [10:25:23<4:03:02,  6.96s/it]score1 tensor([[0.4141],
-        [0.4961],
-        [0.5039],
-        [0.6016]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.4844, 0.4805, 0.5820], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0171, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:12:00,823] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.37
-[2025-01-25 19:12:00,824] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.83 | bwd_microstep: 4643.08 | bwd_inner_microstep: 4636.60 | bwd_allreduce_microstep: 6.40 | step_microstep: 41.27
-[2025-01-25 19:12:00,825] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.80 | bwd: 4643.10 | bwd_inner: 4636.60 | bwd_allreduce: 6.44 | step: 41.27
- 64%|██████▍   | 3705/5800 [10:25:30<4:02:32,  6.95s/it]                                                        {'loss': 0.0171, 'grad_norm': 8.065499305725098, 'learning_rate': 1.2194402196520971e-05, 'epoch': 31.94}
- 64%|██████▍   | 3705/5800 [10:25:30<4:02:32,  6.95s/it]score1 tensor([[0.4746],
-        [0.6484],
-        [0.6250],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4707, 0.6172, 0.6211, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:12:07,742] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 19:12:07,743] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.24 | bwd_microstep: 4635.26 | bwd_inner_microstep: 4629.94 | bwd_allreduce_microstep: 5.20 | step_microstep: 45.81
-[2025-01-25 19:12:07,743] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.22 | bwd: 4635.28 | bwd_inner: 4629.94 | bwd_allreduce: 5.26 | step: 45.82
- 64%|██████▍   | 3706/5800 [10:25:37<4:02:07,  6.94s/it]                                                        {'loss': 0.0156, 'grad_norm': 8.800084114074707, 'learning_rate': 1.2184120966046755e-05, 'epoch': 31.95}
- 64%|██████▍   | 3706/5800 [10:25:37<4:02:07,  6.94s/it]score1 tensor([[0.3555],
-        [0.5078],
-        [0.6016],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3418, 0.5000, 0.5625, 0.4805], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0269, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:12:15,152] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 19:12:15,153] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.51 | bwd_microstep: 4637.27 | bwd_inner_microstep: 4632.64 | bwd_allreduce_microstep: 4.55 | step_microstep: 540.36
-[2025-01-25 19:12:15,153] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.47 | bwd: 4637.29 | bwd_inner: 4632.64 | bwd_allreduce: 4.58 | step: 540.37
- 64%|██████▍   | 3707/5800 [10:25:45<4:06:57,  7.08s/it]                                                        {'loss': 0.0269, 'grad_norm': 8.013275146484375, 'learning_rate': 1.2173842172699249e-05, 'epoch': 31.96}
- 64%|██████▍   | 3707/5800 [10:25:45<4:06:57,  7.08s/it]score1 tensor([[0.6055],
-        [0.4883],
-        [0.6367],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5898, 0.4668, 0.6094, 0.5586], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0200, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:12:22,073] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 19:12:22,074] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.25 | bwd_microstep: 4640.78 | bwd_inner_microstep: 4634.71 | bwd_allreduce_microstep: 5.99 | step_microstep: 47.42
-[2025-01-25 19:12:22,075] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.22 | bwd: 4640.80 | bwd_inner: 4634.71 | bwd_allreduce: 6.03 | step: 47.43
- 64%|██████▍   | 3708/5800 [10:25:52<4:05:11,  7.03s/it]                                                        {'loss': 0.02, 'grad_norm': 8.767982482910156, 'learning_rate': 1.2163565819683547e-05, 'epoch': 31.97}
- 64%|██████▍   | 3708/5800 [10:25:52<4:05:11,  7.03s/it]score1 tensor([[0.6680],
-        [0.5430],
-        [0.5781],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.5312, 0.5508, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0229, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:12:28,987] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.00 | optimizer_step: 4.37
-[2025-01-25 19:12:28,988] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.15 | bwd_microstep: 4634.57 | bwd_inner_microstep: 4629.64 | bwd_allreduce_microstep: 4.82 | step_microstep: 42.27
-[2025-01-25 19:12:28,988] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.12 | bwd: 4634.60 | bwd_inner: 4629.64 | bwd_allreduce: 4.87 | step: 42.28
- 64%|██████▍   | 3709/5800 [10:25:58<4:03:50,  7.00s/it]                                                        {'loss': 0.0229, 'grad_norm': 8.769231796264648, 'learning_rate': 1.215329191020399e-05, 'epoch': 31.97}
- 64%|██████▍   | 3709/5800 [10:25:58<4:03:50,  7.00s/it]score1 tensor([[0.5430],
-        [0.6367],
-        [0.4395],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5078, 0.6094, 0.4180, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:12:35,900] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 19:12:35,901] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.33 | bwd_microstep: 4633.49 | bwd_inner_microstep: 4628.44 | bwd_allreduce_microstep: 4.94 | step_microstep: 43.70
-[2025-01-25 19:12:35,902] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.30 | bwd: 4633.52 | bwd_inner: 4628.44 | bwd_allreduce: 4.99 | step: 43.72
- 64%|██████▍   | 3710/5800 [10:26:05<4:02:49,  6.97s/it]                                                        {'loss': 0.0215, 'grad_norm': 8.304008483886719, 'learning_rate': 1.2143020447464177e-05, 'epoch': 31.98}
- 64%|██████▍   | 3710/5800 [10:26:05<4:02:49,  6.97s/it]score1 tensor([[0.6055],
-        [0.6914],
-        [0.5547],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6016, 0.6562, 0.5586, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:12:42,809] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 19:12:42,810] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.48 | bwd_microstep: 4638.04 | bwd_inner_microstep: 4633.05 | bwd_allreduce_microstep: 4.90 | step_microstep: 43.16
-[2025-01-25 19:12:42,811] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.45 | bwd: 4638.06 | bwd_inner: 4633.05 | bwd_allreduce: 4.95 | step: 43.17
- 64%|██████▍   | 3711/5800 [10:26:12<4:02:02,  6.95s/it]                                                        {'loss': 0.0117, 'grad_norm': 0.8243961334228516, 'learning_rate': 1.213275143466691e-05, 'epoch': 31.99}
- 64%|██████▍   | 3711/5800 [10:26:12<4:02:02,  6.95s/it][93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-score1 tensor([[0.4668]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:12:47,068] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 19:12:47,069] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 573.61 | bwd_microstep: 1222.29 | bwd_inner_microstep: 1217.59 | bwd_allreduce_microstep: 4.63 | step_microstep: 42.94
-[2025-01-25 19:12:47,070] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 573.57 | bwd: 1222.32 | bwd_inner: 1217.59 | bwd_allreduce: 4.67 | step: 42.96
- 64%|██████▍   | 3712/5800 [10:26:17<3:33:51,  6.15s/it]                                                        {'loss': 0.002, 'grad_norm': 7.757216930389404, 'learning_rate': 1.2122484875014261e-05, 'epoch': 32.0}
- 64%|██████▍   | 3712/5800 [10:26:17<3:33:51,  6.15s/it][2025-01-25 19:12:51,648] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 19:13:01,784] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 19:13:11,216] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 19:13:20,722] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-score1 tensor([[0.5430],
-        [0.3945],
-        [0.4258],
-        [0.3984]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5703, 0.4004, 0.4277, 0.4062], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:13:36,649] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 10.10 | optimizer_step: 4.36
-[2025-01-25 19:13:36,650] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2139.27 | bwd_microstep: 4574.93 | bwd_inner_microstep: 4568.97 | bwd_allreduce_microstep: 5.86 | step_microstep: 52.09
-[2025-01-25 19:13:36,650] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2139.24 | bwd: 4574.96 | bwd_inner: 4568.97 | bwd_allreduce: 5.91 | step: 52.10
- 64%|██████▍   | 3713/5800 [10:27:06<11:07:00, 19.18s/it]                                                         {'loss': 0.0107, 'grad_norm': 7.463339805603027, 'learning_rate': 1.2112220771707505e-05, 'epoch': 32.01}
- 64%|██████▍   | 3713/5800 [10:27:06<11:07:00, 19.18s/it]score1 tensor([[0.4824],
-        [0.6211],
-        [0.4590],
-        [0.4746]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4902, 0.6445, 0.4883, 0.4883], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:13:43,490] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 19:13:43,492] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2131.71 | bwd_microstep: 4583.42 | bwd_inner_microstep: 4578.34 | bwd_allreduce_microstep: 4.98 | step_microstep: 44.84
-[2025-01-25 19:13:43,492] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2131.68 | bwd: 4583.44 | bwd_inner: 4578.34 | bwd_allreduce: 5.03 | step: 44.84
- 64%|██████▍   | 3714/5800 [10:27:13<8:58:02, 15.48s/it]                                                         {'loss': 0.0186, 'grad_norm': 8.14030647277832, 'learning_rate': 1.210195912794719e-05, 'epoch': 32.02}
- 64%|██████▍   | 3714/5800 [10:27:13<8:58:02, 15.48s/it]score1 tensor([[0.5664],
-        [0.5391],
-        [0.4902],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.5586, 0.4844, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0239, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:13:50,356] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 19:13:50,358] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2142.01 | bwd_microstep: 4596.28 | bwd_inner_microstep: 4591.28 | bwd_allreduce_microstep: 4.91 | step_microstep: 44.90
-[2025-01-25 19:13:50,358] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2141.93 | bwd: 4596.30 | bwd_inner: 4591.28 | bwd_allreduce: 4.96 | step: 44.92
- 64%|██████▍   | 3715/5800 [10:27:20<7:28:03, 12.89s/it]                                                        {'loss': 0.0239, 'grad_norm': 4.383686542510986, 'learning_rate': 1.2091699946933055e-05, 'epoch': 32.03}
- 64%|██████▍   | 3715/5800 [10:27:20<7:28:03, 12.89s/it]score1 tensor([[0.5508],
-        [0.4531],
-        [0.4863],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5703, 0.4688, 0.5117, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0190, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:13:57,242] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 19:13:57,244] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2142.61 | bwd_microstep: 4604.87 | bwd_inner_microstep: 4600.28 | bwd_allreduce_microstep: 4.51 | step_microstep: 46.55
-[2025-01-25 19:13:57,244] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2142.57 | bwd: 4604.90 | bwd_inner: 4600.28 | bwd_allreduce: 4.55 | step: 46.56
- 64%|██████▍   | 3716/5800 [10:27:27<6:25:12, 11.09s/it]                                                        {'loss': 0.019, 'grad_norm': 8.052835464477539, 'learning_rate': 1.2081443231864099e-05, 'epoch': 32.03}
- 64%|██████▍   | 3716/5800 [10:27:27<6:25:12, 11.09s/it]score1 tensor([[0.4512],
-        [0.4648],
-        [0.4922],
-        [0.6016]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.4648, 0.5156, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0151, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:14:04,069] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.36
-[2025-01-25 19:14:04,070] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.27 | bwd_microstep: 4552.45 | bwd_inner_microstep: 4547.18 | bwd_allreduce_microstep: 5.17 | step_microstep: 43.22
-[2025-01-25 19:14:04,071] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.24 | bwd: 4552.48 | bwd_inner: 4547.18 | bwd_allreduce: 5.22 | step: 43.23
- 64%|██████▍   | 3717/5800 [10:27:34<5:40:50,  9.82s/it]                                                        {'loss': 0.0151, 'grad_norm': 6.109273433685303, 'learning_rate': 1.207118898593854e-05, 'epoch': 32.04}
- 64%|██████▍   | 3717/5800 [10:27:34<5:40:50,  9.82s/it]score1 tensor([[0.5391],
-        [0.5742],
-        [0.5781],
-        [0.4277]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.6016, 0.6133, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0259, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:14:10,978] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 19:14:10,979] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.47 | bwd_microstep: 4600.32 | bwd_inner_microstep: 4595.53 | bwd_allreduce_microstep: 4.68 | step_microstep: 46.43
-[2025-01-25 19:14:10,979] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.42 | bwd: 4600.35 | bwd_inner: 4595.53 | bwd_allreduce: 4.74 | step: 46.44
- 64%|██████▍   | 3718/5800 [10:27:40<5:10:08,  8.94s/it]                                                        {'loss': 0.0259, 'grad_norm': 8.333703994750977, 'learning_rate': 1.2060937212353834e-05, 'epoch': 32.05}
- 64%|██████▍   | 3718/5800 [10:27:40<5:10:08,  8.94s/it]score1 tensor([[0.6445],
-        [0.5195],
-        [0.5977],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6484, 0.5312, 0.6016, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:14:17,836] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 19:14:17,838] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2146.31 | bwd_microstep: 4592.18 | bwd_inner_microstep: 4587.56 | bwd_allreduce_microstep: 4.53 | step_microstep: 42.82
-[2025-01-25 19:14:17,838] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2146.27 | bwd: 4592.21 | bwd_inner: 4587.56 | bwd_allreduce: 4.58 | step: 42.83
- 64%|██████▍   | 3719/5800 [10:27:47<4:48:21,  8.31s/it]                                                        {'loss': 0.0078, 'grad_norm': 8.671680450439453, 'learning_rate': 1.2050687914306658e-05, 'epoch': 32.06}
- 64%|██████▍   | 3719/5800 [10:27:47<4:48:21,  8.31s/it]score1 tensor([[0.6016],
-        [0.4707],
-        [0.4668],
-        [0.4199]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5977, 0.4570, 0.4414, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:14:24,704] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.89 | optimizer_step: 4.36
-[2025-01-25 19:14:24,705] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2151.61 | bwd_microstep: 4593.47 | bwd_inner_microstep: 4588.24 | bwd_allreduce_microstep: 5.03 | step_microstep: 42.81
-[2025-01-25 19:14:24,705] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2151.58 | bwd: 4593.50 | bwd_inner: 4588.24 | bwd_allreduce: 5.11 | step: 42.82
- 64%|██████▍   | 3720/5800 [10:27:54<4:33:12,  7.88s/it]                                                        {'loss': 0.0112, 'grad_norm': 7.929393291473389, 'learning_rate': 1.204044109499293e-05, 'epoch': 32.07}
- 64%|██████▍   | 3720/5800 [10:27:54<4:33:12,  7.88s/it]score1 tensor([[0.5117],
-        [0.3906],
-        [0.5000],
-        [0.4922]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5234, 0.3750, 0.5156, 0.4727], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:14:31,595] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.36
-[2025-01-25 19:14:31,597] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2149.34 | bwd_microstep: 4601.65 | bwd_inner_microstep: 4596.41 | bwd_allreduce_microstep: 5.13 | step_microstep: 53.13
-[2025-01-25 19:14:31,597] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2149.31 | bwd: 4601.67 | bwd_inner: 4596.41 | bwd_allreduce: 5.18 | step: 53.14
- 64%|██████▍   | 3721/5800 [10:28:01<4:22:47,  7.58s/it]                                                        {'loss': 0.0156, 'grad_norm': 0.5050615072250366, 'learning_rate': 1.2030196757607769e-05, 'epoch': 32.08}
- 64%|██████▍   | 3721/5800 [10:28:01<4:22:47,  7.58s/it]score1 tensor([[0.4648],
-        [0.5508],
-        [0.5469],
-        [0.4531]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4629, 0.5586, 0.5430, 0.4297], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:14:38,470] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.55 | optimizer_step: 4.36
-[2025-01-25 19:14:38,471] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.45 | bwd_microstep: 4601.71 | bwd_inner_microstep: 4597.10 | bwd_allreduce_microstep: 4.52 | step_microstep: 43.55
-[2025-01-25 19:14:38,472] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.42 | bwd: 4601.73 | bwd_inner: 4597.10 | bwd_allreduce: 4.56 | step: 43.57
- 64%|██████▍   | 3722/5800 [10:28:08<4:15:16,  7.37s/it]                                                        {'loss': 0.0093, 'grad_norm': 3.846342086791992, 'learning_rate': 1.2019954905345554e-05, 'epoch': 32.09}
- 64%|██████▍   | 3722/5800 [10:28:08<4:15:16,  7.37s/it]score1 tensor([[0.5898],
-        [0.5742],
-        [0.5508],
-        [0.4902]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.5742, 0.5664, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:14:45,298] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 19:14:45,299] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.02 | bwd_microstep: 4551.48 | bwd_inner_microstep: 4546.18 | bwd_allreduce_microstep: 5.19 | step_microstep: 44.72
-[2025-01-25 19:14:45,300] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.99 | bwd: 4551.50 | bwd_inner: 4546.18 | bwd_allreduce: 5.24 | step: 44.75
- 64%|██████▍   | 3723/5800 [10:28:15<4:09:29,  7.21s/it]                                                        {'loss': 0.0137, 'grad_norm': 6.319055557250977, 'learning_rate': 1.200971554139986e-05, 'epoch': 32.09}
- 64%|██████▍   | 3723/5800 [10:28:15<4:09:29,  7.21s/it]score1 tensor([[0.5664],
-        [0.3906],
-        [0.5781],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.3750, 0.5898, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:14:52,128] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.55 | optimizer_step: 4.37
-[2025-01-25 19:14:52,129] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.50 | bwd_microstep: 4555.27 | bwd_inner_microstep: 4550.28 | bwd_allreduce_microstep: 4.88 | step_microstep: 43.35
-[2025-01-25 19:14:52,129] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.47 | bwd: 4555.30 | bwd_inner: 4550.28 | bwd_allreduce: 4.95 | step: 43.35
- 64%|██████▍   | 3724/5800 [10:28:22<4:05:27,  7.09s/it]                                                        {'loss': 0.0088, 'grad_norm': 1.6278488636016846, 'learning_rate': 1.1999478668963509e-05, 'epoch': 32.1}
- 64%|██████▍   | 3724/5800 [10:28:22<4:05:27,  7.09s/it]score1 tensor([[0.6211],
-        [0.4473],
-        [0.4180],
-        [0.3438]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.4180, 0.4004, 0.3223], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0181, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:14:58,998] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 19:14:58,999] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2151.83 | bwd_microstep: 4606.74 | bwd_inner_microstep: 4601.97 | bwd_allreduce_microstep: 4.63 | step_microstep: 43.19
-[2025-01-25 19:14:58,999] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2151.79 | bwd: 4606.76 | bwd_inner: 4601.97 | bwd_allreduce: 4.70 | step: 43.19
- 64%|██████▍   | 3725/5800 [10:28:28<4:03:00,  7.03s/it]                                                        {'loss': 0.0181, 'grad_norm': 7.615180015563965, 'learning_rate': 1.1989244291228524e-05, 'epoch': 32.11}
- 64%|██████▍   | 3725/5800 [10:28:28<4:03:00,  7.03s/it]score1 tensor([[0.4941],
-        [0.4883],
-        [0.4336],
-        [0.4668]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4863, 0.4941, 0.4141, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:15:05,878] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.36
-[2025-01-25 19:15:05,879] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.03 | bwd_microstep: 4605.04 | bwd_inner_microstep: 4600.36 | bwd_allreduce_microstep: 4.59 | step_microstep: 45.57
-[2025-01-25 19:15:05,879] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.99 | bwd: 4605.08 | bwd_inner: 4600.36 | bwd_allreduce: 4.63 | step: 45.58
- 64%|██████▍   | 3726/5800 [10:28:35<4:01:25,  6.98s/it]                                                        {'loss': 0.0088, 'grad_norm': 3.8376646041870117, 'learning_rate': 1.1979012411386175e-05, 'epoch': 32.12}
- 64%|██████▍   | 3726/5800 [10:28:35<4:01:25,  6.98s/it]score1 tensor([[0.3789],
-        [0.3555],
-        [0.5078],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3789, 0.3398, 0.5039, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:15:12,707] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.37
-[2025-01-25 19:15:12,708] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.74 | bwd_microstep: 4553.32 | bwd_inner_microstep: 4548.48 | bwd_allreduce_microstep: 4.71 | step_microstep: 41.66
-[2025-01-25 19:15:12,709] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.70 | bwd: 4553.35 | bwd_inner: 4548.48 | bwd_allreduce: 4.77 | step: 41.66
- 64%|██████▍   | 3727/5800 [10:28:42<3:59:42,  6.94s/it]                                                        {'loss': 0.0068, 'grad_norm': 5.879308700561523, 'learning_rate': 1.1968783032626931e-05, 'epoch': 32.13}
- 64%|██████▍   | 3727/5800 [10:28:42<3:59:42,  6.94s/it]score1 tensor([[0.4336],
-        [0.3750],
-        [0.4883],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4453, 0.3789, 0.4922, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:15:19,588] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.52 | optimizer_step: 4.37
-[2025-01-25 19:15:19,589] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.84 | bwd_microstep: 4606.97 | bwd_inner_microstep: 4602.52 | bwd_allreduce_microstep: 4.36 | step_microstep: 40.78
-[2025-01-25 19:15:19,589] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.80 | bwd: 4607.00 | bwd_inner: 4602.52 | bwd_allreduce: 4.41 | step: 40.79
- 64%|██████▍   | 3728/5800 [10:28:49<3:58:56,  6.92s/it]                                                        {'loss': 0.0059, 'grad_norm': 3.4755003452301025, 'learning_rate': 1.1958556158140486e-05, 'epoch': 32.14}
- 64%|██████▍   | 3728/5800 [10:28:49<3:58:56,  6.92s/it]score1 tensor([[0.4434],
-        [0.6055],
-        [0.5547],
-        [0.4648]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4453, 0.6055, 0.5391, 0.4570], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:15:26,419] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.05 | optimizer_step: 4.37
-[2025-01-25 19:15:26,420] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.18 | bwd_microstep: 4562.16 | bwd_inner_microstep: 4556.73 | bwd_allreduce_microstep: 5.30 | step_microstep: 43.75
-[2025-01-25 19:15:26,421] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.16 | bwd: 4562.19 | bwd_inner: 4556.73 | bwd_allreduce: 5.36 | step: 43.76
- 64%|██████▍   | 3729/5800 [10:28:56<3:58:03,  6.90s/it]                                                        {'loss': 0.0063, 'grad_norm': 2.232603073120117, 'learning_rate': 1.1948331791115768e-05, 'epoch': 32.15}
- 64%|██████▍   | 3729/5800 [10:28:56<3:58:03,  6.90s/it]score1 tensor([[0.5625],
-        [0.4805],
-        [0.5195],
-        [0.3711]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5742, 0.5000, 0.5391, 0.3672], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:15:33,325] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 19:15:33,327] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.17 | bwd_microstep: 4620.04 | bwd_inner_microstep: 4614.37 | bwd_allreduce_microstep: 5.50 | step_microstep: 44.36
-[2025-01-25 19:15:33,327] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.14 | bwd: 4620.07 | bwd_inner: 4614.37 | bwd_allreduce: 5.56 | step: 44.37
- 64%|██████▍   | 3730/5800 [10:29:03<3:57:56,  6.90s/it]                                                        {'loss': 0.0137, 'grad_norm': 4.507059574127197, 'learning_rate': 1.1938109934740898e-05, 'epoch': 32.16}
- 64%|██████▍   | 3730/5800 [10:29:03<3:57:56,  6.90s/it]score1 tensor([[0.6172],
-        [0.4746],
-        [0.5469],
-        [0.4961]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6289, 0.4785, 0.5664, 0.4805], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:15:40,223] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.37
-[2025-01-25 19:15:40,224] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.27 | bwd_microstep: 4616.68 | bwd_inner_microstep: 4611.36 | bwd_allreduce_microstep: 5.24 | step_microstep: 41.30
-[2025-01-25 19:15:40,224] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.23 | bwd: 4616.71 | bwd_inner: 4611.36 | bwd_allreduce: 5.28 | step: 41.31
- 64%|██████▍   | 3731/5800 [10:29:10<3:57:48,  6.90s/it]                                                        {'loss': 0.0127, 'grad_norm': 4.363120079040527, 'learning_rate': 1.1927890592203247e-05, 'epoch': 32.16}
- 64%|██████▍   | 3731/5800 [10:29:10<3:57:48,  6.90s/it]score1 tensor([[0.4707],
-        [0.5742],
-        [0.5977],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.5938, 0.6133, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0151, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:15:47,122] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 19:15:47,127] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.55 | bwd_microstep: 4611.61 | bwd_inner_microstep: 4606.93 | bwd_allreduce_microstep: 4.61 | step_microstep: 44.49
-[2025-01-25 19:15:47,127] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.52 | bwd: 4611.64 | bwd_inner: 4606.93 | bwd_allreduce: 4.65 | step: 44.51
- 64%|██████▍   | 3732/5800 [10:29:17<3:57:45,  6.90s/it]                                                        {'loss': 0.0151, 'grad_norm': 4.552183628082275, 'learning_rate': 1.1917673766689362e-05, 'epoch': 32.17}
- 64%|██████▍   | 3732/5800 [10:29:17<3:57:45,  6.90s/it]score1 tensor([[0.5000],
-        [0.5000],
-        [0.5664],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.5195, 0.5664, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:15:53,984] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.89 | optimizer_step: 4.36
-[2025-01-25 19:15:53,986] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.75 | bwd_microstep: 4572.83 | bwd_inner_microstep: 4567.63 | bwd_allreduce_microstep: 5.10 | step_microstep: 46.20
-[2025-01-25 19:15:53,986] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.72 | bwd: 4572.86 | bwd_inner: 4567.63 | bwd_allreduce: 5.15 | step: 46.21
- 64%|██████▍   | 3733/5800 [10:29:23<3:57:16,  6.89s/it]                                                        {'loss': 0.0088, 'grad_norm': 6.140822410583496, 'learning_rate': 1.1907459461385047e-05, 'epoch': 32.18}
- 64%|██████▍   | 3733/5800 [10:29:23<3:57:16,  6.89s/it]score1 tensor([[0.4082],
-        [0.4570],
-        [0.5469],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4219, 0.4551, 0.5195, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:16:00,882] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.92 | optimizer_step: 4.36
-[2025-01-25 19:16:00,883] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.04 | bwd_microstep: 4610.68 | bwd_inner_microstep: 4605.85 | bwd_allreduce_microstep: 4.75 | step_microstep: 42.99
-[2025-01-25 19:16:00,884] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.01 | bwd: 4610.70 | bwd_inner: 4605.85 | bwd_allreduce: 4.79 | step: 43.00
- 64%|██████▍   | 3734/5800 [10:29:30<3:57:14,  6.89s/it]                                                        {'loss': 0.0117, 'grad_norm': 0.49373480677604675, 'learning_rate': 1.1897247679475284e-05, 'epoch': 32.19}
- 64%|██████▍   | 3734/5800 [10:29:30<3:57:14,  6.89s/it]score1 tensor([[0.5391],
-        [0.6133],
-        [0.5508],
-        [0.4199]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.6484, 0.5625, 0.4160], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:16:07,773] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.37
-[2025-01-25 19:16:07,774] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.63 | bwd_microstep: 4610.72 | bwd_inner_microstep: 4605.64 | bwd_allreduce_microstep: 4.97 | step_microstep: 41.88
-[2025-01-25 19:16:07,774] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.59 | bwd: 4610.74 | bwd_inner: 4605.64 | bwd_allreduce: 5.02 | step: 41.89
- 64%|██████▍   | 3735/5800 [10:29:37<3:57:09,  6.89s/it]                                                        {'loss': 0.0137, 'grad_norm': 0.7577407956123352, 'learning_rate': 1.1887038424144288e-05, 'epoch': 32.2}
- 64%|██████▍   | 3735/5800 [10:29:37<3:57:09,  6.89s/it]score1 tensor([[0.6523],
-        [0.4355],
-        [0.4961],
-        [0.4141]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6484, 0.4180, 0.4922, 0.4121], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:16:14,672] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.01 | optimizer_step: 4.36
-[2025-01-25 19:16:14,674] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.64 | bwd_microstep: 4621.40 | bwd_inner_microstep: 4616.54 | bwd_allreduce_microstep: 4.75 | step_microstep: 41.17
-[2025-01-25 19:16:14,674] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.60 | bwd: 4621.43 | bwd_inner: 4616.54 | bwd_allreduce: 4.79 | step: 41.18
- 64%|██████▍   | 3736/5800 [10:29:44<3:57:10,  6.89s/it]                                                        {'loss': 0.0068, 'grad_norm': 8.028589248657227, 'learning_rate': 1.1876831698575486e-05, 'epoch': 32.21}
- 64%|██████▍   | 3736/5800 [10:29:44<3:57:10,  6.89s/it]score1 tensor([[0.4648],
-        [0.6133],
-        [0.4922],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.6094, 0.5000, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:16:21,571] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.36
-[2025-01-25 19:16:21,572] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.37 | bwd_microstep: 4612.04 | bwd_inner_microstep: 4606.72 | bwd_allreduce_microstep: 5.21 | step_microstep: 43.77
-[2025-01-25 19:16:21,573] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.33 | bwd: 4612.07 | bwd_inner: 4606.72 | bwd_allreduce: 5.27 | step: 43.78
- 64%|██████▍   | 3737/5800 [10:29:51<3:57:04,  6.89s/it]                                                        {'loss': 0.0117, 'grad_norm': 4.371932506561279, 'learning_rate': 1.1866627505951512e-05, 'epoch': 32.22}
- 64%|██████▍   | 3737/5800 [10:29:51<3:57:04,  6.89s/it]score1 tensor([[0.4629],
-        [0.5430],
-        [0.4473],
-        [0.4316]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.5352, 0.4531, 0.4355], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:16:28,476] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.36
-[2025-01-25 19:16:28,478] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.83 | bwd_microstep: 4617.83 | bwd_inner_microstep: 4612.83 | bwd_allreduce_microstep: 4.91 | step_microstep: 45.81
-[2025-01-25 19:16:28,478] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.80 | bwd: 4617.86 | bwd_inner: 4612.83 | bwd_allreduce: 4.96 | step: 45.82
- 64%|██████▍   | 3738/5800 [10:29:58<3:57:01,  6.90s/it]                                                        {'loss': 0.0078, 'grad_norm': 0.5830117464065552, 'learning_rate': 1.1856425849454203e-05, 'epoch': 32.22}
- 64%|██████▍   | 3738/5800 [10:29:58<3:57:01,  6.90s/it]score1 tensor([[0.5039],
-        [0.5234],
-        [0.4668],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.5312, 0.4473, 0.5586], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:16:35,291] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.36
-[2025-01-25 19:16:35,292] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.16 | bwd_microstep: 4528.69 | bwd_inner_microstep: 4523.49 | bwd_allreduce_microstep: 5.07 | step_microstep: 46.66
-[2025-01-25 19:16:35,292] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.13 | bwd: 4528.72 | bwd_inner: 4523.49 | bwd_allreduce: 5.15 | step: 46.66
- 64%|██████▍   | 3739/5800 [10:30:05<3:56:02,  6.87s/it]                                                        {'loss': 0.0068, 'grad_norm': 0.38534948229789734, 'learning_rate': 1.1846226732264623e-05, 'epoch': 32.23}
- 64%|██████▍   | 3739/5800 [10:30:05<3:56:02,  6.87s/it]score1 tensor([[0.6602],
-        [0.3984],
-        [0.5430],
-        [0.4570]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6602, 0.4004, 0.5312, 0.4629], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:16:42,144] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 19:16:42,146] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.87 | bwd_microstep: 4577.20 | bwd_inner_microstep: 4572.10 | bwd_allreduce_microstep: 5.02 | step_microstep: 46.19
-[2025-01-25 19:16:42,146] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.84 | bwd: 4577.22 | bwd_inner: 4572.10 | bwd_allreduce: 5.06 | step: 46.20
- 64%|██████▍   | 3740/5800 [10:30:12<3:55:48,  6.87s/it]                                                        {'loss': 0.0049, 'grad_norm': 1.6467037200927734, 'learning_rate': 1.1836030157563021e-05, 'epoch': 32.24}
- 64%|██████▍   | 3740/5800 [10:30:12<3:55:48,  6.87s/it]score1 tensor([[0.4746],
-        [0.3633],
-        [0.6094],
-        [0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4766, 0.3652, 0.6094, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:16:49,013] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 19:16:49,014] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.40 | bwd_microstep: 4584.18 | bwd_inner_microstep: 4579.14 | bwd_allreduce_microstep: 4.93 | step_microstep: 45.17
-[2025-01-25 19:16:49,015] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.34 | bwd: 4584.21 | bwd_inner: 4579.14 | bwd_allreduce: 4.99 | step: 45.18
- 64%|██████▍   | 3741/5800 [10:30:18<3:55:37,  6.87s/it]                                                        {'loss': 0.0059, 'grad_norm': 5.573938369750977, 'learning_rate': 1.1825836128528882e-05, 'epoch': 32.25}
- 64%|██████▍   | 3741/5800 [10:30:18<3:55:37,  6.87s/it]score1 tensor([[0.5352],
-        [0.4199],
-        [0.3770],
-        [0.6211]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.4043, 0.3652, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:16:55,894] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.99 | optimizer_step: 4.36
-[2025-01-25 19:16:55,895] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.54 | bwd_microstep: 4594.71 | bwd_inner_microstep: 4589.69 | bwd_allreduce_microstep: 4.93 | step_microstep: 41.98
-[2025-01-25 19:16:55,895] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.50 | bwd: 4594.73 | bwd_inner: 4589.69 | bwd_allreduce: 4.98 | step: 41.99
- 65%|██████▍   | 3742/5800 [10:30:25<3:55:43,  6.87s/it]                                                        {'loss': 0.0117, 'grad_norm': 5.621870994567871, 'learning_rate': 1.1815644648340862e-05, 'epoch': 32.26}
- 65%|██████▍   | 3742/5800 [10:30:25<3:55:43,  6.87s/it]score1 tensor([[0.6328],
-        [0.4414],
-        [0.5352],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.4395, 0.5078, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0142, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:17:02,826] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 19:17:02,828] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.96 | bwd_microstep: 4633.10 | bwd_inner_microstep: 4628.05 | bwd_allreduce_microstep: 4.94 | step_microstep: 46.78
-[2025-01-25 19:17:02,828] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.92 | bwd: 4633.12 | bwd_inner: 4628.05 | bwd_allreduce: 5.00 | step: 46.78
- 65%|██████▍   | 3743/5800 [10:30:32<3:56:10,  6.89s/it]                                                        {'loss': 0.0142, 'grad_norm': 4.203585147857666, 'learning_rate': 1.1805455720176862e-05, 'epoch': 32.27}
- 65%|██████▍   | 3743/5800 [10:30:32<3:56:10,  6.89s/it]score1 tensor([[0.3262],
-        [0.4043],
-        [0.5586],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3418, 0.4121, 0.5508, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:17:09,764] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.96 | optimizer_step: 4.37
-[2025-01-25 19:17:09,765] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.74 | bwd_microstep: 4643.36 | bwd_inner_microstep: 4638.41 | bwd_allreduce_microstep: 4.87 | step_microstep: 42.90
-[2025-01-25 19:17:09,766] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.69 | bwd: 4643.38 | bwd_inner: 4638.41 | bwd_allreduce: 4.91 | step: 42.90
- 65%|██████▍   | 3744/5800 [10:30:39<3:56:37,  6.91s/it]                                                        {'loss': 0.0088, 'grad_norm': 1.2068735361099243, 'learning_rate': 1.1795269347213947e-05, 'epoch': 32.28}
- 65%|██████▍   | 3744/5800 [10:30:39<3:56:37,  6.91s/it]score1 tensor([[0.4648],
-        [0.4121],
-        [0.3672],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4609, 0.4258, 0.3711, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:17:16,705] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.03 | optimizer_step: 4.36
-[2025-01-25 19:17:16,706] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.87 | bwd_microstep: 4643.41 | bwd_inner_microstep: 4638.35 | bwd_allreduce_microstep: 4.94 | step_microstep: 47.08
-[2025-01-25 19:17:16,706] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.84 | bwd: 4643.43 | bwd_inner: 4638.35 | bwd_allreduce: 5.00 | step: 47.08
- 65%|██████▍   | 3745/5800 [10:30:46<3:56:52,  6.92s/it]                                                        {'loss': 0.0063, 'grad_norm': 0.6649326086044312, 'learning_rate': 1.178508553262842e-05, 'epoch': 32.28}
- 65%|██████▍   | 3745/5800 [10:30:46<3:56:52,  6.92s/it]score1 tensor([[0.5273],
-        [0.4766],
-        [0.4941],
-        [0.4707]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.4766, 0.4863, 0.4707], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:17:23,534] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.37
-[2025-01-25 19:17:23,536] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.46 | bwd_microstep: 4541.78 | bwd_inner_microstep: 4536.68 | bwd_allreduce_microstep: 5.00 | step_microstep: 43.86
-[2025-01-25 19:17:23,536] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.42 | bwd: 4541.81 | bwd_inner: 4536.68 | bwd_allreduce: 5.04 | step: 43.87
- 65%|██████▍   | 3746/5800 [10:30:53<3:55:53,  6.89s/it]                                                        {'loss': 0.0117, 'grad_norm': 0.33548280596733093, 'learning_rate': 1.1774904279595766e-05, 'epoch': 32.29}
- 65%|██████▍   | 3746/5800 [10:30:53<3:55:53,  6.89s/it]score1 tensor([[0.4902],
-        [0.4199],
-        [0.4727],
-        [0.5898]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4922, 0.4180, 0.4707, 0.5820], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0034, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:17:30,464] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 19:17:30,466] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.28 | bwd_microstep: 4633.37 | bwd_inner_microstep: 4628.09 | bwd_allreduce_microstep: 5.18 | step_microstep: 46.56
-[2025-01-25 19:17:30,466] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.25 | bwd: 4633.40 | bwd_inner: 4628.09 | bwd_allreduce: 5.23 | step: 46.56
- 65%|██████▍   | 3747/5800 [10:31:00<3:56:06,  6.90s/it]                                                        {'loss': 0.0034, 'grad_norm': 4.004431247711182, 'learning_rate': 1.1764725591290669e-05, 'epoch': 32.3}
- 65%|██████▍   | 3747/5800 [10:31:00<3:56:06,  6.90s/it]score1 tensor([[0.5430],
-        [0.4199],
-        [0.5430],
-        [0.4297]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.3945, 0.5391, 0.4316], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:17:37,402] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 19:17:37,403] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.12 | bwd_microstep: 4636.10 | bwd_inner_microstep: 4630.44 | bwd_allreduce_microstep: 5.53 | step_microstep: 51.15
-[2025-01-25 19:17:37,403] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.08 | bwd: 4636.13 | bwd_inner: 4630.44 | bwd_allreduce: 5.60 | step: 51.15
- 65%|██████▍   | 3748/5800 [10:31:07<3:56:21,  6.91s/it]                                                        {'loss': 0.0107, 'grad_norm': 0.37218597531318665, 'learning_rate': 1.1754549470887033e-05, 'epoch': 32.31}
- 65%|██████▍   | 3748/5800 [10:31:07<3:56:21,  6.91s/it]score1 tensor([[0.6094],
-        [0.4473],
-        [0.4199],
-        [0.7031]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6250, 0.4434, 0.4180, 0.6953], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:17:44,315] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.92 | optimizer_step: 4.37
-[2025-01-25 19:17:44,316] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.24 | bwd_microstep: 4636.59 | bwd_inner_microstep: 4631.21 | bwd_allreduce_microstep: 5.26 | step_microstep: 43.66
-[2025-01-25 19:17:44,317] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.21 | bwd: 4636.61 | bwd_inner: 4631.21 | bwd_allreduce: 5.32 | step: 43.67
- 65%|██████▍   | 3749/5800 [10:31:14<3:56:19,  6.91s/it]                                                        {'loss': 0.0073, 'grad_norm': 3.930732488632202, 'learning_rate': 1.1744375921557937e-05, 'epoch': 32.32}
- 65%|██████▍   | 3749/5800 [10:31:14<3:56:19,  6.91s/it]score1 tensor([[0.4238],
-        [0.3574],
-        [0.3516],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.3457, 0.3555, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0151, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:17:51,243] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.90 | optimizer_step: 4.37
-[2025-01-25 19:17:51,244] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.97 | bwd_microstep: 4637.71 | bwd_inner_microstep: 4632.89 | bwd_allreduce_microstep: 4.75 | step_microstep: 47.14
-[2025-01-25 19:17:51,244] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.94 | bwd: 4637.74 | bwd_inner: 4632.89 | bwd_allreduce: 4.79 | step: 47.15
- 65%|██████▍   | 3750/5800 [10:31:21<3:56:18,  6.92s/it]                                                        {'loss': 0.0151, 'grad_norm': 3.86133074760437, 'learning_rate': 1.1734204946475685e-05, 'epoch': 32.33}
- 65%|██████▍   | 3750/5800 [10:31:21<3:56:18,  6.92s/it]score1 tensor([[0.4688],
-        [0.4746],
-        [0.5039],
-        [0.4707]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.4863, 0.5117, 0.4824], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:17:58,112] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 19:17:58,113] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.40 | bwd_microstep: 4580.94 | bwd_inner_microstep: 4576.14 | bwd_allreduce_microstep: 4.72 | step_microstep: 43.92
-[2025-01-25 19:17:58,114] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.35 | bwd: 4580.99 | bwd_inner: 4576.14 | bwd_allreduce: 4.76 | step: 43.92
- 65%|██████▍   | 3751/5800 [10:31:28<3:55:41,  6.90s/it]                                                        {'loss': 0.0078, 'grad_norm': 5.891229629516602, 'learning_rate': 1.1724036548811749e-05, 'epoch': 32.34}
- 65%|██████▍   | 3751/5800 [10:31:28<3:55:41,  6.90s/it]score1 tensor([[0.5117],
-        [0.5117],
-        [0.5352],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.5430, 0.5547, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0225, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:18:05,037] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.37
-[2025-01-25 19:18:05,038] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.43 | bwd_microstep: 4640.91 | bwd_inner_microstep: 4636.37 | bwd_allreduce_microstep: 4.46 | step_microstep: 42.52
-[2025-01-25 19:18:05,038] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.39 | bwd: 4640.93 | bwd_inner: 4636.37 | bwd_allreduce: 4.50 | step: 42.53
- 65%|██████▍   | 3752/5800 [10:31:35<3:55:47,  6.91s/it]                                                        {'loss': 0.0225, 'grad_norm': 8.279775619506836, 'learning_rate': 1.1713870731736816e-05, 'epoch': 32.34}
- 65%|██████▍   | 3752/5800 [10:31:35<3:55:47,  6.91s/it]score1 tensor([[0.4062],
-        [0.4531],
-        [0.4512],
-        [0.4902]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4199, 0.4609, 0.4609, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:18:11,957] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.53 | optimizer_step: 4.37
-[2025-01-25 19:18:11,958] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.17 | bwd_microstep: 4643.65 | bwd_inner_microstep: 4639.30 | bwd_allreduce_microstep: 4.28 | step_microstep: 42.33
-[2025-01-25 19:18:11,959] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.14 | bwd: 4643.68 | bwd_inner: 4639.30 | bwd_allreduce: 4.31 | step: 42.34
- 65%|██████▍   | 3753/5800 [10:31:41<3:55:48,  6.91s/it]                                                        {'loss': 0.0132, 'grad_norm': 7.5825090408325195, 'learning_rate': 1.1703707498420768e-05, 'epoch': 32.35}
- 65%|██████▍   | 3753/5800 [10:31:41<3:55:48,  6.91s/it]score1 tensor([[0.6797],
-        [0.4707],
-        [0.4961],
-        [0.3828]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6797, 0.4922, 0.5039, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:18:18,822] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 19:18:18,823] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.56 | bwd_microstep: 4589.26 | bwd_inner_microstep: 4584.31 | bwd_allreduce_microstep: 4.85 | step_microstep: 43.64
-[2025-01-25 19:18:18,824] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.53 | bwd: 4589.29 | bwd_inner: 4584.31 | bwd_allreduce: 4.89 | step: 43.64
- 65%|██████▍   | 3754/5800 [10:31:48<3:55:13,  6.90s/it]                                                        {'loss': 0.0093, 'grad_norm': 2.228472948074341, 'learning_rate': 1.1693546852032671e-05, 'epoch': 32.36}
- 65%|██████▍   | 3754/5800 [10:31:48<3:55:13,  6.90s/it]score1 tensor([[0.5117],
-        [0.4531],
-        [0.5234],
-        [0.6445]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.4473, 0.5391, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:18:25,707] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.36
-[2025-01-25 19:18:25,708] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.85 | bwd_microstep: 4597.53 | bwd_inner_microstep: 4592.83 | bwd_allreduce_microstep: 4.61 | step_microstep: 43.32
-[2025-01-25 19:18:25,709] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.82 | bwd: 4597.56 | bwd_inner: 4592.83 | bwd_allreduce: 4.65 | step: 43.33
- 65%|██████▍   | 3755/5800 [10:31:55<3:55:00,  6.90s/it]                                                        {'loss': 0.0073, 'grad_norm': 2.2302041053771973, 'learning_rate': 1.1683388795740802e-05, 'epoch': 32.37}
- 65%|██████▍   | 3755/5800 [10:31:55<3:55:00,  6.90s/it]score1 tensor([[0.4004],
-        [0.5508],
-        [0.6523],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3867, 0.5547, 0.6562, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:18:32,639] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.37
-[2025-01-25 19:18:32,640] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.72 | bwd_microstep: 4638.77 | bwd_inner_microstep: 4633.98 | bwd_allreduce_microstep: 4.68 | step_microstep: 48.23
-[2025-01-25 19:18:32,641] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.68 | bwd: 4638.80 | bwd_inner: 4633.98 | bwd_allreduce: 4.73 | step: 48.24
- 65%|██████▍   | 3756/5800 [10:32:02<3:55:15,  6.91s/it]                                                        {'loss': 0.0073, 'grad_norm': 4.938727855682373, 'learning_rate': 1.1673233332712606e-05, 'epoch': 32.38}
- 65%|██████▍   | 3756/5800 [10:32:02<3:55:15,  6.91s/it]score1 tensor([[0.4180],
-        [0.4590],
-        [0.5508],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3984, 0.4512, 0.5508, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:18:39,514] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.37
-[2025-01-25 19:18:39,515] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.69 | bwd_microstep: 4591.15 | bwd_inner_microstep: 4586.39 | bwd_allreduce_microstep: 4.68 | step_microstep: 42.00
-[2025-01-25 19:18:39,516] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.65 | bwd: 4591.17 | bwd_inner: 4586.39 | bwd_allreduce: 4.72 | step: 42.01
- 65%|██████▍   | 3757/5800 [10:32:09<3:54:51,  6.90s/it]                                                        {'loss': 0.0098, 'grad_norm': 5.882211208343506, 'learning_rate': 1.1663080466114747e-05, 'epoch': 32.39}
- 65%|██████▍   | 3757/5800 [10:32:09<3:54:51,  6.90s/it]score1 tensor([[0.4922],
-        [0.6953],
-        [0.4375],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4961, 0.6875, 0.4121, 0.5000], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:18:46,452] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 19:18:46,454] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.14 | bwd_microstep: 4646.60 | bwd_inner_microstep: 4641.19 | bwd_allreduce_microstep: 5.28 | step_microstep: 45.43
-[2025-01-25 19:18:46,454] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.11 | bwd: 4646.62 | bwd_inner: 4641.19 | bwd_allreduce: 5.34 | step: 45.44
- 65%|██████▍   | 3758/5800 [10:32:16<3:55:04,  6.91s/it]                                                        {'loss': 0.0112, 'grad_norm': 4.386487007141113, 'learning_rate': 1.1652930199113055e-05, 'epoch': 32.4}
- 65%|██████▍   | 3758/5800 [10:32:16<3:55:04,  6.91s/it]score1 tensor([[0.4844],
-        [0.5391],
-        [0.5586],
-        [0.4531]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4629, 0.5312, 0.5781, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:18:53,372] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 19:18:53,373] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.57 | bwd_microstep: 4641.08 | bwd_inner_microstep: 4636.11 | bwd_allreduce_microstep: 4.87 | step_microstep: 43.20
-[2025-01-25 19:18:53,373] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.53 | bwd: 4641.10 | bwd_inner: 4636.11 | bwd_allreduce: 4.92 | step: 43.21
- 65%|██████▍   | 3759/5800 [10:32:23<3:55:08,  6.91s/it]                                                        {'loss': 0.0127, 'grad_norm': 3.8393616676330566, 'learning_rate': 1.1642782534872552e-05, 'epoch': 32.41}
- 65%|██████▍   | 3759/5800 [10:32:23<3:55:08,  6.91s/it]score1 tensor([[0.5781],
-        [0.6172],
-        [0.4434],
-        [0.4336]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5820, 0.6133, 0.4336, 0.4277], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:19:00,309] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.01 | optimizer_step: 4.36
-[2025-01-25 19:19:00,311] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2170.37 | bwd_microstep: 4648.10 | bwd_inner_microstep: 4642.91 | bwd_allreduce_microstep: 5.09 | step_microstep: 43.60
-[2025-01-25 19:19:00,311] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2170.32 | bwd: 4648.13 | bwd_inner: 4642.91 | bwd_allreduce: 5.14 | step: 43.60
- 65%|██████▍   | 3760/5800 [10:32:30<3:55:21,  6.92s/it]                                                        {'loss': 0.0059, 'grad_norm': 3.8159682750701904, 'learning_rate': 1.1632637476557476e-05, 'epoch': 32.41}
- 65%|██████▍   | 3760/5800 [10:32:30<3:55:21,  6.92s/it]score1 tensor([[0.5312],
-        [0.4219],
-        [0.6992],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.4023, 0.7070, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:19:07,203] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 19:19:07,205] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.77 | bwd_microstep: 4602.16 | bwd_inner_microstep: 4596.82 | bwd_allreduce_microstep: 5.19 | step_microstep: 45.87
-[2025-01-25 19:19:07,205] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.73 | bwd: 4602.19 | bwd_inner: 4596.82 | bwd_allreduce: 5.25 | step: 45.88
- 65%|██████▍   | 3761/5800 [10:32:37<3:54:54,  6.91s/it]                                                        {'loss': 0.0078, 'grad_norm': 2.804413080215454, 'learning_rate': 1.1622495027331211e-05, 'epoch': 32.42}
- 65%|██████▍   | 3761/5800 [10:32:37<3:54:54,  6.91s/it]score1 tensor([[0.4102],
-        [0.4512],
-        [0.3262],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4043, 0.4336, 0.3086, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:19:14,131] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 19:19:14,132] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.09 | bwd_microstep: 4635.96 | bwd_inner_microstep: 4630.80 | bwd_allreduce_microstep: 5.05 | step_microstep: 42.06
-[2025-01-25 19:19:14,133] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.06 | bwd: 4636.04 | bwd_inner: 4630.80 | bwd_allreduce: 5.10 | step: 42.07
- 65%|██████▍   | 3762/5800 [10:32:44<3:54:56,  6.92s/it]                                                        {'loss': 0.0132, 'grad_norm': 7.296665191650391, 'learning_rate': 1.1612355190356368e-05, 'epoch': 32.43}
- 65%|██████▍   | 3762/5800 [10:32:44<3:54:56,  6.92s/it]score1 tensor([[0.5703],
-        [0.4590],
-        [0.4668],
-        [0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.4473, 0.4473, 0.4746], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:19:21,054] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.02 | optimizer_step: 4.37
-[2025-01-25 19:19:21,055] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.02 | bwd_microstep: 4640.06 | bwd_inner_microstep: 4635.09 | bwd_allreduce_microstep: 4.89 | step_microstep: 42.36
-[2025-01-25 19:19:21,055] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.97 | bwd: 4640.08 | bwd_inner: 4635.09 | bwd_allreduce: 4.93 | step: 42.36
- 65%|██████▍   | 3763/5800 [10:32:51<3:54:51,  6.92s/it]                                                        {'loss': 0.0122, 'grad_norm': 7.956539630889893, 'learning_rate': 1.160221796879471e-05, 'epoch': 32.44}
- 65%|██████▍   | 3763/5800 [10:32:51<3:54:51,  6.92s/it]score1 tensor([[0.5703],
-        [0.6055],
-        [0.4941],
-        [0.5977]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.6094, 0.4980, 0.6289], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0166, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:19:27,977] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.36
-[2025-01-25 19:19:27,979] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.35 | bwd_microstep: 4640.17 | bwd_inner_microstep: 4635.31 | bwd_allreduce_microstep: 4.78 | step_microstep: 44.27
-[2025-01-25 19:19:27,979] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.32 | bwd: 4640.19 | bwd_inner: 4635.31 | bwd_allreduce: 4.81 | step: 44.28
- 65%|██████▍   | 3764/5800 [10:32:57<3:54:49,  6.92s/it]                                                        {'loss': 0.0166, 'grad_norm': 4.347016334533691, 'learning_rate': 1.1592083365807208e-05, 'epoch': 32.45}
- 65%|██████▍   | 3764/5800 [10:32:57<3:54:49,  6.92s/it]score1 tensor([[0.4258],
-        [0.5234],
-        [0.3887],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.5273, 0.3750, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:19:34,907] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.37
-[2025-01-25 19:19:34,909] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.93 | bwd_microstep: 4639.85 | bwd_inner_microstep: 4634.53 | bwd_allreduce_microstep: 5.23 | step_microstep: 45.44
-[2025-01-25 19:19:34,909] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.88 | bwd: 4639.87 | bwd_inner: 4634.53 | bwd_allreduce: 5.27 | step: 45.44
- 65%|██████▍   | 3765/5800 [10:33:04<3:54:48,  6.92s/it]                                                        {'loss': 0.0083, 'grad_norm': 3.6025876998901367, 'learning_rate': 1.1581951384554003e-05, 'epoch': 32.46}
- 65%|██████▍   | 3765/5800 [10:33:04<3:54:48,  6.92s/it]score1 tensor([[0.5312],
-        [0.6094],
-        [0.5508],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.5898, 0.5469, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:19:41,789] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.37
-[2025-01-25 19:19:41,791] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.18 | bwd_microstep: 4595.90 | bwd_inner_microstep: 4591.54 | bwd_allreduce_microstep: 4.28 | step_microstep: 42.46
-[2025-01-25 19:19:41,791] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.15 | bwd: 4595.93 | bwd_inner: 4591.54 | bwd_allreduce: 4.32 | step: 42.47
- 65%|██████▍   | 3766/5800 [10:33:11<3:54:14,  6.91s/it]                                                        {'loss': 0.0088, 'grad_norm': 6.436620712280273, 'learning_rate': 1.157182202819441e-05, 'epoch': 32.47}
- 65%|██████▍   | 3766/5800 [10:33:11<3:54:14,  6.91s/it]score1 tensor([[0.5469],
-        [0.6562],
-        [0.5078],
-        [0.6016]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.6641, 0.5117, 0.5938], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:19:48,663] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.07 | optimizer_step: 4.37
-[2025-01-25 19:19:48,664] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.49 | bwd_microstep: 4587.37 | bwd_inner_microstep: 4582.59 | bwd_allreduce_microstep: 4.71 | step_microstep: 43.42
-[2025-01-25 19:19:48,665] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.46 | bwd: 4587.40 | bwd_inner: 4582.59 | bwd_allreduce: 4.75 | step: 43.42
- 65%|██████▍   | 3767/5800 [10:33:18<3:53:45,  6.90s/it]                                                        {'loss': 0.0049, 'grad_norm': 2.1533329486846924, 'learning_rate': 1.1561695299886964e-05, 'epoch': 32.47}
- 65%|██████▍   | 3767/5800 [10:33:18<3:53:45,  6.90s/it]score1 tensor([[0.4727],
-        [0.4199],
-        [0.5156],
-        [0.6406]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.4141, 0.5117, 0.6367], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:19:55,591] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 19:19:55,592] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.65 | bwd_microstep: 4638.49 | bwd_inner_microstep: 4633.71 | bwd_allreduce_microstep: 4.71 | step_microstep: 44.80
-[2025-01-25 19:19:55,593] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.62 | bwd: 4638.52 | bwd_inner: 4633.71 | bwd_allreduce: 4.75 | step: 44.81
- 65%|██████▍   | 3768/5800 [10:33:25<3:53:57,  6.91s/it]                                                        {'loss': 0.0049, 'grad_norm': 4.307673931121826, 'learning_rate': 1.155157120278933e-05, 'epoch': 32.48}
- 65%|██████▍   | 3768/5800 [10:33:25<3:53:57,  6.91s/it]score1 tensor([[0.4688],
-        [0.4180],
-        [0.4727],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.4238, 0.4941, 0.5586], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:20:02,513] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 19:20:02,515] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.31 | bwd_microstep: 4639.15 | bwd_inner_microstep: 4632.86 | bwd_allreduce_microstep: 6.17 | step_microstep: 43.05
-[2025-01-25 19:20:02,515] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.28 | bwd: 4639.18 | bwd_inner: 4632.86 | bwd_allreduce: 6.24 | step: 43.06
- 65%|██████▍   | 3769/5800 [10:33:32<3:53:59,  6.91s/it]                                                        {'loss': 0.0156, 'grad_norm': 7.814321517944336, 'learning_rate': 1.1541449740058397e-05, 'epoch': 32.49}
- 65%|██████▍   | 3769/5800 [10:33:32<3:53:59,  6.91s/it]score1 tensor([[0.4844],
-        [0.5469],
-        [0.4570],
-        [0.3652]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4922, 0.5625, 0.4512, 0.3516], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:20:09,435] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.37
-[2025-01-25 19:20:09,437] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.45 | bwd_microstep: 4634.82 | bwd_inner_microstep: 4629.35 | bwd_allreduce_microstep: 5.39 | step_microstep: 49.92
-[2025-01-25 19:20:09,437] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.41 | bwd: 4634.85 | bwd_inner: 4629.35 | bwd_allreduce: 5.44 | step: 49.92
- 65%|██████▌   | 3770/5800 [10:33:39<3:53:57,  6.92s/it]                                                        {'loss': 0.0107, 'grad_norm': 0.8542126417160034, 'learning_rate': 1.1531330914850204e-05, 'epoch': 32.5}
- 65%|██████▌   | 3770/5800 [10:33:39<3:53:57,  6.92s/it]score1 tensor([[0.4336],
-        [0.5781],
-        [0.5625],
-        [0.4238]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4434, 0.5781, 0.5664, 0.4277], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0044, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:20:16,305] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 19:20:16,306] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.28 | bwd_microstep: 4588.47 | bwd_inner_microstep: 4583.26 | bwd_allreduce_microstep: 5.13 | step_microstep: 41.42
-[2025-01-25 19:20:16,306] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.23 | bwd: 4588.49 | bwd_inner: 4583.26 | bwd_allreduce: 5.17 | step: 41.43
- 65%|██████▌   | 3771/5800 [10:33:46<3:53:22,  6.90s/it]                                                        {'loss': 0.0044, 'grad_norm': 5.82795524597168, 'learning_rate': 1.152121473031997e-05, 'epoch': 32.51}
- 65%|██████▌   | 3771/5800 [10:33:46<3:53:22,  6.90s/it]score1 tensor([[0.4746],
-        [0.4844],
-        [0.6289],
-        [0.4668]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.4785, 0.6562, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:20:23,228] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 19:20:23,230] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.98 | bwd_microstep: 4633.40 | bwd_inner_microstep: 4628.18 | bwd_allreduce_microstep: 5.14 | step_microstep: 52.31
-[2025-01-25 19:20:23,230] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.95 | bwd: 4633.42 | bwd_inner: 4628.18 | bwd_allreduce: 5.18 | step: 52.32
- 65%|██████▌   | 3772/5800 [10:33:53<3:53:28,  6.91s/it]                                                        {'loss': 0.0127, 'grad_norm': 0.5716001391410828, 'learning_rate': 1.1511101189622105e-05, 'epoch': 32.52}
- 65%|██████▌   | 3772/5800 [10:33:53<3:53:28,  6.91s/it]score1 tensor([[0.4570],
-        [0.5547],
-        [0.4707],
-        [0.4160]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.5625, 0.4785, 0.4160], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:20:30,109] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.37
-[2025-01-25 19:20:30,111] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.64 | bwd_microstep: 4595.20 | bwd_inner_microstep: 4590.55 | bwd_allreduce_microstep: 4.57 | step_microstep: 43.65
-[2025-01-25 19:20:30,111] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.59 | bwd: 4595.22 | bwd_inner: 4590.55 | bwd_allreduce: 4.61 | step: 43.65
- 65%|██████▌   | 3773/5800 [10:34:00<3:53:06,  6.90s/it]                                                        {'loss': 0.0063, 'grad_norm': 2.238319158554077, 'learning_rate': 1.1500990295910182e-05, 'epoch': 32.53}
- 65%|██████▌   | 3773/5800 [10:34:00<3:53:06,  6.90s/it]score1 tensor([[0.6719],
-        [0.3340],
-        [0.5977],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6797, 0.3340, 0.6055, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:20:36,947] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 19:20:36,949] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.98 | bwd_microstep: 4550.00 | bwd_inner_microstep: 4545.06 | bwd_allreduce_microstep: 4.86 | step_microstep: 42.74
-[2025-01-25 19:20:36,950] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2168.94 | bwd: 4550.02 | bwd_inner: 4545.06 | bwd_allreduce: 4.90 | step: 42.75
- 65%|██████▌   | 3774/5800 [10:34:06<3:52:24,  6.88s/it]                                                        {'loss': 0.0039, 'grad_norm': 4.691009998321533, 'learning_rate': 1.149088205233696e-05, 'epoch': 32.53}
- 65%|██████▌   | 3774/5800 [10:34:06<3:52:24,  6.88s/it]score1 tensor([[0.4941],
-        [0.3516],
-        [0.3652],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.3477, 0.3730, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:20:43,827] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.36
-[2025-01-25 19:20:43,828] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.59 | bwd_microstep: 4588.67 | bwd_inner_microstep: 4583.48 | bwd_allreduce_microstep: 5.05 | step_microstep: 41.95
-[2025-01-25 19:20:43,829] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.56 | bwd: 4588.70 | bwd_inner: 4583.48 | bwd_allreduce: 5.10 | step: 41.96
- 65%|██████▌   | 3775/5800 [10:34:13<3:52:15,  6.88s/it]                                                        {'loss': 0.0093, 'grad_norm': 2.0645971298217773, 'learning_rate': 1.1480776462054353e-05, 'epoch': 32.54}
- 65%|██████▌   | 3775/5800 [10:34:13<3:52:15,  6.88s/it]score1 tensor([[0.5195],
-        [0.4336],
-        [0.3633],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.4277, 0.3613, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:20:50,767] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 19:20:50,769] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.01 | bwd_microstep: 4641.36 | bwd_inner_microstep: 4635.16 | bwd_allreduce_microstep: 6.08 | step_microstep: 52.00
-[2025-01-25 19:20:50,769] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.97 | bwd: 4641.38 | bwd_inner: 4635.16 | bwd_allreduce: 6.14 | step: 52.01
- 65%|██████▌   | 3776/5800 [10:34:20<3:52:41,  6.90s/it]                                                        {'loss': 0.0049, 'grad_norm': 7.720816612243652, 'learning_rate': 1.1470673528213476e-05, 'epoch': 32.55}
- 65%|██████▌   | 3776/5800 [10:34:20<3:52:41,  6.90s/it]score1 tensor([[0.6602],
-        [0.5352],
-        [0.4922],
-        [0.5781]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6836, 0.5430, 0.4941, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0151, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:20:57,700] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 19:20:57,702] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.82 | bwd_microstep: 4645.82 | bwd_inner_microstep: 4640.45 | bwd_allreduce_microstep: 5.26 | step_microstep: 42.11
-[2025-01-25 19:20:57,702] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.78 | bwd: 4645.84 | bwd_inner: 4640.45 | bwd_allreduce: 5.31 | step: 42.13
- 65%|██████▌   | 3777/5800 [10:34:27<3:52:59,  6.91s/it]                                                        {'loss': 0.0151, 'grad_norm': 4.289787292480469, 'learning_rate': 1.1460573253964577e-05, 'epoch': 32.56}
- 65%|██████▌   | 3777/5800 [10:34:27<3:52:59,  6.91s/it]score1 tensor([[0.4512],
-        [0.5078],
-        [0.6680],
-        [0.6211]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4668, 0.5117, 0.6641, 0.6250], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:21:04,631] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 19:21:04,632] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.17 | bwd_microstep: 4635.38 | bwd_inner_microstep: 4630.11 | bwd_allreduce_microstep: 5.16 | step_microstep: 45.54
-[2025-01-25 19:21:04,633] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.14 | bwd: 4635.40 | bwd_inner: 4630.11 | bwd_allreduce: 5.22 | step: 45.54
- 65%|██████▌   | 3778/5800 [10:34:34<3:53:03,  6.92s/it]                                                        {'loss': 0.0068, 'grad_norm': 3.8423449993133545, 'learning_rate': 1.1450475642457124e-05, 'epoch': 32.57}
- 65%|██████▌   | 3778/5800 [10:34:34<3:53:03,  6.92s/it]score1 tensor([[0.4141],
-        [0.5391],
-        [0.5156],
-        [0.4316]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4082, 0.5664, 0.5039, 0.4375], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:21:11,548] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 19:21:11,549] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.63 | bwd_microstep: 4638.79 | bwd_inner_microstep: 4633.65 | bwd_allreduce_microstep: 5.05 | step_microstep: 43.19
-[2025-01-25 19:21:11,549] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.60 | bwd: 4638.81 | bwd_inner: 4633.65 | bwd_allreduce: 5.09 | step: 43.20
- 65%|██████▌   | 3779/5800 [10:34:41<3:52:55,  6.92s/it]                                                        {'loss': 0.0127, 'grad_norm': 0.47226887941360474, 'learning_rate': 1.14403806968397e-05, 'epoch': 32.58}
- 65%|██████▌   | 3779/5800 [10:34:41<3:52:55,  6.92s/it]score1 tensor([[0.6367],
-        [0.4473],
-        [0.5469],
-        [0.7422]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.4551, 0.5469, 0.7031], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:21:18,414] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 19:21:18,415] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.76 | bwd_microstep: 4581.39 | bwd_inner_microstep: 4576.59 | bwd_allreduce_microstep: 4.70 | step_microstep: 43.22
-[2025-01-25 19:21:18,415] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.73 | bwd: 4581.41 | bwd_inner: 4576.59 | bwd_allreduce: 4.74 | step: 43.22
- 65%|██████▌   | 3780/5800 [10:34:48<3:52:17,  6.90s/it]                                                        {'loss': 0.0137, 'grad_norm': 1.7456095218658447, 'learning_rate': 1.1430288420260106e-05, 'epoch': 32.59}
- 65%|██████▌   | 3780/5800 [10:34:48<3:52:17,  6.90s/it]score1 tensor([[0.5469],
-        [0.4609],
-        [0.5156],
-        [0.3262]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.4492, 0.5078, 0.3457], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:21:25,346] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 19:21:25,347] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.67 | bwd_microstep: 4646.93 | bwd_inner_microstep: 4642.07 | bwd_allreduce_microstep: 4.78 | step_microstep: 41.33
-[2025-01-25 19:21:25,348] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.62 | bwd: 4646.95 | bwd_inner: 4642.07 | bwd_allreduce: 4.82 | step: 41.34
- 65%|██████▌   | 3781/5800 [10:34:55<3:52:33,  6.91s/it]                                                        {'loss': 0.0146, 'grad_norm': 4.476722717285156, 'learning_rate': 1.1420198815865273e-05, 'epoch': 32.59}
- 65%|██████▌   | 3781/5800 [10:34:55<3:52:33,  6.91s/it]score1 tensor([[0.6016],
-        [0.6562],
-        [0.4355],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.6484, 0.4082, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0166, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:21:32,274] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.99 | optimizer_step: 4.37
-[2025-01-25 19:21:32,276] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.56 | bwd_microstep: 4639.72 | bwd_inner_microstep: 4634.61 | bwd_allreduce_microstep: 4.98 | step_microstep: 43.68
-[2025-01-25 19:21:32,276] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.52 | bwd: 4639.75 | bwd_inner: 4634.61 | bwd_allreduce: 5.04 | step: 43.70
- 65%|██████▌   | 3782/5800 [10:35:02<3:52:40,  6.92s/it]                                                        {'loss': 0.0166, 'grad_norm': 4.580428123474121, 'learning_rate': 1.141011188680133e-05, 'epoch': 32.6}
- 65%|██████▌   | 3782/5800 [10:35:02<3:52:40,  6.92s/it]score1 tensor([[0.4902],
-        [0.6211],
-        [0.5234],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4805, 0.5898, 0.5156, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:21:39,162] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 19:21:39,163] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.51 | bwd_microstep: 4592.33 | bwd_inner_microstep: 4587.32 | bwd_allreduce_microstep: 4.92 | step_microstep: 44.48
-[2025-01-25 19:21:39,163] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.48 | bwd: 4592.35 | bwd_inner: 4587.32 | bwd_allreduce: 4.96 | step: 44.49
- 65%|██████▌   | 3783/5800 [10:35:09<3:52:09,  6.91s/it]                                                        {'loss': 0.0122, 'grad_norm': 6.332176208496094, 'learning_rate': 1.1400027636213538e-05, 'epoch': 32.61}
- 65%|██████▌   | 3783/5800 [10:35:09<3:52:09,  6.91s/it]score1 tensor([[0.4609],
-        [0.4590],
-        [0.5625],
-        [0.4414]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.4297, 0.5625, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:21:46,033] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.89 | optimizer_step: 4.37
-[2025-01-25 19:21:46,035] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.35 | bwd_microstep: 4590.60 | bwd_inner_microstep: 4585.64 | bwd_allreduce_microstep: 4.88 | step_microstep: 43.25
-[2025-01-25 19:21:46,035] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.32 | bwd: 4590.63 | bwd_inner: 4585.64 | bwd_allreduce: 4.92 | step: 43.26
- 65%|██████▌   | 3784/5800 [10:35:16<3:51:47,  6.90s/it]                                                        {'loss': 0.0132, 'grad_norm': 1.961508870124817, 'learning_rate': 1.1389946067246362e-05, 'epoch': 32.62}
- 65%|██████▌   | 3784/5800 [10:35:16<3:51:47,  6.90s/it]score1 tensor([[0.5508],
-        [0.6094],
-        [0.4434],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.6094, 0.4473, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:21:52,905] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 19:21:52,907] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.61 | bwd_microstep: 4579.78 | bwd_inner_microstep: 4574.79 | bwd_allreduce_microstep: 4.90 | step_microstep: 43.96
-[2025-01-25 19:21:52,907] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.57 | bwd: 4579.80 | bwd_inner: 4574.79 | bwd_allreduce: 4.94 | step: 43.97
- 65%|██████▌   | 3785/5800 [10:35:22<3:51:22,  6.89s/it]                                                        {'loss': 0.0132, 'grad_norm': 2.3466339111328125, 'learning_rate': 1.1379867183043389e-05, 'epoch': 32.63}
- 65%|██████▌   | 3785/5800 [10:35:22<3:51:22,  6.89s/it]score1 tensor([[0.4902],
-        [0.5234],
-        [0.4609],
-        [0.6992]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4961, 0.5039, 0.4648, 0.6719], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0142, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:21:59,834] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.36
-[2025-01-25 19:21:59,835] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.69 | bwd_microstep: 4641.14 | bwd_inner_microstep: 4636.11 | bwd_allreduce_microstep: 4.95 | step_microstep: 43.25
-[2025-01-25 19:21:59,835] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.66 | bwd: 4641.16 | bwd_inner: 4636.11 | bwd_allreduce: 4.99 | step: 43.26
- 65%|██████▌   | 3786/5800 [10:35:29<3:51:37,  6.90s/it]                                                        {'loss': 0.0142, 'grad_norm': 0.9234222173690796, 'learning_rate': 1.1369790986747412e-05, 'epoch': 32.64}
- 65%|██████▌   | 3786/5800 [10:35:29<3:51:37,  6.90s/it]score1 tensor([[0.5781],
-        [0.3574],
-        [0.3809],
-        [0.4551]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.3730, 0.4004, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:22:06,766] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.37
-[2025-01-25 19:22:06,768] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.78 | bwd_microstep: 4640.25 | bwd_inner_microstep: 4634.79 | bwd_allreduce_microstep: 5.34 | step_microstep: 46.15
-[2025-01-25 19:22:06,768] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.73 | bwd: 4640.27 | bwd_inner: 4634.79 | bwd_allreduce: 5.40 | step: 46.16
- 65%|██████▌   | 3787/5800 [10:35:36<3:51:46,  6.91s/it]                                                        {'loss': 0.0176, 'grad_norm': 0.9702911376953125, 'learning_rate': 1.1359717481500343e-05, 'epoch': 32.65}
- 65%|██████▌   | 3787/5800 [10:35:36<3:51:46,  6.91s/it]score1 tensor([[0.4023],
-        [0.4453],
-        [0.4414],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3906, 0.4512, 0.4414, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:22:13,637] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 19:22:13,639] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.29 | bwd_microstep: 4586.90 | bwd_inner_microstep: 4581.69 | bwd_allreduce_microstep: 5.10 | step_microstep: 43.73
-[2025-01-25 19:22:13,639] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.24 | bwd: 4586.92 | bwd_inner: 4581.69 | bwd_allreduce: 5.15 | step: 43.74
- 65%|██████▌   | 3788/5800 [10:35:43<3:51:18,  6.90s/it]                                                        {'loss': 0.0083, 'grad_norm': 2.0891501903533936, 'learning_rate': 1.1349646670443293e-05, 'epoch': 32.66}
- 65%|██████▌   | 3788/5800 [10:35:43<3:51:18,  6.90s/it]score1 tensor([[0.4375],
-        [0.3633],
-        [0.4844],
-        [0.6445]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4199, 0.2812, 0.4844, 0.6406], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0259, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:22:20,503] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.93 | optimizer_step: 4.36
-[2025-01-25 19:22:20,505] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.40 | bwd_microstep: 4579.95 | bwd_inner_microstep: 4575.34 | bwd_allreduce_microstep: 4.54 | step_microstep: 43.45
-[2025-01-25 19:22:20,505] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.36 | bwd: 4579.97 | bwd_inner: 4575.34 | bwd_allreduce: 4.57 | step: 43.46
- 65%|██████▌   | 3789/5800 [10:35:50<3:50:54,  6.89s/it]                                                        {'loss': 0.0259, 'grad_norm': 5.91884708404541, 'learning_rate': 1.1339578556716506e-05, 'epoch': 32.66}
- 65%|██████▌   | 3789/5800 [10:35:50<3:50:54,  6.89s/it]score1 tensor([[0.5625],
-        [0.4863],
-        [0.5039],
-        [0.4746]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133, 0.4941, 0.5273, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:22:27,442] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.36
-[2025-01-25 19:22:27,443] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.36 | bwd_microstep: 4641.00 | bwd_inner_microstep: 4636.12 | bwd_allreduce_microstep: 4.79 | step_microstep: 51.98
-[2025-01-25 19:22:27,444] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.31 | bwd: 4641.03 | bwd_inner: 4636.12 | bwd_allreduce: 4.83 | step: 52.00
- 65%|██████▌   | 3790/5800 [10:35:57<3:51:14,  6.90s/it]                                                        {'loss': 0.0254, 'grad_norm': 8.134085655212402, 'learning_rate': 1.1329513143459391e-05, 'epoch': 32.67}
- 65%|██████▌   | 3790/5800 [10:35:57<3:51:14,  6.90s/it]score1 tensor([[0.5898],
-        [0.5469],
-        [0.4082],
-        [0.6016]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.6055, 0.4258, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0249, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:22:34,365] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.00 | optimizer_step: 4.37
-[2025-01-25 19:22:34,367] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.09 | bwd_microstep: 4632.82 | bwd_inner_microstep: 4627.84 | bwd_allreduce_microstep: 4.88 | step_microstep: 41.87
-[2025-01-25 19:22:34,367] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.04 | bwd: 4632.85 | bwd_inner: 4627.84 | bwd_allreduce: 4.93 | step: 41.88
- 65%|██████▌   | 3791/5800 [10:36:04<3:51:20,  6.91s/it]                                                        {'loss': 0.0249, 'grad_norm': 8.427865982055664, 'learning_rate': 1.131945043381053e-05, 'epoch': 32.68}
- 65%|██████▌   | 3791/5800 [10:36:04<3:51:20,  6.91s/it]score1 tensor([[0.4570],
-        [0.4199],
-        [0.5586],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4590, 0.4453, 0.5664, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:22:41,272] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 19:22:41,274] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.48 | bwd_microstep: 4633.10 | bwd_inner_microstep: 4628.06 | bwd_allreduce_microstep: 4.93 | step_microstep: 42.80
-[2025-01-25 19:22:41,274] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.44 | bwd: 4633.12 | bwd_inner: 4628.06 | bwd_allreduce: 4.98 | step: 42.81
- 65%|██████▌   | 3792/5800 [10:36:11<3:51:15,  6.91s/it]                                                        {'loss': 0.0137, 'grad_norm': 7.90949010848999, 'learning_rate': 1.130939043090764e-05, 'epoch': 32.69}
- 65%|██████▌   | 3792/5800 [10:36:11<3:51:15,  6.91s/it]score1 tensor([[0.4531],
-        [0.4609],
-        [0.5703],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.4941, 0.5469, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0239, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:22:48,186] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 19:22:48,188] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.94 | bwd_microstep: 4630.41 | bwd_inner_microstep: 4625.59 | bwd_allreduce_microstep: 4.74 | step_microstep: 43.88
-[2025-01-25 19:22:48,188] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.89 | bwd: 4630.44 | bwd_inner: 4625.59 | bwd_allreduce: 4.78 | step: 43.89
- 65%|██████▌   | 3793/5800 [10:36:18<3:51:09,  6.91s/it]                                                        {'loss': 0.0239, 'grad_norm': 3.7489004135131836, 'learning_rate': 1.1299333137887613e-05, 'epoch': 32.7}
- 65%|██████▌   | 3793/5800 [10:36:18<3:51:09,  6.91s/it]score1 tensor([[0.5156],
-        [0.6094],
-        [0.5586],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.6211, 0.5703, 0.5820], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:22:55,102] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 19:22:55,104] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.18 | bwd_microstep: 4632.49 | bwd_inner_microstep: 4628.08 | bwd_allreduce_microstep: 4.31 | step_microstep: 46.01
-[2025-01-25 19:22:55,104] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.15 | bwd: 4632.51 | bwd_inner: 4628.08 | bwd_allreduce: 4.36 | step: 46.03
- 65%|██████▌   | 3794/5800 [10:36:25<3:51:06,  6.91s/it]                                                        {'loss': 0.0137, 'grad_norm': 8.620540618896484, 'learning_rate': 1.1289278557886476e-05, 'epoch': 32.71}
- 65%|██████▌   | 3794/5800 [10:36:25<3:51:06,  6.91s/it]score1 tensor([[0.4199],
-        [0.4258],
-        [0.4316],
-        [0.3691]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4395, 0.4551, 0.4375, 0.3652], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:23:02,016] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 19:23:02,017] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.89 | bwd_microstep: 4633.83 | bwd_inner_microstep: 4628.50 | bwd_allreduce_microstep: 5.20 | step_microstep: 45.67
-[2025-01-25 19:23:02,018] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.85 | bwd: 4633.85 | bwd_inner: 4628.50 | bwd_allreduce: 5.26 | step: 45.68
- 65%|██████▌   | 3795/5800 [10:36:31<3:50:58,  6.91s/it]                                                        {'loss': 0.0146, 'grad_norm': 3.8394851684570312, 'learning_rate': 1.1279226694039439e-05, 'epoch': 32.72}
- 65%|██████▌   | 3795/5800 [10:36:31<3:50:58,  6.91s/it]score1 tensor([[0.5391],
-        [0.3926],
-        [0.4551],
-        [0.4473]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.4043, 0.4668, 0.4570], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0249, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:23:08,941] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.37
-[2025-01-25 19:23:08,942] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.36 | bwd_microstep: 4643.87 | bwd_inner_microstep: 4638.50 | bwd_allreduce_microstep: 5.30 | step_microstep: 42.75
-[2025-01-25 19:23:08,943] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.31 | bwd: 4643.89 | bwd_inner: 4638.50 | bwd_allreduce: 5.34 | step: 42.76
- 65%|██████▌   | 3796/5800 [10:36:38<3:50:58,  6.92s/it]                                                        {'loss': 0.0249, 'grad_norm': 7.649205207824707, 'learning_rate': 1.1269177549480836e-05, 'epoch': 32.72}
- 65%|██████▌   | 3796/5800 [10:36:38<3:50:58,  6.92s/it]score1 tensor([[0.4961],
-        [0.6367],
-        [0.3574],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4863, 0.6328, 0.3691, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:23:15,853] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.37
-[2025-01-25 19:23:15,854] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.97 | bwd_microstep: 4634.32 | bwd_inner_microstep: 4628.72 | bwd_allreduce_microstep: 5.49 | step_microstep: 46.59
-[2025-01-25 19:23:15,854] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.94 | bwd: 4634.34 | bwd_inner: 4628.72 | bwd_allreduce: 5.54 | step: 46.60
- 65%|██████▌   | 3797/5800 [10:36:45<3:50:48,  6.91s/it]                                                        {'loss': 0.0093, 'grad_norm': 0.755520224571228, 'learning_rate': 1.1259131127344158e-05, 'epoch': 32.73}
- 65%|██████▌   | 3797/5800 [10:36:45<3:50:48,  6.91s/it]score1 tensor([[0.6211],
-        [0.4160],
-        [0.4688],
-        [0.4238]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6289, 0.4062, 0.4688, 0.4160], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:23:22,719] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.30 | optimizer_step: 4.37
-[2025-01-25 19:23:22,720] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.67 | bwd_microstep: 4590.27 | bwd_inner_microstep: 4584.56 | bwd_allreduce_microstep: 5.61 | step_microstep: 44.19
-[2025-01-25 19:23:22,721] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.63 | bwd: 4590.32 | bwd_inner: 4584.56 | bwd_allreduce: 5.66 | step: 44.20
- 65%|██████▌   | 3798/5800 [10:36:52<3:50:17,  6.90s/it]                                                        {'loss': 0.0063, 'grad_norm': 1.4710272550582886, 'learning_rate': 1.124908743076207e-05, 'epoch': 32.74}
- 65%|██████▌   | 3798/5800 [10:36:52<3:50:17,  6.90s/it]score1 tensor([[0.4512],
-        [0.5078],
-        [0.3926],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4590, 0.4941, 0.3750, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:23:29,640] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 19:23:29,642] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.45 | bwd_microstep: 4638.63 | bwd_inner_microstep: 4633.77 | bwd_allreduce_microstep: 4.79 | step_microstep: 43.49
-[2025-01-25 19:23:29,642] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.42 | bwd: 4638.66 | bwd_inner: 4633.77 | bwd_allreduce: 4.82 | step: 43.50
- 66%|██████▌   | 3799/5800 [10:36:59<3:50:22,  6.91s/it]                                                        {'loss': 0.0107, 'grad_norm': 0.5898258090019226, 'learning_rate': 1.1239046462866354e-05, 'epoch': 32.75}
- 66%|██████▌   | 3799/5800 [10:36:59<3:50:22,  6.91s/it]score1 tensor([[0.4570],
-        [0.4141],
-        [0.4883],
-        [0.6055]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4453, 0.3945, 0.4980, 0.6133], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:23:36,566] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.06 | optimizer_step: 4.37
-[2025-01-25 19:23:36,567] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.50 | bwd_microstep: 4637.71 | bwd_inner_microstep: 4632.68 | bwd_allreduce_microstep: 4.93 | step_microstep: 45.77
-[2025-01-25 19:23:36,568] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.47 | bwd: 4637.73 | bwd_inner: 4632.68 | bwd_allreduce: 4.98 | step: 45.78
- 66%|██████▌   | 3800/5800 [10:37:06<3:50:26,  6.91s/it]                                                        {'loss': 0.0122, 'grad_norm': 0.7836993932723999, 'learning_rate': 1.1229008226787976e-05, 'epoch': 32.76}
- 66%|██████▌   | 3800/5800 [10:37:06<3:50:26,  6.91s/it]score1 tensor([[0.4277],
-        [0.5117],
-        [0.3965],
-        [0.6992]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4219, 0.5195, 0.3789, 0.6875], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:23:43,497] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.91 | optimizer_step: 4.36
-[2025-01-25 19:23:43,499] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.19 | bwd_microstep: 4642.59 | bwd_inner_microstep: 4637.77 | bwd_allreduce_microstep: 4.75 | step_microstep: 43.47
-[2025-01-25 19:23:43,499] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.14 | bwd: 4642.62 | bwd_inner: 4637.77 | bwd_allreduce: 4.79 | step: 43.48
- 66%|██████▌   | 3801/5800 [10:37:13<3:50:29,  6.92s/it]                                                        {'loss': 0.0107, 'grad_norm': 4.01157808303833, 'learning_rate': 1.121897272565702e-05, 'epoch': 32.77}
- 66%|██████▌   | 3801/5800 [10:37:13<3:50:29,  6.92s/it]score1 tensor([[0.5156],
-        [0.6445],
-        [0.5977],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.6367, 0.5742, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:23:50,435] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.02 | optimizer_step: 4.37
-[2025-01-25 19:23:50,437] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.68 | bwd_microstep: 4649.83 | bwd_inner_microstep: 4644.37 | bwd_allreduce_microstep: 5.35 | step_microstep: 45.09
-[2025-01-25 19:23:50,437] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.63 | bwd: 4649.86 | bwd_inner: 4644.37 | bwd_allreduce: 5.41 | step: 45.09
- 66%|██████▌   | 3802/5800 [10:37:20<3:50:33,  6.92s/it]                                                        {'loss': 0.0146, 'grad_norm': 4.649836540222168, 'learning_rate': 1.1208939962602728e-05, 'epoch': 32.78}
- 66%|██████▌   | 3802/5800 [10:37:20<3:50:33,  6.92s/it]score1 tensor([[0.4238],
-        [0.4980],
-        [0.3262],
-        [0.4551]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.4844, 0.3105, 0.4238], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0210, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:23:57,347] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.37
-[2025-01-25 19:23:57,349] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.31 | bwd_microstep: 4631.38 | bwd_inner_microstep: 4626.47 | bwd_allreduce_microstep: 4.81 | step_microstep: 43.82
-[2025-01-25 19:23:57,349] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.27 | bwd: 4631.40 | bwd_inner: 4626.47 | bwd_allreduce: 4.86 | step: 43.83
- 66%|██████▌   | 3803/5800 [10:37:27<3:50:20,  6.92s/it]                                                        {'loss': 0.021, 'grad_norm': 7.362717628479004, 'learning_rate': 1.1198909940753499e-05, 'epoch': 32.78}
- 66%|██████▌   | 3803/5800 [10:37:27<3:50:20,  6.92s/it]score1 tensor([[0.4688],
-        [0.4082],
-        [0.6250],
-        [0.3965]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4570, 0.3926, 0.6172, 0.4023], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:24:04,277] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 19:24:04,279] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.97 | bwd_microstep: 4640.67 | bwd_inner_microstep: 4635.66 | bwd_allreduce_microstep: 4.89 | step_microstep: 50.58
-[2025-01-25 19:24:04,279] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.91 | bwd: 4640.70 | bwd_inner: 4635.66 | bwd_allreduce: 4.94 | step: 50.59
- 66%|██████▌   | 3804/5800 [10:37:34<3:50:19,  6.92s/it]                                                        {'loss': 0.0103, 'grad_norm': 4.296973705291748, 'learning_rate': 1.1188882663236855e-05, 'epoch': 32.79}
- 66%|██████▌   | 3804/5800 [10:37:34<3:50:19,  6.92s/it]score1 tensor([[0.4824],
-        [0.4473],
-        [0.4844],
-        [0.4922]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4805, 0.4492, 0.4863, 0.4707], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:24:11,195] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 19:24:11,196] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.84 | bwd_microstep: 4639.95 | bwd_inner_microstep: 4635.10 | bwd_allreduce_microstep: 4.75 | step_microstep: 42.06
-[2025-01-25 19:24:11,197] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.80 | bwd: 4639.97 | bwd_inner: 4635.10 | bwd_allreduce: 4.80 | step: 42.06
- 66%|██████▌   | 3805/5800 [10:37:41<3:50:09,  6.92s/it]                                                        {'loss': 0.0068, 'grad_norm': 0.3802395761013031, 'learning_rate': 1.117885813317949e-05, 'epoch': 32.8}
- 66%|██████▌   | 3805/5800 [10:37:41<3:50:09,  6.92s/it]score1 tensor([[0.3906],
-        [0.5781],
-        [0.3535],
-        [0.5977]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3809, 0.6016, 0.3438, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0166, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:24:18,105] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 19:24:18,107] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.74 | bwd_microstep: 4630.34 | bwd_inner_microstep: 4625.60 | bwd_allreduce_microstep: 4.66 | step_microstep: 42.25
-[2025-01-25 19:24:18,107] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.70 | bwd: 4630.36 | bwd_inner: 4625.59 | bwd_allreduce: 4.70 | step: 42.26
- 66%|██████▌   | 3806/5800 [10:37:48<3:49:55,  6.92s/it]                                                        {'loss': 0.0166, 'grad_norm': 1.4349561929702759, 'learning_rate': 1.1168836353707214e-05, 'epoch': 32.81}
- 66%|██████▌   | 3806/5800 [10:37:48<3:49:55,  6.92s/it]score1 tensor([[0.5508],
-        [0.4395],
-        [0.4902],
-        [0.4395]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.4258, 0.4824, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:24:25,026] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.37
-[2025-01-25 19:24:25,028] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.65 | bwd_microstep: 4638.56 | bwd_inner_microstep: 4633.85 | bwd_allreduce_microstep: 4.63 | step_microstep: 43.00
-[2025-01-25 19:24:25,028] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.60 | bwd: 4638.59 | bwd_inner: 4633.85 | bwd_allreduce: 4.66 | step: 43.01
- 66%|██████▌   | 3807/5800 [10:37:55<3:49:49,  6.92s/it]                                                        {'loss': 0.0103, 'grad_norm': 4.1084136962890625, 'learning_rate': 1.1158817327945001e-05, 'epoch': 32.82}
- 66%|██████▌   | 3807/5800 [10:37:55<3:49:49,  6.92s/it]score1 tensor([[0.6289],
-        [0.4492],
-        [0.4609],
-        [0.4824]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.4727, 0.4805, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:24:31,954] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.37
-[2025-01-25 19:24:31,955] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.30 | bwd_microstep: 4641.85 | bwd_inner_microstep: 4637.12 | bwd_allreduce_microstep: 4.64 | step_microstep: 41.69
-[2025-01-25 19:24:31,956] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.27 | bwd: 4641.87 | bwd_inner: 4637.12 | bwd_allreduce: 4.68 | step: 41.69
- 66%|██████▌   | 3808/5800 [10:38:01<3:49:46,  6.92s/it]                                                        {'loss': 0.0186, 'grad_norm': 8.07767105102539, 'learning_rate': 1.1148801059016958e-05, 'epoch': 32.83}
- 66%|██████▌   | 3808/5800 [10:38:01<3:49:46,  6.92s/it]score1 tensor([[0.5391],
-        [0.4688],
-        [0.4668],
-        [0.6172]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.4844, 0.4629, 0.6367], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:24:38,817] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 19:24:38,819] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.20 | bwd_microstep: 4586.09 | bwd_inner_microstep: 4580.54 | bwd_allreduce_microstep: 5.47 | step_microstep: 45.61
-[2025-01-25 19:24:38,819] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.17 | bwd: 4586.12 | bwd_inner: 4580.54 | bwd_allreduce: 5.51 | step: 45.61
- 66%|██████▌   | 3809/5800 [10:38:08<3:49:03,  6.90s/it]                                                        {'loss': 0.0098, 'grad_norm': 2.328625202178955, 'learning_rate': 1.1138787550046315e-05, 'epoch': 32.84}
- 66%|██████▌   | 3809/5800 [10:38:08<3:49:03,  6.90s/it]score1 tensor([[0.5391],
-        [0.3867],
-        [0.4590],
-        [0.3984]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.3984, 0.4512, 0.3262], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0249, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:24:45,730] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.88 | optimizer_step: 4.37
-[2025-01-25 19:24:45,731] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.26 | bwd_microstep: 4638.00 | bwd_inner_microstep: 4632.30 | bwd_allreduce_microstep: 5.60 | step_microstep: 42.54
-[2025-01-25 19:24:45,731] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.23 | bwd: 4638.03 | bwd_inner: 4632.30 | bwd_allreduce: 5.65 | step: 42.54
- 66%|██████▌   | 3810/5800 [10:38:15<3:49:03,  6.91s/it]                                                        {'loss': 0.0249, 'grad_norm': 0.3789946138858795, 'learning_rate': 1.1128776804155479e-05, 'epoch': 32.84}
- 66%|██████▌   | 3810/5800 [10:38:15<3:49:03,  6.91s/it]score1 tensor([[0.4688],
-        [0.3730],
-        [0.6484],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.3809, 0.6523, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:24:52,655] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 19:24:52,656] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.41 | bwd_microstep: 4636.35 | bwd_inner_microstep: 4631.33 | bwd_allreduce_microstep: 4.91 | step_microstep: 43.78
-[2025-01-25 19:24:52,657] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.38 | bwd: 4636.38 | bwd_inner: 4631.33 | bwd_allreduce: 4.97 | step: 43.79
- 66%|██████▌   | 3811/5800 [10:38:22<3:49:08,  6.91s/it]                                                        {'loss': 0.0088, 'grad_norm': 3.834791660308838, 'learning_rate': 1.1118768824465955e-05, 'epoch': 32.85}
- 66%|██████▌   | 3811/5800 [10:38:22<3:49:08,  6.91s/it]score1 tensor([[0.4004],
-        [0.4961],
-        [0.4160],
-        [0.4258]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4023, 0.4980, 0.4336, 0.4355], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:24:59,572] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 19:24:59,574] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.01 | bwd_microstep: 4635.73 | bwd_inner_microstep: 4630.44 | bwd_allreduce_microstep: 5.19 | step_microstep: 45.64
-[2025-01-25 19:24:59,574] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.98 | bwd: 4635.75 | bwd_inner: 4630.44 | bwd_allreduce: 5.24 | step: 45.65
- 66%|██████▌   | 3812/5800 [10:38:29<3:49:01,  6.91s/it]                                                        {'loss': 0.0078, 'grad_norm': 7.412498474121094, 'learning_rate': 1.1108763614098424e-05, 'epoch': 32.86}
- 66%|██████▌   | 3812/5800 [10:38:29<3:49:01,  6.91s/it]score1 tensor([[0.4512],
-        [0.6406],
-        [0.5391],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.6406, 0.5391, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:25:06,386] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.37
-[2025-01-25 19:25:06,387] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.08 | bwd_microstep: 4537.96 | bwd_inner_microstep: 4533.30 | bwd_allreduce_microstep: 4.59 | step_microstep: 42.75
-[2025-01-25 19:25:06,388] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.04 | bwd: 4537.99 | bwd_inner: 4533.30 | bwd_allreduce: 4.63 | step: 42.76
- 66%|██████▌   | 3813/5800 [10:38:36<3:48:01,  6.89s/it]                                                        {'loss': 0.0039, 'grad_norm': 3.9846351146698, 'learning_rate': 1.1098761176172666e-05, 'epoch': 32.87}
- 66%|██████▌   | 3813/5800 [10:38:36<3:48:01,  6.89s/it]score1 tensor([[0.4629],
-        [0.5664],
-        [0.4316],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.5508, 0.4316, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:25:13,270] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.36 | optimizer_step: 4.36
-[2025-01-25 19:25:13,271] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.97 | bwd_microstep: 4596.13 | bwd_inner_microstep: 4591.11 | bwd_allreduce_microstep: 4.93 | step_microstep: 45.36
-[2025-01-25 19:25:13,272] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.94 | bwd: 4596.20 | bwd_inner: 4591.11 | bwd_allreduce: 4.99 | step: 45.37
- 66%|██████▌   | 3814/5800 [10:38:43<3:47:50,  6.88s/it]                                                        {'loss': 0.0132, 'grad_norm': 1.8828891515731812, 'learning_rate': 1.1088761513807622e-05, 'epoch': 32.88}
- 66%|██████▌   | 3814/5800 [10:38:43<3:47:50,  6.88s/it]score1 tensor([[0.5352],
-        [0.4941],
-        [0.5586],
-        [0.4746]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.5000, 0.5430, 0.4531], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:25:20,196] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.36
-[2025-01-25 19:25:20,197] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.96 | bwd_microstep: 4645.23 | bwd_inner_microstep: 4640.26 | bwd_allreduce_microstep: 4.88 | step_microstep: 46.27
-[2025-01-25 19:25:20,197] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.94 | bwd: 4645.25 | bwd_inner: 4640.26 | bwd_allreduce: 4.92 | step: 46.28
- 66%|██████▌   | 3815/5800 [10:38:50<3:48:15,  6.90s/it]                                                        {'loss': 0.0117, 'grad_norm': 0.37345701456069946, 'learning_rate': 1.107876463012137e-05, 'epoch': 32.89}
- 66%|██████▌   | 3815/5800 [10:38:50<3:48:15,  6.90s/it]score1 tensor([[0.6094],
-        [0.5430],
-        [0.4395],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.5391, 0.4297, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:25:27,139] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 19:25:27,141] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.93 | bwd_microstep: 4637.52 | bwd_inner_microstep: 4632.27 | bwd_allreduce_microstep: 5.18 | step_microstep: 52.14
-[2025-01-25 19:25:27,141] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.88 | bwd: 4637.54 | bwd_inner: 4632.27 | bwd_allreduce: 5.21 | step: 52.15
- 66%|██████▌   | 3816/5800 [10:38:57<3:48:27,  6.91s/it]                                                        {'loss': 0.0073, 'grad_norm': 3.775454521179199, 'learning_rate': 1.1068770528231094e-05, 'epoch': 32.9}
- 66%|██████▌   | 3816/5800 [10:38:57<3:48:27,  6.91s/it]score1 tensor([[0.4883],
-        [0.4902],
-        [0.5352],
-        [0.6211]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.4688, 0.5391, 0.6016], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:25:34,087] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 19:25:34,088] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.17 | bwd_microstep: 4649.16 | bwd_inner_microstep: 4643.15 | bwd_allreduce_microstep: 5.91 | step_microstep: 47.43
-[2025-01-25 19:25:34,088] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.11 | bwd: 4649.18 | bwd_inner: 4643.15 | bwd_allreduce: 5.96 | step: 47.45
- 66%|██████▌   | 3817/5800 [10:39:04<3:48:45,  6.92s/it]                                                        {'loss': 0.0122, 'grad_norm': 4.155000686645508, 'learning_rate': 1.1058779211253148e-05, 'epoch': 32.91}
- 66%|██████▌   | 3817/5800 [10:39:04<3:48:45,  6.92s/it]score1 tensor([[0.5039],
-        [0.1914],
-        [0.4844],
-        [0.6992]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4746, 0.1787, 0.4648, 0.6875], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0183, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:25:41,021] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 19:25:41,022] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.27 | bwd_microstep: 4646.33 | bwd_inner_microstep: 4641.44 | bwd_allreduce_microstep: 4.78 | step_microstep: 41.58
-[2025-01-25 19:25:41,022] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2168.24 | bwd: 4646.36 | bwd_inner: 4641.44 | bwd_allreduce: 4.84 | step: 41.59
- 66%|██████▌   | 3818/5800 [10:39:10<3:48:44,  6.92s/it]                                                        {'loss': 0.0183, 'grad_norm': 7.51597785949707, 'learning_rate': 1.1048790682302978e-05, 'epoch': 32.91}
- 66%|██████▌   | 3818/5800 [10:39:10<3:48:44,  6.92s/it]score1 tensor([[0.6758],
-        [0.5977],
-        [0.6211],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6562, 0.5977, 0.6094, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:25:47,905] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.36
-[2025-01-25 19:25:47,907] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2172.46 | bwd_microstep: 4593.46 | bwd_inner_microstep: 4588.63 | bwd_allreduce_microstep: 4.74 | step_microstep: 44.19
-[2025-01-25 19:25:47,907] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2172.43 | bwd: 4593.48 | bwd_inner: 4588.63 | bwd_allreduce: 4.78 | step: 44.19
- 66%|██████▌   | 3819/5800 [10:39:17<3:48:14,  6.91s/it]                                                        {'loss': 0.0117, 'grad_norm': 6.946545124053955, 'learning_rate': 1.10388049444952e-05, 'epoch': 32.92}
- 66%|██████▌   | 3819/5800 [10:39:17<3:48:14,  6.91s/it]score1 tensor([[0.4375],
-        [0.5938],
-        [0.5898],
-        [0.4434]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4375, 0.5625, 0.5742, 0.4395], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:25:54,774] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 19:25:54,776] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.03 | bwd_microstep: 4585.36 | bwd_inner_microstep: 4580.43 | bwd_allreduce_microstep: 4.83 | step_microstep: 45.09
-[2025-01-25 19:25:54,776] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.00 | bwd: 4585.39 | bwd_inner: 4580.44 | bwd_allreduce: 4.87 | step: 45.10
- 66%|██████▌   | 3820/5800 [10:39:24<3:47:40,  6.90s/it]                                                        {'loss': 0.0127, 'grad_norm': 6.331973552703857, 'learning_rate': 1.1028822000943529e-05, 'epoch': 32.93}
- 66%|██████▌   | 3820/5800 [10:39:24<3:47:40,  6.90s/it]score1 tensor([[0.5156],
-        [0.5352],
-        [0.4766],
-        [0.4609]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5000, 0.5430, 0.4648, 0.4668], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:26:01,683] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 19:26:01,684] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.71 | bwd_microstep: 4632.92 | bwd_inner_microstep: 4628.20 | bwd_allreduce_microstep: 4.62 | step_microstep: 42.25
-[2025-01-25 19:26:01,685] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.68 | bwd: 4632.94 | bwd_inner: 4628.20 | bwd_allreduce: 4.68 | step: 42.26
- 66%|██████▌   | 3821/5800 [10:39:31<3:47:40,  6.90s/it]                                                        {'loss': 0.0103, 'grad_norm': 0.3783525228500366, 'learning_rate': 1.1018841854760814e-05, 'epoch': 32.94}
- 66%|██████▌   | 3821/5800 [10:39:31<3:47:40,  6.90s/it]score1 tensor([[0.4707],
-        [0.5703],
-        [0.5664],
-        [0.4941]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4766, 0.5625, 0.5508, 0.4883], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:26:08,592] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 19:26:08,593] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.70 | bwd_microstep: 4633.50 | bwd_inner_microstep: 4629.05 | bwd_allreduce_microstep: 4.37 | step_microstep: 37.74
-[2025-01-25 19:26:08,593] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.66 | bwd: 4633.52 | bwd_inner: 4629.05 | bwd_allreduce: 4.41 | step: 37.74
- 66%|██████▌   | 3822/5800 [10:39:38<3:47:33,  6.90s/it]                                                        {'loss': 0.0088, 'grad_norm': 4.393133163452148, 'learning_rate': 1.1008864509059055e-05, 'epoch': 32.95}
- 66%|██████▌   | 3822/5800 [10:39:38<3:47:33,  6.90s/it]score1 tensor([[0.3965],
-        [0.4512],
-        [0.4766],
-        [0.4727]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4043, 0.4609, 0.4941, 0.4590], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:26:15,507] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.37
-[2025-01-25 19:26:15,509] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.92 | bwd_microstep: 4633.87 | bwd_inner_microstep: 4628.95 | bwd_allreduce_microstep: 4.84 | step_microstep: 46.95
-[2025-01-25 19:26:15,509] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.89 | bwd: 4633.89 | bwd_inner: 4628.95 | bwd_allreduce: 4.88 | step: 46.96
- 66%|██████▌   | 3823/5800 [10:39:45<3:47:39,  6.91s/it]                                                        {'loss': 0.0122, 'grad_norm': 3.713240623474121, 'learning_rate': 1.0998889966949342e-05, 'epoch': 32.96}
- 66%|██████▌   | 3823/5800 [10:39:45<3:47:39,  6.91s/it]score1 tensor([[0.3867],
-        [0.5273],
-        [0.6523],
-        [0.6211]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3887, 0.5078, 0.6445, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:26:22,377] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 19:26:22,379] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.99 | bwd_microstep: 4592.67 | bwd_inner_microstep: 4587.83 | bwd_allreduce_microstep: 4.75 | step_microstep: 41.59
-[2025-01-25 19:26:22,379] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.96 | bwd: 4592.69 | bwd_inner: 4587.83 | bwd_allreduce: 4.79 | step: 41.60
- 66%|██████▌   | 3824/5800 [10:39:52<3:47:06,  6.90s/it]                                                        {'loss': 0.0073, 'grad_norm': 2.799337863922119, 'learning_rate': 1.0988918231541928e-05, 'epoch': 32.97}
- 66%|██████▌   | 3824/5800 [10:39:52<3:47:06,  6.90s/it]score1 tensor([[0.6406],
-        [0.6367],
-        [0.5586],
-        [0.4805]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.6094, 0.5547, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:26:29,291] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 19:26:29,293] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.42 | bwd_microstep: 4634.95 | bwd_inner_microstep: 4630.33 | bwd_allreduce_microstep: 4.54 | step_microstep: 44.42
-[2025-01-25 19:26:29,293] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.39 | bwd: 4634.98 | bwd_inner: 4630.33 | bwd_allreduce: 4.58 | step: 44.43
- 66%|██████▌   | 3825/5800 [10:39:59<3:47:10,  6.90s/it]                                                        {'loss': 0.0122, 'grad_norm': 0.47848355770111084, 'learning_rate': 1.0978949305946157e-05, 'epoch': 32.97}
- 66%|██████▌   | 3825/5800 [10:39:59<3:47:10,  6.90s/it]score1 tensor([[0.4668],
-        [0.3652],
-        [0.4062],
-        [0.6055]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.3672, 0.4141, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:26:36,165] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.37
-[2025-01-25 19:26:36,166] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.29 | bwd_microstep: 4597.96 | bwd_inner_microstep: 4593.01 | bwd_allreduce_microstep: 4.85 | step_microstep: 44.46
-[2025-01-25 19:26:36,167] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.26 | bwd: 4597.99 | bwd_inner: 4593.01 | bwd_allreduce: 4.89 | step: 44.47
- 66%|██████▌   | 3826/5800 [10:40:06<3:46:49,  6.89s/it]                                                        {'loss': 0.0054, 'grad_norm': 5.408536434173584, 'learning_rate': 1.0968983193270531e-05, 'epoch': 32.98}
- 66%|██████▌   | 3826/5800 [10:40:06<3:46:49,  6.89s/it]score1 tensor([[0.4785],
-        [0.5977],
-        [0.5039],
-        [0.4805]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.6172, 0.5234, 0.4902], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:26:43,086] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.37
-[2025-01-25 19:26:43,087] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.01 | bwd_microstep: 4633.93 | bwd_inner_microstep: 4628.42 | bwd_allreduce_microstep: 5.43 | step_microstep: 48.59
-[2025-01-25 19:26:43,088] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.95 | bwd: 4633.95 | bwd_inner: 4628.42 | bwd_allreduce: 5.46 | step: 48.60
- 66%|██████▌   | 3827/5800 [10:40:13<3:46:54,  6.90s/it]                                                        {'loss': 0.0137, 'grad_norm': 4.257272720336914, 'learning_rate': 1.0959019896622647e-05, 'epoch': 32.99}
- 66%|██████▌   | 3827/5800 [10:40:13<3:46:54,  6.90s/it][93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0., device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:26:47,953] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 19:26:47,954] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 572.36 | bwd_microstep: 1184.14 | bwd_inner_microstep: 1179.82 | bwd_allreduce_microstep: 4.23 | step_microstep: 42.54
-[2025-01-25 19:26:47,954] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 572.33 | bwd: 1184.17 | bwd_inner: 1179.82 | bwd_allreduce: 4.28 | step: 42.54
- 66%|██████▌   | 3828/5800 [10:40:17<3:26:45,  6.29s/it]                                                        {'loss': 0.0, 'grad_norm': 0.0, 'learning_rate': 1.0949059419109225e-05, 'epoch': 33.0}
- 66%|██████▌   | 3828/5800 [10:40:17<3:26:45,  6.29s/it][2025-01-25 19:26:52,508] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 19:27:04,007] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 19:27:14,156] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 19:27:24,322] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-score1 tensor([[0.4004],
-        [0.6406],
-        [0.4375],
-        [0.4727]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4199, 0.6484, 0.4473, 0.4902], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:27:42,185] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 19:27:42,186] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2139.47 | bwd_microstep: 4602.61 | bwd_inner_microstep: 4597.64 | bwd_allreduce_microstep: 4.84 | step_microstep: 44.11
-[2025-01-25 19:27:42,187] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2139.43 | bwd: 4602.64 | bwd_inner: 4597.64 | bwd_allreduce: 4.91 | step: 44.11
- 66%|██████▌   | 3829/5800 [10:41:12<11:19:07, 20.67s/it]                                                         {'loss': 0.0137, 'grad_norm': 7.950127124786377, 'learning_rate': 1.0939101763836129e-05, 'epoch': 33.01}
- 66%|██████▌   | 3829/5800 [10:41:12<11:19:07, 20.67s/it]score1 tensor([[0.4941],
-        [0.6055],
-        [0.4375],
-        [0.4355]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.6133, 0.4375, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:27:48,967] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 19:27:48,968] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2132.46 | bwd_microstep: 4531.85 | bwd_inner_microstep: 4526.09 | bwd_allreduce_microstep: 5.63 | step_microstep: 47.35
-[2025-01-25 19:27:48,969] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2132.42 | bwd: 4531.88 | bwd_inner: 4526.09 | bwd_allreduce: 5.70 | step: 47.36
- 66%|██████▌   | 3830/5800 [10:41:18<9:01:59, 16.51s/it]                                                         {'loss': 0.0107, 'grad_norm': 6.116821765899658, 'learning_rate': 1.0929146933908318e-05, 'epoch': 33.02}
- 66%|██████▌   | 3830/5800 [10:41:18<9:01:59, 16.51s/it]score1 tensor([[0.4141],
-        [0.4512],
-        [0.6289],
-        [0.4395]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4023, 0.4570, 0.6289, 0.4355], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:27:55,767] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.95 | optimizer_step: 4.37
-[2025-01-25 19:27:55,769] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2134.97 | bwd_microstep: 4542.01 | bwd_inner_microstep: 4536.72 | bwd_allreduce_microstep: 5.21 | step_microstep: 44.78
-[2025-01-25 19:27:55,769] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2134.93 | bwd: 4542.04 | bwd_inner: 4536.72 | bwd_allreduce: 5.24 | step: 44.78
- 66%|██████▌   | 3831/5800 [10:41:25<7:26:13, 13.60s/it]                                                        {'loss': 0.0054, 'grad_norm': 1.8364144563674927, 'learning_rate': 1.0919194932429895e-05, 'epoch': 33.03}
- 66%|██████▌   | 3831/5800 [10:41:25<7:26:13, 13.60s/it]score1 tensor([[0.6562],
-        [0.3633],
-        [0.5977],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6562, 0.3652, 0.5938, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0024, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:28:02,571] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.36
-[2025-01-25 19:28:02,572] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2140.77 | bwd_microstep: 4537.33 | bwd_inner_microstep: 4532.49 | bwd_allreduce_microstep: 4.76 | step_microstep: 41.71
-[2025-01-25 19:28:02,573] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2140.74 | bwd: 4537.36 | bwd_inner: 4532.49 | bwd_allreduce: 4.80 | step: 41.71
- 66%|██████▌   | 3832/5800 [10:41:32<6:19:03, 11.56s/it]                                                        {'loss': 0.0024, 'grad_norm': 1.609319806098938, 'learning_rate': 1.0909245762504055e-05, 'epoch': 33.03}
- 66%|██████▌   | 3832/5800 [10:41:32<6:19:03, 11.56s/it]score1 tensor([[0.4551],
-        [0.4707],
-        [0.4180],
-        [0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.4766, 0.4238, 0.4746], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:28:09,441] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.04 | optimizer_step: 4.36
-[2025-01-25 19:28:09,442] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2144.28 | bwd_microstep: 4601.06 | bwd_inner_microstep: 4595.64 | bwd_allreduce_microstep: 5.30 | step_microstep: 45.30
-[2025-01-25 19:28:09,442] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2144.24 | bwd: 4601.09 | bwd_inner: 4595.64 | bwd_allreduce: 5.37 | step: 45.30
- 66%|██████▌   | 3833/5800 [10:41:39<5:32:47, 10.15s/it]                                                        {'loss': 0.0073, 'grad_norm': 0.3673425614833832, 'learning_rate': 1.0899299427233118e-05, 'epoch': 33.04}
- 66%|██████▌   | 3833/5800 [10:41:39<5:32:47, 10.15s/it]score1 tensor([[0.5586],
-        [0.4688],
-        [0.5781],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4707, 0.5820, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:28:16,315] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 19:28:16,317] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2147.95 | bwd_microstep: 4602.87 | bwd_inner_microstep: 4597.68 | bwd_allreduce_microstep: 5.09 | step_microstep: 42.96
-[2025-01-25 19:28:16,317] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2147.92 | bwd: 4602.89 | bwd_inner: 4597.68 | bwd_allreduce: 5.13 | step: 42.97
- 66%|██████▌   | 3834/5800 [10:41:46<5:00:24,  9.17s/it]                                                        {'loss': 0.0049, 'grad_norm': 0.4691700041294098, 'learning_rate': 1.0889355929718541e-05, 'epoch': 33.05}
- 66%|██████▌   | 3834/5800 [10:41:46<5:00:24,  9.17s/it]score1 tensor([[0.4453],
-        [0.5000],
-        [0.5273],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4297, 0.5078, 0.5195, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:28:23,199] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 19:28:23,201] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2151.25 | bwd_microstep: 4612.17 | bwd_inner_microstep: 4607.36 | bwd_allreduce_microstep: 4.71 | step_microstep: 44.10
-[2025-01-25 19:28:23,201] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2151.21 | bwd: 4612.20 | bwd_inner: 4607.36 | bwd_allreduce: 4.76 | step: 44.11
- 66%|██████▌   | 3835/5800 [10:41:53<4:37:49,  8.48s/it]                                                        {'loss': 0.0088, 'grad_norm': 0.428987056016922, 'learning_rate': 1.0879415273060858e-05, 'epoch': 33.06}
- 66%|██████▌   | 3835/5800 [10:41:53<4:37:49,  8.48s/it]score1 tensor([[0.5625],
-        [0.4336],
-        [0.4707],
-        [0.6289]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.4258, 0.4727, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:28:30,031] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.37
-[2025-01-25 19:28:30,032] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.81 | bwd_microstep: 4547.45 | bwd_inner_microstep: 4542.10 | bwd_allreduce_microstep: 5.24 | step_microstep: 46.08
-[2025-01-25 19:28:30,033] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.78 | bwd: 4547.49 | bwd_inner: 4542.10 | bwd_allreduce: 5.30 | step: 46.08
- 66%|██████▌   | 3836/5800 [10:42:00<4:21:24,  7.99s/it]                                                        {'loss': 0.0054, 'grad_norm': 2.280937910079956, 'learning_rate': 1.0869477460359756e-05, 'epoch': 33.07}
- 66%|██████▌   | 3836/5800 [10:42:00<4:21:24,  7.99s/it]score1 tensor([[0.4316],
-        [0.5820],
-        [0.5742],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3984, 0.5781, 0.5508, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0190, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:28:36,894] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 19:28:36,896] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2147.99 | bwd_microstep: 4598.38 | bwd_inner_microstep: 4592.75 | bwd_allreduce_microstep: 5.52 | step_microstep: 42.95
-[2025-01-25 19:28:36,896] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2147.96 | bwd: 4598.41 | bwd_inner: 4592.75 | bwd_allreduce: 5.57 | step: 42.96
- 66%|██████▌   | 3837/5800 [10:42:06<4:10:17,  7.65s/it]                                                        {'loss': 0.019, 'grad_norm': 8.343259811401367, 'learning_rate': 1.0859542494714002e-05, 'epoch': 33.08}
- 66%|██████▌   | 3837/5800 [10:42:06<4:10:17,  7.65s/it]score1 tensor([[0.5547],
-        [0.5312],
-        [0.3652],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.5039, 0.3340, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:28:43,714] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.37
-[2025-01-25 19:28:43,715] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.83 | bwd_microstep: 4546.54 | bwd_inner_microstep: 4541.89 | bwd_allreduce_microstep: 4.57 | step_microstep: 43.74
-[2025-01-25 19:28:43,716] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.79 | bwd: 4546.59 | bwd_inner: 4541.89 | bwd_allreduce: 4.61 | step: 43.75
- 66%|██████▌   | 3838/5800 [10:42:13<4:02:02,  7.40s/it]                                                        {'loss': 0.0186, 'grad_norm': 5.8495635986328125, 'learning_rate': 1.0849610379221508e-05, 'epoch': 33.09}
- 66%|██████▌   | 3838/5800 [10:42:13<4:02:02,  7.40s/it]score1 tensor([[0.5000],
-        [0.3848],
-        [0.5352],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.3711, 0.5117, 0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0166, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:28:50,593] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 19:28:50,594] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.32 | bwd_microstep: 4605.95 | bwd_inner_microstep: 4601.01 | bwd_allreduce_microstep: 4.85 | step_microstep: 44.10
-[2025-01-25 19:28:50,595] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.27 | bwd: 4605.97 | bwd_inner: 4601.01 | bwd_allreduce: 4.89 | step: 44.11
- 66%|██████▌   | 3839/5800 [10:42:20<3:56:45,  7.24s/it]                                                        {'loss': 0.0166, 'grad_norm': 7.928177833557129, 'learning_rate': 1.083968111697927e-05, 'epoch': 33.09}
- 66%|██████▌   | 3839/5800 [10:42:20<3:56:45,  7.24s/it]score1 tensor([[0.6445],
-        [0.5625],
-        [0.5195],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6367, 0.5664, 0.5156, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:28:57,434] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 19:28:57,436] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.29 | bwd_microstep: 4564.30 | bwd_inner_microstep: 4558.63 | bwd_allreduce_microstep: 5.58 | step_microstep: 46.23
-[2025-01-25 19:28:57,436] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.26 | bwd: 4564.33 | bwd_inner: 4558.63 | bwd_allreduce: 5.63 | step: 46.24
- 66%|██████▌   | 3840/5800 [10:42:27<3:52:43,  7.12s/it]                                                        {'loss': 0.0039, 'grad_norm': 2.2632551193237305, 'learning_rate': 1.08297547110834e-05, 'epoch': 33.1}
- 66%|██████▌   | 3840/5800 [10:42:27<3:52:43,  7.12s/it]score1 tensor([[0.5664],
-        [0.4453],
-        [0.6094],
-        [0.3945]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.4238, 0.6016, 0.3730], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:29:04,320] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 19:29:04,322] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.45 | bwd_microstep: 4601.63 | bwd_inner_microstep: 4596.87 | bwd_allreduce_microstep: 4.69 | step_microstep: 47.48
-[2025-01-25 19:29:04,322] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.41 | bwd: 4601.66 | bwd_inner: 4596.87 | bwd_allreduce: 4.73 | step: 47.50
- 66%|██████▌   | 3841/5800 [10:42:34<3:50:13,  7.05s/it]                                                        {'loss': 0.0137, 'grad_norm': 8.033099174499512, 'learning_rate': 1.0819831164629133e-05, 'epoch': 33.11}
- 66%|██████▌   | 3841/5800 [10:42:34<3:50:13,  7.05s/it]score1 tensor([[0.6094],
-        [0.6172],
-        [0.5273],
-        [0.4512]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.6055, 0.5352, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:29:11,151] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 19:29:11,153] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.81 | bwd_microstep: 4556.87 | bwd_inner_microstep: 4551.55 | bwd_allreduce_microstep: 5.22 | step_microstep: 45.37
-[2025-01-25 19:29:11,153] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.78 | bwd: 4556.90 | bwd_inner: 4551.55 | bwd_allreduce: 5.27 | step: 45.38
- 66%|██████▌   | 3842/5800 [10:42:41<3:47:56,  6.98s/it]                                                        {'loss': 0.0054, 'grad_norm': 2.1439969539642334, 'learning_rate': 1.0809910480710793e-05, 'epoch': 33.12}
- 66%|██████▌   | 3842/5800 [10:42:41<3:47:56,  6.98s/it]score1 tensor([[0.4785],
-        [0.6211],
-        [0.4883],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.6250, 0.4941, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:29:18,042] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.36
-[2025-01-25 19:29:18,043] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.40 | bwd_microstep: 4613.52 | bwd_inner_microstep: 4608.28 | bwd_allreduce_microstep: 5.13 | step_microstep: 44.48
-[2025-01-25 19:29:18,043] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.35 | bwd: 4613.54 | bwd_inner: 4608.28 | bwd_allreduce: 5.19 | step: 44.49
- 66%|██████▋   | 3843/5800 [10:42:48<3:46:52,  6.96s/it]                                                        {'loss': 0.0093, 'grad_norm': 8.321661949157715, 'learning_rate': 1.0799992662421835e-05, 'epoch': 33.13}
- 66%|██████▋   | 3843/5800 [10:42:48<3:46:52,  6.96s/it]score1 tensor([[0.6836],
-        [0.3730],
-        [0.4512],
-        [0.4746]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6953, 0.3809, 0.4648, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:29:24,928] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 7.91
-[2025-01-25 19:29:24,929] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.04 | bwd_microstep: 4608.95 | bwd_inner_microstep: 4603.71 | bwd_allreduce_microstep: 5.14 | step_microstep: 48.88
-[2025-01-25 19:29:24,930] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.00 | bwd: 4609.00 | bwd_inner: 4603.71 | bwd_allreduce: 5.19 | step: 48.89
- 66%|██████▋   | 3844/5800 [10:42:54<3:46:06,  6.94s/it]                                                        {'loss': 0.0146, 'grad_norm': 4.077828407287598, 'learning_rate': 1.07900777128548e-05, 'epoch': 33.14}
- 66%|██████▋   | 3844/5800 [10:42:54<3:46:06,  6.94s/it]score1 tensor([[0.4082],
-        [0.4941],
-        [0.4766],
-        [0.5898]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.5039, 0.5000, 0.6133], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:29:31,810] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 19:29:31,812] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.05 | bwd_microstep: 4607.51 | bwd_inner_microstep: 4602.14 | bwd_allreduce_microstep: 5.25 | step_microstep: 46.34
-[2025-01-25 19:29:31,812] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.03 | bwd: 4607.53 | bwd_inner: 4602.14 | bwd_allreduce: 5.31 | step: 46.35
- 66%|██████▋   | 3845/5800 [10:43:01<3:45:27,  6.92s/it]                                                        {'loss': 0.0161, 'grad_norm': 4.3888068199157715, 'learning_rate': 1.0780165635101328e-05, 'epoch': 33.15}
- 66%|██████▋   | 3845/5800 [10:43:01<3:45:27,  6.92s/it]score1 tensor([[0.4688],
-        [0.4160],
-        [0.5039],
-        [0.4785]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.4297, 0.5352, 0.4902], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0181, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:29:38,687] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 19:29:38,689] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.44 | bwd_microstep: 4607.56 | bwd_inner_microstep: 4602.53 | bwd_allreduce_microstep: 4.96 | step_microstep: 44.84
-[2025-01-25 19:29:38,689] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.40 | bwd: 4607.59 | bwd_inner: 4602.53 | bwd_allreduce: 4.99 | step: 44.85
- 66%|██████▋   | 3846/5800 [10:43:08<3:44:57,  6.91s/it]                                                        {'loss': 0.0181, 'grad_norm': 7.691963195800781, 'learning_rate': 1.0770256432252193e-05, 'epoch': 33.16}
- 66%|██████▋   | 3846/5800 [10:43:08<3:44:57,  6.91s/it]score1 tensor([[0.4668],
-        [0.4160],
-        [0.5117],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4609, 0.4180, 0.5391, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:29:45,573] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.37
-[2025-01-25 19:29:45,575] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.75 | bwd_microstep: 4611.88 | bwd_inner_microstep: 4606.56 | bwd_allreduce_microstep: 5.23 | step_microstep: 43.93
-[2025-01-25 19:29:45,575] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.72 | bwd: 4611.90 | bwd_inner: 4606.56 | bwd_allreduce: 5.27 | step: 43.94
- 66%|██████▋   | 3847/5800 [10:43:15<3:44:37,  6.90s/it]                                                        {'loss': 0.0127, 'grad_norm': 3.942983388900757, 'learning_rate': 1.0760350107397257e-05, 'epoch': 33.16}
- 66%|██████▋   | 3847/5800 [10:43:15<3:44:37,  6.90s/it]score1 tensor([[0.4727],
-        [0.4082],
-        [0.5234],
-        [0.4336]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.3984, 0.5391, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:29:52,477] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 19:29:52,479] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.83 | bwd_microstep: 4614.21 | bwd_inner_microstep: 4609.04 | bwd_allreduce_microstep: 5.05 | step_microstep: 45.01
-[2025-01-25 19:29:52,479] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.78 | bwd: 4614.23 | bwd_inner: 4609.04 | bwd_allreduce: 5.11 | step: 45.02
- 66%|██████▋   | 3848/5800 [10:43:22<3:44:32,  6.90s/it]                                                        {'loss': 0.0132, 'grad_norm': 4.082945823669434, 'learning_rate': 1.0750446663625476e-05, 'epoch': 33.17}
- 66%|██████▋   | 3848/5800 [10:43:22<3:44:32,  6.90s/it]score1 tensor([[0.3867],
-        [0.5508],
-        [0.4316],
-        [0.5898]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3809, 0.5586, 0.4512, 0.6016], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:29:59,369] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 19:29:59,371] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.28 | bwd_microstep: 4610.50 | bwd_inner_microstep: 4604.99 | bwd_allreduce_microstep: 5.39 | step_microstep: 46.14
-[2025-01-25 19:29:59,371] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.25 | bwd: 4610.52 | bwd_inner: 4604.99 | bwd_allreduce: 5.45 | step: 46.14
- 66%|██████▋   | 3849/5800 [10:43:29<3:44:17,  6.90s/it]                                                        {'loss': 0.0112, 'grad_norm': 4.45713996887207, 'learning_rate': 1.0740546104024923e-05, 'epoch': 33.18}
- 66%|██████▋   | 3849/5800 [10:43:29<3:44:17,  6.90s/it]score1 tensor([[0.5039],
-        [0.6641],
-        [0.6367],
-        [0.4238]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.6797, 0.6484, 0.4141], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:30:06,208] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 19:30:06,210] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.42 | bwd_microstep: 4559.78 | bwd_inner_microstep: 4554.88 | bwd_allreduce_microstep: 4.81 | step_microstep: 44.07
-[2025-01-25 19:30:06,210] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.38 | bwd: 4559.80 | bwd_inner: 4554.88 | bwd_allreduce: 4.85 | step: 44.07
- 66%|██████▋   | 3850/5800 [10:43:36<3:43:37,  6.88s/it]                                                        {'loss': 0.0093, 'grad_norm': 3.0215256214141846, 'learning_rate': 1.0730648431682754e-05, 'epoch': 33.19}
- 66%|██████▋   | 3850/5800 [10:43:36<3:43:37,  6.88s/it]score1 tensor([[0.5586],
-        [0.4316],
-        [0.5508],
-        [0.6211]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5742, 0.4316, 0.5508, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:30:12,994] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.36
-[2025-01-25 19:30:12,996] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.50 | bwd_microstep: 4500.17 | bwd_inner_microstep: 4493.24 | bwd_allreduce_microstep: 6.79 | step_microstep: 46.70
-[2025-01-25 19:30:12,996] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.47 | bwd: 4500.20 | bwd_inner: 4493.24 | bwd_allreduce: 6.86 | step: 46.71
- 66%|██████▋   | 3851/5800 [10:43:42<3:42:36,  6.85s/it]                                                        {'loss': 0.0039, 'grad_norm': 2.1567885875701904, 'learning_rate': 1.0720753649685247e-05, 'epoch': 33.2}
- 66%|██████▋   | 3851/5800 [10:43:42<3:42:36,  6.85s/it]score1 tensor([[0.6172],
-        [0.5508],
-        [0.3496],
-        [0.4727]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6289, 0.5508, 0.3418, 0.4355], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0142, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:30:19,844] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 19:30:19,846] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.97 | bwd_microstep: 4566.09 | bwd_inner_microstep: 4561.03 | bwd_allreduce_microstep: 4.95 | step_microstep: 45.70
-[2025-01-25 19:30:19,846] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.93 | bwd: 4566.11 | bwd_inner: 4561.03 | bwd_allreduce: 5.01 | step: 45.71
- 66%|██████▋   | 3852/5800 [10:43:49<3:42:25,  6.85s/it]                                                        {'loss': 0.0142, 'grad_norm': 1.440900206565857, 'learning_rate': 1.0710861761117756e-05, 'epoch': 33.21}
- 66%|██████▋   | 3852/5800 [10:43:49<3:42:25,  6.85s/it]score1 tensor([[0.4883],
-        [0.5195],
-        [0.5977],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4629, 0.5352, 0.5898, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:30:26,743] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 19:30:26,745] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.28 | bwd_microstep: 4610.65 | bwd_inner_microstep: 4605.14 | bwd_allreduce_microstep: 5.42 | step_microstep: 47.61
-[2025-01-25 19:30:26,745] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.24 | bwd: 4610.68 | bwd_inner: 4605.14 | bwd_allreduce: 5.47 | step: 47.62
- 66%|██████▋   | 3853/5800 [10:43:56<3:42:54,  6.87s/it]                                                        {'loss': 0.0137, 'grad_norm': 4.214231967926025, 'learning_rate': 1.0700972769064755e-05, 'epoch': 33.22}
- 66%|██████▋   | 3853/5800 [10:43:56<3:42:54,  6.87s/it]score1 tensor([[0.5742],
-        [0.5039],
-        [0.4727],
-        [0.4141]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.4922, 0.4453, 0.4062], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0205, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:30:33,646] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.37
-[2025-01-25 19:30:33,647] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.84 | bwd_microstep: 4614.97 | bwd_inner_microstep: 4609.58 | bwd_allreduce_microstep: 5.30 | step_microstep: 47.37
-[2025-01-25 19:30:33,647] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.81 | bwd: 4615.00 | bwd_inner: 4609.58 | bwd_allreduce: 5.34 | step: 47.37
- 66%|██████▋   | 3854/5800 [10:44:03<3:43:00,  6.88s/it]                                                        {'loss': 0.0205, 'grad_norm': 7.939457416534424, 'learning_rate': 1.069108667660979e-05, 'epoch': 33.22}
- 66%|██████▋   | 3854/5800 [10:44:03<3:43:00,  6.88s/it]score1 tensor([[0.5781],
-        [0.4219],
-        [0.6211],
-        [0.5898]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.4004, 0.6172, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:30:40,495] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 19:30:40,497] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.74 | bwd_microstep: 4565.86 | bwd_inner_microstep: 4561.02 | bwd_allreduce_microstep: 4.75 | step_microstep: 45.89
-[2025-01-25 19:30:40,497] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.69 | bwd: 4565.88 | bwd_inner: 4561.02 | bwd_allreduce: 4.80 | step: 45.90
- 66%|██████▋   | 3855/5800 [10:44:10<3:42:38,  6.87s/it]                                                        {'loss': 0.0093, 'grad_norm': 6.323728561401367, 'learning_rate': 1.0681203486835527e-05, 'epoch': 33.23}
- 66%|██████▋   | 3855/5800 [10:44:10<3:42:38,  6.87s/it]score1 tensor([[0.5234],
-        [0.4727],
-        [0.5625],
-        [0.6602]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5000, 0.4629, 0.5469, 0.6641], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:30:47,384] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 19:30:47,385] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.84 | bwd_microstep: 4612.39 | bwd_inner_microstep: 4607.05 | bwd_allreduce_microstep: 5.23 | step_microstep: 46.15
-[2025-01-25 19:30:47,385] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.79 | bwd: 4612.42 | bwd_inner: 4607.05 | bwd_allreduce: 5.29 | step: 46.16
- 66%|██████▋   | 3856/5800 [10:44:17<3:42:43,  6.87s/it]                                                        {'loss': 0.0132, 'grad_norm': 3.8464086055755615, 'learning_rate': 1.067132320282371e-05, 'epoch': 33.24}
- 66%|██████▋   | 3856/5800 [10:44:17<3:42:43,  6.87s/it]score1 tensor([[0.6094],
-        [0.4863],
-        [0.3887],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.4648, 0.3262, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0229, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:30:54,239] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 19:30:54,241] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.57 | bwd_microstep: 4573.73 | bwd_inner_microstep: 4568.09 | bwd_allreduce_microstep: 5.55 | step_microstep: 47.95
-[2025-01-25 19:30:54,241] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.53 | bwd: 4573.75 | bwd_inner: 4568.09 | bwd_allreduce: 5.60 | step: 47.96
- 66%|██████▋   | 3857/5800 [10:44:24<3:42:25,  6.87s/it]                                                        {'loss': 0.0229, 'grad_norm': 1.5397924184799194, 'learning_rate': 1.0661445827655187e-05, 'epoch': 33.25}
- 66%|██████▋   | 3857/5800 [10:44:24<3:42:25,  6.87s/it]score1 tensor([[0.5039],
-        [0.5508],
-        [0.4883],
-        [0.4629]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.4941, 0.4785, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0225, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:31:01,146] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.81 | optimizer_step: 4.36
-[2025-01-25 19:31:01,148] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.94 | bwd_microstep: 4625.34 | bwd_inner_microstep: 4619.49 | bwd_allreduce_microstep: 5.73 | step_microstep: 48.62
-[2025-01-25 19:31:01,148] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.90 | bwd: 4625.37 | bwd_inner: 4619.49 | bwd_allreduce: 5.79 | step: 48.62
- 67%|██████▋   | 3858/5800 [10:44:31<3:42:40,  6.88s/it]                                                        {'loss': 0.0225, 'grad_norm': 8.080657005310059, 'learning_rate': 1.0651571364409897e-05, 'epoch': 33.26}
- 67%|██████▋   | 3858/5800 [10:44:31<3:42:40,  6.88s/it]score1 tensor([[0.5820],
-        [0.4668],
-        [0.6250],
-        [0.5977]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.4590, 0.6094, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:31:08,065] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.93 | optimizer_step: 4.36
-[2025-01-25 19:31:08,067] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.37 | bwd_microstep: 4641.89 | bwd_inner_microstep: 4636.68 | bwd_allreduce_microstep: 5.09 | step_microstep: 42.21
-[2025-01-25 19:31:08,067] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.32 | bwd: 4641.91 | bwd_inner: 4636.67 | bwd_allreduce: 5.16 | step: 42.22
- 67%|██████▋   | 3859/5800 [10:44:38<3:42:56,  6.89s/it]                                                        {'loss': 0.0156, 'grad_norm': 8.70702075958252, 'learning_rate': 1.0641699816166856e-05, 'epoch': 33.27}
- 67%|██████▋   | 3859/5800 [10:44:38<3:42:56,  6.89s/it]score1 tensor([[0.6797],
-        [0.5430],
-        [0.5156],
-        [0.6172]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6875, 0.5508, 0.4961, 0.6406], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:31:14,987] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 19:31:14,989] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.76 | bwd_microstep: 4637.40 | bwd_inner_microstep: 4631.86 | bwd_allreduce_microstep: 5.40 | step_microstep: 48.29
-[2025-01-25 19:31:14,989] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.73 | bwd: 4637.42 | bwd_inner: 4631.86 | bwd_allreduce: 5.46 | step: 48.29
- 67%|██████▋   | 3860/5800 [10:44:44<3:43:11,  6.90s/it]                                                        {'loss': 0.0146, 'grad_norm': 4.845266342163086, 'learning_rate': 1.06318311860042e-05, 'epoch': 33.28}
- 67%|██████▋   | 3860/5800 [10:44:44<3:43:11,  6.90s/it]score1 tensor([[0.3535],
-        [0.5273],
-        [0.5156],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.5430, 0.5078, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:31:21,881] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.93 | optimizer_step: 4.37
-[2025-01-25 19:31:21,883] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.32 | bwd_microstep: 4594.13 | bwd_inner_microstep: 4588.95 | bwd_allreduce_microstep: 5.10 | step_microstep: 43.27
-[2025-01-25 19:31:21,883] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.28 | bwd: 4594.16 | bwd_inner: 4588.95 | bwd_allreduce: 5.14 | step: 43.28
- 67%|██████▋   | 3861/5800 [10:44:51<3:42:57,  6.90s/it]                                                        {'loss': 0.0112, 'grad_norm': 1.7403639554977417, 'learning_rate': 1.0621965476999123e-05, 'epoch': 33.28}
- 67%|██████▋   | 3861/5800 [10:44:51<3:42:57,  6.90s/it]score1 tensor([[0.5352],
-        [0.4766],
-        [0.5664],
-        [0.3535]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.4922, 0.5625, 0.3652], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:31:28,795] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 19:31:28,796] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.85 | bwd_microstep: 4631.30 | bwd_inner_microstep: 4625.80 | bwd_allreduce_microstep: 5.38 | step_microstep: 46.99
-[2025-01-25 19:31:28,797] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.81 | bwd: 4631.32 | bwd_inner: 4625.80 | bwd_allreduce: 5.45 | step: 47.02
- 67%|██████▋   | 3862/5800 [10:44:58<3:42:57,  6.90s/it]                                                        {'loss': 0.0127, 'grad_norm': 3.5582034587860107, 'learning_rate': 1.0612102692227944e-05, 'epoch': 33.29}
- 67%|██████▋   | 3862/5800 [10:44:58<3:42:57,  6.90s/it]score1 tensor([[0.3516],
-        [0.5625],
-        [0.5547],
-        [0.4902]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3652, 0.5781, 0.5703, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:31:35,716] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 19:31:35,717] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.90 | bwd_microstep: 4637.84 | bwd_inner_microstep: 4630.10 | bwd_allreduce_microstep: 7.64 | step_microstep: 45.33
-[2025-01-25 19:31:35,718] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.85 | bwd: 4637.87 | bwd_inner: 4630.10 | bwd_allreduce: 7.69 | step: 45.34
- 67%|██████▋   | 3863/5800 [10:45:05<3:43:02,  6.91s/it]                                                        {'loss': 0.0127, 'grad_norm': 7.929969787597656, 'learning_rate': 1.060224283476604e-05, 'epoch': 33.3}
- 67%|██████▋   | 3863/5800 [10:45:05<3:43:02,  6.91s/it]score1 tensor([[0.5156],
-        [0.5430],
-        [0.5078],
-        [0.4570]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.5625, 0.4941, 0.4570], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:31:42,595] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.37
-[2025-01-25 19:31:42,596] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.40 | bwd_microstep: 4591.91 | bwd_inner_microstep: 4586.54 | bwd_allreduce_microstep: 5.26 | step_microstep: 44.98
-[2025-01-25 19:31:42,597] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.36 | bwd: 4591.93 | bwd_inner: 4586.54 | bwd_allreduce: 5.32 | step: 45.00
- 67%|██████▋   | 3864/5800 [10:45:12<3:42:37,  6.90s/it]                                                        {'loss': 0.0112, 'grad_norm': 2.173621654510498, 'learning_rate': 1.0592385907687875e-05, 'epoch': 33.31}
- 67%|██████▋   | 3864/5800 [10:45:12<3:42:37,  6.90s/it]score1 tensor([[0.4062],
-        [0.4902],
-        [0.6719],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4277, 0.4863, 0.7031, 0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0142, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:31:49,472] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 19:31:49,473] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.22 | bwd_microstep: 4587.92 | bwd_inner_microstep: 4582.91 | bwd_allreduce_microstep: 4.91 | step_microstep: 49.36
-[2025-01-25 19:31:49,474] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.18 | bwd: 4587.95 | bwd_inner: 4582.91 | bwd_allreduce: 4.96 | step: 49.37
- 67%|██████▋   | 3865/5800 [10:45:19<3:42:14,  6.89s/it]                                                        {'loss': 0.0142, 'grad_norm': 2.2730965614318848, 'learning_rate': 1.0582531914067032e-05, 'epoch': 33.32}
- 67%|██████▋   | 3865/5800 [10:45:19<3:42:14,  6.89s/it]score1 tensor([[0.4844],
-        [0.5430],
-        [0.5977],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4629, 0.5664, 0.5898, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0171, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:31:56,390] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 19:31:56,391] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.68 | bwd_microstep: 4637.77 | bwd_inner_microstep: 4632.46 | bwd_allreduce_microstep: 5.18 | step_microstep: 44.62
-[2025-01-25 19:31:56,391] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.63 | bwd: 4637.79 | bwd_inner: 4632.46 | bwd_allreduce: 5.25 | step: 44.63
- 67%|██████▋   | 3866/5800 [10:45:26<3:42:29,  6.90s/it]                                                        {'loss': 0.0171, 'grad_norm': 4.233579158782959, 'learning_rate': 1.0572680856976131e-05, 'epoch': 33.33}
- 67%|██████▋   | 3866/5800 [10:45:26<3:42:29,  6.90s/it]score1 tensor([[0.6172],
-        [0.4980],
-        [0.4648],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5938, 0.4941, 0.4512, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:32:03,282] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.93 | optimizer_step: 4.37
-[2025-01-25 19:32:03,284] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.99 | bwd_microstep: 4599.86 | bwd_inner_microstep: 4589.90 | bwd_allreduce_microstep: 9.84 | step_microstep: 45.63
-[2025-01-25 19:32:03,284] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.95 | bwd: 4599.89 | bwd_inner: 4589.91 | bwd_allreduce: 9.90 | step: 45.63
- 67%|██████▋   | 3867/5800 [10:45:33<3:42:18,  6.90s/it]                                                        {'loss': 0.0103, 'grad_norm': 6.233340740203857, 'learning_rate': 1.0562832739486933e-05, 'epoch': 33.34}
- 67%|██���███▋   | 3867/5800 [10:45:33<3:42:18,  6.90s/it]score1 tensor([[0.4238],
-        [0.4199],
-        [0.4863],
-        [0.6055]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4395, 0.4160, 0.4941, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:32:10,222] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.99 | optimizer_step: 4.36
-[2025-01-25 19:32:10,224] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.36 | bwd_microstep: 4640.40 | bwd_inner_microstep: 4634.99 | bwd_allreduce_microstep: 5.32 | step_microstep: 46.97
-[2025-01-25 19:32:10,224] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.32 | bwd: 4640.43 | bwd_inner: 4634.99 | bwd_allreduce: 5.36 | step: 46.98
- 67%|██████▋   | 3868/5800 [10:45:40<3:42:31,  6.91s/it]                                                        {'loss': 0.0078, 'grad_norm': 4.249204635620117, 'learning_rate': 1.0552987564670221e-05, 'epoch': 33.34}
- 67%|██████▋   | 3868/5800 [10:45:40<3:42:31,  6.91s/it]score1 tensor([[0.4082],
-        [0.6289],
-        [0.5781],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.6445, 0.5664, 0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:32:17,155] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.14 | optimizer_step: 4.36
-[2025-01-25 19:32:17,156] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2169.71 | bwd_microstep: 4636.67 | bwd_inner_microstep: 4631.31 | bwd_allreduce_microstep: 5.26 | step_microstep: 46.79
-[2025-01-25 19:32:17,157] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2169.67 | bwd: 4636.70 | bwd_inner: 4631.31 | bwd_allreduce: 5.31 | step: 46.79
- 67%|██████▋   | 3869/5800 [10:45:47<3:42:37,  6.92s/it]                                                        {'loss': 0.0112, 'grad_norm': 0.4522343873977661, 'learning_rate': 1.054314533559592e-05, 'epoch': 33.35}
- 67%|██████▋   | 3869/5800 [10:45:47<3:42:37,  6.92s/it]score1 tensor([[0.4590],
-        [0.4531],
-        [0.6680],
-        [0.4824]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.4551, 0.6562, 0.4746], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:32:24,091] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.89 | optimizer_step: 4.36
-[2025-01-25 19:32:24,092] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.66 | bwd_microstep: 4639.43 | bwd_inner_microstep: 4634.00 | bwd_allreduce_microstep: 5.31 | step_microstep: 44.92
-[2025-01-25 19:32:24,093] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.61 | bwd: 4639.45 | bwd_inner: 4634.00 | bwd_allreduce: 5.37 | step: 44.93
- 67%|██████▋   | 3870/5800 [10:45:54<3:42:41,  6.92s/it]                                                        {'loss': 0.0083, 'grad_norm': 4.409940242767334, 'learning_rate': 1.053330605533299e-05, 'epoch': 33.36}
- 67%|██████▋   | 3870/5800 [10:45:54<3:42:41,  6.92s/it]score1 tensor([[0.4805],
-        [0.5352],
-        [0.4551],
-        [0.4004]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4805, 0.5391, 0.4668, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:32:30,926] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.37
-[2025-01-25 19:32:30,928] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.97 | bwd_microstep: 4545.80 | bwd_inner_microstep: 4539.93 | bwd_allreduce_microstep: 5.75 | step_microstep: 47.35
-[2025-01-25 19:32:30,928] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.93 | bwd: 4545.82 | bwd_inner: 4539.93 | bwd_allreduce: 5.81 | step: 47.36
- 67%|██████▋   | 3871/5800 [10:46:00<3:41:42,  6.90s/it]                                                        {'loss': 0.0039, 'grad_norm': 4.011567115783691, 'learning_rate': 1.0523469726949486e-05, 'epoch': 33.37}
- 67%|██████▋   | 3871/5800 [10:46:00<3:41:42,  6.90s/it]score1 tensor([[0.5352],
-        [0.5273],
-        [0.4688],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.5078, 0.4863, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:32:37,853] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.36
-[2025-01-25 19:32:37,855] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.56 | bwd_microstep: 4634.94 | bwd_inner_microstep: 4629.33 | bwd_allreduce_microstep: 5.47 | step_microstep: 49.24
-[2025-01-25 19:32:37,855] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.53 | bwd: 4634.96 | bwd_inner: 4629.33 | bwd_allreduce: 5.55 | step: 49.25
- 67%|██████▋   | 3872/5800 [10:46:07<3:41:54,  6.91s/it]                                                        {'loss': 0.0112, 'grad_norm': 3.9833240509033203, 'learning_rate': 1.0513636353512562e-05, 'epoch': 33.38}
- 67%|██████▋   | 3872/5800 [10:46:07<3:41:54,  6.91s/it]score1 tensor([[0.2969],
-        [0.4512],
-        [0.4512],
-        [0.4531]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3223, 0.4609, 0.4453, 0.4395], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:32:44,783] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 19:32:44,784] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.93 | bwd_microstep: 4639.06 | bwd_inner_microstep: 4633.73 | bwd_allreduce_microstep: 5.24 | step_microstep: 43.51
-[2025-01-25 19:32:44,785] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.90 | bwd: 4639.09 | bwd_inner: 4633.73 | bwd_allreduce: 5.28 | step: 43.52
- 67%|██████▋   | 3873/5800 [10:46:14<3:41:59,  6.91s/it]                                                        {'loss': 0.0137, 'grad_norm': 0.6381581425666809, 'learning_rate': 1.0503805938088418e-05, 'epoch': 33.39}
- 67%|██████▋   | 3873/5800 [10:46:14<3:41:59,  6.91s/it]score1 tensor([[0.4785],
-        [0.5625],
-        [0.4805],
-        [0.4297]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.5469, 0.4824, 0.4336], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:32:51,705] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 19:32:51,706] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.08 | bwd_microstep: 4636.26 | bwd_inner_microstep: 4631.14 | bwd_allreduce_microstep: 5.02 | step_microstep: 43.24
-[2025-01-25 19:32:51,707] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.04 | bwd: 4636.28 | bwd_inner: 4631.14 | bwd_allreduce: 5.07 | step: 43.25
- 67%|██████▋   | 3874/5800 [10:46:21<3:41:59,  6.92s/it]                                                        {'loss': 0.0078, 'grad_norm': 1.6575900316238403, 'learning_rate': 1.0493978483742362e-05, 'epoch': 33.4}
- 67%|██████▋   | 3874/5800 [10:46:21<3:41:59,  6.92s/it]score1 tensor([[0.5039],
-        [0.4180],
-        [0.4336],
-        [0.3887]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.4082, 0.4199, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:32:58,641] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 19:32:58,643] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.01 | bwd_microstep: 4644.22 | bwd_inner_microstep: 4638.73 | bwd_allreduce_microstep: 5.36 | step_microstep: 44.87
-[2025-01-25 19:32:58,643] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.97 | bwd: 4644.26 | bwd_inner: 4638.73 | bwd_allreduce: 5.43 | step: 44.87
- 67%|██████▋   | 3875/5800 [10:46:28<3:42:03,  6.92s/it]                                                        {'loss': 0.0103, 'grad_norm': 3.959643840789795, 'learning_rate': 1.0484153993538761e-05, 'epoch': 33.41}
- 67%|██████▋   | 3875/5800 [10:46:28<3:42:03,  6.92s/it]score1 tensor([[0.3359],
-        [0.5586],
-        [0.4824],
-        [0.6133]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3457, 0.5547, 0.4844, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:33:05,557] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 19:33:05,559] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.10 | bwd_microstep: 4632.15 | bwd_inner_microstep: 4627.23 | bwd_allreduce_microstep: 4.82 | step_microstep: 43.58
-[2025-01-25 19:33:05,559] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.06 | bwd: 4632.17 | bwd_inner: 4627.24 | bwd_allreduce: 4.86 | step: 43.59
- 67%|██████▋   | 3876/5800 [10:46:35<3:41:56,  6.92s/it]                                                        {'loss': 0.0049, 'grad_norm': 3.742626428604126, 'learning_rate': 1.047433247054105e-05, 'epoch': 33.41}
- 67%|██████▋   | 3876/5800 [10:46:35<3:41:56,  6.92s/it]score1 tensor([[0.3535],
-        [0.3848],
-        [0.5039],
-        [0.4219]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3398, 0.3867, 0.4980, 0.4121], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:33:12,481] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 19:33:12,482] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.93 | bwd_microstep: 4632.05 | bwd_inner_microstep: 4626.69 | bwd_allreduce_microstep: 5.23 | step_microstep: 45.61
-[2025-01-25 19:33:12,483] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.89 | bwd: 4632.08 | bwd_inner: 4626.69 | bwd_allreduce: 5.30 | step: 45.62
- 67%|██████▋   | 3877/5800 [10:46:42<3:41:45,  6.92s/it]                                                        {'loss': 0.0078, 'grad_norm': 3.798818826675415, 'learning_rate': 1.0464513917811767e-05, 'epoch': 33.42}
- 67%|██████▋   | 3877/5800 [10:46:42<3:41:45,  6.92s/it]score1 tensor([[0.5938],
-        [0.5078],
-        [0.6172],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.4844, 0.6133, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0166, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:33:19,408] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 19:33:19,410] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.39 | bwd_microstep: 4642.07 | bwd_inner_microstep: 4636.84 | bwd_allreduce_microstep: 5.13 | step_microstep: 48.75
-[2025-01-25 19:33:19,410] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.35 | bwd: 4642.10 | bwd_inner: 4636.84 | bwd_allreduce: 5.19 | step: 48.76
- 67%|██████▋   | 3878/5800 [10:46:49<3:41:43,  6.92s/it]                                                        {'loss': 0.0166, 'grad_norm': 8.722907066345215, 'learning_rate': 1.0454698338412493e-05, 'epoch': 33.43}
- 67%|██████▋   | 3878/5800 [10:46:49<3:41:43,  6.92s/it]score1 tensor([[0.4238],
-        [0.4512],
-        [0.4531],
-        [0.3984]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4277, 0.4473, 0.4473, 0.4023], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0044, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:33:26,335] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 19:33:26,336] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.49 | bwd_microstep: 4640.98 | bwd_inner_microstep: 4634.49 | bwd_allreduce_microstep: 6.38 | step_microstep: 47.00
-[2025-01-25 19:33:26,337] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.45 | bwd: 4641.02 | bwd_inner: 4634.49 | bwd_allreduce: 6.44 | step: 47.00
- 67%|██████▋   | 3879/5800 [10:46:56<3:41:37,  6.92s/it]                                                        {'loss': 0.0044, 'grad_norm': 0.45276111364364624, 'learning_rate': 1.0444885735403906e-05, 'epoch': 33.44}
- 67%|██████▋   | 3879/5800 [10:46:56<3:41:37,  6.92s/it]score1 tensor([[0.5352],
-        [0.5898],
-        [0.5469],
-        [0.4609]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.5664, 0.5430, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:33:33,251] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.96 | optimizer_step: 4.37
-[2025-01-25 19:33:33,252] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.17 | bwd_microstep: 4634.52 | bwd_inner_microstep: 4629.27 | bwd_allreduce_microstep: 5.15 | step_microstep: 43.34
-[2025-01-25 19:33:33,253] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.14 | bwd: 4634.55 | bwd_inner: 4629.27 | bwd_allreduce: 5.19 | step: 43.35
- 67%|██████▋   | 3880/5800 [10:47:03<3:41:30,  6.92s/it]                                                        {'loss': 0.0127, 'grad_norm': 4.532679080963135, 'learning_rate': 1.0435076111845741e-05, 'epoch': 33.45}
- 67%|██████▋   | 3880/5800 [10:47:03<3:41:30,  6.92s/it]score1 tensor([[0.4219],
-        [0.4414],
-        [0.5117],
-        [0.6602]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.4551, 0.5234, 0.6523], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:33:40,184] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 19:33:40,185] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2169.37 | bwd_microstep: 4636.96 | bwd_inner_microstep: 4632.01 | bwd_allreduce_microstep: 4.87 | step_microstep: 44.68
-[2025-01-25 19:33:40,186] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2169.33 | bwd: 4636.98 | bwd_inner: 4632.01 | bwd_allreduce: 4.91 | step: 44.69
- 67%|██████▋   | 3881/5800 [10:47:10<3:41:29,  6.93s/it]                                                        {'loss': 0.0093, 'grad_norm': 0.5423934459686279, 'learning_rate': 1.0425269470796814e-05, 'epoch': 33.46}
- 67%|██████▋   | 3881/5800 [10:47:10<3:41:29,  6.93s/it]score1 tensor([[0.4336],
-        [0.5742],
-        [0.4824],
-        [0.4082]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4316, 0.5664, 0.4922, 0.4043], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:33:47,108] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.37
-[2025-01-25 19:33:47,110] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.42 | bwd_microstep: 4631.83 | bwd_inner_microstep: 4626.08 | bwd_allreduce_microstep: 5.61 | step_microstep: 48.94
-[2025-01-25 19:33:47,110] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.38 | bwd: 4631.86 | bwd_inner: 4626.08 | bwd_allreduce: 5.68 | step: 48.94
- 67%|██████▋   | 3882/5800 [10:47:17<3:41:23,  6.93s/it]                                                        {'loss': 0.0059, 'grad_norm': 3.875418186187744, 'learning_rate': 1.0415465815314995e-05, 'epoch': 33.47}
- 67%|██████▋   | 3882/5800 [10:47:17<3:41:23,  6.93s/it]score1 tensor([[0.4414],
-        [0.4961],
-        [0.3945],
-        [0.4531]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.4844, 0.4160, 0.4570], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:33:54,030] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 19:33:54,031] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.36 | bwd_microstep: 4636.84 | bwd_inner_microstep: 4631.22 | bwd_allreduce_microstep: 5.52 | step_microstep: 45.09
-[2025-01-25 19:33:54,032] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.32 | bwd: 4636.87 | bwd_inner: 4631.22 | bwd_allreduce: 5.57 | step: 45.11
- 67%|██████▋   | 3883/5800 [10:47:24<3:41:14,  6.92s/it]                                                        {'loss': 0.0107, 'grad_norm': 3.5362300872802734, 'learning_rate': 1.0405665148457249e-05, 'epoch': 33.47}
- 67%|██████▋   | 3883/5800 [10:47:24<3:41:14,  6.92s/it]score1 tensor([[0.5586],
-        [0.4121],
-        [0.4297],
-        [0.3984]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5703, 0.4160, 0.4375, 0.3906], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:34:00,970] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.08 | optimizer_step: 4.36
-[2025-01-25 19:34:00,972] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.47 | bwd_microstep: 4640.84 | bwd_inner_microstep: 4635.40 | bwd_allreduce_microstep: 5.35 | step_microstep: 50.47
-[2025-01-25 19:34:00,972] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.42 | bwd: 4640.89 | bwd_inner: 4635.40 | bwd_allreduce: 5.40 | step: 50.49
- 67%|██████▋   | 3884/5800 [10:47:30<3:41:15,  6.93s/it]                                                        {'loss': 0.0078, 'grad_norm': 4.049204349517822, 'learning_rate': 1.0395867473279578e-05, 'epoch': 33.48}
- 67%|██████▋   | 3884/5800 [10:47:30<3:41:15,  6.93s/it]score1 tensor([[0.4863],
-        [0.4844],
-        [0.5508],
-        [0.4102]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4922, 0.4922, 0.5391, 0.4336], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:34:07,896] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 19:34:07,897] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.95 | bwd_microstep: 4635.20 | bwd_inner_microstep: 4629.79 | bwd_allreduce_microstep: 5.32 | step_microstep: 47.61
-[2025-01-25 19:34:07,898] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.92 | bwd: 4635.22 | bwd_inner: 4629.79 | bwd_allreduce: 5.36 | step: 47.61
- 67%|██████▋   | 3885/5800 [10:47:37<3:41:06,  6.93s/it]                                                        {'loss': 0.0122, 'grad_norm': 3.6336448192596436, 'learning_rate': 1.0386072792837083e-05, 'epoch': 33.49}
- 67%|██████▋   | 3885/5800 [10:47:37<3:41:06,  6.93s/it]score1 tensor([[0.4648],
-        [0.4961],
-        [0.3770],
-        [0.4121]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4863, 0.4941, 0.3750, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:34:14,817] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 19:34:14,818] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.03 | bwd_microstep: 4636.12 | bwd_inner_microstep: 4631.08 | bwd_allreduce_microstep: 4.95 | step_microstep: 44.48
-[2025-01-25 19:34:14,818] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.00 | bwd: 4636.14 | bwd_inner: 4631.08 | bwd_allreduce: 5.00 | step: 44.48
- 67%|██████▋   | 3886/5800 [10:47:44<3:40:56,  6.93s/it]                                                        {'loss': 0.0146, 'grad_norm': 0.49273985624313354, 'learning_rate': 1.03762811101839e-05, 'epoch': 33.5}
- 67%|██████▋   | 3886/5800 [10:47:44<3:40:56,  6.93s/it]score1 tensor([[0.5469],
-        [0.4980],
-        [0.5508],
-        [0.6016]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.5156, 0.6055, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0347, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:34:21,743] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.37
-[2025-01-25 19:34:21,745] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.33 | bwd_microstep: 4639.31 | bwd_inner_microstep: 4633.73 | bwd_allreduce_microstep: 5.46 | step_microstep: 48.34
-[2025-01-25 19:34:21,745] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.30 | bwd: 4639.33 | bwd_inner: 4633.73 | bwd_allreduce: 5.52 | step: 48.34
- 67%|██████▋   | 3887/5800 [10:47:51<3:40:48,  6.93s/it]                                                        {'loss': 0.0347, 'grad_norm': 8.527975082397461, 'learning_rate': 1.036649242837326e-05, 'epoch': 33.51}
- 67%|██████▋   | 3887/5800 [10:47:51<3:40:48,  6.93s/it]score1 tensor([[0.4512],
-        [0.6055],
-        [0.3516],
-        [0.6445]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.6055, 0.3457, 0.6641], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:34:28,564] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.12 | optimizer_step: 4.36
-[2025-01-25 19:34:28,566] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.48 | bwd_microstep: 4541.54 | bwd_inner_microstep: 4536.47 | bwd_allreduce_microstep: 4.99 | step_microstep: 41.73
-[2025-01-25 19:34:28,567] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.46 | bwd: 4541.56 | bwd_inner: 4536.47 | bwd_allreduce: 5.03 | step: 41.74
- 67%|██████▋   | 3888/5800 [10:47:58<3:39:45,  6.90s/it]                                                        {'loss': 0.0063, 'grad_norm': 0.9228416085243225, 'learning_rate': 1.0356706750457429e-05, 'epoch': 33.52}
- 67%|██████▋   | 3888/5800 [10:47:58<3:39:45,  6.90s/it]score1 tensor([[0.3730],
-        [0.4648],
-        [0.5352],
-        [0.6250]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3691, 0.4590, 0.5352, 0.6406], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:34:35,454] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.91 | optimizer_step: 4.36
-[2025-01-25 19:34:35,456] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.90 | bwd_microstep: 4590.81 | bwd_inner_microstep: 4585.28 | bwd_allreduce_microstep: 5.45 | step_microstep: 47.44
-[2025-01-25 19:34:35,456] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.86 | bwd: 4590.84 | bwd_inner: 4585.28 | bwd_allreduce: 5.50 | step: 47.46
- 67%|██████▋   | 3889/5800 [10:48:05<3:39:29,  6.89s/it]                                                        {'loss': 0.0063, 'grad_norm': 1.4776856899261475, 'learning_rate': 1.0346924079487765e-05, 'epoch': 33.53}
- 67%|██████▋   | 3889/5800 [10:48:05<3:39:29,  6.89s/it]score1 tensor([[0.5547],
-        [0.5312],
-        [0.3711],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.5391, 0.3750, 0.5820], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:34:42,335] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.37
-[2025-01-25 19:34:42,337] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.62 | bwd_microstep: 4595.56 | bwd_inner_microstep: 4590.19 | bwd_allreduce_microstep: 5.28 | step_microstep: 46.98
-[2025-01-25 19:34:42,337] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.58 | bwd: 4595.58 | bwd_inner: 4590.19 | bwd_allreduce: 5.33 | step: 46.99
- 67%|██████▋   | 3890/5800 [10:48:12<3:39:20,  6.89s/it]                                                        {'loss': 0.0039, 'grad_norm': 1.7039833068847656, 'learning_rate': 1.0337144418514664e-05, 'epoch': 33.53}
- 67%|██████▋   | 3890/5800 [10:48:12<3:39:20,  6.89s/it]score1 tensor([[0.4570],
-        [0.5625],
-        [0.5078],
-        [0.4551]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.5703, 0.4961, 0.4531], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:34:49,251] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.37
-[2025-01-25 19:34:49,252] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.67 | bwd_microstep: 4632.86 | bwd_inner_microstep: 4627.87 | bwd_allreduce_microstep: 4.92 | step_microstep: 44.90
-[2025-01-25 19:34:49,253] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.64 | bwd: 4632.88 | bwd_inner: 4627.87 | bwd_allreduce: 4.96 | step: 44.91
- 67%|██████▋   | 3891/5800 [10:48:19<3:39:24,  6.90s/it]                                                        {'loss': 0.0059, 'grad_norm': 3.6761417388916016, 'learning_rate': 1.0327367770587604e-05, 'epoch': 33.54}
- 67%|██████▋   | 3891/5800 [10:48:19<3:39:24,  6.90s/it]score1 tensor([[0.6172],
-        [0.6367],
-        [0.4785],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.6719, 0.4805, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:34:56,113] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.37
-[2025-01-25 19:34:56,115] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.53 | bwd_microstep: 4586.31 | bwd_inner_microstep: 4580.94 | bwd_allreduce_microstep: 5.26 | step_microstep: 44.62
-[2025-01-25 19:34:56,115] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.49 | bwd: 4586.36 | bwd_inner: 4580.94 | bwd_allreduce: 5.32 | step: 44.62
- 67%|██████▋   | 3892/5800 [10:48:26<3:38:59,  6.89s/it]                                                        {'loss': 0.0117, 'grad_norm': 2.3438057899475098, 'learning_rate': 1.0317594138755101e-05, 'epoch': 33.55}
- 67%|██████▋   | 3892/5800 [10:48:26<3:38:59,  6.89s/it]score1 tensor([[0.3984],
-        [0.4453],
-        [0.4707],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.4258, 0.4609, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:35:03,033] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.37
-[2025-01-25 19:35:03,035] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.00 | bwd_microstep: 4637.93 | bwd_inner_microstep: 4632.92 | bwd_allreduce_microstep: 4.91 | step_microstep: 46.98
-[2025-01-25 19:35:03,035] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.96 | bwd: 4637.96 | bwd_inner: 4632.92 | bwd_allreduce: 4.95 | step: 46.99
- 67%|██████▋   | 3893/5800 [10:48:33<3:39:14,  6.90s/it]                                                        {'loss': 0.0098, 'grad_norm': 0.34766316413879395, 'learning_rate': 1.0307823526064755e-05, 'epoch': 33.56}
- 67%|██████▋   | 3893/5800 [10:48:33<3:39:14,  6.90s/it]score1 tensor([[0.5859],
-        [0.5156],
-        [0.4688],
-        [0.4102]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5742, 0.5312, 0.4531, 0.4082], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:35:09,967] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 19:35:09,969] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.07 | bwd_microstep: 4634.19 | bwd_inner_microstep: 4628.38 | bwd_allreduce_microstep: 5.69 | step_microstep: 48.40
-[2025-01-25 19:35:09,969] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.02 | bwd: 4634.21 | bwd_inner: 4628.38 | bwd_allreduce: 5.76 | step: 48.40
- 67%|██████▋   | 3894/5800 [10:48:39<3:39:27,  6.91s/it]                                                        {'loss': 0.0112, 'grad_norm': 3.926734209060669, 'learning_rate': 1.0298055935563212e-05, 'epoch': 33.57}
- 67%|██████▋   | 3894/5800 [10:48:39<3:39:27,  6.91s/it]score1 tensor([[0.5508],
-        [0.3770],
-        [0.6367],
-        [0.4961]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.3672, 0.6289, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:35:16,899] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 19:35:16,901] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.27 | bwd_microstep: 4639.13 | bwd_inner_microstep: 4633.73 | bwd_allreduce_microstep: 5.28 | step_microstep: 46.22
-[2025-01-25 19:35:16,901] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.24 | bwd: 4639.16 | bwd_inner: 4633.73 | bwd_allreduce: 5.34 | step: 46.23
- 67%|██████▋   | 3895/5800 [10:48:46<3:39:36,  6.92s/it]                                                        {'loss': 0.0059, 'grad_norm': 0.5303981900215149, 'learning_rate': 1.0288291370296162e-05, 'epoch': 33.58}
- 67%|██████▋   | 3895/5800 [10:48:46<3:39:36,  6.92s/it]score1 tensor([[0.6094],
-        [0.5625],
-        [0.5586],
-        [0.4492]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.5508, 0.5586, 0.4414], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:35:23,742] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.37
-[2025-01-25 19:35:23,743] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.97 | bwd_microstep: 4548.59 | bwd_inner_microstep: 4543.54 | bwd_allreduce_microstep: 4.98 | step_microstep: 46.08
-[2025-01-25 19:35:23,744] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.93 | bwd: 4548.61 | bwd_inner: 4543.54 | bwd_allreduce: 5.01 | step: 46.09
- 67%|██████▋   | 3896/5800 [10:48:53<3:38:42,  6.89s/it]                                                        {'loss': 0.0049, 'grad_norm': 4.024184226989746, 'learning_rate': 1.0278529833308382e-05, 'epoch': 33.59}
- 67%|██████▋   | 3896/5800 [10:48:53<3:38:42,  6.89s/it]score1 tensor([[0.6445],
-        [0.4199],
-        [0.5664],
-        [0.4355]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.4219, 0.5664, 0.4434], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0024, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:35:30,571] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 19:35:30,573] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.04 | bwd_microstep: 4545.38 | bwd_inner_microstep: 4540.07 | bwd_allreduce_microstep: 5.18 | step_microstep: 47.77
-[2025-01-25 19:35:30,573] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.00 | bwd: 4545.40 | bwd_inner: 4540.07 | bwd_allreduce: 5.25 | step: 47.78
- 67%|██████▋   | 3897/5800 [10:49:00<3:38:00,  6.87s/it]                                                        {'loss': 0.0024, 'grad_norm': 3.6929657459259033, 'learning_rate': 1.0268771327643674e-05, 'epoch': 33.59}
- 67%|██████▋   | 3897/5800 [10:49:00<3:38:00,  6.87s/it]score1 tensor([[0.4180],
-        [0.4336],
-        [0.4121],
-        [0.4551]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.4453, 0.4121, 0.4570], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0034, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:35:37,403] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.98 | optimizer_step: 4.36
-[2025-01-25 19:35:37,404] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.28 | bwd_microstep: 4544.72 | bwd_inner_microstep: 4539.50 | bwd_allreduce_microstep: 5.10 | step_microstep: 43.59
-[2025-01-25 19:35:37,405] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.23 | bwd: 4544.75 | bwd_inner: 4539.50 | bwd_allreduce: 5.16 | step: 43.60
- 67%|██████▋   | 3898/5800 [10:49:07<3:37:31,  6.86s/it]                                                        {'loss': 0.0034, 'grad_norm': 3.7575197219848633, 'learning_rate': 1.0259015856344927e-05, 'epoch': 33.6}
- 67%|██████▋   | 3898/5800 [10:49:07<3:37:31,  6.86s/it]score1 tensor([[0.3926],
-        [0.6289],
-        [0.4395],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4043, 0.6211, 0.4492, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:35:44,281] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.01 | optimizer_step: 4.36
-[2025-01-25 19:35:44,283] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.61 | bwd_microstep: 4589.28 | bwd_inner_microstep: 4583.99 | bwd_allreduce_microstep: 5.18 | step_microstep: 44.49
-[2025-01-25 19:35:44,283] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.57 | bwd: 4589.30 | bwd_inner: 4583.99 | bwd_allreduce: 5.24 | step: 44.50
- 67%|██████▋   | 3899/5800 [10:49:14<3:37:34,  6.87s/it]                                                        {'loss': 0.0073, 'grad_norm': 1.4502842426300049, 'learning_rate': 1.024926342245405e-05, 'epoch': 33.61}
- 67%|██████▋   | 3899/5800 [10:49:14<3:37:34,  6.87s/it]score1 tensor([[0.4766],
-        [0.4629],
-        [0.5430],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.4727, 0.5469, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:35:51,160] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.37
-[2025-01-25 19:35:51,162] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.14 | bwd_microstep: 4589.32 | bwd_inner_microstep: 4584.41 | bwd_allreduce_microstep: 4.82 | step_microstep: 44.23
-[2025-01-25 19:35:51,162] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.11 | bwd: 4589.34 | bwd_inner: 4584.41 | bwd_allreduce: 4.86 | step: 44.25
- 67%|██████▋   | 3900/5800 [10:49:21<3:37:32,  6.87s/it]                                                        {'loss': 0.0054, 'grad_norm': 5.98363733291626, 'learning_rate': 1.0239514029012035e-05, 'epoch': 33.62}
- 67%|██████▋   | 3900/5800 [10:49:21<3:37:32,  6.87s/it]score1 tensor([[0.4824],
-        [0.5312],
-        [0.6172],
-        [0.4863]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.5117, 0.6172, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:35:58,048] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.12 | optimizer_step: 4.37
-[2025-01-25 19:35:58,049] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.89 | bwd_microstep: 4590.17 | bwd_inner_microstep: 4584.59 | bwd_allreduce_microstep: 5.52 | step_microstep: 49.42
-[2025-01-25 19:35:58,050] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.85 | bwd: 4590.20 | bwd_inner: 4584.59 | bwd_allreduce: 5.55 | step: 49.44
- 67%|██████▋   | 3901/5800 [10:49:28<3:37:38,  6.88s/it]                                                        {'loss': 0.0078, 'grad_norm': 6.018710136413574, 'learning_rate': 1.0229767679058904e-05, 'epoch': 33.63}
- 67%|██████▋   | 3901/5800 [10:49:28<3:37:38,  6.88s/it]score1 tensor([[0.4004],
-        [0.4492],
-        [0.5625],
-        [0.4629]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3945, 0.4434, 0.5352, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:36:04,993] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.01 | optimizer_step: 4.37
-[2025-01-25 19:36:04,994] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.69 | bwd_microstep: 4644.52 | bwd_inner_microstep: 4638.23 | bwd_allreduce_microstep: 6.17 | step_microstep: 50.02
-[2025-01-25 19:36:04,995] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.66 | bwd: 4644.54 | bwd_inner: 4638.23 | bwd_allreduce: 6.23 | step: 50.03
- 67%|██████▋   | 3902/5800 [10:49:34<3:38:08,  6.90s/it]                                                        {'loss': 0.0127, 'grad_norm': 7.732409477233887, 'learning_rate': 1.0220024375633733e-05, 'epoch': 33.64}
- 67%|██████▋   | 3902/5800 [10:49:34<3:38:08,  6.90s/it]score1 tensor([[0.4922],
-        [0.5391],
-        [0.5039],
-        [0.6016]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.5156, 0.4980, 0.6016], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:36:11,873] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 19:36:11,874] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.39 | bwd_microstep: 4594.64 | bwd_inner_microstep: 4589.93 | bwd_allreduce_microstep: 4.62 | step_microstep: 44.13
-[2025-01-25 19:36:11,875] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.33 | bwd: 4594.67 | bwd_inner: 4589.93 | bwd_allreduce: 4.66 | step: 44.14
- 67%|██████▋   | 3903/5800 [10:49:41<3:37:51,  6.89s/it]                                                        {'loss': 0.0107, 'grad_norm': 6.062441349029541, 'learning_rate': 1.0210284121774663e-05, 'epoch': 33.65}
- 67%|██████▋   | 3903/5800 [10:49:41<3:37:51,  6.89s/it]score1 tensor([[0.3828],
-        [0.3047],
-        [0.4512],
-        [0.4043]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3672, 0.3086, 0.4551, 0.3926], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:36:18,795] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 19:36:18,797] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.95 | bwd_microstep: 4642.83 | bwd_inner_microstep: 4637.74 | bwd_allreduce_microstep: 5.00 | step_microstep: 45.45
-[2025-01-25 19:36:18,797] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.92 | bwd: 4642.86 | bwd_inner: 4637.74 | bwd_allreduce: 5.04 | step: 45.45
- 67%|██████▋   | 3904/5800 [10:49:48<3:38:02,  6.90s/it]                                                        {'loss': 0.0088, 'grad_norm': 0.3677290380001068, 'learning_rate': 1.0200546920518865e-05, 'epoch': 33.66}
- 67%|██████▋   | 3904/5800 [10:49:48<3:38:02,  6.90s/it]score1 tensor([[0.4824],
-        [0.7227],
-        [0.4258],
-        [0.4980]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.7070, 0.4277, 0.4863], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:36:25,718] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 19:36:25,719] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.77 | bwd_microstep: 4641.27 | bwd_inner_microstep: 4635.59 | bwd_allreduce_microstep: 5.58 | step_microstep: 45.45
-[2025-01-25 19:36:25,720] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.73 | bwd: 4641.30 | bwd_inner: 4635.59 | bwd_allreduce: 5.63 | step: 45.45
- 67%|██████▋   | 3905/5800 [10:49:55<3:38:07,  6.91s/it]                                                        {'loss': 0.0117, 'grad_norm': 4.704395294189453, 'learning_rate': 1.019081277490258e-05, 'epoch': 33.66}
- 67%|██████▋   | 3905/5800 [10:49:55<3:38:07,  6.91s/it]score1 tensor([[0.5273],
-        [0.5469],
-        [0.5312],
-        [0.4238]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.5312, 0.5156, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:36:32,641] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.88 | optimizer_step: 4.36
-[2025-01-25 19:36:32,643] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.87 | bwd_microstep: 4635.85 | bwd_inner_microstep: 4630.70 | bwd_allreduce_microstep: 5.07 | step_microstep: 48.32
-[2025-01-25 19:36:32,643] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.84 | bwd: 4635.87 | bwd_inner: 4630.70 | bwd_allreduce: 5.11 | step: 48.33
- 67%|██████▋   | 3906/5800 [10:50:02<3:38:11,  6.91s/it]                                                        {'loss': 0.0122, 'grad_norm': 8.073549270629883, 'learning_rate': 1.0181081687961067e-05, 'epoch': 33.67}
- 67%|██████▋   | 3906/5800 [10:50:02<3:38:11,  6.91s/it]score1 tensor([[0.5547],
-        [0.5625],
-        [0.3594],
-        [0.4824]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.5625, 0.3613, 0.4707], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:36:39,512] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 19:36:39,514] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.53 | bwd_microstep: 4589.05 | bwd_inner_microstep: 4583.98 | bwd_allreduce_microstep: 4.95 | step_microstep: 43.79
-[2025-01-25 19:36:39,514] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.49 | bwd: 4589.08 | bwd_inner: 4583.98 | bwd_allreduce: 5.02 | step: 43.82
- 67%|██████▋   | 3907/5800 [10:50:09<3:37:46,  6.90s/it]                                                        {'loss': 0.0063, 'grad_norm': 2.4784271717071533, 'learning_rate': 1.0171353662728666e-05, 'epoch': 33.68}
- 67%|██████▋   | 3907/5800 [10:50:09<3:37:46,  6.90s/it]score1 tensor([[0.4492],
-        [0.5898],
-        [0.5430],
-        [0.4785]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4375, 0.5625, 0.5391, 0.4766], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:36:46,443] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 19:36:46,445] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.38 | bwd_microstep: 4633.99 | bwd_inner_microstep: 4628.52 | bwd_allreduce_microstep: 5.37 | step_microstep: 48.82
-[2025-01-25 19:36:46,445] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.32 | bwd: 4634.02 | bwd_inner: 4628.52 | bwd_allreduce: 5.43 | step: 48.83
- 67%|██████▋   | 3908/5800 [10:50:16<3:37:48,  6.91s/it]                                                        {'loss': 0.0112, 'grad_norm': 8.182345390319824, 'learning_rate': 1.0161628702238736e-05, 'epoch': 33.69}
- 67%|██████▋   | 3908/5800 [10:50:16<3:37:48,  6.91s/it]score1 tensor([[0.4375],
-        [0.4062],
-        [0.6875],
-        [0.5938]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.4121, 0.6875, 0.5898], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:36:53,313] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.05 | optimizer_step: 4.37
-[2025-01-25 19:36:53,315] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.49 | bwd_microstep: 4588.20 | bwd_inner_microstep: 4582.88 | bwd_allreduce_microstep: 5.21 | step_microstep: 45.02
-[2025-01-25 19:36:53,315] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.44 | bwd: 4588.22 | bwd_inner: 4582.88 | bwd_allreduce: 5.26 | step: 45.02
- 67%|██████▋   | 3909/5800 [10:50:23<3:37:26,  6.90s/it]                                                        {'loss': 0.0054, 'grad_norm': 1.5259889364242554, 'learning_rate': 1.0151906809523677e-05, 'epoch': 33.7}
- 67%|██████▋   | 3909/5800 [10:50:23<3:37:26,  6.90s/it]score1 tensor([[0.6016],
-        [0.3379],
-        [0.4961],
-        [0.4258]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.3438, 0.4980, 0.4277], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0034, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:37:00,248] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 19:37:00,250] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.48 | bwd_microstep: 4634.50 | bwd_inner_microstep: 4629.60 | bwd_allreduce_microstep: 4.82 | step_microstep: 42.40
-[2025-01-25 19:37:00,250] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.43 | bwd: 4634.52 | bwd_inner: 4629.60 | bwd_allreduce: 4.85 | step: 42.40
- 67%|██████▋   | 3910/5800 [10:50:30<3:37:36,  6.91s/it]                                                        {'loss': 0.0034, 'grad_norm': 7.693965435028076, 'learning_rate': 1.0142187987614962e-05, 'epoch': 33.71}
- 67%|██████▋   | 3910/5800 [10:50:30<3:37:36,  6.91s/it]evaluate!
-score1 tensor([[0.4668]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4805, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5469]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5625, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4648]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4766, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5312]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5703, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6484]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5742, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0742, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4316]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5547, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1230, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5977, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5430]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4219, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1211, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4902]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5000, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4414]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3945, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5000, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4922]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3730, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1191, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4434]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0488, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4180]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4414, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5195]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6797, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1602, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5391, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4863]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4648, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4004]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4258, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4414]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4707, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4414]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3965, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0449, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4980]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3750, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1230, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4492, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0781, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5312]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5586, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5547]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6523, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0977, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5820]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5898, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4355, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0996, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4980]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5156, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4902]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3262, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1641, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4023]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4863, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0840, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4531]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1953, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5312]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6055, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0742, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5195]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4570, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0625, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5664]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0820, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5938]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.7031, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1094, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4648]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4336, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5312]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5391, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4355]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4473, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5195]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5430, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4414]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4590, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6328]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6641, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4453, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5898]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.7109, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1211, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5273, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4844]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3691, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1152, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4453]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4180, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4609]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3223, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1387, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5547]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6016, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4746]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4648, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4512]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3418, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1094, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4688]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4590, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4844]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5117, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4824]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4551, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4609]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5234, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0625, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5469]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5312, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5742]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5820, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4102]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4199, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4102]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3926, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0781, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5039]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5352, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5859]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6250, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5703]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5703, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0., device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4570]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4727, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5469]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4434, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1035, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4844]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4941, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5469]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5352, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4805]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4531]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3828, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4199]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4219, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4746]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4844, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4805, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5586]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0586, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5742]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5898, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6406]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6641, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4902]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4688, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4219]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4297, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4141]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4668, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0527, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1055, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4473]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4297, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4180]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4609, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4707]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1465, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4648]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4492, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4043]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3730, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5195]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6445, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1250, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4688]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1797, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4492]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5430, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0938, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5039]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5273, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4453]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3867, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0586, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4824]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4668, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5664]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5625, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5898]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6250, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5742]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5508]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0781, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4375]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4785, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0410, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4707]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4062, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0645, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4395, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0840, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4238]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5664, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1426, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4414]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3555, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0859, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4668]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4785, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5078, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5859]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6016, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4531]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4277, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5859]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5781, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4707]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3477, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1230, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5039]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6133, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1094, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5312]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5156, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4375]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3379, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0996, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4199]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3418, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0781, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4844]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3848, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0996, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4180]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3828, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4570]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4277, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4082]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4043, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-Results saved to /DATA/DATA1/wjr/intern/InternVL/internvl_chat/new_train_result/stage2/mos3_test.csv
-Accuracy: 0.0
-SRCC_score: 0.6618657806649916
-PLCC_score: 0.665993456150777
-KRCC_score: 0.4855876606581913
-SRCC_level: 0.6618657806649916
-PLCC_level: 0.665993456150777
-KRCC_level: 0.4855876606581913
-score1 tensor([[0.6016],
-        [0.5078],
-        [0.5625],
-        [0.3828]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.5195, 0.5742, 0.3789], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:47:18,845] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 19:47:18,847] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2135.11 | bwd_microstep: 4592.07 | bwd_inner_microstep: 4587.08 | bwd_allreduce_microstep: 4.90 | step_microstep: 50.36
-[2025-01-25 19:47:18,847] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2135.06 | bwd: 4592.09 | bwd_inner: 4587.08 | bwd_allreduce: 4.95 | step: 50.37
- 67%|██████▋   | 3911/5800 [11:00:48<99:54:56, 190.42s/it]                                                          {'loss': 0.0088, 'grad_norm': 4.780642509460449, 'learning_rate': 1.0132472239543074e-05, 'epoch': 33.72}
- 67%|██████▋   | 3911/5800 [11:00:48<99:54:56, 190.42s/it]score1 tensor([[0.5000],
-        [0.4043],
-        [0.4043],
-        [0.6289]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5078, 0.4141, 0.4180, 0.6250], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:47:25,689] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 19:47:25,690] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2138.39 | bwd_microstep: 4580.20 | bwd_inner_microstep: 4575.61 | bwd_allreduce_microstep: 4.51 | step_microstep: 43.05
-[2025-01-25 19:47:25,690] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2138.36 | bwd: 4580.22 | bwd_inner: 4575.61 | bwd_allreduce: 4.55 | step: 43.05
- 67%|██████▋   | 3912/5800 [11:00:55<70:58:46, 135.34s/it]                                                          {'loss': 0.0088, 'grad_norm': 3.3239152431488037, 'learning_rate': 1.0122759568337566e-05, 'epoch': 33.72}
- 67%|██████▋   | 3912/5800 [11:00:55<70:58:46, 135.34s/it]score1 tensor([[0.5312],
-        [0.4863],
-        [0.4277],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.4941, 0.4336, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:47:32,516] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.37
-[2025-01-25 19:47:32,517] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2133.22 | bwd_microstep: 4581.15 | bwd_inner_microstep: 4576.42 | bwd_allreduce_microstep: 4.63 | step_microstep: 43.12
-[2025-01-25 19:47:32,517] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2133.18 | bwd: 4581.18 | bwd_inner: 4576.42 | bwd_allreduce: 4.69 | step: 43.12
- 67%|██████▋   | 3913/5800 [11:01:02<50:43:57, 96.79s/it]                                                          {'loss': 0.0093, 'grad_norm': 7.9881911277771, 'learning_rate': 1.011304997702701e-05, 'epoch': 33.73}
- 67%|██████▋   | 3913/5800 [11:01:02<50:43:57, 96.79s/it]score1 tensor([[0.4609],
-        [0.5859],
-        [0.4688],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4590, 0.6094, 0.4844, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:47:39,365] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.47 | optimizer_step: 4.37
-[2025-01-25 19:47:39,366] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2136.47 | bwd_microstep: 4596.78 | bwd_inner_microstep: 4591.98 | bwd_allreduce_microstep: 4.70 | step_microstep: 42.24
-[2025-01-25 19:47:39,366] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2136.44 | bwd: 4596.81 | bwd_inner: 4591.98 | bwd_allreduce: 4.75 | step: 42.25
- 67%|██████▋   | 3914/5800 [11:01:09<36:34:14, 69.81s/it]                                                         {'loss': 0.0132, 'grad_norm': 4.269044399261475, 'learning_rate': 1.0103343468639016e-05, 'epoch': 33.74}
- 67%|██████▋   | 3914/5800 [11:01:09<36:34:14, 69.81s/it]score1 tensor([[0.5078],
-        [0.5430],
-        [0.4648],
-        [0.4824]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.5508, 0.4648, 0.5000], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:47:46,163] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 19:47:46,164] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2142.43 | bwd_microstep: 4540.76 | bwd_inner_microstep: 4535.71 | bwd_allreduce_microstep: 4.97 | step_microstep: 43.12
-[2025-01-25 19:47:46,165] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2142.39 | bwd: 4540.78 | bwd_inner: 4535.71 | bwd_allreduce: 5.01 | step: 43.13
- 68%|██████▊   | 3915/5800 [11:01:16<26:39:13, 50.90s/it]                                                         {'loss': 0.0073, 'grad_norm': 6.135354518890381, 'learning_rate': 1.0093640046200257e-05, 'epoch': 33.75}
- 68%|██████▊   | 3915/5800 [11:01:16<26:39:13, 50.90s/it]score1 tensor([[0.4746],
-        [0.4824],
-        [0.4922],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.4629, 0.4883, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:47:53,035] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 19:47:53,036] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2149.46 | bwd_microstep: 4604.88 | bwd_inner_microstep: 4600.15 | bwd_allreduce_microstep: 4.63 | step_microstep: 42.84
-[2025-01-25 19:47:53,037] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2149.43 | bwd: 4604.91 | bwd_inner: 4600.15 | bwd_allreduce: 4.68 | step: 42.85
- 68%|██████▊   | 3916/5800 [11:01:23<19:43:36, 37.69s/it]                                                         {'loss': 0.0083, 'grad_norm': 3.872889518737793, 'learning_rate': 1.0083939712736414e-05, 'epoch': 33.76}
- 68%|██████▊   | 3916/5800 [11:01:23<19:43:36, 37.69s/it]score1 tensor([[0.5195],
-        [0.4219],
-        [0.5586],
-        [0.4980]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.4141, 0.5664, 0.5000], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:47:59,913] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 19:47:59,915] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2148.11 | bwd_microstep: 4610.07 | bwd_inner_microstep: 4605.39 | bwd_allreduce_microstep: 4.58 | step_microstep: 44.48
-[2025-01-25 19:47:59,915] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2148.07 | bwd: 4610.09 | bwd_inner: 4605.39 | bwd_allreduce: 4.63 | step: 44.49
- 68%|██████▊   | 3917/5800 [11:01:29<14:52:50, 28.45s/it]                                                         {'loss': 0.0063, 'grad_norm': 0.535308301448822, 'learning_rate': 1.0074242471272236e-05, 'epoch': 33.77}
- 68%|██████▊   | 3917/5800 [11:01:29<14:52:50, 28.45s/it]score1 tensor([[0.5547],
-        [0.4043],
-        [0.4844],
-        [0.3848]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.3945, 0.4766, 0.3730], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:48:06,741] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.36
-[2025-01-25 19:48:06,742] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2149.69 | bwd_microstep: 4558.63 | bwd_inner_microstep: 4553.98 | bwd_allreduce_microstep: 4.58 | step_microstep: 42.54
-[2025-01-25 19:48:06,743] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2149.65 | bwd: 4558.65 | bwd_inner: 4553.98 | bwd_allreduce: 4.61 | step: 42.55
- 68%|██████▊   | 3918/5800 [11:01:36<11:28:55, 21.96s/it]                                                         {'loss': 0.0073, 'grad_norm': 5.4960432052612305, 'learning_rate': 1.0064548324831475e-05, 'epoch': 33.78}
- 68%|██████▊   | 3918/5800 [11:01:36<11:28:55, 21.96s/it]score1 tensor([[0.4727],
-        [0.4492],
-        [0.5234],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.4258, 0.5039, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0166, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:48:13,627] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.51 | optimizer_step: 4.36
-[2025-01-25 19:48:13,628] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2149.45 | bwd_microstep: 4610.83 | bwd_inner_microstep: 4606.39 | bwd_allreduce_microstep: 4.34 | step_microstep: 41.73
-[2025-01-25 19:48:13,628] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2149.42 | bwd: 4610.85 | bwd_inner: 4606.39 | bwd_allreduce: 4.38 | step: 41.74
- 68%|██████▊   | 3919/5800 [11:01:43<9:06:41, 17.44s/it]                                                         {'loss': 0.0166, 'grad_norm': 7.990834712982178, 'learning_rate': 1.005485727643695e-05, 'epoch': 33.78}
- 68%|██████▊   | 3919/5800 [11:01:43<9:06:41, 17.44s/it]score1 tensor([[0.4629],
-        [0.5391],
-        [0.6484],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4297, 0.5234, 0.6484, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:48:20,458] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.37
-[2025-01-25 19:48:20,459] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2148.91 | bwd_microstep: 4568.27 | bwd_inner_microstep: 4563.84 | bwd_allreduce_microstep: 4.34 | step_microstep: 45.38
-[2025-01-25 19:48:20,459] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2148.88 | bwd: 4568.29 | bwd_inner: 4563.84 | bwd_allreduce: 4.38 | step: 45.38
- 68%|██████▊   | 3920/5800 [11:01:50<7:26:42, 14.26s/it]                                                        {'loss': 0.0132, 'grad_norm': 6.0225911140441895, 'learning_rate': 1.004516932911048e-05, 'epoch': 33.79}
- 68%|██████▊   | 3920/5800 [11:01:50<7:26:42, 14.26s/it]score1 tensor([[0.5234],
-        [0.6055],
-        [0.4961],
-        [0.3750]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4961, 0.6211, 0.5039, 0.2812], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0361, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:48:27,340] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 7.48 | optimizer_step: 4.37
-[2025-01-25 19:48:27,341] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.98 | bwd_microstep: 4614.84 | bwd_inner_microstep: 4610.33 | bwd_allreduce_microstep: 4.46 | step_microstep: 38.28
-[2025-01-25 19:48:27,342] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.94 | bwd: 4614.86 | bwd_inner: 4610.33 | bwd_allreduce: 4.49 | step: 38.28
- 68%|██████▊   | 3921/5800 [11:01:57<6:17:05, 12.04s/it]                                                        {'loss': 0.0361, 'grad_norm': 0.7596989274024963, 'learning_rate': 1.0035484485872944e-05, 'epoch': 33.8}
- 68%|██████▊   | 3921/5800 [11:01:57<6:17:05, 12.04s/it]score1 tensor([[0.3691],
-        [0.4961],
-        [0.5117],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3555, 0.4844, 0.5156, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:48:34,204] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.32 | optimizer_step: 4.37
-[2025-01-25 19:48:34,205] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.42 | bwd_microstep: 4612.55 | bwd_inner_microstep: 4608.15 | bwd_allreduce_microstep: 4.32 | step_microstep: 40.13
-[2025-01-25 19:48:34,205] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.39 | bwd: 4612.57 | bwd_inner: 4608.15 | bwd_allreduce: 4.36 | step: 40.12
- 68%|██████▊   | 3922/5800 [11:02:04<5:28:15, 10.49s/it]                                                        {'loss': 0.0103, 'grad_norm': 3.8317031860351562, 'learning_rate': 1.0025802749744256e-05, 'epoch': 33.81}
- 68%|██████▊   | 3922/5800 [11:02:04<5:28:15, 10.49s/it]score1 tensor([[0.5547],
-        [0.6719],
-        [0.4883],
-        [0.3770]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.6562, 0.4883, 0.3789], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0044, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:48:40,980] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 19:48:40,981] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.89 | bwd_microstep: 4529.03 | bwd_inner_microstep: 4524.30 | bwd_allreduce_microstep: 4.65 | step_microstep: 45.56
-[2025-01-25 19:48:40,986] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.87 | bwd: 4529.05 | bwd_inner: 4524.30 | bwd_allreduce: 4.69 | step: 45.56
- 68%|██████▊   | 3923/5800 [11:02:10<4:53:28,  9.38s/it]                                                        {'loss': 0.0044, 'grad_norm': 0.9408938884735107, 'learning_rate': 1.0016124123743327e-05, 'epoch': 33.82}
- 68%|██████▊   | 3923/5800 [11:02:10<4:53:28,  9.38s/it]score1 tensor([[0.5742],
-        [0.5312],
-        [0.6016],
-        [0.4102]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.5195, 0.6055, 0.4023], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:48:47,871] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 19:48:47,872] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.84 | bwd_microstep: 4612.61 | bwd_inner_microstep: 4607.81 | bwd_allreduce_microstep: 4.69 | step_microstep: 39.59
-[2025-01-25 19:48:47,872] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.80 | bwd: 4612.64 | bwd_inner: 4607.81 | bwd_allreduce: 4.75 | step: 39.59
- 68%|██████▊   | 3924/5800 [11:02:17<4:29:47,  8.63s/it]                                                        {'loss': 0.0117, 'grad_norm': 3.9827983379364014, 'learning_rate': 1.000644861088814e-05, 'epoch': 33.83}
- 68%|██████▊   | 3924/5800 [11:02:17<4:29:47,  8.63s/it]score1 tensor([[0.3672],
-        [0.5312],
-        [0.6367],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.5273, 0.6211, 0.5000], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:48:54,712] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 19:48:54,713] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.68 | bwd_microstep: 4575.20 | bwd_inner_microstep: 4570.01 | bwd_allreduce_microstep: 5.09 | step_microstep: 46.92
-[2025-01-25 19:48:54,714] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.65 | bwd: 4575.23 | bwd_inner: 4570.01 | bwd_allreduce: 5.14 | step: 46.92
- 68%|██████▊   | 3925/5800 [11:02:24<4:12:55,  8.09s/it]                                                        {'loss': 0.0068, 'grad_norm': 2.799497604370117, 'learning_rate': 9.996776214195677e-06, 'epoch': 33.84}
- 68%|██████▊   | 3925/5800 [11:02:24<4:12:55,  8.09s/it]score1 tensor([[0.5508],
-        [0.5898],
-        [0.4727],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.6016, 0.4805, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:49:01,606] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 19:49:01,607] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.99 | bwd_microstep: 4624.12 | bwd_inner_microstep: 4619.10 | bwd_allreduce_microstep: 4.93 | step_microstep: 42.07
-[2025-01-25 19:49:01,607] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.95 | bwd: 4624.15 | bwd_inner: 4619.10 | bwd_allreduce: 4.98 | step: 42.08
- 68%|██████▊   | 3926/5800 [11:02:31<4:01:35,  7.74s/it]                                                        {'loss': 0.0098, 'grad_norm': 4.038133144378662, 'learning_rate': 9.987106936681958e-06, 'epoch': 33.84}
- 68%|██████▊   | 3926/5800 [11:02:31<4:01:35,  7.74s/it]score1 tensor([[0.4766],
-        [0.5352],
-        [0.5156],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4863, 0.5430, 0.5195, 0.5742], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:49:08,494] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 19:49:08,495] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.62 | bwd_microstep: 4615.31 | bwd_inner_microstep: 4610.66 | bwd_allreduce_microstep: 4.57 | step_microstep: 39.59
-[2025-01-25 19:49:08,495] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.58 | bwd: 4615.33 | bwd_inner: 4610.66 | bwd_allreduce: 4.61 | step: 39.61
- 68%|██████▊   | 3927/5800 [11:02:38<3:53:27,  7.48s/it]                                                        {'loss': 0.0073, 'grad_norm': 3.946134090423584, 'learning_rate': 9.97744078136204e-06, 'epoch': 33.85}
- 68%|██████▊   | 3927/5800 [11:02:38<3:53:27,  7.48s/it]score1 tensor([[0.4238],
-        [0.4414],
-        [0.6172],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.4395, 0.6211, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:49:15,337] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.37
-[2025-01-25 19:49:15,338] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.62 | bwd_microstep: 4575.81 | bwd_inner_microstep: 4570.94 | bwd_allreduce_microstep: 4.79 | step_microstep: 41.43
-[2025-01-25 19:49:15,338] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.58 | bwd: 4575.83 | bwd_inner: 4570.94 | bwd_allreduce: 4.82 | step: 41.44
- 68%|██████▊   | 3928/5800 [11:02:45<3:47:27,  7.29s/it]                                                        {'loss': 0.0093, 'grad_norm': 2.367159843444824, 'learning_rate': 9.967777751249987e-06, 'epoch': 33.86}
- 68%|██████▊   | 3928/5800 [11:02:45<3:47:27,  7.29s/it]score1 tensor([[0.5781],
-        [0.3555],
-        [0.6484],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.3516, 0.6445, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:49:22,185] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.37
-[2025-01-25 19:49:22,186] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.00 | bwd_microstep: 4568.66 | bwd_inner_microstep: 4564.10 | bwd_allreduce_microstep: 4.47 | step_microstep: 42.01
-[2025-01-25 19:49:22,187] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.97 | bwd: 4568.68 | bwd_inner: 4564.10 | bwd_allreduce: 4.52 | step: 42.03
- 68%|██████▊   | 3929/5800 [11:02:52<3:43:11,  7.16s/it]                                                        {'loss': 0.0059, 'grad_norm': 2.0046801567077637, 'learning_rate': 9.958117849358912e-06, 'epoch': 33.87}
- 68%|██████▊   | 3929/5800 [11:02:52<3:43:11,  7.16s/it]score1 tensor([[0.6406],
-        [0.6641],
-        [0.4492],
-        [0.3848]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.6797, 0.4668, 0.4043], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0142, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:49:29,088] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.37
-[2025-01-25 19:49:29,089] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.68 | bwd_microstep: 4625.92 | bwd_inner_microstep: 4621.20 | bwd_allreduce_microstep: 4.64 | step_microstep: 45.99
-[2025-01-25 19:49:29,090] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.65 | bwd: 4625.94 | bwd_inner: 4621.20 | bwd_allreduce: 4.68 | step: 45.99
- 68%|██████▊   | 3930/5800 [11:02:59<3:40:42,  7.08s/it]                                                        {'loss': 0.0142, 'grad_norm': 8.389087677001953, 'learning_rate': 9.948461078700926e-06, 'epoch': 33.88}
- 68%|██████▊   | 3930/5800 [11:02:59<3:40:42,  7.08s/it]score1 tensor([[0.4531],
-        [0.5234],
-        [0.5703],
-        [0.6094]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.5352, 0.5977, 0.6367], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:49:35,986] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.37
-[2025-01-25 19:49:35,988] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.05 | bwd_microstep: 4622.94 | bwd_inner_microstep: 4618.14 | bwd_allreduce_microstep: 4.70 | step_microstep: 41.79
-[2025-01-25 19:49:35,988] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.01 | bwd: 4622.96 | bwd_inner: 4618.14 | bwd_allreduce: 4.75 | step: 41.80
- 68%|██████▊   | 3931/5800 [11:03:05<3:38:51,  7.03s/it]                                                        {'loss': 0.0176, 'grad_norm': 4.679344177246094, 'learning_rate': 9.938807442287193e-06, 'epoch': 33.89}
- 68%|██████▊   | 3931/5800 [11:03:05<3:38:51,  7.03s/it]score1 tensor([[0.6719],
-        [0.6367],
-        [0.4785],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6875, 0.6445, 0.4824, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:49:42,856] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.37
-[2025-01-25 19:49:42,858] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.20 | bwd_microstep: 4584.14 | bwd_inner_microstep: 4577.90 | bwd_allreduce_microstep: 6.08 | step_microstep: 51.38
-[2025-01-25 19:49:42,858] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.16 | bwd: 4584.17 | bwd_inner: 4577.90 | bwd_allreduce: 6.16 | step: 51.39
- 68%|██████▊   | 3932/5800 [11:03:12<3:37:18,  6.98s/it]                                                        {'loss': 0.0068, 'grad_norm': 6.70176362991333, 'learning_rate': 9.929156943127876e-06, 'epoch': 33.9}
- 68%|██████▊   | 3932/5800 [11:03:12<3:37:18,  6.98s/it]score1 tensor([[0.6172],
-        [0.4590],
-        [0.5508],
-        [0.4160]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6367, 0.4688, 0.5508, 0.4062], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:49:49,728] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 19:49:49,729] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.08 | bwd_microstep: 4581.59 | bwd_inner_microstep: 4577.11 | bwd_allreduce_microstep: 4.40 | step_microstep: 41.97
-[2025-01-25 19:49:49,729] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.03 | bwd: 4581.61 | bwd_inner: 4577.11 | bwd_allreduce: 4.44 | step: 41.98
- 68%|██████▊   | 3933/5800 [11:03:19<3:36:07,  6.95s/it]                                                        {'loss': 0.0098, 'grad_norm': 2.454800844192505, 'learning_rate': 9.919509584232161e-06, 'epoch': 33.91}
- 68%|██████▊   | 3933/5800 [11:03:19<3:36:07,  6.95s/it]score1 tensor([[0.5742],
-        [0.6250],
-        [0.4707],
-        [0.6211]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5586, 0.6484, 0.4727, 0.6133], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:49:56,617] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.37
-[2025-01-25 19:49:56,618] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.81 | bwd_microstep: 4619.43 | bwd_inner_microstep: 4614.53 | bwd_allreduce_microstep: 4.80 | step_microstep: 41.84
-[2025-01-25 19:49:56,618] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.78 | bwd: 4619.45 | bwd_inner: 4614.53 | bwd_allreduce: 4.85 | step: 41.85
- 68%|██████▊   | 3934/5800 [11:03:26<3:35:29,  6.93s/it]                                                        {'loss': 0.0122, 'grad_norm': 0.5130058526992798, 'learning_rate': 9.909865368608275e-06, 'epoch': 33.91}
- 68%|██████▊   | 3934/5800 [11:03:26<3:35:29,  6.93s/it]score1 tensor([[0.4688],
-        [0.5469],
-        [0.4043],
-        [0.4688]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4883, 0.5430, 0.3887, 0.4668], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:50:03,516] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.37
-[2025-01-25 19:50:03,518] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.39 | bwd_microstep: 4626.88 | bwd_inner_microstep: 4621.93 | bwd_allreduce_microstep: 4.83 | step_microstep: 44.53
-[2025-01-25 19:50:03,518] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.35 | bwd: 4626.90 | bwd_inner: 4621.93 | bwd_allreduce: 4.90 | step: 44.54
- 68%|██████▊   | 3935/5800 [11:03:33<3:35:07,  6.92s/it]                                                        {'loss': 0.0103, 'grad_norm': 3.9420199394226074, 'learning_rate': 9.900224299263441e-06, 'epoch': 33.92}
- 68%|██████▊   | 3935/5800 [11:03:33<3:35:07,  6.92s/it]score1 tensor([[0.5469],
-        [0.6211],
-        [0.4512],
-        [0.4824]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.6328, 0.4609, 0.4707], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:50:10,405] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 19:50:10,406] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.51 | bwd_microstep: 4613.31 | bwd_inner_microstep: 4608.61 | bwd_allreduce_microstep: 4.62 | step_microstep: 41.67
-[2025-01-25 19:50:10,407] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.47 | bwd: 4613.33 | bwd_inner: 4608.61 | bwd_allreduce: 4.66 | step: 41.67
- 68%|██████▊   | 3936/5800 [11:03:40<3:34:38,  6.91s/it]                                                        {'loss': 0.0103, 'grad_norm': 0.39997270703315735, 'learning_rate': 9.890586379203923e-06, 'epoch': 33.93}
- 68%|██████▊   | 3936/5800 [11:03:40<3:34:38,  6.91s/it]score1 tensor([[0.3125],
-        [0.6484],
-        [0.5547],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3105, 0.6602, 0.5469, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:50:17,291] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.37
-[2025-01-25 19:50:17,292] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.36 | bwd_microstep: 4618.15 | bwd_inner_microstep: 4612.65 | bwd_allreduce_microstep: 5.42 | step_microstep: 46.27
-[2025-01-25 19:50:17,292] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.32 | bwd: 4618.18 | bwd_inner: 4612.65 | bwd_allreduce: 5.46 | step: 46.28
- 68%|██████▊   | 3937/5800 [11:03:47<3:34:22,  6.90s/it]                                                        {'loss': 0.0078, 'grad_norm': 3.4237351417541504, 'learning_rate': 9.880951611434978e-06, 'epoch': 33.94}
- 68%|██████▊   | 3937/5800 [11:03:47<3:34:22,  6.90s/it]score1 tensor([[0.6211],
-        [0.4688],
-        [0.3809],
-        [0.4512]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5977, 0.4688, 0.3750, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:50:24,112] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.36
-[2025-01-25 19:50:24,113] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.15 | bwd_microstep: 4538.81 | bwd_inner_microstep: 4533.85 | bwd_allreduce_microstep: 4.88 | step_microstep: 45.68
-[2025-01-25 19:50:24,114] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.11 | bwd: 4538.83 | bwd_inner: 4533.85 | bwd_allreduce: 4.92 | step: 45.69
- 68%|██████▊   | 3938/5800 [11:03:54<3:33:32,  6.88s/it]                                                        {'loss': 0.0073, 'grad_norm': 4.021468639373779, 'learning_rate': 9.87131999896091e-06, 'epoch': 33.95}
- 68%|██████▊   | 3938/5800 [11:03:54<3:33:32,  6.88s/it]score1 tensor([[0.3457],
-        [0.3672],
-        [0.6367],
-        [0.4453]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3477, 0.3789, 0.6445, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:50:31,025] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.37
-[2025-01-25 19:50:31,027] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.41 | bwd_microstep: 4622.99 | bwd_inner_microstep: 4618.40 | bwd_allreduce_microstep: 4.50 | step_microstep: 43.30
-[2025-01-25 19:50:31,027] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.38 | bwd: 4623.01 | bwd_inner: 4618.39 | bwd_allreduce: 4.54 | step: 43.30
- 68%|██████▊   | 3939/5800 [11:04:00<3:33:40,  6.89s/it]                                                        {'loss': 0.0059, 'grad_norm': 7.55233097076416, 'learning_rate': 9.861691544785015e-06, 'epoch': 33.96}
- 68%|██████▊   | 3939/5800 [11:04:01<3:33:40,  6.89s/it]score1 tensor([[0.3984],
-        [0.4805],
-        [0.5469],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4043, 0.4844, 0.5508, 0.5586], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:50:37,926] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 19:50:37,928] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.43 | bwd_microstep: 4627.18 | bwd_inner_microstep: 4622.20 | bwd_allreduce_microstep: 4.88 | step_microstep: 44.09
-[2025-01-25 19:50:37,928] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.39 | bwd: 4627.21 | bwd_inner: 4622.20 | bwd_allreduce: 4.93 | step: 44.10
- 68%|██████▊   | 3940/5800 [11:04:07<3:33:39,  6.89s/it]                                                        {'loss': 0.0073, 'grad_norm': 7.945971488952637, 'learning_rate': 9.852066251909603e-06, 'epoch': 33.97}
- 68%|██████▊   | 3940/5800 [11:04:07<3:33:39,  6.89s/it]score1 tensor([[0.4512],
-        [0.5234],
-        [0.4512],
-        [0.6172]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.5273, 0.4414, 0.6133], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0044, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:50:44,771] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 19:50:44,772] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.82 | bwd_microstep: 4570.10 | bwd_inner_microstep: 4565.37 | bwd_allreduce_microstep: 4.63 | step_microstep: 43.58
-[2025-01-25 19:50:44,772] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.79 | bwd: 4570.12 | bwd_inner: 4565.37 | bwd_allreduce: 4.68 | step: 43.59
- 68%|██████▊   | 3941/5800 [11:04:14<3:33:06,  6.88s/it]                                                        {'loss': 0.0044, 'grad_norm': 2.142951488494873, 'learning_rate': 9.842444123336021e-06, 'epoch': 33.97}
- 68%|██████▊   | 3941/5800 [11:04:14<3:33:06,  6.88s/it]score1 tensor([[0.4570],
-        [0.4199],
-        [0.6797],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.4219, 0.6836, 0.5820], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:50:51,671] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 19:50:51,673] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.03 | bwd_microstep: 4622.10 | bwd_inner_microstep: 4617.00 | bwd_allreduce_microstep: 4.99 | step_microstep: 43.83
-[2025-01-25 19:50:51,673] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.99 | bwd: 4622.12 | bwd_inner: 4617.01 | bwd_allreduce: 5.03 | step: 43.80
- 68%|██████▊   | 3942/5800 [11:04:21<3:33:15,  6.89s/it]                                                        {'loss': 0.0054, 'grad_norm': 8.320869445800781, 'learning_rate': 9.832825162064605e-06, 'epoch': 33.98}
- 68%|██████▊   | 3942/5800 [11:04:21<3:33:15,  6.89s/it]score1 tensor([[0.5000],
-        [0.6172],
-        [0.5742],
-        [0.1865]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4805, 0.6094, 0.5664, 0.1787], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:50:58,585] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 19:50:58,590] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.73 | bwd_microstep: 4625.56 | bwd_inner_microstep: 4620.48 | bwd_allreduce_microstep: 4.99 | step_microstep: 44.32
-[2025-01-25 19:50:58,591] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.68 | bwd: 4625.59 | bwd_inner: 4620.48 | bwd_allreduce: 5.03 | step: 44.32
- 68%|██████▊   | 3943/5800 [11:04:28<3:33:23,  6.89s/it]                                                        {'loss': 0.0107, 'grad_norm': 7.487306594848633, 'learning_rate': 9.823209371094727e-06, 'epoch': 33.99}
- 68%|██████▊   | 3943/5800 [11:04:28<3:33:23,  6.89s/it][93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-score1 tensor([[0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:51:03,488] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 19:51:03,489] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 571.68 | bwd_microstep: 1219.91 | bwd_inner_microstep: 1214.97 | bwd_allreduce_microstep: 4.85 | step_microstep: 44.04
-[2025-01-25 19:51:03,490] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 571.65 | bwd: 1219.93 | bwd_inner: 1214.97 | bwd_allreduce: 4.89 | step: 44.05
- 68%|██████▊   | 3944/5800 [11:04:33<3:14:45,  6.30s/it]                                                        {'loss': 0.0098, 'grad_norm': 7.920163631439209, 'learning_rate': 9.813596753424747e-06, 'epoch': 34.0}
- 68%|██████▊   | 3944/5800 [11:04:33<3:14:45,  6.30s/it][2025-01-25 19:51:07,916] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 19:51:18,043] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 19:51:28,108] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 19:51:37,702] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-score1 tensor([[0.4883],
-        [0.4668],
-        [0.4727],
-        [0.4824]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.4727, 0.4570, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:51:53,432] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.43 | optimizer_step: 4.36
-[2025-01-25 19:51:53,434] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2141.27 | bwd_microstep: 4601.43 | bwd_inner_microstep: 4596.80 | bwd_allreduce_microstep: 4.56 | step_microstep: 41.74
-[2025-01-25 19:51:53,434] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2141.23 | bwd: 4601.45 | bwd_inner: 4596.80 | bwd_allreduce: 4.59 | step: 41.75
- 68%|██████▊   | 3945/5800 [11:05:23<9:59:28, 19.39s/it]                                                        {'loss': 0.0083, 'grad_norm': 0.38037052750587463, 'learning_rate': 9.803987312052043e-06, 'epoch': 34.01}
- 68%|██████▊   | 3945/5800 [11:05:23<9:59:28, 19.39s/it]score1 tensor([[0.4609],
-        [0.4453],
-        [0.4727],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4668, 0.4355, 0.4688, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:52:00,273] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.37
-[2025-01-25 19:52:00,275] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2132.52 | bwd_microstep: 4587.68 | bwd_inner_microstep: 4582.68 | bwd_allreduce_microstep: 4.89 | step_microstep: 44.76
-[2025-01-25 19:52:00,276] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2132.50 | bwd: 4587.71 | bwd_inner: 4582.68 | bwd_allreduce: 4.94 | step: 44.77
- 68%|██████▊   | 3946/5800 [11:05:30<8:02:50, 15.63s/it]                                                        {'loss': 0.0059, 'grad_norm': 0.4678475558757782, 'learning_rate': 9.794381049973018e-06, 'epoch': 34.02}
- 68%|██████▊   | 3946/5800 [11:05:30<8:02:50, 15.63s/it]score1 tensor([[0.6094],
-        [0.4980],
-        [0.5625],
-        [0.4941]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6289, 0.4922, 0.5625, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:52:07,046] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 19:52:07,047] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2136.79 | bwd_microstep: 4515.60 | bwd_inner_microstep: 4510.67 | bwd_allreduce_microstep: 4.82 | step_microstep: 44.03
-[2025-01-25 19:52:07,048] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2136.74 | bwd: 4515.62 | bwd_inner: 4510.67 | bwd_allreduce: 4.87 | step: 44.03
- 68%|██████▊   | 3947/5800 [11:05:37<6:40:31, 12.97s/it]                                                        {'loss': 0.0063, 'grad_norm': 0.4553282558917999, 'learning_rate': 9.78477797018306e-06, 'epoch': 34.03}
- 68%|██████▊   | 3947/5800 [11:05:37<6:40:31, 12.97s/it]score1 tensor([[0.4707],
-        [0.5586],
-        [0.6094],
-        [0.3867]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4629, 0.5781, 0.6055, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:52:13,907] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 19:52:13,908] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2145.55 | bwd_microstep: 4595.48 | bwd_inner_microstep: 4590.08 | bwd_allreduce_microstep: 5.30 | step_microstep: 45.17
-[2025-01-25 19:52:13,908] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2145.51 | bwd: 4595.50 | bwd_inner: 4590.08 | bwd_allreduce: 5.35 | step: 45.18
- 68%|██████▊   | 3948/5800 [11:05:43<5:43:45, 11.14s/it]                                                        {'loss': 0.0112, 'grad_norm': 0.5703232884407043, 'learning_rate': 9.775178075676586e-06, 'epoch': 34.03}
- 68%|██████▊   | 3948/5800 [11:05:43<5:43:45, 11.14s/it]score1 tensor([[0.4922],
-        [0.4844],
-        [0.3223],
-        [0.3770]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.5000, 0.3223, 0.3789], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:52:20,729] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 19:52:20,730] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2146.36 | bwd_microstep: 4552.62 | bwd_inner_microstep: 4547.79 | bwd_allreduce_microstep: 4.73 | step_microstep: 48.43
-[2025-01-25 19:52:20,730] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2146.31 | bwd: 4552.64 | bwd_inner: 4547.79 | bwd_allreduce: 4.78 | step: 48.44
- 68%|██████▊   | 3949/5800 [11:05:50<5:03:38,  9.84s/it]                                                        {'loss': 0.0049, 'grad_norm': 5.692172527313232, 'learning_rate': 9.765581369446998e-06, 'epoch': 34.04}
- 68%|██████▊   | 3949/5800 [11:05:50<5:03:38,  9.84s/it]score1 tensor([[0.3730],
-        [0.5195],
-        [0.4980],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.5312, 0.4980, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:52:27,557] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 19:52:27,558] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2146.96 | bwd_microstep: 4560.70 | bwd_inner_microstep: 4556.00 | bwd_allreduce_microstep: 4.61 | step_microstep: 43.15
-[2025-01-25 19:52:27,558] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2146.92 | bwd: 4560.72 | bwd_inner: 4556.00 | bwd_allreduce: 4.65 | step: 43.16
- 68%|██████▊   | 3950/5800 [11:05:57<4:35:37,  8.94s/it]                                                        {'loss': 0.0103, 'grad_norm': 5.777928829193115, 'learning_rate': 9.755987854486726e-06, 'epoch': 34.05}
- 68%|██████▊   | 3950/5800 [11:05:57<4:35:37,  8.94s/it]score1 tensor([[0.3770],
-        [0.6094],
-        [0.3750],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3262, 0.6055, 0.3711, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:52:34,445] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.98 | optimizer_step: 4.37
-[2025-01-25 19:52:34,447] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2151.01 | bwd_microstep: 4614.33 | bwd_inner_microstep: 4607.10 | bwd_allreduce_microstep: 7.16 | step_microstep: 43.97
-[2025-01-25 19:52:34,447] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.98 | bwd: 4614.35 | bwd_inner: 4607.10 | bwd_allreduce: 7.20 | step: 43.98
- 68%|██████▊   | 3951/5800 [11:06:04<4:16:29,  8.32s/it]                                                        {'loss': 0.0156, 'grad_norm': 7.794710636138916, 'learning_rate': 9.74639753378719e-06, 'epoch': 34.06}
- 68%|██████▊   | 3951/5800 [11:06:04<4:16:29,  8.32s/it]score1 tensor([[0.3730],
-        [0.6289],
-        [0.4141],
-        [0.5781]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3730, 0.6719, 0.4121, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:52:41,238] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 19:52:41,240] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.55 | bwd_microstep: 4523.47 | bwd_inner_microstep: 4518.55 | bwd_allreduce_microstep: 4.83 | step_microstep: 42.02
-[2025-01-25 19:52:41,240] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.50 | bwd: 4523.50 | bwd_inner: 4518.55 | bwd_allreduce: 4.88 | step: 42.03
- 68%|██████▊   | 3952/5800 [11:06:11<4:02:12,  7.86s/it]                                                        {'loss': 0.0112, 'grad_norm': 0.7026869654655457, 'learning_rate': 9.736810410338815e-06, 'epoch': 34.07}
- 68%|██████▊   | 3952/5800 [11:06:11<4:02:12,  7.86s/it]score1 tensor([[0.3633],
-        [0.4609],
-        [0.5430],
-        [0.4629]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.4727, 0.5508, 0.4766], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:52:48,128] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 19:52:48,129] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2151.87 | bwd_microstep: 4617.73 | bwd_inner_microstep: 4612.89 | bwd_allreduce_microstep: 4.72 | step_microstep: 43.10
-[2025-01-25 19:52:48,129] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2151.81 | bwd: 4617.76 | bwd_inner: 4612.89 | bwd_allreduce: 4.78 | step: 43.11
- 68%|██████▊   | 3953/5800 [11:06:18<3:53:02,  7.57s/it]                                                        {'loss': 0.0112, 'grad_norm': 7.6375508308410645, 'learning_rate': 9.727226487131032e-06, 'epoch': 34.08}
- 68%|██████▊   | 3953/5800 [11:06:18<3:53:02,  7.57s/it]score1 tensor([[0.5703],
-        [0.5039],
-        [0.5312],
-        [0.4102]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.5039, 0.5391, 0.4141], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:52:54,961] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 19:52:54,963] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.62 | bwd_microstep: 4562.46 | bwd_inner_microstep: 4557.17 | bwd_allreduce_microstep: 5.17 | step_microstep: 44.94
-[2025-01-25 19:52:54,963] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.58 | bwd: 4562.48 | bwd_inner: 4557.17 | bwd_allreduce: 5.22 | step: 44.95
- 68%|██████▊   | 3954/5800 [11:06:24<3:46:11,  7.35s/it]                                                        {'loss': 0.0049, 'grad_norm': 1.7249114513397217, 'learning_rate': 9.717645767152289e-06, 'epoch': 34.09}
- 68%|██████▊   | 3954/5800 [11:06:24<3:46:11,  7.35s/it]score1 tensor([[0.5508],
-        [0.5469],
-        [0.6367],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.5430, 0.6250, 0.5234], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:53:01,859] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 8.38 | optimizer_step: 4.36
-[2025-01-25 19:53:01,860] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.50 | bwd_microstep: 4613.71 | bwd_inner_microstep: 4607.80 | bwd_allreduce_microstep: 5.82 | step_microstep: 49.36
-[2025-01-25 19:53:01,860] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.46 | bwd: 4613.74 | bwd_inner: 4607.80 | bwd_allreduce: 5.87 | step: 49.37
- 68%|██████▊   | 3955/5800 [11:06:31<3:41:51,  7.21s/it]                                                        {'loss': 0.0078, 'grad_norm': 4.433935165405273, 'learning_rate': 9.70806825339e-06, 'epoch': 34.09}
- 68%|██████▊   | 3955/5800 [11:06:31<3:41:51,  7.21s/it]score1 tensor([[0.5586],
-        [0.4824],
-        [0.5000],
-        [0.4688]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.4941, 0.4863, 0.4570], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:53:08,755] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.36
-[2025-01-25 19:53:08,756] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.32 | bwd_microstep: 4612.57 | bwd_inner_microstep: 4607.26 | bwd_allreduce_microstep: 5.22 | step_microstep: 47.91
-[2025-01-25 19:53:08,756] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.29 | bwd: 4612.60 | bwd_inner: 4607.26 | bwd_allreduce: 5.27 | step: 47.92
- 68%|██████▊   | 3956/5800 [11:06:38<3:38:44,  7.12s/it]                                                        {'loss': 0.0103, 'grad_norm': 0.5015223026275635, 'learning_rate': 9.698493948830618e-06, 'epoch': 34.1}
- 68%|██████▊   | 3956/5800 [11:06:38<3:38:44,  7.12s/it]score1 tensor([[0.4766],
-        [0.4316],
-        [0.4160],
-        [0.6133]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4961, 0.4180, 0.4180, 0.5977], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:53:15,659] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 19:53:15,660] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.70 | bwd_microstep: 4628.68 | bwd_inner_microstep: 4623.48 | bwd_allreduce_microstep: 5.10 | step_microstep: 43.40
-[2025-01-25 19:53:15,661] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.67 | bwd: 4628.70 | bwd_inner: 4623.48 | bwd_allreduce: 5.15 | step: 43.41
- 68%|██████▊   | 3957/5800 [11:06:45<3:36:39,  7.05s/it]                                                        {'loss': 0.0127, 'grad_norm': 0.6439946293830872, 'learning_rate': 9.688922856459563e-06, 'epoch': 34.11}
- 68%|██████▊   | 3957/5800 [11:06:45<3:36:39,  7.05s/it]score1 tensor([[0.3711],
-        [0.4199],
-        [0.4453],
-        [0.6211]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3652, 0.4180, 0.4375, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:53:22,550] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 19:53:22,551] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.38 | bwd_microstep: 4615.85 | bwd_inner_microstep: 4610.73 | bwd_allreduce_microstep: 4.98 | step_microstep: 44.83
-[2025-01-25 19:53:22,551] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.34 | bwd: 4615.90 | bwd_inner: 4610.73 | bwd_allreduce: 5.05 | step: 44.83
- 68%|██████▊   | 3958/5800 [11:06:52<3:35:05,  7.01s/it]                                                        {'loss': 0.0049, 'grad_norm': 7.714130878448486, 'learning_rate': 9.679354979261282e-06, 'epoch': 34.12}
- 68%|██████▊   | 3958/5800 [11:06:52<3:35:05,  7.01s/it]score1 tensor([[0.6953],
-        [0.5430],
-        [0.5078],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6875, 0.5547, 0.5000, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:53:29,456] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 19:53:29,457] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.38 | bwd_microstep: 4624.72 | bwd_inner_microstep: 4619.32 | bwd_allreduce_microstep: 5.29 | step_microstep: 45.50
-[2025-01-25 19:53:29,458] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.33 | bwd: 4624.74 | bwd_inner: 4619.32 | bwd_allreduce: 5.35 | step: 45.50
- 68%|██████▊   | 3959/5800 [11:06:59<3:34:01,  6.98s/it]                                                        {'loss': 0.0078, 'grad_norm': 4.48094367980957, 'learning_rate': 9.669790320219187e-06, 'epoch': 34.13}
- 68%|██████▊   | 3959/5800 [11:06:59<3:34:01,  6.98s/it]score1 tensor([[0.5742],
-        [0.5898],
-        [0.5234],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.5898, 0.5117, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:53:36,301] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 7.92 | optimizer_step: 4.36
-[2025-01-25 19:53:36,303] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.28 | bwd_microstep: 4565.65 | bwd_inner_microstep: 4560.75 | bwd_allreduce_microstep: 4.81 | step_microstep: 47.81
-[2025-01-25 19:53:36,303] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.25 | bwd: 4565.68 | bwd_inner: 4560.75 | bwd_allreduce: 4.86 | step: 47.81
- 68%|██████▊   | 3960/5800 [11:07:06<3:32:43,  6.94s/it]                                                        {'loss': 0.0068, 'grad_norm': 2.176151752471924, 'learning_rate': 9.660228882315725e-06, 'epoch': 34.14}
- 68%|██████▊   | 3960/5800 [11:07:06<3:32:43,  6.94s/it]score1 tensor([[0.4863],
-        [0.3906],
-        [0.4414],
-        [0.3945]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4805, 0.4023, 0.4414, 0.3984], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:53:43,150] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 19:53:43,152] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.69 | bwd_microstep: 4574.17 | bwd_inner_microstep: 4569.57 | bwd_allreduce_microstep: 4.50 | step_microstep: 42.72
-[2025-01-25 19:53:43,152] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.64 | bwd: 4574.20 | bwd_inner: 4569.57 | bwd_allreduce: 4.55 | step: 42.73
- 68%|██████▊   | 3961/5800 [11:07:13<3:31:48,  6.91s/it]                                                        {'loss': 0.0054, 'grad_norm': 1.6172091960906982, 'learning_rate': 9.6506706685323e-06, 'epoch': 34.15}
- 68%|██████▊   | 3961/5800 [11:07:13<3:31:48,  6.91s/it]score1 tensor([[0.4121],
-        [0.5742],
-        [0.5664],
-        [0.6133]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.5664, 0.5664, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:53:50,004] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.51 | optimizer_step: 4.37
-[2025-01-25 19:53:50,005] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.30 | bwd_microstep: 4573.92 | bwd_inner_microstep: 4569.43 | bwd_allreduce_microstep: 4.41 | step_microstep: 43.64
-[2025-01-25 19:53:50,006] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.25 | bwd: 4573.94 | bwd_inner: 4569.43 | bwd_allreduce: 4.44 | step: 43.65
- 68%|██████▊   | 3962/5800 [11:07:19<3:31:08,  6.89s/it]                                                        {'loss': 0.0054, 'grad_norm': 1.926046371459961, 'learning_rate': 9.641115681849347e-06, 'epoch': 34.16}
- 68%|██████▊   | 3962/5800 [11:07:19<3:31:08,  6.89s/it]score1 tensor([[0.6367],
-        [0.5742],
-        [0.4707],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.6133, 0.4609, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0151, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:53:56,898] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 19:53:56,900] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.03 | bwd_microstep: 4619.30 | bwd_inner_microstep: 4614.08 | bwd_allreduce_microstep: 5.14 | step_microstep: 44.95
-[2025-01-25 19:53:56,900] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.00 | bwd: 4619.34 | bwd_inner: 4614.07 | bwd_allreduce: 5.19 | step: 44.96
- 68%|██████▊   | 3963/5800 [11:07:26<3:31:05,  6.89s/it]                                                        {'loss': 0.0151, 'grad_norm': 0.8182997703552246, 'learning_rate': 9.631563925246264e-06, 'epoch': 34.16}
- 68%|██████▊   | 3963/5800 [11:07:26<3:31:05,  6.89s/it]score1 tensor([[0.3770],
-        [0.4707],
-        [0.5273],
-        [0.4375]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3926, 0.4727, 0.5391, 0.4434], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:54:03,800] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 19:54:03,801] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.62 | bwd_microstep: 4623.49 | bwd_inner_microstep: 4618.62 | bwd_allreduce_microstep: 4.76 | step_microstep: 43.17
-[2025-01-25 19:54:03,802] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.57 | bwd: 4623.51 | bwd_inner: 4618.62 | bwd_allreduce: 4.81 | step: 43.18
- 68%|██████▊   | 3964/5800 [11:07:33<3:31:02,  6.90s/it]                                                        {'loss': 0.0088, 'grad_norm': 7.593437671661377, 'learning_rate': 9.62201540170145e-06, 'epoch': 34.17}
- 68%|██████▊   | 3964/5800 [11:07:33<3:31:02,  6.90s/it]score1 tensor([[0.4707],
-        [0.5117],
-        [0.5352],
-        [0.4121]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.5039, 0.5508, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:54:10,706] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.34 | optimizer_step: 4.36
-[2025-01-25 19:54:10,707] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.71 | bwd_microstep: 4623.47 | bwd_inner_microstep: 4618.70 | bwd_allreduce_microstep: 4.69 | step_microstep: 42.79
-[2025-01-25 19:54:10,708] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.67 | bwd: 4623.50 | bwd_inner: 4618.70 | bwd_allreduce: 4.73 | step: 42.80
- 68%|██████▊   | 3965/5800 [11:07:40<3:30:56,  6.90s/it]                                                        {'loss': 0.0078, 'grad_norm': 0.3822228014469147, 'learning_rate': 9.612470114192314e-06, 'epoch': 34.18}
- 68%|██████▊   | 3965/5800 [11:07:40<3:30:56,  6.90s/it]score1 tensor([[0.4219],
-        [0.5430],
-        [0.4980],
-        [0.4629]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4121, 0.5312, 0.4707, 0.4688], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:54:17,597] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 19:54:17,598] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.80 | bwd_microstep: 4618.11 | bwd_inner_microstep: 4613.27 | bwd_allreduce_microstep: 4.74 | step_microstep: 43.33
-[2025-01-25 19:54:17,599] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.75 | bwd: 4618.14 | bwd_inner: 4613.27 | bwd_allreduce: 4.79 | step: 43.34
- 68%|██████▊   | 3966/5800 [11:07:47<3:30:49,  6.90s/it]                                                        {'loss': 0.0137, 'grad_norm': 3.999858856201172, 'learning_rate': 9.602928065695229e-06, 'epoch': 34.19}
- 68%|██████▊   | 3966/5800 [11:07:47<3:30:49,  6.90s/it]score1 tensor([[0.5039],
-        [0.4355],
-        [0.5703],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.4297, 0.5586, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:54:24,497] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 19:54:24,498] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.46 | bwd_microstep: 4623.20 | bwd_inner_microstep: 4617.97 | bwd_allreduce_microstep: 5.11 | step_microstep: 43.83
-[2025-01-25 19:54:24,498] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.42 | bwd: 4623.23 | bwd_inner: 4617.97 | bwd_allreduce: 5.17 | step: 43.83
- 68%|██████▊   | 3967/5800 [11:07:54<3:30:42,  6.90s/it]                                                        {'loss': 0.0093, 'grad_norm': 4.1573991775512695, 'learning_rate': 9.593389259185585e-06, 'epoch': 34.2}
- 68%|██████▊   | 3967/5800 [11:07:54<3:30:42,  6.90s/it]score1 tensor([[0.6094],
-        [0.5625],
-        [0.4766],
-        [0.4082]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.5625, 0.4805, 0.3887], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:54:31,317] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.37
-[2025-01-25 19:54:31,318] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.02 | bwd_microstep: 4541.51 | bwd_inner_microstep: 4536.89 | bwd_allreduce_microstep: 4.54 | step_microstep: 42.75
-[2025-01-25 19:54:31,319] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.98 | bwd: 4541.54 | bwd_inner: 4536.89 | bwd_allreduce: 4.58 | step: 42.75
- 68%|██████▊   | 3968/5800 [11:08:01<3:29:55,  6.88s/it]                                                        {'loss': 0.0059, 'grad_norm': 0.3303298354148865, 'learning_rate': 9.583853697637734e-06, 'epoch': 34.21}
- 68%|██████▊   | 3968/5800 [11:08:01<3:29:55,  6.88s/it]score1 tensor([[0.5781],
-        [0.4551],
-        [0.4473],
-        [0.6758]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.4609, 0.4473, 0.7031], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:54:38,172] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 19:54:38,173] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.54 | bwd_microstep: 4576.19 | bwd_inner_microstep: 4571.57 | bwd_allreduce_microstep: 4.53 | step_microstep: 45.48
-[2025-01-25 19:54:38,173] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.49 | bwd: 4576.22 | bwd_inner: 4571.57 | bwd_allreduce: 4.58 | step: 45.49
- 68%|██████▊   | 3969/5800 [11:08:08<3:29:35,  6.87s/it]                                                        {'loss': 0.0112, 'grad_norm': 2.15407133102417, 'learning_rate': 9.574321384025043e-06, 'epoch': 34.22}
- 68%|██████▊   | 3969/5800 [11:08:08<3:29:35,  6.87s/it]score1 tensor([[0.5820],
-        [0.5273],
-        [0.4004],
-        [0.6406]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6016, 0.5273, 0.4062, 0.6523], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:54:45,013] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 19:54:45,014] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.37 | bwd_microstep: 4566.71 | bwd_inner_microstep: 4561.88 | bwd_allreduce_microstep: 4.74 | step_microstep: 42.73
-[2025-01-25 19:54:45,014] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.34 | bwd: 4566.73 | bwd_inner: 4561.88 | bwd_allreduce: 4.79 | step: 42.73
- 68%|██████▊   | 3970/5800 [11:08:14<3:29:13,  6.86s/it]                                                        {'loss': 0.0093, 'grad_norm': 6.315732002258301, 'learning_rate': 9.564792321319846e-06, 'epoch': 34.22}
- 68%|██████▊   | 3970/5800 [11:08:14<3:29:13,  6.86s/it]score1 tensor([[0.4355],
-        [0.4863],
-        [0.6250],
-        [0.6641]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4453, 0.4805, 0.6250, 0.6562], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:54:51,872] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 19:54:51,874] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.17 | bwd_microstep: 4577.60 | bwd_inner_microstep: 4572.70 | bwd_allreduce_microstep: 4.81 | step_microstep: 42.40
-[2025-01-25 19:54:51,874] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.13 | bwd: 4577.62 | bwd_inner: 4572.70 | bwd_allreduce: 4.85 | step: 42.41
- 68%|██████▊   | 3971/5800 [11:08:21<3:29:08,  6.86s/it]                                                        {'loss': 0.0059, 'grad_norm': 2.567655324935913, 'learning_rate': 9.555266512493461e-06, 'epoch': 34.23}
- 68%|██████▊   | 3971/5800 [11:08:21<3:29:08,  6.86s/it]score1 tensor([[0.5078],
-        [0.4941],
-        [0.3828],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5000, 0.4980, 0.3809, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0044, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:54:58,779] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.37
-[2025-01-25 19:54:58,780] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.55 | bwd_microstep: 4625.46 | bwd_inner_microstep: 4620.65 | bwd_allreduce_microstep: 4.72 | step_microstep: 42.44
-[2025-01-25 19:54:58,780] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.52 | bwd: 4625.48 | bwd_inner: 4620.65 | bwd_allreduce: 4.76 | step: 42.45
- 68%|██████▊   | 3972/5800 [11:08:28<3:29:23,  6.87s/it]                                                        {'loss': 0.0044, 'grad_norm': 3.7917661666870117, 'learning_rate': 9.545743960516218e-06, 'epoch': 34.24}
- 68%|██████▊   | 3972/5800 [11:08:28<3:29:23,  6.87s/it]score1 tensor([[0.5508],
-        [0.3887],
-        [0.5156],
-        [0.4434]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.3750, 0.4961, 0.4258], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:55:05,675] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.37
-[2025-01-25 19:55:05,676] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.18 | bwd_microstep: 4617.57 | bwd_inner_microstep: 4612.07 | bwd_allreduce_microstep: 5.39 | step_microstep: 45.14
-[2025-01-25 19:55:05,677] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.15 | bwd: 4617.60 | bwd_inner: 4612.07 | bwd_allreduce: 5.44 | step: 45.14
- 68%|██████▊   | 3973/5800 [11:08:35<3:29:29,  6.88s/it]                                                        {'loss': 0.0156, 'grad_norm': 3.680342674255371, 'learning_rate': 9.5362246683574e-06, 'epoch': 34.25}
- 68%|██████▊   | 3973/5800 [11:08:35<3:29:29,  6.88s/it]score1 tensor([[0.4961],
-        [0.4492],
-        [0.4453],
-        [0.4316]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4922, 0.4590, 0.4473, 0.4375], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:55:12,572] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 19:55:12,573] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.09 | bwd_microstep: 4623.65 | bwd_inner_microstep: 4618.73 | bwd_allreduce_microstep: 4.82 | step_microstep: 44.03
-[2025-01-25 19:55:12,573] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.06 | bwd: 4623.67 | bwd_inner: 4618.73 | bwd_allreduce: 4.87 | step: 44.03
- 69%|██████▊   | 3974/5800 [11:08:42<3:29:33,  6.89s/it]                                                        {'loss': 0.0054, 'grad_norm': 3.647814989089966, 'learning_rate': 9.526708638985296e-06, 'epoch': 34.26}
- 69%|██████▊   | 3974/5800 [11:08:42<3:29:33,  6.89s/it]score1 tensor([[0.3984],
-        [0.4844],
-        [0.4941],
-        [0.6289]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4082, 0.4785, 0.5195, 0.6406], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:55:19,460] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.37
-[2025-01-25 19:55:19,464] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.12 | bwd_microstep: 4613.72 | bwd_inner_microstep: 4608.99 | bwd_allreduce_microstep: 4.65 | step_microstep: 45.72
-[2025-01-25 19:55:19,464] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.08 | bwd: 4613.75 | bwd_inner: 4608.99 | bwd_allreduce: 4.69 | step: 45.73
- 69%|██████▊   | 3975/5800 [11:08:49<3:29:29,  6.89s/it]                                                        {'loss': 0.0132, 'grad_norm': 4.114506721496582, 'learning_rate': 9.517195875367167e-06, 'epoch': 34.27}
- 69%|██████▊   | 3975/5800 [11:08:49<3:29:29,  6.89s/it]score1 tensor([[0.6602],
-        [0.5352],
-        [0.4570],
-        [0.3789]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6484, 0.5391, 0.4648, 0.3945], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:55:26,356] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 19:55:26,358] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.49 | bwd_microstep: 4617.85 | bwd_inner_microstep: 4613.10 | bwd_allreduce_microstep: 4.67 | step_microstep: 42.87
-[2025-01-25 19:55:26,358] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.45 | bwd: 4617.88 | bwd_inner: 4613.10 | bwd_allreduce: 4.71 | step: 42.88
- 69%|██████▊   | 3976/5800 [11:08:56<3:29:28,  6.89s/it]                                                        {'loss': 0.0098, 'grad_norm': 3.4129061698913574, 'learning_rate': 9.50768638046925e-06, 'epoch': 34.28}
- 69%|██████▊   | 3976/5800 [11:08:56<3:29:28,  6.89s/it]score1 tensor([[0.4863],
-        [0.4180],
-        [0.4414],
-        [0.4023]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.4004, 0.4492, 0.3945], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:55:33,263] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 19:55:33,264] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.30 | bwd_microstep: 4620.55 | bwd_inner_microstep: 4614.77 | bwd_allreduce_microstep: 5.69 | step_microstep: 50.74
-[2025-01-25 19:55:33,265] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.26 | bwd: 4620.58 | bwd_inner: 4614.77 | bwd_allreduce: 5.74 | step: 50.74
- 69%|██████▊   | 3977/5800 [11:09:03<3:29:30,  6.90s/it]                                                        {'loss': 0.0088, 'grad_norm': 3.6992385387420654, 'learning_rate': 9.498180157256785e-06, 'epoch': 34.28}
- 69%|██████▊   | 3977/5800 [11:09:03<3:29:30,  6.90s/it]score1 tensor([[0.5586],
-        [0.4531],
-        [0.4531],
-        [0.6445]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5586, 0.4668, 0.4531, 0.6484], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0044, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:55:40,089] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.36
-[2025-01-25 19:55:40,091] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.64 | bwd_microstep: 4543.07 | bwd_inner_microstep: 4537.88 | bwd_allreduce_microstep: 5.06 | step_microstep: 43.29
-[2025-01-25 19:55:40,091] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.60 | bwd: 4543.10 | bwd_inner: 4537.88 | bwd_allreduce: 5.13 | step: 43.30
- 69%|██████▊   | 3978/5800 [11:09:10<3:28:45,  6.87s/it]                                                        {'loss': 0.0044, 'grad_norm': 4.261242389678955, 'learning_rate': 9.48867720869396e-06, 'epoch': 34.29}
- 69%|██████▊   | 3978/5800 [11:09:10<3:28:45,  6.87s/it]score1 tensor([[0.5312],
-        [0.4629],
-        [0.4180],
-        [0.4316]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.4570, 0.4160, 0.4121], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:55:47,001] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 19:55:47,002] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.71 | bwd_microstep: 4624.25 | bwd_inner_microstep: 4619.71 | bwd_allreduce_microstep: 4.44 | step_microstep: 42.49
-[2025-01-25 19:55:47,003] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.65 | bwd: 4624.28 | bwd_inner: 4619.71 | bwd_allreduce: 4.49 | step: 42.49
- 69%|██████▊   | 3979/5800 [11:09:16<3:28:56,  6.88s/it]                                                        {'loss': 0.0107, 'grad_norm': 7.654604911804199, 'learning_rate': 9.479177537743977e-06, 'epoch': 34.3}
- 69%|██████▊   | 3979/5800 [11:09:16<3:28:56,  6.88s/it]score1 tensor([[0.4453],
-        [0.5195],
-        [0.4434],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4453, 0.5156, 0.4492, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0151, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:55:53,843] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 19:55:53,844] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.72 | bwd_microstep: 4567.74 | bwd_inner_microstep: 4562.93 | bwd_allreduce_microstep: 4.70 | step_microstep: 42.45
-[2025-01-25 19:55:53,844] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.69 | bwd: 4567.77 | bwd_inner: 4562.93 | bwd_allreduce: 4.76 | step: 42.46
- 69%|██████▊   | 3980/5800 [11:09:23<3:28:25,  6.87s/it]                                                        {'loss': 0.0151, 'grad_norm': 1.9906270503997803, 'learning_rate': 9.469681147368982e-06, 'epoch': 34.31}
- 69%|██████▊   | 3980/5800 [11:09:23<3:28:25,  6.87s/it]score1 tensor([[0.4453],
-        [0.5547],
-        [0.3477],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4414, 0.5430, 0.3438, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:56:00,742] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.92 | optimizer_step: 4.36
-[2025-01-25 19:56:00,744] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.65 | bwd_microstep: 4622.41 | bwd_inner_microstep: 4616.73 | bwd_allreduce_microstep: 5.57 | step_microstep: 45.38
-[2025-01-25 19:56:00,744] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.62 | bwd: 4622.43 | bwd_inner: 4616.73 | bwd_allreduce: 5.62 | step: 45.38
- 69%|██████▊   | 3981/5800 [11:09:30<3:28:33,  6.88s/it]                                                        {'loss': 0.0063, 'grad_norm': 7.711849212646484, 'learning_rate': 9.460188040530132e-06, 'epoch': 34.32}
- 69%|██████▊   | 3981/5800 [11:09:30<3:28:33,  6.88s/it]score1 tensor([[0.4004],
-        [0.5664],
-        [0.5547],
-        [0.4629]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.5664, 0.5625, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:56:07,569] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.37
-[2025-01-25 19:56:07,571] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.86 | bwd_microstep: 4538.68 | bwd_inner_microstep: 4533.79 | bwd_allreduce_microstep: 4.77 | step_microstep: 54.75
-[2025-01-25 19:56:07,571] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.82 | bwd: 4538.70 | bwd_inner: 4533.79 | bwd_allreduce: 4.83 | step: 54.75
- 69%|██████▊   | 3982/5800 [11:09:37<3:28:01,  6.87s/it]                                                        {'loss': 0.0039, 'grad_norm': 0.6587123274803162, 'learning_rate': 9.450698220187528e-06, 'epoch': 34.33}
- 69%|██████▊   | 3982/5800 [11:09:37<3:28:01,  6.87s/it]score1 tensor([[0.4609],
-        [0.4805],
-        [0.3711],
-        [0.4902]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.4629, 0.3672, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:56:14,465] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 19:56:14,467] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.28 | bwd_microstep: 4616.22 | bwd_inner_microstep: 4611.53 | bwd_allreduce_microstep: 4.59 | step_microstep: 42.46
-[2025-01-25 19:56:14,467] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.24 | bwd: 4616.24 | bwd_inner: 4611.53 | bwd_allreduce: 4.64 | step: 42.47
- 69%|██████▊   | 3983/5800 [11:09:44<3:28:08,  6.87s/it]                                                        {'loss': 0.0078, 'grad_norm': 0.49182671308517456, 'learning_rate': 9.441211689300263e-06, 'epoch': 34.34}
- 69%|██████▊   | 3983/5800 [11:09:44<3:28:08,  6.87s/it]score1 tensor([[0.6328],
-        [0.3984],
-        [0.3965],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.4141, 0.3730, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:56:21,325] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 19:56:21,326] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.80 | bwd_microstep: 4582.90 | bwd_inner_microstep: 4577.88 | bwd_allreduce_microstep: 4.93 | step_microstep: 45.43
-[2025-01-25 19:56:21,327] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.76 | bwd: 4582.93 | bwd_inner: 4577.88 | bwd_allreduce: 4.97 | step: 45.44
- 69%|██████▊   | 3984/5800 [11:09:51<3:27:57,  6.87s/it]                                                        {'loss': 0.0127, 'grad_norm': 2.3450379371643066, 'learning_rate': 9.431728450826408e-06, 'epoch': 34.34}
- 69%|██████▊   | 3984/5800 [11:09:51<3:27:57,  6.87s/it]score1 tensor([[0.4258],
-        [0.3848],
-        [0.4570],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4316, 0.3789, 0.4512, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:56:28,217] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 19:56:28,218] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.45 | bwd_microstep: 4611.43 | bwd_inner_microstep: 4606.73 | bwd_allreduce_microstep: 4.62 | step_microstep: 43.34
-[2025-01-25 19:56:28,219] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.41 | bwd: 4611.46 | bwd_inner: 4606.73 | bwd_allreduce: 4.66 | step: 43.35
- 69%|██████▊   | 3985/5800 [11:09:58<3:28:00,  6.88s/it]                                                        {'loss': 0.0054, 'grad_norm': 3.845548629760742, 'learning_rate': 9.422248507722991e-06, 'epoch': 34.35}
- 69%|██████▊   | 3985/5800 [11:09:58<3:28:00,  6.88s/it]score1 tensor([[0.5469],
-        [0.5117],
-        [0.4238],
-        [0.4941]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.5352, 0.4199, 0.5000], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:56:35,111] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 19:56:35,112] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.90 | bwd_microstep: 4617.83 | bwd_inner_microstep: 4613.03 | bwd_allreduce_microstep: 4.70 | step_microstep: 43.68
-[2025-01-25 19:56:35,113] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.86 | bwd: 4617.85 | bwd_inner: 4613.02 | bwd_allreduce: 4.76 | step: 43.69
- 69%|██████▊   | 3986/5800 [11:10:05<3:28:04,  6.88s/it]                                                        {'loss': 0.0132, 'grad_norm': 4.344283103942871, 'learning_rate': 9.412771862946038e-06, 'epoch': 34.36}
- 69%|██████▊   | 3986/5800 [11:10:05<3:28:04,  6.88s/it]score1 tensor([[0.4570],
-        [0.6016],
-        [0.5195],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.6016, 0.5117, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0044, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:56:41,957] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.37
-[2025-01-25 19:56:41,959] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.02 | bwd_microstep: 4565.65 | bwd_inner_microstep: 4560.05 | bwd_allreduce_microstep: 5.45 | step_microstep: 44.26
-[2025-01-25 19:56:41,959] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.99 | bwd: 4565.68 | bwd_inner: 4560.05 | bwd_allreduce: 5.55 | step: 44.26
- 69%|██████▊   | 3987/5800 [11:10:11<3:27:37,  6.87s/it]                                                        {'loss': 0.0044, 'grad_norm': 1.9050604104995728, 'learning_rate': 9.403298519450514e-06, 'epoch': 34.37}
- 69%|██████▊   | 3987/5800 [11:10:11<3:27:37,  6.87s/it]score1 tensor([[0.5469],
-        [0.5430],
-        [0.5664],
-        [0.6367]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.5469, 0.5703, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:56:48,795] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 19:56:48,797] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.35 | bwd_microstep: 4562.95 | bwd_inner_microstep: 4558.23 | bwd_allreduce_microstep: 4.58 | step_microstep: 42.42
-[2025-01-25 19:56:48,797] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.32 | bwd: 4562.97 | bwd_inner: 4558.23 | bwd_allreduce: 4.64 | step: 42.44
- 69%|██████▉   | 3988/5800 [11:10:18<3:27:11,  6.86s/it]                                                        {'loss': 0.0068, 'grad_norm': 1.9902667999267578, 'learning_rate': 9.393828480190387e-06, 'epoch': 34.38}
- 69%|██████▉   | 3988/5800 [11:10:18<3:27:11,  6.86s/it]score1 tensor([[0.4297],
-        [0.4121],
-        [0.6250],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4395, 0.4258, 0.6094, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:56:55,693] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 19:56:55,695] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.27 | bwd_microstep: 4620.56 | bwd_inner_microstep: 4615.87 | bwd_allreduce_microstep: 4.58 | step_microstep: 43.07
-[2025-01-25 19:56:55,695] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.23 | bwd: 4620.58 | bwd_inner: 4615.87 | bwd_allreduce: 4.63 | step: 43.07
- 69%|██████▉   | 3989/5800 [11:10:25<3:27:22,  6.87s/it]                                                        {'loss': 0.0127, 'grad_norm': 3.4986484050750732, 'learning_rate': 9.384361748118567e-06, 'epoch': 34.39}
- 69%|██████▉   | 3989/5800 [11:10:25<3:27:22,  6.87s/it]score1 tensor([[0.4668],
-        [0.4609],
-        [0.5352],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4629, 0.4844, 0.5391, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:57:02,578] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 19:57:02,579] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.59 | bwd_microstep: 4612.56 | bwd_inner_microstep: 4607.73 | bwd_allreduce_microstep: 4.72 | step_microstep: 42.26
-[2025-01-25 19:57:02,579] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.56 | bwd: 4612.59 | bwd_inner: 4607.73 | bwd_allreduce: 4.79 | step: 42.26
- 69%|██████▉   | 3990/5800 [11:10:32<3:27:26,  6.88s/it]                                                        {'loss': 0.0098, 'grad_norm': 4.137485504150391, 'learning_rate': 9.374898326186959e-06, 'epoch': 34.4}
- 69%|██████▉   | 3990/5800 [11:10:32<3:27:26,  6.88s/it]score1 tensor([[0.4766],
-        [0.4805],
-        [0.4238],
-        [0.6133]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4883, 0.4785, 0.4277, 0.5898], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:57:09,486] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 19:57:09,487] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.89 | bwd_microstep: 4628.36 | bwd_inner_microstep: 4622.88 | bwd_allreduce_microstep: 5.34 | step_microstep: 43.54
-[2025-01-25 19:57:09,487] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.85 | bwd: 4628.39 | bwd_inner: 4622.88 | bwd_allreduce: 5.43 | step: 43.55
- 69%|██████▉   | 3991/5800 [11:10:39<3:27:36,  6.89s/it]                                                        {'loss': 0.0103, 'grad_norm': 0.7272798418998718, 'learning_rate': 9.365438217346405e-06, 'epoch': 34.41}
- 69%|██████▉   | 3991/5800 [11:10:39<3:27:36,  6.89s/it]score1 tensor([[0.6602],
-        [0.3652],
-        [0.5195],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.3613, 0.5195, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:57:16,344] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.37
-[2025-01-25 19:57:16,345] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.24 | bwd_microstep: 4580.83 | bwd_inner_microstep: 4575.96 | bwd_allreduce_microstep: 4.79 | step_microstep: 44.34
-[2025-01-25 19:57:16,345] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.22 | bwd: 4580.86 | bwd_inner: 4575.96 | bwd_allreduce: 4.83 | step: 44.34
- 69%|██████▉   | 3992/5800 [11:10:46<3:27:14,  6.88s/it]                                                        {'loss': 0.0059, 'grad_norm': 6.255170822143555, 'learning_rate': 9.35598142454675e-06, 'epoch': 34.41}
- 69%|██████▉   | 3992/5800 [11:10:46<3:27:14,  6.88s/it]score1 tensor([[0.4824],
-        [0.5742],
-        [0.4805],
-        [0.4551]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.5781, 0.4863, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:57:23,239] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.37
-[2025-01-25 19:57:23,241] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.32 | bwd_microstep: 4613.88 | bwd_inner_microstep: 4609.31 | bwd_allreduce_microstep: 4.49 | step_microstep: 43.35
-[2025-01-25 19:57:23,241] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.27 | bwd: 4613.91 | bwd_inner: 4609.31 | bwd_allreduce: 4.53 | step: 43.36
- 69%|██████▉   | 3993/5800 [11:10:53<3:27:15,  6.88s/it]                                                        {'loss': 0.0093, 'grad_norm': 4.236607074737549, 'learning_rate': 9.346527950736768e-06, 'epoch': 34.42}
- 69%|██████▉   | 3993/5800 [11:10:53<3:27:15,  6.88s/it]score1 tensor([[0.4434],
-        [0.4453],
-        [0.4570],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.4395, 0.4512, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:57:30,147] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.36
-[2025-01-25 19:57:30,148] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.69 | bwd_microstep: 4624.74 | bwd_inner_microstep: 4619.33 | bwd_allreduce_microstep: 5.33 | step_microstep: 49.22
-[2025-01-25 19:57:30,149] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.66 | bwd: 4624.76 | bwd_inner: 4619.33 | bwd_allreduce: 5.37 | step: 49.23
- 69%|██████▉   | 3994/5800 [11:11:00<3:27:27,  6.89s/it]                                                        {'loss': 0.0068, 'grad_norm': 3.9461560249328613, 'learning_rate': 9.337077798864231e-06, 'epoch': 34.43}
- 69%|██████▉   | 3994/5800 [11:11:00<3:27:27,  6.89s/it]score1 tensor([[0.5977],
-        [0.6875],
-        [0.5469],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.6875, 0.5430, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:57:37,009] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 19:57:37,010] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.91 | bwd_microstep: 4568.91 | bwd_inner_microstep: 4563.15 | bwd_allreduce_microstep: 5.65 | step_microstep: 46.15
-[2025-01-25 19:57:37,011] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.87 | bwd: 4568.94 | bwd_inner: 4563.15 | bwd_allreduce: 5.71 | step: 46.16
- 69%|██████▉   | 3995/5800 [11:11:06<3:27:01,  6.88s/it]                                                        {'loss': 0.0127, 'grad_norm': 2.2820892333984375, 'learning_rate': 9.32763097187585e-06, 'epoch': 34.44}
- 69%|██████▉   | 3995/5800 [11:11:06<3:27:01,  6.88s/it]score1 tensor([[0.4668],
-        [0.5820],
-        [0.4746],
-        [0.4629]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.5703, 0.4863, 0.4590], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:57:43,902] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 19:57:43,907] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.58 | bwd_microstep: 4610.47 | bwd_inner_microstep: 4605.41 | bwd_allreduce_microstep: 4.97 | step_microstep: 46.82
-[2025-01-25 19:57:43,907] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.54 | bwd: 4610.49 | bwd_inner: 4605.41 | bwd_allreduce: 5.02 | step: 46.83
- 69%|██████▉   | 3996/5800 [11:11:13<3:27:02,  6.89s/it]                                                        {'loss': 0.0073, 'grad_norm': 0.4850969910621643, 'learning_rate': 9.318187472717319e-06, 'epoch': 34.45}
- 69%|██████▉   | 3996/5800 [11:11:13<3:27:02,  6.89s/it]score1 tensor([[0.3281],
-        [0.4258],
-        [0.3926],
-        [0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3105, 0.4238, 0.4043, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:57:50,801] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.37
-[2025-01-25 19:57:50,802] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.97 | bwd_microstep: 4613.41 | bwd_inner_microstep: 4608.04 | bwd_allreduce_microstep: 5.25 | step_microstep: 43.07
-[2025-01-25 19:57:50,802] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.94 | bwd: 4613.44 | bwd_inner: 4608.04 | bwd_allreduce: 5.32 | step: 43.10
- 69%|██████▉   | 3997/5800 [11:11:20<3:27:00,  6.89s/it]                                                        {'loss': 0.0112, 'grad_norm': 0.49966147541999817, 'learning_rate': 9.308747304333273e-06, 'epoch': 34.46}
- 69%|██████▉   | 3997/5800 [11:11:20<3:27:00,  6.89s/it]score1 tensor([[0.5430],
-        [0.6523],
-        [0.5625],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.6367, 0.5508, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:57:57,660] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 19:57:57,661] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.45 | bwd_microstep: 4571.32 | bwd_inner_microstep: 4566.38 | bwd_allreduce_microstep: 4.87 | step_microstep: 47.00
-[2025-01-25 19:57:57,661] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.41 | bwd: 4571.34 | bwd_inner: 4566.38 | bwd_allreduce: 4.90 | step: 47.01
- 69%|██████▉   | 3998/5800 [11:11:27<3:26:37,  6.88s/it]                                                        {'loss': 0.0078, 'grad_norm': 6.68340539932251, 'learning_rate': 9.299310469667333e-06, 'epoch': 34.47}
- 69%|██████▉   | 3998/5800 [11:11:27<3:26:37,  6.88s/it]score1 tensor([[0.5273],
-        [0.4844],
-        [0.4512],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.5000, 0.4336, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:58:04,576] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.37
-[2025-01-25 19:58:04,578] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.19 | bwd_microstep: 4621.29 | bwd_inner_microstep: 4616.38 | bwd_allreduce_microstep: 4.82 | step_microstep: 43.09
-[2025-01-25 19:58:04,578] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.13 | bwd: 4621.32 | bwd_inner: 4616.38 | bwd_allreduce: 4.87 | step: 43.10
- 69%|██████▉   | 3999/5800 [11:11:34<3:26:51,  6.89s/it]                                                        {'loss': 0.0122, 'grad_norm': 4.143454551696777, 'learning_rate': 9.289876971662055e-06, 'epoch': 34.47}
- 69%|██████▉   | 3999/5800 [11:11:34<3:26:51,  6.89s/it]score1 tensor([[0.4883],
-        [0.6367],
-        [0.4727],
-        [0.4688]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.6445, 0.4883, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0225, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:58:11,475] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 19:58:11,476] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.30 | bwd_microstep: 4614.59 | bwd_inner_microstep: 4609.28 | bwd_allreduce_microstep: 5.20 | step_microstep: 43.75
-[2025-01-25 19:58:11,476] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.24 | bwd: 4614.62 | bwd_inner: 4609.28 | bwd_allreduce: 5.26 | step: 43.75
- 69%|██████▉   | 4000/5800 [11:11:41<3:26:46,  6.89s/it]                                                        {'loss': 0.0225, 'grad_norm': 8.199280738830566, 'learning_rate': 9.280446813258982e-06, 'epoch': 34.48}
- 69%|██████▉   | 4000/5800 [11:11:41<3:26:46,  6.89s/it]score1 tensor([[0.4785],
-        [0.6445],
-        [0.4746],
-        [0.5859]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4863, 0.6367, 0.4785, 0.5820], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:58:18,377] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 19:58:18,378] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.45 | bwd_microstep: 4623.42 | bwd_inner_microstep: 4618.10 | bwd_allreduce_microstep: 5.22 | step_microstep: 43.91
-[2025-01-25 19:58:18,379] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.40 | bwd: 4623.45 | bwd_inner: 4618.10 | bwd_allreduce: 5.27 | step: 43.92
- 69%|██████▉   | 4001/5800 [11:11:48<3:26:44,  6.90s/it]                                                        {'loss': 0.0059, 'grad_norm': 1.0096423625946045, 'learning_rate': 9.271019997398592e-06, 'epoch': 34.49}
- 69%|██████▉   | 4001/5800 [11:11:48<3:26:44,  6.90s/it]score1 tensor([[0.4941],
-        [0.5117],
-        [0.4570],
-        [0.4043]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4961, 0.5273, 0.4707, 0.4043], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:58:25,244] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 19:58:25,245] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.21 | bwd_microstep: 4582.85 | bwd_inner_microstep: 4576.54 | bwd_allreduce_microstep: 6.18 | step_microstep: 47.62
-[2025-01-25 19:58:25,246] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.18 | bwd: 4582.89 | bwd_inner: 4576.54 | bwd_allreduce: 6.24 | step: 47.64
- 69%|██████▉   | 4002/5800 [11:11:55<3:26:23,  6.89s/it]                                                        {'loss': 0.0078, 'grad_norm': 5.925253391265869, 'learning_rate': 9.261596527020324e-06, 'epoch': 34.5}
- 69%|██████▉   | 4002/5800 [11:11:55<3:26:23,  6.89s/it]score1 tensor([[0.5430],
-        [0.4727],
-        [0.4512],
-        [0.4121]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.4922, 0.4492, 0.4219], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:58:32,100] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 19:58:32,101] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.60 | bwd_microstep: 4565.50 | bwd_inner_microstep: 4560.44 | bwd_allreduce_microstep: 4.90 | step_microstep: 47.99
-[2025-01-25 19:58:32,102] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.56 | bwd: 4565.53 | bwd_inner: 4560.44 | bwd_allreduce: 4.97 | step: 47.99
- 69%|██████▉   | 4003/5800 [11:12:02<3:26:04,  6.88s/it]                                                        {'loss': 0.0078, 'grad_norm': 1.902819275856018, 'learning_rate': 9.252176405062594e-06, 'epoch': 34.51}
- 69%|██████▉   | 4003/5800 [11:12:02<3:26:04,  6.88s/it]score1 tensor([[0.5312],
-        [0.5195],
-        [0.4355],
-        [0.5938]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.5273, 0.4355, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:58:38,969] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.34 | optimizer_step: 4.36
-[2025-01-25 19:58:38,970] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.19 | bwd_microstep: 4574.31 | bwd_inner_microstep: 4569.21 | bwd_allreduce_microstep: 4.96 | step_microstep: 47.90
-[2025-01-25 19:58:38,971] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.15 | bwd: 4574.33 | bwd_inner: 4569.21 | bwd_allreduce: 5.04 | step: 47.91
- 69%|██████▉   | 4004/5800 [11:12:08<3:25:47,  6.88s/it]                                                        {'loss': 0.0088, 'grad_norm': 6.364138126373291, 'learning_rate': 9.242759634462743e-06, 'epoch': 34.52}
- 69%|██████▉   | 4004/5800 [11:12:08<3:25:47,  6.88s/it]score1 tensor([[0.4141],
-        [0.6406],
-        [0.5508],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4023, 0.6328, 0.5508, 0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:58:45,833] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 19:58:45,834] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.49 | bwd_microstep: 4578.72 | bwd_inner_microstep: 4573.58 | bwd_allreduce_microstep: 5.05 | step_microstep: 44.14
-[2025-01-25 19:58:45,834] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.44 | bwd: 4578.74 | bwd_inner: 4573.58 | bwd_allreduce: 5.09 | step: 44.15
- 69%|██████▉   | 4005/5800 [11:12:15<3:25:34,  6.87s/it]                                                        {'loss': 0.0078, 'grad_norm': 2.1324212551116943, 'learning_rate': 9.233346218157102e-06, 'epoch': 34.53}
- 69%|██████▉   | 4005/5800 [11:12:15<3:25:34,  6.87s/it]score1 tensor([[0.6055],
-        [0.3418],
-        [0.5195],
-        [0.4824]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.3457, 0.5117, 0.4727], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:58:52,735] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.11 | optimizer_step: 4.36
-[2025-01-25 19:58:52,737] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.85 | bwd_microstep: 4612.92 | bwd_inner_microstep: 4607.49 | bwd_allreduce_microstep: 5.32 | step_microstep: 45.23
-[2025-01-25 19:58:52,738] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.81 | bwd: 4612.95 | bwd_inner: 4607.49 | bwd_allreduce: 5.38 | step: 45.24
- 69%|██████▉   | 4006/5800 [11:12:22<3:25:44,  6.88s/it]                                                        {'loss': 0.0063, 'grad_norm': 0.3834070861339569, 'learning_rate': 9.223936159080926e-06, 'epoch': 34.53}
- 69%|██████▉   | 4006/5800 [11:12:22<3:25:44,  6.88s/it]score1 tensor([[0.4844],
-        [0.5625],
-        [0.4824],
-        [0.6211]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.5664, 0.4941, 0.6367], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:58:59,647] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 19:58:59,648] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.22 | bwd_microstep: 4623.17 | bwd_inner_microstep: 4617.60 | bwd_allreduce_microstep: 5.47 | step_microstep: 45.14
-[2025-01-25 19:58:59,648] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.19 | bwd: 4623.19 | bwd_inner: 4617.60 | bwd_allreduce: 5.51 | step: 45.14
- 69%|██████▉   | 4007/5800 [11:12:29<3:25:54,  6.89s/it]                                                        {'loss': 0.0112, 'grad_norm': 8.374587059020996, 'learning_rate': 9.21452946016843e-06, 'epoch': 34.54}
- 69%|██████▉   | 4007/5800 [11:12:29<3:25:54,  6.89s/it]score1 tensor([[0.4648],
-        [0.5234],
-        [0.6016],
-        [0.4785]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4746, 0.5156, 0.6094, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:59:06,555] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 19:59:06,556] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.33 | bwd_microstep: 4618.86 | bwd_inner_microstep: 4613.77 | bwd_allreduce_microstep: 5.01 | step_microstep: 46.76
-[2025-01-25 19:59:06,556] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.30 | bwd: 4618.89 | bwd_inner: 4613.77 | bwd_allreduce: 5.05 | step: 46.77
- 69%|██████▉   | 4008/5800 [11:12:36<3:25:55,  6.89s/it]                                                        {'loss': 0.0078, 'grad_norm': 4.036675453186035, 'learning_rate': 9.2051261243528e-06, 'epoch': 34.55}
- 69%|██████▉   | 4008/5800 [11:12:36<3:25:55,  6.89s/it]score1 tensor([[0.4824],
-        [0.4473],
-        [0.4688],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.4473, 0.4609, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:59:13,413] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 7.91
-[2025-01-25 19:59:13,415] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.98 | bwd_microstep: 4570.17 | bwd_inner_microstep: 4565.11 | bwd_allreduce_microstep: 4.99 | step_microstep: 52.31
-[2025-01-25 19:59:13,415] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.95 | bwd: 4570.20 | bwd_inner: 4565.11 | bwd_allreduce: 5.02 | step: 52.33
- 69%|██████▉   | 4009/5800 [11:12:43<3:25:27,  6.88s/it]                                                        {'loss': 0.0068, 'grad_norm': 2.042844295501709, 'learning_rate': 9.195726154566146e-06, 'epoch': 34.56}
- 69%|██████▉   | 4009/5800 [11:12:43<3:25:27,  6.88s/it]score1 tensor([[0.3633],
-        [0.5117],
-        [0.6680],
-        [0.6094]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3477, 0.5195, 0.6562, 0.6289], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:59:20,313] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 19:59:20,314] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.99 | bwd_microstep: 4619.95 | bwd_inner_microstep: 4614.94 | bwd_allreduce_microstep: 4.88 | step_microstep: 44.18
-[2025-01-25 19:59:20,315] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.93 | bwd: 4619.98 | bwd_inner: 4614.94 | bwd_allreduce: 4.92 | step: 44.18
- 69%|██████▉   | 4010/5800 [11:12:50<3:25:31,  6.89s/it]                                                        {'loss': 0.0137, 'grad_norm': 0.557671308517456, 'learning_rate': 9.186329553739552e-06, 'epoch': 34.57}
- 69%|██████▉   | 4010/5800 [11:12:50<3:25:31,  6.89s/it]score1 tensor([[0.4785],
-        [0.4238],
-        [0.3672],
-        [0.4629]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4668, 0.4199, 0.3340, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0142, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:59:27,218] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 19:59:27,219] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.59 | bwd_microstep: 4621.86 | bwd_inner_microstep: 4617.04 | bwd_allreduce_microstep: 4.74 | step_microstep: 47.52
-[2025-01-25 19:59:27,220] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.55 | bwd: 4621.89 | bwd_inner: 4617.04 | bwd_allreduce: 4.79 | step: 47.53
- 69%|██████▉   | 4011/5800 [11:12:57<3:25:32,  6.89s/it]                                                        {'loss': 0.0142, 'grad_norm': 7.4057722091674805, 'learning_rate': 9.176936324803029e-06, 'epoch': 34.58}
- 69%|██████▉   | 4011/5800 [11:12:57<3:25:32,  6.89s/it]score1 tensor([[0.4316],
-        [0.4492],
-        [0.5117],
-        [0.6914]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4219, 0.4473, 0.4844, 0.6953], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:59:34,131] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.43 | optimizer_step: 4.36
-[2025-01-25 19:59:34,132] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.01 | bwd_microstep: 4622.04 | bwd_inner_microstep: 4617.17 | bwd_allreduce_microstep: 4.78 | step_microstep: 40.84
-[2025-01-25 19:59:34,132] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.95 | bwd: 4622.07 | bwd_inner: 4617.17 | bwd_allreduce: 4.83 | step: 40.84
- 69%|██████▉   | 4012/5800 [11:13:04<3:25:35,  6.90s/it]                                                        {'loss': 0.0107, 'grad_norm': 3.3541581630706787, 'learning_rate': 9.167546470685567e-06, 'epoch': 34.59}
- 69%|██████▉   | 4012/5800 [11:13:04<3:25:35,  6.90s/it]score1 tensor([[0.5664],
-        [0.5820],
-        [0.4766],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5820, 0.5977, 0.4766, 0.5234], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:59:40,986] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 19:59:40,987] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.40 | bwd_microstep: 4580.74 | bwd_inner_microstep: 4576.72 | bwd_allreduce_microstep: 3.96 | step_microstep: 38.90
-[2025-01-25 19:59:40,987] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.34 | bwd: 4580.77 | bwd_inner: 4576.72 | bwd_allreduce: 3.99 | step: 38.90
- 69%|██████▉   | 4013/5800 [11:13:10<3:24:58,  6.88s/it]                                                        {'loss': 0.0088, 'grad_norm': 6.4182233810424805, 'learning_rate': 9.158159994315073e-06, 'epoch': 34.59}
- 69%|██████▉   | 4013/5800 [11:13:10<3:24:58,  6.88s/it]score1 tensor([[0.5586],
-        [0.5039],
-        [0.5664],
-        [0.4688]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.4961, 0.5781, 0.4531], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:59:47,875] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 19:59:47,877] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.86 | bwd_microstep: 4616.18 | bwd_inner_microstep: 4610.86 | bwd_allreduce_microstep: 5.22 | step_microstep: 49.32
-[2025-01-25 19:59:47,877] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.83 | bwd: 4616.21 | bwd_inner: 4610.86 | bwd_allreduce: 5.27 | step: 49.31
- 69%|██████▉   | 4014/5800 [11:13:17<3:25:06,  6.89s/it]                                                        {'loss': 0.0107, 'grad_norm': 0.6157739758491516, 'learning_rate': 9.148776898618408e-06, 'epoch': 34.6}
- 69%|██████▉   | 4014/5800 [11:13:17<3:25:06,  6.89s/it]score1 tensor([[0.5742],
-        [0.1709],
-        [0.3223],
-        [0.4316]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5742, 0.1787, 0.3086, 0.4336], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 19:59:54,740] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.94 | optimizer_step: 4.36
-[2025-01-25 19:59:54,741] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.98 | bwd_microstep: 4567.09 | bwd_inner_microstep: 4561.77 | bwd_allreduce_microstep: 5.23 | step_microstep: 50.88
-[2025-01-25 19:59:54,742] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.94 | bwd: 4567.11 | bwd_inner: 4561.77 | bwd_allreduce: 5.28 | step: 50.91
- 69%|██████▉   | 4015/5800 [11:13:24<3:24:42,  6.88s/it]                                                        {'loss': 0.0059, 'grad_norm': 1.4224270582199097, 'learning_rate': 9.139397186521398e-06, 'epoch': 34.61}
- 69%|██████▉   | 4015/5800 [11:13:24<3:24:42,  6.88s/it]score1 tensor([[0.6875],
-        [0.6523],
-        [0.4512],
-        [0.4219]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6797, 0.6445, 0.4453, 0.4277], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:00:01,660] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 20:00:01,661] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.01 | bwd_microstep: 4627.64 | bwd_inner_microstep: 4622.62 | bwd_allreduce_microstep: 4.92 | step_microstep: 49.00
-[2025-01-25 20:00:01,662] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.97 | bwd: 4627.67 | bwd_inner: 4622.63 | bwd_allreduce: 4.96 | step: 49.01
- 69%|██████▉   | 4016/5800 [11:13:31<3:24:59,  6.89s/it]                                                        {'loss': 0.0068, 'grad_norm': 4.944509506225586, 'learning_rate': 9.130020860948786e-06, 'epoch': 34.62}
- 69%|██████▉   | 4016/5800 [11:13:31<3:24:59,  6.89s/it]score1 tensor([[0.4395],
-        [0.4922],
-        [0.6055],
-        [0.6211]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.4824, 0.6172, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:00:08,534] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.37
-[2025-01-25 20:00:08,536] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.73 | bwd_microstep: 4576.80 | bwd_inner_microstep: 4571.45 | bwd_allreduce_microstep: 5.27 | step_microstep: 49.22
-[2025-01-25 20:00:08,536] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.68 | bwd: 4576.82 | bwd_inner: 4571.45 | bwd_allreduce: 5.31 | step: 49.25
- 69%|██████▉   | 4017/5800 [11:13:38<3:24:40,  6.89s/it]                                                        {'loss': 0.0073, 'grad_norm': 2.1718952655792236, 'learning_rate': 9.120647924824286e-06, 'epoch': 34.63}
- 69%|██████▉   | 4017/5800 [11:13:38<3:24:40,  6.89s/it]score1 tensor([[0.5156],
-        [0.4531],
-        [0.5312],
-        [0.3848]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.4512, 0.5195, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:00:15,455] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 20:00:15,456] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.97 | bwd_microstep: 4629.47 | bwd_inner_microstep: 4623.95 | bwd_allreduce_microstep: 5.40 | step_microstep: 44.79
-[2025-01-25 20:00:15,456] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.93 | bwd: 4629.49 | bwd_inner: 4623.95 | bwd_allreduce: 5.46 | step: 44.80
- 69%|██████▉   | 4018/5800 [11:13:45<3:24:47,  6.90s/it]                                                        {'loss': 0.0088, 'grad_norm': 7.755885124206543, 'learning_rate': 9.111278381070538e-06, 'epoch': 34.64}
- 69%|██████▉   | 4018/5800 [11:13:45<3:24:47,  6.90s/it]score1 tensor([[0.5547],
-        [0.5156],
-        [0.5312],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.5039, 0.5625, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:00:22,384] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.90 | optimizer_step: 4.36
-[2025-01-25 20:00:22,385] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.13 | bwd_microstep: 4629.43 | bwd_inner_microstep: 4624.13 | bwd_allreduce_microstep: 5.20 | step_microstep: 45.69
-[2025-01-25 20:00:22,386] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.09 | bwd: 4629.46 | bwd_inner: 4624.13 | bwd_allreduce: 5.25 | step: 45.69
- 69%|██████▉   | 4019/5800 [11:13:52<3:24:58,  6.91s/it]                                                        {'loss': 0.0215, 'grad_norm': 4.276665687561035, 'learning_rate': 9.101912232609118e-06, 'epoch': 34.65}
- 69%|██████▉   | 4019/5800 [11:13:52<3:24:58,  6.91s/it]score1 tensor([[0.6445],
-        [0.6602],
-        [0.4160],
-        [0.5859]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6562, 0.6484, 0.3984, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:00:29,293] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 20:00:29,294] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.07 | bwd_microstep: 4621.34 | bwd_inner_microstep: 4615.81 | bwd_allreduce_microstep: 5.42 | step_microstep: 46.63
-[2025-01-25 20:00:29,295] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.00 | bwd: 4621.37 | bwd_inner: 4615.82 | bwd_allreduce: 5.47 | step: 46.64
- 69%|██████▉   | 4020/5800 [11:13:59<3:24:55,  6.91s/it]                                                        {'loss': 0.0161, 'grad_norm': 4.044004440307617, 'learning_rate': 9.092549482360571e-06, 'epoch': 34.66}
- 69%|██████▉   | 4020/5800 [11:13:59<3:24:55,  6.91s/it]score1 tensor([[0.4609],
-        [0.5508],
-        [0.4102],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4609, 0.5391, 0.4023, 0.5742], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:00:36,154] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 20:00:36,155] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.89 | bwd_microstep: 4576.10 | bwd_inner_microstep: 4570.61 | bwd_allreduce_microstep: 5.40 | step_microstep: 47.35
-[2025-01-25 20:00:36,156] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.86 | bwd: 4576.13 | bwd_inner: 4570.61 | bwd_allreduce: 5.45 | step: 47.35
- 69%|██████▉   | 4021/5800 [11:14:06<3:24:24,  6.89s/it]                                                        {'loss': 0.0068, 'grad_norm': 1.7679539918899536, 'learning_rate': 9.083190133244348e-06, 'epoch': 34.66}
- 69%|██████▉   | 4021/5800 [11:14:06<3:24:24,  6.89s/it]score1 tensor([[0.3535],
-        [0.6094],
-        [0.4922],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3398, 0.6172, 0.5117, 0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:00:43,075] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.37
-[2025-01-25 20:00:43,076] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.99 | bwd_microstep: 4628.62 | bwd_inner_microstep: 4623.50 | bwd_allreduce_microstep: 5.00 | step_microstep: 48.21
-[2025-01-25 20:00:43,077] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.96 | bwd: 4628.64 | bwd_inner: 4623.50 | bwd_allreduce: 5.06 | step: 48.21
- 69%|██████▉   | 4022/5800 [11:14:13<3:24:30,  6.90s/it]                                                        {'loss': 0.0132, 'grad_norm': 4.686152935028076, 'learning_rate': 9.073834188178872e-06, 'epoch': 34.67}
- 69%|██████▉   | 4022/5800 [11:14:13<3:24:30,  6.90s/it]score1 tensor([[0.5469],
-        [0.4512],
-        [0.4648],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5586, 0.4434, 0.4648, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:00:49,948] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 20:00:49,950] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.92 | bwd_microstep: 4586.01 | bwd_inner_microstep: 4580.70 | bwd_allreduce_microstep: 5.20 | step_microstep: 44.91
-[2025-01-25 20:00:49,950] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.88 | bwd: 4586.03 | bwd_inner: 4580.70 | bwd_allreduce: 5.26 | step: 44.92
- 69%|██████▉   | 4023/5800 [11:14:19<3:24:09,  6.89s/it]                                                        {'loss': 0.0068, 'grad_norm': 2.389495849609375, 'learning_rate': 9.064481650081476e-06, 'epoch': 34.68}
- 69%|██████▉   | 4023/5800 [11:14:19<3:24:09,  6.89s/it]score1 tensor([[0.5938],
-        [0.3633],
-        [0.6094],
-        [0.4336]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5898, 0.3652, 0.6172, 0.4395], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:00:56,866] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.37
-[2025-01-25 20:00:56,868] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.37 | bwd_microstep: 4623.98 | bwd_inner_microstep: 4618.24 | bwd_allreduce_microstep: 5.66 | step_microstep: 51.60
-[2025-01-25 20:00:56,868] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.32 | bwd: 4624.00 | bwd_inner: 4618.24 | bwd_allreduce: 5.70 | step: 51.61
- 69%|██████▉   | 4024/5800 [11:14:26<3:24:16,  6.90s/it]                                                        {'loss': 0.0049, 'grad_norm': 3.5742475986480713, 'learning_rate': 9.055132521868452e-06, 'epoch': 34.69}
- 69%|██████▉   | 4024/5800 [11:14:26<3:24:16,  6.90s/it]score1 tensor([[0.5977],
-        [0.5273],
-        [0.3633],
-        [0.6016]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.5273, 0.3750, 0.6133], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:01:03,731] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 20:01:03,732] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.57 | bwd_microstep: 4572.54 | bwd_inner_microstep: 4567.26 | bwd_allreduce_microstep: 5.14 | step_microstep: 44.04
-[2025-01-25 20:01:03,733] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.53 | bwd: 4572.56 | bwd_inner: 4567.26 | bwd_allreduce: 5.21 | step: 44.05
- 69%|██████▉   | 4025/5800 [11:14:33<3:23:50,  6.89s/it]                                                        {'loss': 0.0088, 'grad_norm': 6.206804275512695, 'learning_rate': 9.045786806455015e-06, 'epoch': 34.7}
- 69%|██████▉   | 4025/5800 [11:14:33<3:23:50,  6.89s/it]score1 tensor([[0.4355],
-        [0.4355],
-        [0.5391],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4375, 0.4453, 0.5352, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:01:10,649] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.37
-[2025-01-25 20:01:10,651] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2168.19 | bwd_microstep: 4622.35 | bwd_inner_microstep: 4617.32 | bwd_allreduce_microstep: 4.94 | step_microstep: 45.09
-[2025-01-25 20:01:10,651] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2168.15 | bwd: 4622.38 | bwd_inner: 4617.32 | bwd_allreduce: 4.99 | step: 45.10
- 69%|██████▉   | 4026/5800 [11:14:40<3:23:56,  6.90s/it]                                                        {'loss': 0.0049, 'grad_norm': 3.658669948577881, 'learning_rate': 9.036444506755322e-06, 'epoch': 34.71}
- 69%|██████▉   | 4026/5800 [11:14:40<3:23:56,  6.90s/it]score1 tensor([[0.4980],
-        [0.6367],
-        [0.4180],
-        [0.4902]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.6445, 0.4141, 0.4746], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:01:17,499] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.37
-[2025-01-25 20:01:17,501] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.70 | bwd_microstep: 4568.76 | bwd_inner_microstep: 4563.85 | bwd_allreduce_microstep: 4.81 | step_microstep: 42.63
-[2025-01-25 20:01:17,501] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.66 | bwd: 4568.79 | bwd_inner: 4563.85 | bwd_allreduce: 4.87 | step: 42.64
- 69%|██████▉   | 4027/5800 [11:14:47<3:23:24,  6.88s/it]                                                        {'loss': 0.0068, 'grad_norm': 1.5311707258224487, 'learning_rate': 9.027105625682473e-06, 'epoch': 34.72}
- 69%|██████▉   | 4027/5800 [11:14:47<3:23:24,  6.88s/it]score1 tensor([[0.5156],
-        [0.4863],
-        [0.4824],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5078, 0.4941, 0.4824, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:01:24,361] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.37
-[2025-01-25 20:01:24,363] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.33 | bwd_microstep: 4577.12 | bwd_inner_microstep: 4571.91 | bwd_allreduce_microstep: 5.08 | step_microstep: 42.42
-[2025-01-25 20:01:24,363] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.29 | bwd: 4577.15 | bwd_inner: 4571.91 | bwd_allreduce: 5.15 | step: 42.43
- 69%|██████▉   | 4028/5800 [11:14:54<3:23:08,  6.88s/it]                                                        {'loss': 0.0078, 'grad_norm': 2.0545380115509033, 'learning_rate': 9.01777016614848e-06, 'epoch': 34.72}
- 69%|██████▉   | 4028/5800 [11:14:54<3:23:08,  6.88s/it]score1 tensor([[0.4707],
-        [0.4258],
-        [0.4531],
-        [0.3906]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.4336, 0.4453, 0.3809], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:01:31,290] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.36
-[2025-01-25 20:01:31,291] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.18 | bwd_microstep: 4638.00 | bwd_inner_microstep: 4633.13 | bwd_allreduce_microstep: 4.80 | step_microstep: 43.08
-[2025-01-25 20:01:31,291] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.10 | bwd: 4638.02 | bwd_inner: 4633.12 | bwd_allreduce: 4.84 | step: 43.08
- 69%|██████▉   | 4029/5800 [11:15:01<3:23:28,  6.89s/it]                                                        {'loss': 0.0078, 'grad_norm': 3.7893779277801514, 'learning_rate': 9.008438131064314e-06, 'epoch': 34.73}
- 69%|██████▉   | 4029/5800 [11:15:01<3:23:28,  6.89s/it]score1 tensor([[0.3574],
-        [0.4922],
-        [0.6172],
-        [0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3516, 0.4766, 0.6094, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:01:38,203] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 20:01:38,204] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.76 | bwd_microstep: 4621.46 | bwd_inner_microstep: 4615.29 | bwd_allreduce_microstep: 6.05 | step_microstep: 47.94
-[2025-01-25 20:01:38,205] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.72 | bwd: 4621.48 | bwd_inner: 4615.29 | bwd_allreduce: 6.11 | step: 47.96
- 69%|██████▉   | 4030/5800 [11:15:08<3:23:33,  6.90s/it]                                                        {'loss': 0.0161, 'grad_norm': 7.908998012542725, 'learning_rate': 8.99910952333985e-06, 'epoch': 34.74}
- 69%|██████▉   | 4030/5800 [11:15:08<3:23:33,  6.90s/it]score1 tensor([[0.4863],
-        [0.5625],
-        [0.6133],
-        [0.4551]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4883, 0.5508, 0.6133, 0.4629], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:01:45,073] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.37
-[2025-01-25 20:01:45,074] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.22 | bwd_microstep: 4584.93 | bwd_inner_microstep: 4580.25 | bwd_allreduce_microstep: 4.60 | step_microstep: 42.78
-[2025-01-25 20:01:45,074] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.18 | bwd: 4584.96 | bwd_inner: 4580.25 | bwd_allreduce: 4.64 | step: 42.78
- 70%|██████▉   | 4031/5800 [11:15:15<3:23:06,  6.89s/it]                                                        {'loss': 0.0054, 'grad_norm': 1.7822624444961548, 'learning_rate': 8.98978434588393e-06, 'epoch': 34.75}
- 70%|██████▉   | 4031/5800 [11:15:15<3:23:06,  6.89s/it]score1 tensor([[0.4590],
-        [0.5156],
-        [0.5703],
-        [0.6133]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.5156, 0.5547, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:01:51,928] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 20:01:51,929] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.52 | bwd_microstep: 4574.46 | bwd_inner_microstep: 4569.23 | bwd_allreduce_microstep: 5.13 | step_microstep: 45.51
-[2025-01-25 20:01:51,930] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.48 | bwd: 4574.48 | bwd_inner: 4569.23 | bwd_allreduce: 5.18 | step: 45.51
- 70%|██████▉   | 4032/5800 [11:15:21<3:22:41,  6.88s/it]                                                        {'loss': 0.0063, 'grad_norm': 2.5730628967285156, 'learning_rate': 8.980462601604291e-06, 'epoch': 34.76}
- 70%|██████▉   | 4032/5800 [11:15:21<3:22:41,  6.88s/it]score1 tensor([[0.5586],
-        [0.5078],
-        [0.4473],
-        [0.4453]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.4941, 0.4473, 0.4238], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:01:58,786] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.88 | optimizer_step: 4.37
-[2025-01-25 20:01:58,788] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.42 | bwd_microstep: 4574.91 | bwd_inner_microstep: 4569.81 | bwd_allreduce_microstep: 5.02 | step_microstep: 44.95
-[2025-01-25 20:01:58,789] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.36 | bwd: 4574.94 | bwd_inner: 4569.81 | bwd_allreduce: 5.06 | step: 44.97
- 70%|██████▉   | 4033/5800 [11:15:28<3:22:24,  6.87s/it]                                                        {'loss': 0.0117, 'grad_norm': 6.057324409484863, 'learning_rate': 8.971144293407613e-06, 'epoch': 34.77}
- 70%|██████▉   | 4033/5800 [11:15:28<3:22:24,  6.87s/it]score1 tensor([[0.5273],
-        [0.6836],
-        [0.4238],
-        [0.6953]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.6641, 0.4316, 0.6875], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:02:05,693] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 20:02:05,694] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.69 | bwd_microstep: 4620.97 | bwd_inner_microstep: 4615.52 | bwd_allreduce_microstep: 5.36 | step_microstep: 44.96
-[2025-01-25 20:02:05,694] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.66 | bwd: 4621.01 | bwd_inner: 4615.52 | bwd_allreduce: 5.40 | step: 44.99
- 70%|██████▉   | 4034/5800 [11:15:35<3:22:35,  6.88s/it]                                                        {'loss': 0.0117, 'grad_norm': 5.247721195220947, 'learning_rate': 8.96182942419952e-06, 'epoch': 34.78}
- 70%|██████▉   | 4034/5800 [11:15:35<3:22:35,  6.88s/it]score1 tensor([[0.4492],
-        [0.5781],
-        [0.6484],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.5742, 0.6406, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:02:12,612] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.90 | optimizer_step: 4.37
-[2025-01-25 20:02:12,614] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.99 | bwd_microstep: 4626.65 | bwd_inner_microstep: 4621.29 | bwd_allreduce_microstep: 5.23 | step_microstep: 45.05
-[2025-01-25 20:02:12,614] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.95 | bwd: 4626.67 | bwd_inner: 4621.29 | bwd_allreduce: 5.29 | step: 45.07
- 70%|██████▉   | 4035/5800 [11:15:42<3:22:50,  6.90s/it]                                                        {'loss': 0.0054, 'grad_norm': 0.8399896025657654, 'learning_rate': 8.952517996884532e-06, 'epoch': 34.78}
- 70%|██████▉   | 4035/5800 [11:15:42<3:22:50,  6.90s/it]score1 tensor([[0.4727],
-        [0.4863],
-        [0.5508],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.4941, 0.5508, 0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:02:19,488] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 20:02:19,489] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.55 | bwd_microstep: 4585.90 | bwd_inner_microstep: 4580.48 | bwd_allreduce_microstep: 5.30 | step_microstep: 44.46
-[2025-01-25 20:02:19,489] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.50 | bwd: 4585.93 | bwd_inner: 4580.48 | bwd_allreduce: 5.36 | step: 44.47
- 70%|██████▉   | 4036/5800 [11:15:49<3:22:29,  6.89s/it]                                                        {'loss': 0.0054, 'grad_norm': 5.998133182525635, 'learning_rate': 8.943210014366131e-06, 'epoch': 34.79}
- 70%|██████▉   | 4036/5800 [11:15:49<3:22:29,  6.89s/it]score1 tensor([[0.4629],
-        [0.4199],
-        [0.4824],
-        [0.3652]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.4277, 0.4844, 0.3652], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:02:26,357] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.91 | optimizer_step: 4.36
-[2025-01-25 20:02:26,358] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.80 | bwd_microstep: 4584.52 | bwd_inner_microstep: 4579.66 | bwd_allreduce_microstep: 4.77 | step_microstep: 47.30
-[2025-01-25 20:02:26,359] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.76 | bwd: 4584.55 | bwd_inner: 4579.66 | bwd_allreduce: 4.82 | step: 47.33
- 70%|██████▉   | 4037/5800 [11:15:56<3:22:16,  6.88s/it]                                                        {'loss': 0.0039, 'grad_norm': 5.698099136352539, 'learning_rate': 8.933905479546698e-06, 'epoch': 34.8}
- 70%|██████▉   | 4037/5800 [11:15:56<3:22:16,  6.88s/it]score1 tensor([[0.4746],
-        [0.4551],
-        [0.4941],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4902, 0.4277, 0.4922, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:02:33,280] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 20:02:33,281] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.78 | bwd_microstep: 4626.77 | bwd_inner_microstep: 4621.79 | bwd_allreduce_microstep: 4.90 | step_microstep: 43.36
-[2025-01-25 20:02:33,281] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.75 | bwd: 4626.79 | bwd_inner: 4621.79 | bwd_allreduce: 4.94 | step: 43.37
- 70%|██████▉   | 4038/5800 [11:16:03<3:22:26,  6.89s/it]                                                        {'loss': 0.0122, 'grad_norm': 4.00144624710083, 'learning_rate': 8.924604395327546e-06, 'epoch': 34.81}
- 70%|██████▉   | 4038/5800 [11:16:03<3:22:26,  6.89s/it]score1 tensor([[0.5234],
-        [0.4238],
-        [0.5625],
-        [0.4551]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.4297, 0.5664, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:02:40,192] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.37
-[2025-01-25 20:02:40,194] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.29 | bwd_microstep: 4624.82 | bwd_inner_microstep: 4619.76 | bwd_allreduce_microstep: 4.96 | step_microstep: 42.90
-[2025-01-25 20:02:40,194] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.24 | bwd: 4624.84 | bwd_inner: 4619.76 | bwd_allreduce: 5.01 | step: 42.91
- 70%|██████▉   | 4039/5800 [11:16:10<3:22:30,  6.90s/it]                                                        {'loss': 0.0073, 'grad_norm': 4.155384540557861, 'learning_rate': 8.915306764608924e-06, 'epoch': 34.82}
- 70%|██████▉   | 4039/5800 [11:16:10<3:22:30,  6.90s/it]score1 tensor([[0.6016],
-        [0.5430],
-        [0.3496],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133, 0.5586, 0.3691, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:02:47,109] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.90 | optimizer_step: 4.36
-[2025-01-25 20:02:47,110] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.23 | bwd_microstep: 4630.94 | bwd_inner_microstep: 4624.91 | bwd_allreduce_microstep: 5.91 | step_microstep: 46.19
-[2025-01-25 20:02:47,111] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.19 | bwd: 4630.97 | bwd_inner: 4624.91 | bwd_allreduce: 5.98 | step: 46.21
- 70%|██████▉   | 4040/5800 [11:16:17<3:22:32,  6.91s/it]                                                        {'loss': 0.0127, 'grad_norm': 4.006852626800537, 'learning_rate': 8.90601259028998e-06, 'epoch': 34.83}
- 70%|██████▉   | 4040/5800 [11:16:17<3:22:32,  6.91s/it]score1 tensor([[0.4590],
-        [0.4629],
-        [0.5586],
-        [0.5938]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.4805, 0.5625, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0181, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:02:54,024] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.37
-[2025-01-25 20:02:54,025] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.01 | bwd_microstep: 4631.66 | bwd_inner_microstep: 4626.84 | bwd_allreduce_microstep: 4.72 | step_microstep: 43.85
-[2025-01-25 20:02:54,026] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.96 | bwd: 4631.68 | bwd_inner: 4626.84 | bwd_allreduce: 4.77 | step: 43.86
- 70%|██████▉   | 4041/5800 [11:16:24<3:22:31,  6.91s/it]                                                        {'loss': 0.0181, 'grad_norm': 8.209949493408203, 'learning_rate': 8.896721875268817e-06, 'epoch': 34.84}
- 70%|██████▉   | 4041/5800 [11:16:24<3:22:31,  6.91s/it]score1 tensor([[0.4043],
-        [0.3594],
-        [0.6289],
-        [0.4941]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3906, 0.2812, 0.6211, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0303, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:03:00,942] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 20:03:00,943] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.54 | bwd_microstep: 4623.68 | bwd_inner_microstep: 4618.94 | bwd_allreduce_microstep: 4.64 | step_microstep: 45.00
-[2025-01-25 20:03:00,944] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.47 | bwd: 4623.70 | bwd_inner: 4618.94 | bwd_allreduce: 4.69 | step: 45.00
- 70%|██████▉   | 4042/5800 [11:16:30<3:22:29,  6.91s/it]                                                        {'loss': 0.0303, 'grad_norm': 3.817770481109619, 'learning_rate': 8.887434622442426e-06, 'epoch': 34.84}
- 70%|██████▉   | 4042/5800 [11:16:30<3:22:29,  6.91s/it]score1 tensor([[0.3281],
-        [0.5625],
-        [0.5664],
-        [0.4023]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3457, 0.5469, 0.6055, 0.4258], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0239, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:03:07,864] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.37
-[2025-01-25 20:03:07,865] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.51 | bwd_microstep: 4632.02 | bwd_inner_microstep: 4626.55 | bwd_allreduce_microstep: 5.38 | step_microstep: 47.60
-[2025-01-25 20:03:07,865] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.47 | bwd: 4632.04 | bwd_inner: 4626.54 | bwd_allreduce: 5.43 | step: 47.61
- 70%|██████▉   | 4043/5800 [11:16:37<3:22:27,  6.91s/it]                                                        {'loss': 0.0239, 'grad_norm': 3.466731548309326, 'learning_rate': 8.878150834706745e-06, 'epoch': 34.85}
- 70%|██████▉   | 4043/5800 [11:16:37<3:22:27,  6.91s/it]score1 tensor([[0.4414],
-        [0.3984],
-        [0.4922],
-        [0.4082]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.4043, 0.4941, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:03:14,775] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 20:03:14,777] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.57 | bwd_microstep: 4627.88 | bwd_inner_microstep: 4622.94 | bwd_allreduce_microstep: 4.85 | step_microstep: 44.66
-[2025-01-25 20:03:14,777] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.51 | bwd: 4627.93 | bwd_inner: 4622.94 | bwd_allreduce: 4.89 | step: 44.66
- 70%|██████▉   | 4044/5800 [11:16:44<3:22:19,  6.91s/it]                                                        {'loss': 0.0054, 'grad_norm': 3.8688278198242188, 'learning_rate': 8.868870514956613e-06, 'epoch': 34.86}
- 70%|██████▉   | 4044/5800 [11:16:44<3:22:19,  6.91s/it]score1 tensor([[0.5000],
-        [0.4180],
-        [0.3672],
-        [0.6016]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.4043, 0.3555, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:03:21,691] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 20:03:21,693] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.10 | bwd_microstep: 4630.24 | bwd_inner_microstep: 4625.16 | bwd_allreduce_microstep: 5.00 | step_microstep: 47.32
-[2025-01-25 20:03:21,693] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.05 | bwd: 4630.27 | bwd_inner: 4625.16 | bwd_allreduce: 5.04 | step: 47.33
- 70%|██████▉   | 4045/5800 [11:16:51<3:22:14,  6.91s/it]                                                        {'loss': 0.0093, 'grad_norm': 1.0480271577835083, 'learning_rate': 8.859593666085791e-06, 'epoch': 34.87}
- 70%|██████▉   | 4045/5800 [11:16:51<3:22:14,  6.91s/it]score1 tensor([[0.4746],
-        [0.5078],
-        [0.7070],
-        [0.6367]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4707, 0.4844, 0.7070, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:03:28,545] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 20:03:28,546] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.23 | bwd_microstep: 4578.06 | bwd_inner_microstep: 4572.49 | bwd_allreduce_microstep: 5.46 | step_microstep: 42.90
-[2025-01-25 20:03:28,547] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.20 | bwd: 4578.09 | bwd_inner: 4572.49 | bwd_allreduce: 5.51 | step: 42.90
- 70%|██████▉   | 4046/5800 [11:16:58<3:21:34,  6.90s/it]                                                        {'loss': 0.0117, 'grad_norm': 6.291990756988525, 'learning_rate': 8.850320290986973e-06, 'epoch': 34.88}
- 70%|██████▉   | 4046/5800 [11:16:58<3:21:34,  6.90s/it]score1 tensor([[0.4980],
-        [0.4570],
-        [0.5938],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.4512, 0.6016, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:03:35,464] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.37
-[2025-01-25 20:03:35,465] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.59 | bwd_microstep: 4628.92 | bwd_inner_microstep: 4623.16 | bwd_allreduce_microstep: 5.65 | step_microstep: 46.83
-[2025-01-25 20:03:35,466] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.56 | bwd: 4628.94 | bwd_inner: 4623.16 | bwd_allreduce: 5.71 | step: 46.83
- 70%|██████▉   | 4047/5800 [11:17:05<3:21:40,  6.90s/it]                                                        {'loss': 0.0078, 'grad_norm': 0.46187081933021545, 'learning_rate': 8.841050392551747e-06, 'epoch': 34.89}
- 70%|██████▉   | 4047/5800 [11:17:05<3:21:40,  6.90s/it]score1 tensor([[0.4531],
-        [0.6953],
-        [0.6016],
-        [0.3496]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4297, 0.6797, 0.5820, 0.3418], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0166, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:03:42,386] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 20:03:42,387] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.89 | bwd_microstep: 4633.95 | bwd_inner_microstep: 4628.58 | bwd_allreduce_microstep: 5.30 | step_microstep: 46.62
-[2025-01-25 20:03:42,387] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.83 | bwd: 4633.98 | bwd_inner: 4628.58 | bwd_allreduce: 5.34 | step: 46.63
- 70%|██████▉   | 4048/5800 [11:17:12<3:21:43,  6.91s/it]                                                        {'loss': 0.0166, 'grad_norm': 8.270792007446289, 'learning_rate': 8.831783973670638e-06, 'epoch': 34.9}
- 70%|██████▉   | 4048/5800 [11:17:12<3:21:43,  6.91s/it]score1 tensor([[0.5430],
-        [0.5508],
-        [0.4609],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.5508, 0.4492, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:03:49,257] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.03 | optimizer_step: 4.36
-[2025-01-25 20:03:49,258] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.33 | bwd_microstep: 4580.06 | bwd_inner_microstep: 4574.77 | bwd_allreduce_microstep: 5.17 | step_microstep: 42.82
-[2025-01-25 20:03:49,259] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.31 | bwd: 4580.08 | bwd_inner: 4574.77 | bwd_allreduce: 5.22 | step: 42.83
- 70%|██████▉   | 4049/5800 [11:17:19<3:21:17,  6.90s/it]                                                        {'loss': 0.0073, 'grad_norm': 6.055781364440918, 'learning_rate': 8.822521037233074e-06, 'epoch': 34.91}
- 70%|██████▉   | 4049/5800 [11:17:19<3:21:17,  6.90s/it]score1 tensor([[0.4453],
-        [0.5898],
-        [0.3809],
-        [0.4980]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.5664, 0.3789, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:03:56,161] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.36
-[2025-01-25 20:03:56,163] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.45 | bwd_microstep: 4623.35 | bwd_inner_microstep: 4618.72 | bwd_allreduce_microstep: 4.54 | step_microstep: 44.18
-[2025-01-25 20:03:56,163] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.43 | bwd: 4623.37 | bwd_inner: 4618.72 | bwd_allreduce: 4.59 | step: 44.19
- 70%|██████▉   | 4050/5800 [11:17:26<3:21:12,  6.90s/it]                                                        {'loss': 0.0103, 'grad_norm': 4.038583278656006, 'learning_rate': 8.813261586127386e-06, 'epoch': 34.91}
- 70%|██████▉   | 4050/5800 [11:17:26<3:21:12,  6.90s/it]score1 tensor([[0.5000],
-        [0.6133],
-        [0.5703],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4863, 0.6133, 0.5664, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:04:03,013] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.37
-[2025-01-25 20:04:03,017] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.17 | bwd_microstep: 4570.17 | bwd_inner_microstep: 4565.34 | bwd_allreduce_microstep: 4.75 | step_microstep: 44.97
-[2025-01-25 20:04:03,018] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.13 | bwd: 4570.19 | bwd_inner: 4565.34 | bwd_allreduce: 4.79 | step: 45.04
- 70%|██████▉   | 4051/5800 [11:17:32<3:20:44,  6.89s/it]                                                        {'loss': 0.0083, 'grad_norm': 6.314655303955078, 'learning_rate': 8.804005623240852e-06, 'epoch': 34.92}
- 70%|██████▉   | 4051/5800 [11:17:32<3:20:44,  6.89s/it]score1 tensor([[0.5898],
-        [0.4082],
-        [0.6562],
-        [0.4141]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5742, 0.4062, 0.6445, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:04:09,924] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 20:04:09,926] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.64 | bwd_microstep: 4627.88 | bwd_inner_microstep: 4622.98 | bwd_allreduce_microstep: 4.77 | step_microstep: 43.00
-[2025-01-25 20:04:09,926] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.61 | bwd: 4627.91 | bwd_inner: 4622.98 | bwd_allreduce: 4.84 | step: 43.01
- 70%|██████▉   | 4052/5800 [11:17:39<3:20:48,  6.89s/it]                                                        {'loss': 0.0083, 'grad_norm': 4.630712985992432, 'learning_rate': 8.794753151459626e-06, 'epoch': 34.93}
- 70%|██████▉   | 4052/5800 [11:17:39<3:20:48,  6.89s/it]score1 tensor([[0.5781],
-        [0.5938],
-        [0.5352],
-        [0.4707]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5703, 0.5938, 0.5156, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:04:16,780] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.36
-[2025-01-25 20:04:16,781] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.16 | bwd_microstep: 4576.78 | bwd_inner_microstep: 4572.36 | bwd_allreduce_microstep: 4.33 | step_microstep: 42.88
-[2025-01-25 20:04:16,781] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.13 | bwd: 4576.80 | bwd_inner: 4572.36 | bwd_allreduce: 4.38 | step: 42.89
- 70%|██████▉   | 4053/5800 [11:17:46<3:20:22,  6.88s/it]                                                        {'loss': 0.0083, 'grad_norm': 6.208820819854736, 'learning_rate': 8.7855041736688e-06, 'epoch': 34.94}
- 70%|██████▉   | 4053/5800 [11:17:46<3:20:22,  6.88s/it]score1 tensor([[0.5352],
-        [0.6602],
-        [0.4062],
-        [0.3848]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.6602, 0.4160, 0.3867], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:04:23,637] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 20:04:23,639] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.45 | bwd_microstep: 4577.04 | bwd_inner_microstep: 4572.25 | bwd_allreduce_microstep: 4.68 | step_microstep: 43.07
-[2025-01-25 20:04:23,639] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.37 | bwd: 4577.07 | bwd_inner: 4572.25 | bwd_allreduce: 4.74 | step: 43.08
- 70%|██████▉   | 4054/5800 [11:17:53<3:20:03,  6.87s/it]                                                        {'loss': 0.0068, 'grad_norm': 5.627439975738525, 'learning_rate': 8.776258692752355e-06, 'epoch': 34.95}
- 70%|██████▉   | 4054/5800 [11:17:53<3:20:03,  6.87s/it]score1 tensor([[0.4062],
-        [0.6250],
-        [0.4336],
-        [0.5977]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4160, 0.6211, 0.4570, 0.6016], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:04:30,539] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.37
-[2025-01-25 20:04:30,540] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.64 | bwd_microstep: 4619.77 | bwd_inner_microstep: 4614.97 | bwd_allreduce_microstep: 4.71 | step_microstep: 42.35
-[2025-01-25 20:04:30,540] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.61 | bwd: 4619.80 | bwd_inner: 4614.97 | bwd_allreduce: 4.75 | step: 42.36
- 70%|██████▉   | 4055/5800 [11:18:00<3:20:12,  6.88s/it]                                                        {'loss': 0.0103, 'grad_norm': 3.584326982498169, 'learning_rate': 8.767016711593204e-06, 'epoch': 34.96}
- 70%|██████▉   | 4055/5800 [11:18:00<3:20:12,  6.88s/it]score1 tensor([[0.6211],
-        [0.4941],
-        [0.4727],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6289, 0.4902, 0.4551, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:04:37,461] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 20:04:37,462] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.74 | bwd_microstep: 4628.01 | bwd_inner_microstep: 4623.15 | bwd_allreduce_microstep: 4.79 | step_microstep: 47.13
-[2025-01-25 20:04:37,463] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.71 | bwd: 4628.03 | bwd_inner: 4623.15 | bwd_allreduce: 4.82 | step: 47.15
- 70%|██████▉   | 4056/5800 [11:18:07<3:20:22,  6.89s/it]                                                        {'loss': 0.0093, 'grad_norm': 0.829006552696228, 'learning_rate': 8.757778233073153e-06, 'epoch': 34.97}
- 70%|██████▉   | 4056/5800 [11:18:07<3:20:22,  6.89s/it]score1 tensor([[0.4551],
-        [0.6055],
-        [0.4102],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4590, 0.5938, 0.4082, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:04:44,376] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 20:04:44,378] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.57 | bwd_microstep: 4638.73 | bwd_inner_microstep: 4634.01 | bwd_allreduce_microstep: 4.62 | step_microstep: 42.79
-[2025-01-25 20:04:44,378] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.54 | bwd: 4638.76 | bwd_inner: 4634.01 | bwd_allreduce: 4.67 | step: 42.79
- 70%|██████▉   | 4057/5800 [11:18:14<3:20:25,  6.90s/it]                                                        {'loss': 0.0054, 'grad_norm': 4.253370761871338, 'learning_rate': 8.748543260072912e-06, 'epoch': 34.97}
- 70%|██████▉   | 4057/5800 [11:18:14<3:20:25,  6.90s/it]score1 tensor([[0.4805],
-        [0.3965],
-        [0.3516],
-        [0.4785]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.4004, 0.3672, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:04:51,292] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.34 | optimizer_step: 4.36
-[2025-01-25 20:04:51,293] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.00 | bwd_microstep: 4640.50 | bwd_inner_microstep: 4635.37 | bwd_allreduce_microstep: 5.04 | step_microstep: 45.16
-[2025-01-25 20:04:51,293] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.98 | bwd: 4640.53 | bwd_inner: 4635.37 | bwd_allreduce: 5.09 | step: 45.17
- 70%|██████▉   | 4058/5800 [11:18:21<3:20:32,  6.91s/it]                                                        {'loss': 0.0068, 'grad_norm': 3.4562630653381348, 'learning_rate': 8.739311795472112e-06, 'epoch': 34.98}
- 70%|██████▉   | 4058/5800 [11:18:21<3:20:32,  6.91s/it]score1 tensor([[0.6836],
-        [0.6328],
-        [0.6602],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6836, 0.6484, 0.6641, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:04:58,173] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.91 | optimizer_step: 4.37
-[2025-01-25 20:04:58,174] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.46 | bwd_microstep: 4584.56 | bwd_inner_microstep: 4579.71 | bwd_allreduce_microstep: 4.75 | step_microstep: 46.99
-[2025-01-25 20:04:58,174] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.40 | bwd: 4584.58 | bwd_inner: 4579.71 | bwd_allreduce: 4.80 | step: 47.00
- 70%|██████▉   | 4059/5800 [11:18:28<3:20:08,  6.90s/it]                                                        {'loss': 0.0059, 'grad_norm': 6.794659614562988, 'learning_rate': 8.73008384214929e-06, 'epoch': 34.99}
- 70%|██████▉   | 4059/5800 [11:18:28<3:20:08,  6.90s/it][93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-score1 tensor([[0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:05:02,668] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.37
-[2025-01-25 20:05:02,670] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 576.05 | bwd_microstep: 1221.69 | bwd_inner_microstep: 1216.89 | bwd_allreduce_microstep: 4.70 | step_microstep: 43.19
-[2025-01-25 20:05:02,670] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 576.01 | bwd: 1221.71 | bwd_inner: 1216.89 | bwd_allreduce: 4.74 | step: 43.19
- 70%|███████   | 4060/5800 [11:18:32<2:59:08,  6.18s/it]                                                        {'loss': 0.0039, 'grad_norm': 8.605040550231934, 'learning_rate': 8.72085940298187e-06, 'epoch': 35.0}
- 70%|███████   | 4060/5800 [11:18:32<2:59:08,  6.18s/it][2025-01-25 20:05:07,111] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 20:05:17,503] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 20:05:27,842] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 20:05:38,089] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-score1 tensor([[0.4473],
-        [0.3633],
-        [0.6484],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4277, 0.3711, 0.6445, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:05:57,993] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.99 | optimizer_step: 4.37
-[2025-01-25 20:05:57,995] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2139.19 | bwd_microstep: 4598.72 | bwd_inner_microstep: 4593.71 | bwd_allreduce_microstep: 4.93 | step_microstep: 44.55
-[2025-01-25 20:05:57,995] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2139.16 | bwd: 4598.74 | bwd_inner: 4593.71 | bwd_allreduce: 4.96 | step: 44.56
- 70%|███████   | 4061/5800 [11:19:27<10:06:22, 20.92s/it]                                                         {'loss': 0.0088, 'grad_norm': 0.6350886821746826, 'learning_rate': 8.711638480846205e-06, 'epoch': 35.01}
- 70%|███████   | 4061/5800 [11:19:27<10:06:22, 20.92s/it]score1 tensor([[0.5195],
-        [0.3223],
-        [0.4512],
-        [0.4316]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.3438, 0.4629, 0.4375], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:06:04,824] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.37
-[2025-01-25 20:06:04,825] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2130.72 | bwd_microstep: 4577.46 | bwd_inner_microstep: 4572.47 | bwd_allreduce_microstep: 4.88 | step_microstep: 45.03
-[2025-01-25 20:06:04,825] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2130.68 | bwd: 4577.48 | bwd_inner: 4572.47 | bwd_allreduce: 4.94 | step: 45.04
- 70%|███████   | 4062/5800 [11:19:34<8:03:32, 16.69s/it]                                                         {'loss': 0.0127, 'grad_norm': 7.4221296310424805, 'learning_rate': 8.702421078617525e-06, 'epoch': 35.02}
- 70%|███████   | 4062/5800 [11:19:34<8:03:32, 16.69s/it]score1 tensor([[0.5000],
-        [0.3535],
-        [0.4375],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4961, 0.3691, 0.4473, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:06:11,655] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.91 | optimizer_step: 4.36
-[2025-01-25 20:06:11,656] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2133.74 | bwd_microstep: 4584.74 | bwd_inner_microstep: 4579.84 | bwd_allreduce_microstep: 4.82 | step_microstep: 43.02
-[2025-01-25 20:06:11,656] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2133.71 | bwd: 4584.77 | bwd_inner: 4579.84 | bwd_allreduce: 4.86 | step: 43.03
- 70%|███████   | 4063/5800 [11:19:41<6:37:38, 13.74s/it]                                                        {'loss': 0.0083, 'grad_norm': 3.618438959121704, 'learning_rate': 8.693207199169988e-06, 'epoch': 35.03}
- 70%|███████   | 4063/5800 [11:19:41<6:37:38, 13.74s/it]score1 tensor([[0.4688],
-        [0.4082],
-        [0.4980],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4883, 0.4180, 0.5000, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:06:18,521] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.31 | optimizer_step: 4.37
-[2025-01-25 20:06:18,522] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2141.23 | bwd_microstep: 4600.82 | bwd_inner_microstep: 4595.86 | bwd_allreduce_microstep: 4.85 | step_microstep: 44.78
-[2025-01-25 20:06:18,523] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2141.20 | bwd: 4600.85 | bwd_inner: 4595.86 | bwd_allreduce: 4.90 | step: 44.78
- 70%|███████   | 4064/5800 [11:19:48<5:37:48, 11.68s/it]                                                        {'loss': 0.0088, 'grad_norm': 3.6359241008758545, 'learning_rate': 8.68399684537663e-06, 'epoch': 35.03}
- 70%|███████   | 4064/5800 [11:19:48<5:37:48, 11.68s/it]score1 tensor([[0.4531],
-        [0.4453],
-        [0.5430],
-        [0.4062]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4609, 0.4492, 0.5586, 0.4043], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:06:25,393] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 20:06:25,399] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2145.94 | bwd_microstep: 4600.50 | bwd_inner_microstep: 4595.70 | bwd_allreduce_microstep: 4.71 | step_microstep: 49.40
-[2025-01-25 20:06:25,400] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2145.91 | bwd: 4600.52 | bwd_inner: 4595.70 | bwd_allreduce: 4.74 | step: 49.41
- 70%|███████   | 4065/5800 [11:19:55<4:55:56, 10.23s/it]                                                        {'loss': 0.0073, 'grad_norm': 4.100090503692627, 'learning_rate': 8.674790020109414e-06, 'epoch': 35.04}
- 70%|███████   | 4065/5800 [11:19:55<4:55:56, 10.23s/it]score1 tensor([[0.5547],
-        [0.6562],
-        [0.4785],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.6445, 0.4648, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:06:32,222] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.37
-[2025-01-25 20:06:32,224] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2147.49 | bwd_microstep: 4558.32 | bwd_inner_microstep: 4553.07 | bwd_allreduce_microstep: 5.16 | step_microstep: 44.73
-[2025-01-25 20:06:32,224] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2147.44 | bwd: 4558.35 | bwd_inner: 4553.07 | bwd_allreduce: 5.21 | step: 44.74
- 70%|███████   | 4066/5800 [11:20:02<4:26:14,  9.21s/it]                                                        {'loss': 0.0073, 'grad_norm': 6.509937763214111, 'learning_rate': 8.66558672623917e-06, 'epoch': 35.05}
- 70%|███████   | 4066/5800 [11:20:02<4:26:14,  9.21s/it]score1 tensor([[0.6133],
-        [0.5117],
-        [0.5039],
-        [0.5859]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133, 0.4980, 0.4863, 0.5742], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:06:39,049] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.37
-[2025-01-25 20:06:39,051] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2147.49 | bwd_microstep: 4550.66 | bwd_inner_microstep: 4545.44 | bwd_allreduce_microstep: 5.11 | step_microstep: 45.07
-[2025-01-25 20:06:39,051] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2147.45 | bwd: 4550.68 | bwd_inner: 4545.44 | bwd_allreduce: 5.16 | step: 45.08
- 70%|███████   | 4067/5800 [11:20:09<4:05:25,  8.50s/it]                                                        {'loss': 0.0107, 'grad_norm': 6.291415691375732, 'learning_rate': 8.656386966635664e-06, 'epoch': 35.06}
- 70%|███████   | 4067/5800 [11:20:09<4:05:25,  8.50s/it]score1 tensor([[0.4805],
-        [0.5312],
-        [0.5469],
-        [0.4492]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4824, 0.5391, 0.5664, 0.4434], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:06:45,924] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 20:06:45,925] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.02 | bwd_microstep: 4598.65 | bwd_inner_microstep: 4593.69 | bwd_allreduce_microstep: 4.88 | step_microstep: 44.53
-[2025-01-25 20:06:45,925] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2149.98 | bwd: 4598.68 | bwd_inner: 4593.69 | bwd_allreduce: 4.91 | step: 44.53
- 70%|███████   | 4068/5800 [11:20:15<3:51:13,  8.01s/it]                                                        {'loss': 0.0088, 'grad_norm': 4.286492347717285, 'learning_rate': 8.647190744167528e-06, 'epoch': 35.07}
- 70%|███████   | 4068/5800 [11:20:15<3:51:13,  8.01s/it]score1 tensor([[0.4902],
-        [0.5742],
-        [0.5742],
-        [0.4980]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.5664, 0.5625, 0.4883], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:06:52,801] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.36
-[2025-01-25 20:06:52,802] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2149.05 | bwd_microstep: 4607.00 | bwd_inner_microstep: 4602.09 | bwd_allreduce_microstep: 4.82 | step_microstep: 44.14
-[2025-01-25 20:06:52,803] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2149.02 | bwd: 4607.02 | bwd_inner: 4602.09 | bwd_allreduce: 4.86 | step: 44.15
- 70%|███████   | 4069/5800 [11:20:22<3:41:18,  7.67s/it]                                                        {'loss': 0.0088, 'grad_norm': 8.359514236450195, 'learning_rate': 8.6379980617023e-06, 'epoch': 35.08}
- 70%|███████   | 4069/5800 [11:20:22<3:41:18,  7.67s/it]score1 tensor([[0.6016],
-        [0.5508],
-        [0.5078],
-        [0.4102]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6016, 0.5352, 0.4844, 0.4238], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:06:59,634] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.37
-[2025-01-25 20:06:59,636] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2149.51 | bwd_microstep: 4550.84 | bwd_inner_microstep: 4545.50 | bwd_allreduce_microstep: 5.26 | step_microstep: 45.16
-[2025-01-25 20:06:59,636] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2149.48 | bwd: 4550.87 | bwd_inner: 4545.50 | bwd_allreduce: 5.30 | step: 45.16
- 70%|███████   | 4070/5800 [11:20:29<3:33:54,  7.42s/it]                                                        {'loss': 0.0132, 'grad_norm': 2.4031269550323486, 'learning_rate': 8.628808922106432e-06, 'epoch': 35.09}
- 70%|███████   | 4070/5800 [11:20:29<3:33:54,  7.42s/it]score1 tensor([[0.6484],
-        [0.5547],
-        [0.4492],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6562, 0.5586, 0.4355, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:07:06,524] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.05 | optimizer_step: 4.37
-[2025-01-25 20:07:06,525] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.40 | bwd_microstep: 4612.43 | bwd_inner_microstep: 4606.92 | bwd_allreduce_microstep: 5.33 | step_microstep: 46.97
-[2025-01-25 20:07:06,525] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.37 | bwd: 4612.46 | bwd_inner: 4606.92 | bwd_allreduce: 5.43 | step: 46.98
- 70%|███████   | 4071/5800 [11:20:36<3:29:14,  7.26s/it]                                                        {'loss': 0.0083, 'grad_norm': 0.884791910648346, 'learning_rate': 8.619623328245241e-06, 'epoch': 35.09}
- 70%|███████   | 4071/5800 [11:20:36<3:29:14,  7.26s/it]score1 tensor([[0.5742],
-        [0.4336],
-        [0.6836],
-        [0.4863]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.4258, 0.6875, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:07:13,406] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.88 | optimizer_step: 4.37
-[2025-01-25 20:07:13,407] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.82 | bwd_microstep: 4610.20 | bwd_inner_microstep: 4605.36 | bwd_allreduce_microstep: 4.75 | step_microstep: 42.38
-[2025-01-25 20:07:13,408] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.79 | bwd: 4610.23 | bwd_inner: 4605.36 | bwd_allreduce: 4.79 | step: 42.39
- 70%|███████   | 4072/5800 [11:20:43<3:25:50,  7.15s/it]                                                        {'loss': 0.0068, 'grad_norm': 4.796988487243652, 'learning_rate': 8.610441282982971e-06, 'epoch': 35.1}
- 70%|███████   | 4072/5800 [11:20:43<3:25:50,  7.15s/it]score1 tensor([[0.6641],
-        [0.4980],
-        [0.5000],
-        [0.4980]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.4785, 0.4941, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:07:20,294] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 20:07:20,295] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.33 | bwd_microstep: 4612.35 | bwd_inner_microstep: 4606.90 | bwd_allreduce_microstep: 5.37 | step_microstep: 41.82
-[2025-01-25 20:07:20,295] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.30 | bwd: 4612.38 | bwd_inner: 4606.90 | bwd_allreduce: 5.41 | step: 41.82
- 70%|███████   | 4073/5800 [11:20:50<3:23:31,  7.07s/it]                                                        {'loss': 0.0146, 'grad_norm': 8.409481048583984, 'learning_rate': 8.60126278918273e-06, 'epoch': 35.11}
- 70%|███████   | 4073/5800 [11:20:50<3:23:31,  7.07s/it]score1 tensor([[0.5352],
-        [0.5430],
-        [0.5195],
-        [0.4805]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.5430, 0.5234, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:07:27,139] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 20:07:27,140] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.26 | bwd_microstep: 4567.40 | bwd_inner_microstep: 4562.60 | bwd_allreduce_microstep: 4.72 | step_microstep: 41.87
-[2025-01-25 20:07:27,141] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.24 | bwd: 4567.43 | bwd_inner: 4562.60 | bwd_allreduce: 4.76 | step: 41.88
- 70%|███████   | 4074/5800 [11:20:57<3:21:21,  7.00s/it]                                                        {'loss': 0.0063, 'grad_norm': 1.9580446481704712, 'learning_rate': 8.59208784970654e-06, 'epoch': 35.12}
- 70%|███████   | 4074/5800 [11:20:57<3:21:21,  7.00s/it]score1 tensor([[0.7305],
-        [0.5391],
-        [0.5156],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.7031, 0.5273, 0.5195, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:07:34,043] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 20:07:34,044] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.60 | bwd_microstep: 4617.81 | bwd_inner_microstep: 4613.02 | bwd_allreduce_microstep: 4.72 | step_microstep: 49.42
-[2025-01-25 20:07:34,045] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.57 | bwd: 4617.84 | bwd_inner: 4613.02 | bwd_allreduce: 4.76 | step: 49.43
- 70%|███████   | 4075/5800 [11:21:04<3:20:28,  6.97s/it]                                                        {'loss': 0.0146, 'grad_norm': 4.706672668457031, 'learning_rate': 8.582916467415305e-06, 'epoch': 35.13}
- 70%|███████   | 4075/5800 [11:21:04<3:20:28,  6.97s/it]score1 tensor([[0.5234],
-        [0.5664],
-        [0.3828],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.5625, 0.3750, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:07:40,937] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.37
-[2025-01-25 20:07:40,938] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.25 | bwd_microstep: 4613.34 | bwd_inner_microstep: 4608.52 | bwd_allreduce_microstep: 4.74 | step_microstep: 45.85
-[2025-01-25 20:07:40,939] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.21 | bwd: 4613.37 | bwd_inner: 4608.52 | bwd_allreduce: 4.78 | step: 45.85
- 70%|███████   | 4076/5800 [11:21:10<3:19:38,  6.95s/it]                                                        {'loss': 0.0049, 'grad_norm': 3.868483066558838, 'learning_rate': 8.573748645168811e-06, 'epoch': 35.14}
- 70%|███████   | 4076/5800 [11:21:10<3:19:38,  6.95s/it]score1 tensor([[0.4062],
-        [0.3398],
-        [0.5195],
-        [0.5859]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4043, 0.3340, 0.5078, 0.5820], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:07:47,829] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.36
-[2025-01-25 20:07:47,831] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.88 | bwd_microstep: 4614.90 | bwd_inner_microstep: 4609.88 | bwd_allreduce_microstep: 4.90 | step_microstep: 43.85
-[2025-01-25 20:07:47,832] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.83 | bwd: 4614.93 | bwd_inner: 4609.88 | bwd_allreduce: 4.96 | step: 43.85
- 70%|███████   | 4077/5800 [11:21:17<3:19:04,  6.93s/it]                                                        {'loss': 0.0059, 'grad_norm': 7.701883792877197, 'learning_rate': 8.564584385825761e-06, 'epoch': 35.15}
- 70%|███████   | 4077/5800 [11:21:17<3:19:04,  6.93s/it]score1 tensor([[0.5977],
-        [0.3652],
-        [0.4004],
-        [0.5898]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.3398, 0.3887, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0210, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:07:54,732] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 7.82
-[2025-01-25 20:07:54,733] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.87 | bwd_microstep: 4618.72 | bwd_inner_microstep: 4613.81 | bwd_allreduce_microstep: 4.82 | step_microstep: 46.99
-[2025-01-25 20:07:54,734] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.84 | bwd: 4618.75 | bwd_inner: 4613.81 | bwd_allreduce: 4.86 | step: 46.99
- 70%|███████   | 4078/5800 [11:21:24<3:18:41,  6.92s/it]                                                        {'loss': 0.021, 'grad_norm': 3.4642059803009033, 'learning_rate': 8.555423692243711e-06, 'epoch': 35.16}
- 70%|███████   | 4078/5800 [11:21:24<3:18:41,  6.92s/it]score1 tensor([[0.3887],
-        [0.4766],
-        [0.5508],
-        [0.6406]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3262, 0.4668, 0.5508, 0.6523], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0210, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:08:01,585] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 20:08:01,587] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.69 | bwd_microstep: 4574.67 | bwd_inner_microstep: 4569.21 | bwd_allreduce_microstep: 5.36 | step_microstep: 46.29
-[2025-01-25 20:08:01,587] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.65 | bwd: 4574.69 | bwd_inner: 4569.21 | bwd_allreduce: 5.41 | step: 46.30
- 70%|███████   | 4079/5800 [11:21:31<3:17:57,  6.90s/it]                                                        {'loss': 0.021, 'grad_norm': 1.4999667406082153, 'learning_rate': 8.546266567279144e-06, 'epoch': 35.16}
- 70%|███████   | 4079/5800 [11:21:31<3:17:57,  6.90s/it]score1 tensor([[0.4590],
-        [0.3711],
-        [0.4648],
-        [0.6758]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.3789, 0.4785, 0.6797], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:08:08,498] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.37
-[2025-01-25 20:08:08,499] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.27 | bwd_microstep: 4620.63 | bwd_inner_microstep: 4615.91 | bwd_allreduce_microstep: 4.64 | step_microstep: 42.86
-[2025-01-25 20:08:08,499] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.24 | bwd: 4620.65 | bwd_inner: 4615.91 | bwd_allreduce: 4.68 | step: 42.87
- 70%|███████   | 4080/5800 [11:21:38<3:17:56,  6.91s/it]                                                        {'loss': 0.0088, 'grad_norm': 4.102849006652832, 'learning_rate': 8.5371130137874e-06, 'epoch': 35.17}
- 70%|███████   | 4080/5800 [11:21:38<3:17:56,  6.91s/it]score1 tensor([[0.4062],
-        [0.5117],
-        [0.6484],
-        [0.3164]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.5195, 0.6445, 0.3223], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:08:15,393] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.37
-[2025-01-25 20:08:15,394] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.86 | bwd_microstep: 4617.66 | bwd_inner_microstep: 4612.90 | bwd_allreduce_microstep: 4.67 | step_microstep: 42.48
-[2025-01-25 20:08:15,394] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.82 | bwd: 4617.68 | bwd_inner: 4612.90 | bwd_allreduce: 4.71 | step: 42.49
- 70%|███████   | 4081/5800 [11:21:45<3:17:45,  6.90s/it]                                                        {'loss': 0.0073, 'grad_norm': 3.165759325027466, 'learning_rate': 8.52796303462271e-06, 'epoch': 35.18}
- 70%|███████   | 4081/5800 [11:21:45<3:17:45,  6.90s/it]score1 tensor([[0.4258],
-        [0.4980],
-        [0.4590],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4258, 0.5156, 0.4609, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:08:22,237] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.36
-[2025-01-25 20:08:22,239] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.53 | bwd_microstep: 4562.85 | bwd_inner_microstep: 4557.98 | bwd_allreduce_microstep: 4.79 | step_microstep: 42.17
-[2025-01-25 20:08:22,239] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.49 | bwd: 4562.88 | bwd_inner: 4557.98 | bwd_allreduce: 4.83 | step: 42.18
- 70%|███████   | 4082/5800 [11:21:52<3:17:09,  6.89s/it]                                                        {'loss': 0.0078, 'grad_norm': 5.938370227813721, 'learning_rate': 8.51881663263821e-06, 'epoch': 35.19}
- 70%|███████   | 4082/5800 [11:21:52<3:17:09,  6.89s/it]score1 tensor([[0.6250],
-        [0.5703],
-        [0.5859],
-        [0.3906]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6367, 0.5898, 0.5781, 0.3809], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:08:29,132] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 20:08:29,133] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.60 | bwd_microstep: 4614.14 | bwd_inner_microstep: 4609.37 | bwd_allreduce_microstep: 4.70 | step_microstep: 42.32
-[2025-01-25 20:08:29,134] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.56 | bwd: 4614.16 | bwd_inner: 4609.37 | bwd_allreduce: 4.73 | step: 42.33
- 70%|███████   | 4083/5800 [11:21:59<3:17:05,  6.89s/it]                                                        {'loss': 0.0122, 'grad_norm': 0.7781352400779724, 'learning_rate': 8.509673810685893e-06, 'epoch': 35.2}
- 70%|███████   | 4083/5800 [11:21:59<3:17:05,  6.89s/it]score1 tensor([[0.4961],
-        [0.6133],
-        [0.3691],
-        [0.5938]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.6172, 0.3750, 0.5938], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0029, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:08:35,981] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 20:08:35,982] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.46 | bwd_microstep: 4572.57 | bwd_inner_microstep: 4567.61 | bwd_allreduce_microstep: 4.87 | step_microstep: 42.03
-[2025-01-25 20:08:35,982] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.44 | bwd: 4572.59 | bwd_inner: 4567.61 | bwd_allreduce: 4.92 | step: 42.04
- 70%|███████   | 4084/5800 [11:22:05<3:16:37,  6.87s/it]                                                        {'loss': 0.0029, 'grad_norm': 5.9812493324279785, 'learning_rate': 8.500534571616663e-06, 'epoch': 35.21}
- 70%|███████   | 4084/5800 [11:22:05<3:16:37,  6.87s/it]score1 tensor([[0.5156],
-        [0.4512],
-        [0.5430],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.4570, 0.5508, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:08:42,820] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 20:08:42,821] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.96 | bwd_microstep: 4567.86 | bwd_inner_microstep: 4562.89 | bwd_allreduce_microstep: 4.89 | step_microstep: 40.71
-[2025-01-25 20:08:42,822] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.93 | bwd: 4567.88 | bwd_inner: 4562.89 | bwd_allreduce: 4.93 | step: 40.72
- 70%|███████   | 4085/5800 [11:22:12<3:16:14,  6.87s/it]                                                        {'loss': 0.0083, 'grad_norm': 5.981919288635254, 'learning_rate': 8.491398918280278e-06, 'epoch': 35.22}
- 70%|███████   | 4085/5800 [11:22:12<3:16:14,  6.87s/it]score1 tensor([[0.5781],
-        [0.4590],
-        [0.4590],
-        [0.3926]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5820, 0.4570, 0.4648, 0.3906], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0034, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:08:49,712] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.37
-[2025-01-25 20:08:49,714] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.62 | bwd_microstep: 4615.81 | bwd_inner_microstep: 4610.61 | bwd_allreduce_microstep: 5.09 | step_microstep: 45.43
-[2025-01-25 20:08:49,714] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.58 | bwd: 4615.83 | bwd_inner: 4610.61 | bwd_allreduce: 5.15 | step: 45.44
- 70%|███████   | 4086/5800 [11:22:19<3:16:19,  6.87s/it]                                                        {'loss': 0.0034, 'grad_norm': 0.6581294536590576, 'learning_rate': 8.482266853525411e-06, 'epoch': 35.22}
- 70%|███████   | 4086/5800 [11:22:19<3:16:19,  6.87s/it]score1 tensor([[0.4961],
-        [0.6367],
-        [0.5117],
-        [0.4766]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.6328, 0.5352, 0.4746], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:08:56,612] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.00 | optimizer_step: 4.36
-[2025-01-25 20:08:56,614] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.77 | bwd_microstep: 4620.98 | bwd_inner_microstep: 4615.38 | bwd_allreduce_microstep: 5.47 | step_microstep: 44.58
-[2025-01-25 20:08:56,614] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.74 | bwd: 4621.00 | bwd_inner: 4615.38 | bwd_allreduce: 5.54 | step: 44.58
- 70%|███████   | 4087/5800 [11:22:26<3:16:31,  6.88s/it]                                                        {'loss': 0.0078, 'grad_norm': 0.4833517372608185, 'learning_rate': 8.473138380199587e-06, 'epoch': 35.23}
- 70%|███████   | 4087/5800 [11:22:26<3:16:31,  6.88s/it]score1 tensor([[0.5078],
-        [0.5586],
-        [0.4375],
-        [0.6250]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.5625, 0.4531, 0.6133], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:09:03,522] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.92 | optimizer_step: 4.37
-[2025-01-25 20:09:03,523] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.87 | bwd_microstep: 4622.95 | bwd_inner_microstep: 4617.21 | bwd_allreduce_microstep: 5.62 | step_microstep: 43.40
-[2025-01-25 20:09:03,523] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.83 | bwd: 4622.98 | bwd_inner: 4617.21 | bwd_allreduce: 5.69 | step: 43.41
- 70%|███████   | 4088/5800 [11:22:33<3:16:32,  6.89s/it]                                                        {'loss': 0.0088, 'grad_norm': 3.7529754638671875, 'learning_rate': 8.464013501149221e-06, 'epoch': 35.24}
- 70%|███████   | 4088/5800 [11:22:33<3:16:32,  6.89s/it]score1 tensor([[0.4902],
-        [0.5625],
-        [0.4688],
-        [0.4746]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.5703, 0.4727, 0.4707], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:09:10,413] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 20:09:10,415] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.19 | bwd_microstep: 4618.57 | bwd_inner_microstep: 4614.13 | bwd_allreduce_microstep: 4.36 | step_microstep: 41.03
-[2025-01-25 20:09:10,415] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.15 | bwd: 4618.59 | bwd_inner: 4614.13 | bwd_allreduce: 4.40 | step: 41.04
- 70%|███████   | 4089/5800 [11:22:40<3:16:29,  6.89s/it]                                                        {'loss': 0.0054, 'grad_norm': 0.4332277774810791, 'learning_rate': 8.454892219219617e-06, 'epoch': 35.25}
- 70%|███████   | 4089/5800 [11:22:40<3:16:29,  6.89s/it]score1 tensor([[0.4609],
-        [0.6172],
-        [0.4805],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.6055, 0.4688, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:09:17,307] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.91 | optimizer_step: 4.37
-[2025-01-25 20:09:17,308] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.05 | bwd_microstep: 4615.80 | bwd_inner_microstep: 4610.73 | bwd_allreduce_microstep: 4.94 | step_microstep: 42.65
-[2025-01-25 20:09:17,308] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.02 | bwd: 4615.82 | bwd_inner: 4610.73 | bwd_allreduce: 5.00 | step: 42.66
- 71%|███████   | 4090/5800 [11:22:47<3:16:23,  6.89s/it]                                                        {'loss': 0.0107, 'grad_norm': 8.169576644897461, 'learning_rate': 8.445774537254938e-06, 'epoch': 35.26}
- 71%|███████   | 4090/5800 [11:22:47<3:16:23,  6.89s/it]score1 tensor([[0.4531],
-        [0.3047],
-        [0.4648],
-        [0.6289]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.3105, 0.4570, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:09:24,200] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.37
-[2025-01-25 20:09:24,202] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.22 | bwd_microstep: 4617.06 | bwd_inner_microstep: 4612.48 | bwd_allreduce_microstep: 4.49 | step_microstep: 42.86
-[2025-01-25 20:09:24,202] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.18 | bwd: 4617.09 | bwd_inner: 4612.48 | bwd_allreduce: 4.54 | step: 42.89
- 71%|███████   | 4091/5800 [11:22:54<3:16:18,  6.89s/it]                                                        {'loss': 0.0063, 'grad_norm': 4.5973405838012695, 'learning_rate': 8.436660458098248e-06, 'epoch': 35.27}
- 71%|███████   | 4091/5800 [11:22:54<3:16:18,  6.89s/it]score1 tensor([[0.4980],
-        [0.5156],
-        [0.3730],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4863, 0.5039, 0.3652, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:09:31,103] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 20:09:31,105] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.69 | bwd_microstep: 4621.56 | bwd_inner_microstep: 4616.21 | bwd_allreduce_microstep: 5.25 | step_microstep: 44.07
-[2025-01-25 20:09:31,105] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.64 | bwd: 4621.58 | bwd_inner: 4616.21 | bwd_allreduce: 5.30 | step: 44.07
- 71%|███████   | 4092/5800 [11:23:01<3:16:16,  6.89s/it]                                                        {'loss': 0.0107, 'grad_norm': 7.919991970062256, 'learning_rate': 8.427549984591461e-06, 'epoch': 35.28}
- 71%|███████   | 4092/5800 [11:23:01<3:16:16,  6.89s/it]score1 tensor([[0.6211],
-        [0.4258],
-        [0.4316],
-        [0.6250]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.4316, 0.4277, 0.6016], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:09:37,997] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.97 | optimizer_step: 4.37
-[2025-01-25 20:09:37,998] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.20 | bwd_microstep: 4617.95 | bwd_inner_microstep: 4612.92 | bwd_allreduce_microstep: 4.93 | step_microstep: 42.76
-[2025-01-25 20:09:37,999] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.15 | bwd: 4617.97 | bwd_inner: 4612.92 | bwd_allreduce: 4.97 | step: 42.77
- 71%|███████   | 4093/5800 [11:23:07<3:16:10,  6.90s/it]                                                        {'loss': 0.0112, 'grad_norm': 4.639245986938477, 'learning_rate': 8.418443119575397e-06, 'epoch': 35.28}
- 71%|███████   | 4093/5800 [11:23:07<3:16:10,  6.90s/it]score1 tensor([[0.5625],
-        [0.4922],
-        [0.5586],
-        [0.4746]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4824, 0.5625, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:09:44,899] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.93 | optimizer_step: 4.36
-[2025-01-25 20:09:44,900] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.09 | bwd_microstep: 4619.12 | bwd_inner_microstep: 4614.26 | bwd_allreduce_microstep: 4.79 | step_microstep: 42.19
-[2025-01-25 20:09:44,901] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.03 | bwd: 4619.14 | bwd_inner: 4614.26 | bwd_allreduce: 4.82 | step: 42.20
- 71%|███████   | 4094/5800 [11:23:14<3:16:08,  6.90s/it]                                                        {'loss': 0.0088, 'grad_norm': 0.4152262508869171, 'learning_rate': 8.409339865889716e-06, 'epoch': 35.29}
- 71%|███████   | 4094/5800 [11:23:14<3:16:08,  6.90s/it]score1 tensor([[0.5859],
-        [0.4160],
-        [0.4180],
-        [0.3965]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.4297, 0.4004, 0.4141], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0220, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:09:51,800] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.37
-[2025-01-25 20:09:51,801] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.20 | bwd_microstep: 4618.91 | bwd_inner_microstep: 4610.17 | bwd_allreduce_microstep: 8.63 | step_microstep: 42.10
-[2025-01-25 20:09:51,801] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.16 | bwd: 4618.94 | bwd_inner: 4610.17 | bwd_allreduce: 8.69 | step: 42.11
- 71%|███████   | 4095/5800 [11:23:21<3:16:01,  6.90s/it]                                                        {'loss': 0.022, 'grad_norm': 0.6754809021949768, 'learning_rate': 8.400240226372988e-06, 'epoch': 35.3}
- 71%|███████   | 4095/5800 [11:23:21<3:16:01,  6.90s/it]score1 tensor([[0.3926],
-        [0.5664],
-        [0.4316],
-        [0.4473]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.5586, 0.4453, 0.4375], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:09:58,693] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.37
-[2025-01-25 20:09:58,694] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.01 | bwd_microstep: 4613.07 | bwd_inner_microstep: 4608.02 | bwd_allreduce_microstep: 4.97 | step_microstep: 43.74
-[2025-01-25 20:09:58,694] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.98 | bwd: 4613.10 | bwd_inner: 4608.02 | bwd_allreduce: 5.01 | step: 43.75
- 71%|███████   | 4096/5800 [11:23:28<3:15:53,  6.90s/it]                                                        {'loss': 0.0098, 'grad_norm': 0.7439060211181641, 'learning_rate': 8.391144203862625e-06, 'epoch': 35.31}
- 71%|███████   | 4096/5800 [11:23:28<3:15:53,  6.90s/it]score1 tensor([[0.3945],
-        [0.4199],
-        [0.5000],
-        [0.4980]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4082, 0.4180, 0.5039, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:10:05,593] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.36
-[2025-01-25 20:10:05,595] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.22 | bwd_microstep: 4617.78 | bwd_inner_microstep: 4612.77 | bwd_allreduce_microstep: 4.91 | step_microstep: 43.29
-[2025-01-25 20:10:05,595] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.19 | bwd: 4617.80 | bwd_inner: 4612.77 | bwd_allreduce: 4.96 | step: 43.29
- 71%|███████   | 4097/5800 [11:23:35<3:15:47,  6.90s/it]                                                        {'loss': 0.0063, 'grad_norm': 0.35459214448928833, 'learning_rate': 8.382051801194938e-06, 'epoch': 35.32}
- 71%|███████   | 4097/5800 [11:23:35<3:15:47,  6.90s/it]score1 tensor([[0.5938],
-        [0.4727],
-        [0.4375],
-        [0.4824]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5938, 0.4785, 0.4434, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:10:12,450] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 20:10:12,452] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.50 | bwd_microstep: 4563.15 | bwd_inner_microstep: 4558.06 | bwd_allreduce_microstep: 4.99 | step_microstep: 43.65
-[2025-01-25 20:10:12,452] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.47 | bwd: 4563.17 | bwd_inner: 4558.06 | bwd_allreduce: 5.04 | step: 43.65
- 71%|███████   | 4098/5800 [11:23:42<3:15:18,  6.88s/it]                                                        {'loss': 0.0054, 'grad_norm': 5.791418075561523, 'learning_rate': 8.37296302120508e-06, 'epoch': 35.33}
- 71%|███████   | 4098/5800 [11:23:42<3:15:18,  6.88s/it]score1 tensor([[0.4512],
-        [0.4395],
-        [0.5781],
-        [0.3867]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.4609, 0.5703, 0.3867], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:10:19,307] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.36
-[2025-01-25 20:10:19,309] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.48 | bwd_microstep: 4572.57 | bwd_inner_microstep: 4567.36 | bwd_allreduce_microstep: 5.09 | step_microstep: 43.47
-[2025-01-25 20:10:19,309] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.45 | bwd: 4572.62 | bwd_inner: 4567.36 | bwd_allreduce: 5.15 | step: 43.47
- 71%|███████   | 4099/5800 [11:23:49<3:14:57,  6.88s/it]                                                        {'loss': 0.0078, 'grad_norm': 2.2767698764801025, 'learning_rate': 8.363877866727105e-06, 'epoch': 35.34}
- 71%|███████   | 4099/5800 [11:23:49<3:14:57,  6.88s/it]score1 tensor([[0.4844],
-        [0.3789],
-        [0.3340],
-        [0.3848]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4922, 0.3945, 0.3555, 0.3730], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0142, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:10:26,220] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.36
-[2025-01-25 20:10:26,221] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.39 | bwd_microstep: 4625.11 | bwd_inner_microstep: 4619.93 | bwd_allreduce_microstep: 5.10 | step_microstep: 44.64
-[2025-01-25 20:10:26,222] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.36 | bwd: 4625.13 | bwd_inner: 4619.93 | bwd_allreduce: 5.14 | step: 44.65
- 71%|███████   | 4100/5800 [11:23:56<3:15:09,  6.89s/it]                                                        {'loss': 0.0142, 'grad_norm': 3.6168599128723145, 'learning_rate': 8.354796340593909e-06, 'epoch': 35.34}
- 71%|███████   | 4100/5800 [11:23:56<3:15:09,  6.89s/it]score1 tensor([[0.6250],
-        [0.6250],
-        [0.4277],
-        [0.4941]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6250, 0.6172, 0.4395, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:10:33,073] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.98 | optimizer_step: 4.36
-[2025-01-25 20:10:33,074] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.56 | bwd_microstep: 4569.17 | bwd_inner_microstep: 4563.82 | bwd_allreduce_microstep: 5.27 | step_microstep: 43.35
-[2025-01-25 20:10:33,075] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.52 | bwd: 4569.19 | bwd_inner: 4563.83 | bwd_allreduce: 5.31 | step: 43.36
- 71%|███████   | 4101/5800 [11:24:03<3:14:43,  6.88s/it]                                                        {'loss': 0.0059, 'grad_norm': 1.6123864650726318, 'learning_rate': 8.345718445637286e-06, 'epoch': 35.35}
- 71%|███████   | 4101/5800 [11:24:03<3:14:43,  6.88s/it]score1 tensor([[0.6641],
-        [0.5703],
-        [0.3672],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6484, 0.5547, 0.3730, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0181, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:10:39,980] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.90 | optimizer_step: 4.36
-[2025-01-25 20:10:39,981] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.61 | bwd_microstep: 4623.39 | bwd_inner_microstep: 4617.95 | bwd_allreduce_microstep: 5.33 | step_microstep: 44.08
-[2025-01-25 20:10:39,982] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.57 | bwd: 4623.41 | bwd_inner: 4617.95 | bwd_allreduce: 5.39 | step: 44.08
- 71%|███████   | 4102/5800 [11:24:09<3:14:49,  6.88s/it]                                                        {'loss': 0.0181, 'grad_norm': 5.171261787414551, 'learning_rate': 8.33664418468786e-06, 'epoch': 35.36}
- 71%|███████   | 4102/5800 [11:24:09<3:14:49,  6.88s/it]score1 tensor([[0.6016],
-        [0.5742],
-        [0.5625],
-        [0.3730]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.5664, 0.5430, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:10:46,885] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.03 | optimizer_step: 4.36
-[2025-01-25 20:10:46,887] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.64 | bwd_microstep: 4625.16 | bwd_inner_microstep: 4619.95 | bwd_allreduce_microstep: 5.10 | step_microstep: 44.67
-[2025-01-25 20:10:46,887] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.61 | bwd: 4625.21 | bwd_inner: 4619.95 | bwd_allreduce: 5.16 | step: 44.67
- 71%|███████   | 4103/5800 [11:24:16<3:14:55,  6.89s/it]                                                        {'loss': 0.0083, 'grad_norm': 0.6502920985221863, 'learning_rate': 8.327573560575162e-06, 'epoch': 35.37}
- 71%|███████   | 4103/5800 [11:24:16<3:14:55,  6.89s/it]score1 tensor([[0.4902],
-        [0.4512],
-        [0.5898],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4805, 0.4629, 0.5820, 0.5586], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:10:53,791] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 20:10:53,792] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.95 | bwd_microstep: 4616.62 | bwd_inner_microstep: 4611.23 | bwd_allreduce_microstep: 5.28 | step_microstep: 44.06
-[2025-01-25 20:10:53,792] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.92 | bwd: 4616.65 | bwd_inner: 4611.23 | bwd_allreduce: 5.34 | step: 44.06
- 71%|███████   | 4104/5800 [11:24:23<3:14:56,  6.90s/it]                                                        {'loss': 0.0093, 'grad_norm': 0.46532273292541504, 'learning_rate': 8.318506576127557e-06, 'epoch': 35.38}
- 71%|███████   | 4104/5800 [11:24:23<3:14:56,  6.90s/it]score1 tensor([[0.4414],
-        [0.6250],
-        [0.5352],
-        [0.4902]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.6172, 0.5312, 0.4688], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:11:00,691] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.40 | optimizer_step: 4.36
-[2025-01-25 20:11:00,692] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.11 | bwd_microstep: 4618.30 | bwd_inner_microstep: 4613.23 | bwd_allreduce_microstep: 4.98 | step_microstep: 45.09
-[2025-01-25 20:11:00,692] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.06 | bwd: 4618.32 | bwd_inner: 4613.23 | bwd_allreduce: 5.02 | step: 45.10
- 71%|███████   | 4105/5800 [11:24:30<3:14:51,  6.90s/it]                                                        {'loss': 0.0107, 'grad_norm': 4.5233154296875, 'learning_rate': 8.309443234172297e-06, 'epoch': 35.39}
- 71%|███████   | 4105/5800 [11:24:30<3:14:51,  6.90s/it]score1 tensor([[0.4707],
-        [0.5039],
-        [0.2217],
-        [0.6094]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4902, 0.5000, 0.1787, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:11:07,597] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.12 | optimizer_step: 4.36
-[2025-01-25 20:11:07,599] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.82 | bwd_microstep: 4625.45 | bwd_inner_microstep: 4620.37 | bwd_allreduce_microstep: 4.99 | step_microstep: 42.35
-[2025-01-25 20:11:07,599] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.78 | bwd: 4625.47 | bwd_inner: 4620.37 | bwd_allreduce: 5.03 | step: 42.36
- 71%|███████   | 4106/5800 [11:24:37<3:14:50,  6.90s/it]                                                        {'loss': 0.0186, 'grad_norm': 1.3801491260528564, 'learning_rate': 8.300383537535485e-06, 'epoch': 35.4}
- 71%|███████   | 4106/5800 [11:24:37<3:14:50,  6.90s/it]score1 tensor([[0.5117],
-        [0.4590],
-        [0.4453],
-        [0.4941]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.4512, 0.4414, 0.5000], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:11:14,504] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.37
-[2025-01-25 20:11:14,505] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.83 | bwd_microstep: 4624.32 | bwd_inner_microstep: 4619.26 | bwd_allreduce_microstep: 4.97 | step_microstep: 43.07
-[2025-01-25 20:11:14,506] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.80 | bwd: 4624.34 | bwd_inner: 4619.26 | bwd_allreduce: 5.01 | step: 43.08
- 71%|███████   | 4107/5800 [11:24:44<3:14:44,  6.90s/it]                                                        {'loss': 0.0103, 'grad_norm': 0.5309202075004578, 'learning_rate': 8.291327489042089e-06, 'epoch': 35.41}
- 71%|███████   | 4107/5800 [11:24:44<3:14:44,  6.90s/it]score1 tensor([[0.4805],
-        [0.4023],
-        [0.5039],
-        [0.4707]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.4023, 0.5117, 0.4766], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0044, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:11:21,348] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.01 | optimizer_step: 4.37
-[2025-01-25 20:11:21,350] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.35 | bwd_microstep: 4565.64 | bwd_inner_microstep: 4560.61 | bwd_allreduce_microstep: 4.95 | step_microstep: 43.79
-[2025-01-25 20:11:21,350] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.32 | bwd: 4565.66 | bwd_inner: 4560.61 | bwd_allreduce: 4.99 | step: 43.80
- 71%|███████   | 4108/5800 [11:24:51<3:14:11,  6.89s/it]                                                        {'loss': 0.0044, 'grad_norm': 5.88678503036499, 'learning_rate': 8.28227509151595e-06, 'epoch': 35.41}
- 71%|███████   | 4108/5800 [11:24:51<3:14:11,  6.89s/it]score1 tensor([[0.6719],
-        [0.4531],
-        [0.3750],
-        [0.6445]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6797, 0.4590, 0.3652, 0.6289], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:11:28,251] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.36
-[2025-01-25 20:11:28,252] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.72 | bwd_microstep: 4618.51 | bwd_inner_microstep: 4613.23 | bwd_allreduce_microstep: 5.14 | step_microstep: 42.73
-[2025-01-25 20:11:28,252] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.69 | bwd: 4618.54 | bwd_inner: 4613.23 | bwd_allreduce: 5.21 | step: 42.74
- 71%|███████   | 4109/5800 [11:24:58<3:14:10,  6.89s/it]                                                        {'loss': 0.0098, 'grad_norm': 0.6240377426147461, 'learning_rate': 8.27322634777975e-06, 'epoch': 35.42}
- 71%|███████   | 4109/5800 [11:24:58<3:14:10,  6.89s/it]score1 tensor([[0.6484],
-        [0.4434],
-        [0.4492],
-        [0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6562, 0.4336, 0.4473, 0.4902], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:11:35,162] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.36
-[2025-01-25 20:11:35,164] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.28 | bwd_microstep: 4626.94 | bwd_inner_microstep: 4621.87 | bwd_allreduce_microstep: 4.97 | step_microstep: 46.00
-[2025-01-25 20:11:35,164] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.25 | bwd: 4626.96 | bwd_inner: 4621.87 | bwd_allreduce: 5.02 | step: 46.01
- 71%|███████   | 4110/5800 [11:25:05<3:14:13,  6.90s/it]                                                        {'loss': 0.0063, 'grad_norm': 0.8181667327880859, 'learning_rate': 8.26418126065506e-06, 'epoch': 35.43}
- 71%|███████   | 4110/5800 [11:25:05<3:14:13,  6.90s/it]score1 tensor([[0.5430],
-        [0.4512],
-        [0.4512],
-        [0.4297]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.4473, 0.4551, 0.4238], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0044, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:11:42,057] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 20:11:42,058] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.78 | bwd_microstep: 4616.86 | bwd_inner_microstep: 4612.13 | bwd_allreduce_microstep: 4.65 | step_microstep: 42.91
-[2025-01-25 20:11:42,059] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.75 | bwd: 4616.89 | bwd_inner: 4612.13 | bwd_allreduce: 4.68 | step: 42.92
- 71%|███████   | 4111/5800 [11:25:12<3:14:06,  6.90s/it]                                                        {'loss': 0.0044, 'grad_norm': 3.9470438957214355, 'learning_rate': 8.255139832962287e-06, 'epoch': 35.44}
- 71%|███████   | 4111/5800 [11:25:12<3:14:06,  6.90s/it]score1 tensor([[0.3828],
-        [0.6797],
-        [0.5039],
-        [0.5898]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3789, 0.6875, 0.5039, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:11:48,920] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.20 | optimizer_step: 4.37
-[2025-01-25 20:11:48,922] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.09 | bwd_microstep: 4578.53 | bwd_inner_microstep: 4573.54 | bwd_allreduce_microstep: 4.91 | step_microstep: 43.23
-[2025-01-25 20:11:48,922] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.04 | bwd: 4578.55 | bwd_inner: 4573.54 | bwd_allreduce: 4.95 | step: 43.24
- 71%|███████   | 4112/5800 [11:25:18<3:13:45,  6.89s/it]                                                        {'loss': 0.0078, 'grad_norm': 3.0416932106018066, 'learning_rate': 8.246102067520699e-06, 'epoch': 35.45}
- 71%|███████   | 4112/5800 [11:25:18<3:13:45,  6.89s/it]score1 tensor([[0.4473],
-        [0.4414],
-        [0.5469],
-        [0.4297]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4453, 0.4375, 0.5391, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:11:55,824] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.98 | optimizer_step: 4.36
-[2025-01-25 20:11:55,826] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.69 | bwd_microstep: 4616.72 | bwd_inner_microstep: 4611.50 | bwd_allreduce_microstep: 5.12 | step_microstep: 44.91
-[2025-01-25 20:11:55,826] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.65 | bwd: 4616.74 | bwd_inner: 4611.50 | bwd_allreduce: 5.17 | step: 44.94
- 71%|███████   | 4113/5800 [11:25:25<3:13:47,  6.89s/it]                                                        {'loss': 0.0088, 'grad_norm': 4.021713733673096, 'learning_rate': 8.237067967148437e-06, 'epoch': 35.46}
- 71%|███████   | 4113/5800 [11:25:25<3:13:47,  6.89s/it]score1 tensor([[0.4688],
-        [0.4141],
-        [0.6289],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.4141, 0.6211, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0029, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:12:02,644] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 20:12:02,646] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.07 | bwd_microstep: 4536.81 | bwd_inner_microstep: 4531.79 | bwd_allreduce_microstep: 4.94 | step_microstep: 43.03
-[2025-01-25 20:12:02,646] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.03 | bwd: 4536.84 | bwd_inner: 4531.79 | bwd_allreduce: 4.98 | step: 43.04
- 71%|███████   | 4114/5800 [11:25:32<3:13:01,  6.87s/it]                                                        {'loss': 0.0029, 'grad_norm': 0.5372175574302673, 'learning_rate': 8.228037534662485e-06, 'epoch': 35.47}
- 71%|███████   | 4114/5800 [11:25:32<3:13:01,  6.87s/it]score1 tensor([[0.5664],
-        [0.4805],
-        [0.6602],
-        [0.4648]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.5039, 0.6641, 0.4688], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:12:09,484] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.93 | optimizer_step: 4.36
-[2025-01-25 20:12:09,486] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.22 | bwd_microstep: 4560.57 | bwd_inner_microstep: 4554.76 | bwd_allreduce_microstep: 5.70 | step_microstep: 43.73
-[2025-01-25 20:12:09,486] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.18 | bwd: 4560.59 | bwd_inner: 4554.76 | bwd_allreduce: 5.76 | step: 43.73
- 71%|███████   | 4115/5800 [11:25:39<3:12:39,  6.86s/it]                                                        {'loss': 0.0078, 'grad_norm': 6.254603862762451, 'learning_rate': 8.219010772878697e-06, 'epoch': 35.47}
- 71%|███████   | 4115/5800 [11:25:39<3:12:39,  6.86s/it]score1 tensor([[0.4707],
-        [0.4434],
-        [0.5078],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4668, 0.4453, 0.5156, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:12:16,385] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.36
-[2025-01-25 20:12:16,387] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.61 | bwd_microstep: 4621.74 | bwd_inner_microstep: 4616.52 | bwd_allreduce_microstep: 5.10 | step_microstep: 42.68
-[2025-01-25 20:12:16,387] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.57 | bwd: 4621.76 | bwd_inner: 4616.52 | bwd_allreduce: 5.16 | step: 42.69
- 71%|███████   | 4116/5800 [11:25:46<3:12:55,  6.87s/it]                                                        {'loss': 0.0054, 'grad_norm': 4.007570743560791, 'learning_rate': 8.20998768461176e-06, 'epoch': 35.48}
- 71%|███████   | 4116/5800 [11:25:46<3:12:55,  6.87s/it]score1 tensor([[0.6211],
-        [0.5391],
-        [0.4082],
-        [0.4590]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.5430, 0.3945, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:12:23,283] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.90 | optimizer_step: 4.37
-[2025-01-25 20:12:23,284] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.43 | bwd_microstep: 4617.01 | bwd_inner_microstep: 4611.39 | bwd_allreduce_microstep: 5.54 | step_microstep: 43.43
-[2025-01-25 20:12:23,284] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.40 | bwd: 4617.04 | bwd_inner: 4611.39 | bwd_allreduce: 5.57 | step: 43.44
- 71%|███████   | 4117/5800 [11:25:53<3:12:59,  6.88s/it]                                                        {'loss': 0.0073, 'grad_norm': 3.8897042274475098, 'learning_rate': 8.200968272675245e-06, 'epoch': 35.49}
- 71%|███████   | 4117/5800 [11:25:53<3:12:59,  6.88s/it]score1 tensor([[0.4395],
-        [0.5195],
-        [0.5977],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4395, 0.5039, 0.6055, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:12:30,148] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.98 | optimizer_step: 4.36
-[2025-01-25 20:12:30,149] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.31 | bwd_microstep: 4573.28 | bwd_inner_microstep: 4567.66 | bwd_allreduce_microstep: 5.53 | step_microstep: 48.69
-[2025-01-25 20:12:30,150] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.27 | bwd: 4573.31 | bwd_inner: 4567.66 | bwd_allreduce: 5.57 | step: 48.70
- 71%|███████   | 4118/5800 [11:26:00<3:12:47,  6.88s/it]                                                        {'loss': 0.0059, 'grad_norm': 0.3712459206581116, 'learning_rate': 8.191952539881554e-06, 'epoch': 35.5}
- 71%|███████   | 4118/5800 [11:26:00<3:12:47,  6.88s/it]score1 tensor([[0.5430],
-        [0.4531],
-        [0.4980],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.4688, 0.4941, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:12:36,999] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.03 | optimizer_step: 4.36
-[2025-01-25 20:12:37,001] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.19 | bwd_microstep: 4565.09 | bwd_inner_microstep: 4560.20 | bwd_allreduce_microstep: 4.81 | step_microstep: 43.97
-[2025-01-25 20:12:37,001] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.16 | bwd: 4565.12 | bwd_inner: 4560.20 | bwd_allreduce: 4.85 | step: 43.98
- 71%|███████   | 4119/5800 [11:26:06<3:12:24,  6.87s/it]                                                        {'loss': 0.0059, 'grad_norm': 2.1656253337860107, 'learning_rate': 8.18294048904194e-06, 'epoch': 35.51}
- 71%|███████   | 4119/5800 [11:26:06<3:12:24,  6.87s/it]score1 tensor([[0.5742],
-        [0.6211],
-        [0.4082],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.6289, 0.4043, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:12:43,910] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.98 | optimizer_step: 4.36
-[2025-01-25 20:12:43,912] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.57 | bwd_microstep: 4622.97 | bwd_inner_microstep: 4617.56 | bwd_allreduce_microstep: 5.32 | step_microstep: 44.85
-[2025-01-25 20:12:43,912] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.54 | bwd: 4622.99 | bwd_inner: 4617.56 | bwd_allreduce: 5.36 | step: 44.86
- 71%|███████   | 4120/5800 [11:26:13<3:12:40,  6.88s/it]                                                        {'loss': 0.0107, 'grad_norm': 0.6652921438217163, 'learning_rate': 8.173932122966535e-06, 'epoch': 35.52}
- 71%|███████   | 4120/5800 [11:26:13<3:12:40,  6.88s/it]score1 tensor([[0.4863],
-        [0.4785],
-        [0.5586],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4961, 0.4863, 0.5391, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:12:50,778] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.37
-[2025-01-25 20:12:50,780] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.72 | bwd_microstep: 4575.89 | bwd_inner_microstep: 4570.60 | bwd_allreduce_microstep: 5.18 | step_microstep: 43.50
-[2025-01-25 20:12:50,780] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.65 | bwd: 4575.91 | bwd_inner: 4570.60 | bwd_allreduce: 5.24 | step: 43.51
- 71%|███████   | 4121/5800 [11:26:20<3:12:27,  6.88s/it]                                                        {'loss': 0.0093, 'grad_norm': 1.7968982458114624, 'learning_rate': 8.164927444464284e-06, 'epoch': 35.53}
- 71%|███████   | 4121/5800 [11:26:20<3:12:27,  6.88s/it]score1 tensor([[0.3789],
-        [0.5391],
-        [0.5117],
-        [0.6172]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.5312, 0.5117, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0029, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:12:57,601] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.03 | optimizer_step: 4.36
-[2025-01-25 20:12:57,602] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.65 | bwd_microstep: 4540.98 | bwd_inner_microstep: 4536.02 | bwd_allreduce_microstep: 4.86 | step_microstep: 42.98
-[2025-01-25 20:12:57,603] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.62 | bwd: 4541.00 | bwd_inner: 4536.02 | bwd_allreduce: 4.91 | step: 42.98
- 71%|███████   | 4122/5800 [11:26:27<3:11:52,  6.86s/it]                                                        {'loss': 0.0029, 'grad_norm': 3.8242902755737305, 'learning_rate': 8.155926456343022e-06, 'epoch': 35.53}
- 71%|███████   | 4122/5800 [11:26:27<3:11:52,  6.86s/it]score1 tensor([[0.5234],
-        [0.3027],
-        [0.5352],
-        [0.4590]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.3086, 0.5430, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:13:04,506] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.93 | optimizer_step: 4.36
-[2025-01-25 20:13:04,507] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.72 | bwd_microstep: 4623.47 | bwd_inner_microstep: 4618.27 | bwd_allreduce_microstep: 5.08 | step_microstep: 41.81
-[2025-01-25 20:13:04,507] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.68 | bwd: 4623.50 | bwd_inner: 4618.27 | bwd_allreduce: 5.14 | step: 41.81
- 71%|███████   | 4123/5800 [11:26:34<3:12:05,  6.87s/it]                                                        {'loss': 0.0054, 'grad_norm': 3.8230068683624268, 'learning_rate': 8.146929161409393e-06, 'epoch': 35.54}
- 71%|███████   | 4123/5800 [11:26:34<3:12:05,  6.87s/it]score1 tensor([[0.6211],
-        [0.6719],
-        [0.4375],
-        [0.3418]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.6641, 0.4297, 0.3418], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:13:11,358] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.94 | optimizer_step: 4.36
-[2025-01-25 20:13:11,360] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.59 | bwd_microstep: 4574.58 | bwd_inner_microstep: 4569.48 | bwd_allreduce_microstep: 5.01 | step_microstep: 44.88
-[2025-01-25 20:13:11,360] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.55 | bwd: 4574.60 | bwd_inner: 4569.48 | bwd_allreduce: 5.05 | step: 44.89
- 71%|███████   | 4124/5800 [11:26:41<3:11:48,  6.87s/it]                                                        {'loss': 0.0068, 'grad_norm': 6.581103801727295, 'learning_rate': 8.13793556246893e-06, 'epoch': 35.55}
- 71%|███████   | 4124/5800 [11:26:41<3:11:48,  6.87s/it]score1 tensor([[0.5625],
-        [0.4141],
-        [0.4277],
-        [0.4434]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5703, 0.4160, 0.4277, 0.4395], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0034, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:13:18,212] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.94 | optimizer_step: 4.37
-[2025-01-25 20:13:18,214] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.67 | bwd_microstep: 4573.39 | bwd_inner_microstep: 4568.21 | bwd_allreduce_microstep: 5.07 | step_microstep: 42.35
-[2025-01-25 20:13:18,214] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.61 | bwd: 4573.41 | bwd_inner: 4568.21 | bwd_allreduce: 5.13 | step: 42.36
- 71%|███████   | 4125/5800 [11:26:48<3:11:37,  6.86s/it]                                                        {'loss': 0.0034, 'grad_norm': 2.080033540725708, 'learning_rate': 8.128945662325984e-06, 'epoch': 35.56}
- 71%|███████   | 4125/5800 [11:26:48<3:11:37,  6.86s/it]score1 tensor([[0.6055],
-        [0.6016],
-        [0.3906],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.5977, 0.3926, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0024, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:13:25,061] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.36
-[2025-01-25 20:13:25,063] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.29 | bwd_microstep: 4566.59 | bwd_inner_microstep: 4561.29 | bwd_allreduce_microstep: 5.19 | step_microstep: 47.11
-[2025-01-25 20:13:25,063] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.26 | bwd: 4566.61 | bwd_inner: 4561.29 | bwd_allreduce: 5.24 | step: 47.12
- 71%|███████   | 4126/5800 [11:26:55<3:11:22,  6.86s/it]                                                        {'loss': 0.0024, 'grad_norm': 2.6907005310058594, 'learning_rate': 8.119959463783754e-06, 'epoch': 35.57}
- 71%|███████   | 4126/5800 [11:26:55<3:11:22,  6.86s/it]score1 tensor([[0.4473],
-        [0.6016],
-        [0.4961],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4531, 0.5781, 0.4844, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:13:31,959] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 20:13:31,961] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.63 | bwd_microstep: 4614.66 | bwd_inner_microstep: 4609.29 | bwd_allreduce_microstep: 5.28 | step_microstep: 48.22
-[2025-01-25 20:13:31,962] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.60 | bwd: 4614.71 | bwd_inner: 4609.29 | bwd_allreduce: 5.33 | step: 48.23
- 71%|███████   | 4127/5800 [11:27:01<3:11:32,  6.87s/it]                                                        {'loss': 0.0112, 'grad_norm': 4.429743766784668, 'learning_rate': 8.110976969644304e-06, 'epoch': 35.58}
- 71%|███████   | 4127/5800 [11:27:01<3:11:32,  6.87s/it]score1 tensor([[0.5703],
-        [0.4473],
-        [0.5352],
-        [0.4336]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.4336, 0.5430, 0.4277], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:13:38,851] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.92 | optimizer_step: 4.36
-[2025-01-25 20:13:38,853] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.30 | bwd_microstep: 4616.87 | bwd_inner_microstep: 4611.78 | bwd_allreduce_microstep: 4.98 | step_microstep: 44.06
-[2025-01-25 20:13:38,853] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.25 | bwd: 4616.90 | bwd_inner: 4611.78 | bwd_allreduce: 5.03 | step: 44.07
- 71%|███████   | 4128/5800 [11:27:08<3:11:41,  6.88s/it]                                                        {'loss': 0.0078, 'grad_norm': 3.7931342124938965, 'learning_rate': 8.101998182708521e-06, 'epoch': 35.59}
- 71%|███████   | 4128/5800 [11:27:08<3:11:41,  6.88s/it]score1 tensor([[0.4316],
-        [0.5273],
-        [0.5234],
-        [0.4863]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.5391, 0.5156, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:13:45,754] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.95 | optimizer_step: 4.36
-[2025-01-25 20:13:45,756] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.88 | bwd_microstep: 4615.33 | bwd_inner_microstep: 4610.24 | bwd_allreduce_microstep: 5.01 | step_microstep: 46.96
-[2025-01-25 20:13:45,757] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.85 | bwd: 4615.35 | bwd_inner: 4610.24 | bwd_allreduce: 5.05 | step: 46.97
- 71%|███████   | 4129/5800 [11:27:15<3:11:46,  6.89s/it]                                                        {'loss': 0.0103, 'grad_norm': 0.42523646354675293, 'learning_rate': 8.093023105776163e-06, 'epoch': 35.59}
- 71%|███████   | 4129/5800 [11:27:15<3:11:46,  6.89s/it]score1 tensor([[0.4688],
-        [0.5078],
-        [0.4531],
-        [0.4160]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.4961, 0.4414, 0.4141], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:13:52,657] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.94 | optimizer_step: 4.36
-[2025-01-25 20:13:52,659] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.47 | bwd_microstep: 4623.56 | bwd_inner_microstep: 4618.54 | bwd_allreduce_microstep: 4.93 | step_microstep: 43.51
-[2025-01-25 20:13:52,659] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.41 | bwd: 4623.59 | bwd_inner: 4618.54 | bwd_allreduce: 4.98 | step: 43.51
- 71%|███████   | 4130/5800 [11:27:22<3:11:44,  6.89s/it]                                                        {'loss': 0.0073, 'grad_norm': 7.65839958190918, 'learning_rate': 8.084051741645796e-06, 'epoch': 35.6}
- 71%|███████   | 4130/5800 [11:27:22<3:11:44,  6.89s/it]score1 tensor([[0.5430],
-        [0.5039],
-        [0.4297],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.5117, 0.4473, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:13:59,547] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.95 | optimizer_step: 4.37
-[2025-01-25 20:13:59,549] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.35 | bwd_microstep: 4616.81 | bwd_inner_microstep: 4611.40 | bwd_allreduce_microstep: 5.27 | step_microstep: 42.60
-[2025-01-25 20:13:59,549] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.32 | bwd: 4616.84 | bwd_inner: 4611.40 | bwd_allreduce: 5.35 | step: 42.61
- 71%|███████   | 4131/5800 [11:27:29<3:11:40,  6.89s/it]                                                        {'loss': 0.0083, 'grad_norm': 0.5270558595657349, 'learning_rate': 8.075084093114853e-06, 'epoch': 35.61}
- 71%|███████   | 4131/5800 [11:27:29<3:11:40,  6.89s/it]score1 tensor([[0.4531],
-        [0.5781],
-        [0.5703],
-        [0.4121]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.5898, 0.5781, 0.4258], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:14:06,452] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.96 | optimizer_step: 4.36
-[2025-01-25 20:14:06,454] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.70 | bwd_microstep: 4621.03 | bwd_inner_microstep: 4615.71 | bwd_allreduce_microstep: 5.19 | step_microstep: 43.54
-[2025-01-25 20:14:06,454] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.67 | bwd: 4621.06 | bwd_inner: 4615.71 | bwd_allreduce: 5.26 | step: 43.54
- 71%|███████   | 4132/5800 [11:27:36<3:11:39,  6.89s/it]                                                        {'loss': 0.0088, 'grad_norm': 8.049455642700195, 'learning_rate': 8.066120162979615e-06, 'epoch': 35.62}
- 71%|███████   | 4132/5800 [11:27:36<3:11:39,  6.89s/it]score1 tensor([[0.4766],
-        [0.5781],
-        [0.4805],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.6016, 0.4844, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:14:13,354] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.00 | optimizer_step: 4.37
-[2025-01-25 20:14:13,356] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.15 | bwd_microstep: 4622.92 | bwd_inner_microstep: 4617.88 | bwd_allreduce_microstep: 4.94 | step_microstep: 42.94
-[2025-01-25 20:14:13,356] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.10 | bwd: 4622.95 | bwd_inner: 4617.88 | bwd_allreduce: 4.99 | step: 42.95
- 71%|███████▏  | 4133/5800 [11:27:43<3:11:37,  6.90s/it]                                                        {'loss': 0.0122, 'grad_norm': 8.159327507019043, 'learning_rate': 8.057159954035175e-06, 'epoch': 35.63}
- 71%|███████▏  | 4133/5800 [11:27:43<3:11:37,  6.90s/it]score1 tensor([[0.5508],
-        [0.6523],
-        [0.3457],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.6445, 0.3613, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:14:20,253] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.37
-[2025-01-25 20:14:20,254] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.98 | bwd_microstep: 4612.31 | bwd_inner_microstep: 4606.75 | bwd_allreduce_microstep: 5.47 | step_microstep: 48.84
-[2025-01-25 20:14:20,254] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.95 | bwd: 4612.33 | bwd_inner: 4606.74 | bwd_allreduce: 5.52 | step: 48.85
- 71%|███████▏  | 4134/5800 [11:27:50<3:11:29,  6.90s/it]                                                        {'loss': 0.0098, 'grad_norm': 1.0710749626159668, 'learning_rate': 8.048203469075493e-06, 'epoch': 35.64}
- 71%|███████▏  | 4134/5800 [11:27:50<3:11:29,  6.90s/it]score1 tensor([[0.5469],
-        [0.3926],
-        [0.5625],
-        [0.6523]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.4062, 0.5781, 0.6562], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:14:27,160] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.65 | optimizer_step: 4.37
-[2025-01-25 20:14:27,162] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.32 | bwd_microstep: 4629.01 | bwd_inner_microstep: 4624.14 | bwd_allreduce_microstep: 4.78 | step_microstep: 46.65
-[2025-01-25 20:14:27,162] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.29 | bwd: 4629.03 | bwd_inner: 4624.14 | bwd_allreduce: 4.82 | step: 46.66
- 71%|███████▏  | 4135/5800 [11:27:57<3:11:30,  6.90s/it]                                                        {'loss': 0.0103, 'grad_norm': 8.37330150604248, 'learning_rate': 8.039250710893342e-06, 'epoch': 35.65}
- 71%|███████▏  | 4135/5800 [11:27:57<3:11:30,  6.90s/it]score1 tensor([[0.3418],
-        [0.5977],
-        [0.3906],
-        [0.4434]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3477, 0.6094, 0.3984, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:14:34,070] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.15 | optimizer_step: 4.36
-[2025-01-25 20:14:34,071] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.29 | bwd_microstep: 4623.84 | bwd_inner_microstep: 4618.72 | bwd_allreduce_microstep: 5.01 | step_microstep: 43.38
-[2025-01-25 20:14:34,072] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.26 | bwd: 4623.87 | bwd_inner: 4618.72 | bwd_allreduce: 5.06 | step: 43.39
- 71%|███████▏  | 4136/5800 [11:28:04<3:11:27,  6.90s/it]                                                        {'loss': 0.0093, 'grad_norm': 7.4967474937438965, 'learning_rate': 8.030301682280366e-06, 'epoch': 35.66}
- 71%|███████▏  | 4136/5800 [11:28:04<3:11:27,  6.90s/it]score1 tensor([[0.5781],
-        [0.4434],
-        [0.4082],
-        [0.6602]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.4590, 0.4160, 0.6719], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:14:40,980] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.90 | optimizer_step: 4.37
-[2025-01-25 20:14:40,981] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.19 | bwd_microstep: 4625.45 | bwd_inner_microstep: 4620.16 | bwd_allreduce_microstep: 5.20 | step_microstep: 44.38
-[2025-01-25 20:14:40,982] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.15 | bwd: 4625.49 | bwd_inner: 4620.15 | bwd_allreduce: 5.24 | step: 44.38
- 71%|███████▏  | 4137/5800 [11:28:10<3:11:21,  6.90s/it]                                                        {'loss': 0.0127, 'grad_norm': 3.8369534015655518, 'learning_rate': 8.021356386027018e-06, 'epoch': 35.66}
- 71%|███████▏  | 4137/5800 [11:28:10<3:11:21,  6.90s/it]score1 tensor([[0.4883],
-        [0.4688],
-        [0.4277],
-        [0.6133]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.4746, 0.4453, 0.6016], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:14:47,877] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.98 | optimizer_step: 4.36
-[2025-01-25 20:14:47,879] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.11 | bwd_microstep: 4616.69 | bwd_inner_microstep: 4611.59 | bwd_allreduce_microstep: 5.01 | step_microstep: 43.73
-[2025-01-25 20:14:47,879] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.08 | bwd: 4616.71 | bwd_inner: 4611.59 | bwd_allreduce: 5.05 | step: 43.74
- 71%|███████▏  | 4138/5800 [11:28:17<3:11:25,  6.91s/it]                                                        {'loss': 0.0112, 'grad_norm': 0.9012076258659363, 'learning_rate': 8.012414824922587e-06, 'epoch': 35.67}
- 71%|███████▏  | 4138/5800 [11:28:17<3:11:25,  6.91s/it]score1 tensor([[0.5547],
-        [0.6406],
-        [0.3594],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.6484, 0.3457, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:14:54,779] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.08 | optimizer_step: 4.36
-[2025-01-25 20:14:54,781] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.00 | bwd_microstep: 4580.90 | bwd_inner_microstep: 4575.74 | bwd_allreduce_microstep: 5.04 | step_microstep: 45.74
-[2025-01-25 20:14:54,781] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.95 | bwd: 4580.92 | bwd_inner: 4575.74 | bwd_allreduce: 5.09 | step: 45.74
- 71%|███████▏  | 4139/5800 [11:28:24<3:11:00,  6.90s/it]                                                        {'loss': 0.0073, 'grad_norm': 1.568564534187317, 'learning_rate': 8.003477001755226e-06, 'epoch': 35.68}
- 71%|███████▏  | 4139/5800 [11:28:24<3:11:00,  6.90s/it]score1 tensor([[0.4531],
-        [0.5859],
-        [0.3867],
-        [0.6250]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4629, 0.5742, 0.3789, 0.6406], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:15:01,670] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.09 | optimizer_step: 4.36
-[2025-01-25 20:15:01,672] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.61 | bwd_microstep: 4612.99 | bwd_inner_microstep: 4607.42 | bwd_allreduce_microstep: 5.46 | step_microstep: 44.61
-[2025-01-25 20:15:01,672] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.56 | bwd: 4613.01 | bwd_inner: 4607.42 | bwd_allreduce: 5.52 | step: 44.62
- 71%|███████▏  | 4140/5800 [11:28:31<3:10:50,  6.90s/it]                                                        {'loss': 0.0112, 'grad_norm': 0.5307512283325195, 'learning_rate': 7.994542919311886e-06, 'epoch': 35.69}
- 71%|███████▏  | 4140/5800 [11:28:31<3:10:50,  6.90s/it]evaluate!
-score1 tensor([[0.4727]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4805, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5586]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5625, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4746]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4766, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5430]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5703, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6602]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5742, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0859, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4551]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5547, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0996, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5469]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5977, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5586]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4219, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1367, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5508]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0625, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5000]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5000, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0., device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4473]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3945, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0527, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5000, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5039]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3730, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1309, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4785]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4375]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4414, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6797, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1562, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5391, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4961]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4648, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4141]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4258, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4453]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4707, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4531]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3965, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0566, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5117]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3750, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1367, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4492, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0742, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5195]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5586, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5781]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6523, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0742, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5938]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5898, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5469]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4355, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1113, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5117]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5156, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4902]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3262, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1641, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4102]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4863, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0762, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4863]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1621, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6055, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5430]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4570, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0859, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5703]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0781, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5938]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.7031, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1094, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4766]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4336, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5430]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5391, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4395]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4473, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5430, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4473]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4590, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6562]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6641, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5312]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4453, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0859, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5977]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.7109, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1133, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5469]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5273, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4980]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3691, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1289, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4629]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4180, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0449, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4688]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3223, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1465, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5625]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6016, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5078]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4648, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4551]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3418, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1133, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4805]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4590, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5039]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5117, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5078]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4551, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0527, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4824]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5234, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0410, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5586]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5312, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5977]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5820, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4453]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4199, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4180]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3926, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5508]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5117]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5352, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5977]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6250, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5938]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5703, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4688]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4727, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5547]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4434, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1113, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4902]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4941, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5508]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5352, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5000]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4570]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3828, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0742, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4375]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4219, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4922]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4844, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4805, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5586]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0586, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5820]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5898, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6445]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6641, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4961]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4688, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4355]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4297, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4277]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4668, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0938, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4590]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4297, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4277]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4609, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0332, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4941]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1230, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4766]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4492, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4258]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3730, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0527, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5312]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6445, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1133, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4766]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1719, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4590]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5430, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0840, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5117]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5273, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4570]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3867, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4980]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4668, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5625]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5625, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0., device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5938]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6250, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5664]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0625, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5625]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4492]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4785, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4883]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4062, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0820, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4395, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0879, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4434]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5664, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1230, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4551]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3555, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0996, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4707]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4785, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5586]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5078, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6016]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6016, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0., device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4688]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4277, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0410, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5898]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5781, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4941]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3477, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1465, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5195]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6133, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0938, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5508]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5156, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4434]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3379, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1055, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4434]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3418, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1016, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4922]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3848, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1074, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4336]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3828, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4668]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4277, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4277]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4043, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-Results saved to /DATA/DATA1/wjr/intern/InternVL/internvl_chat/new_train_result/stage2/mos3_test.csv
-Accuracy: 0.0
-SRCC_score: 0.6761974590064878
-PLCC_score: 0.6714556721750863
-KRCC_score: 0.491993503702106
-SRCC_level: 0.6761974590064878
-PLCC_level: 0.6714556721750863
-KRCC_level: 0.491993503702106
-score1 tensor([[0.4395],
-        [0.4531],
-        [0.5430],
-        [0.4863]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4219, 0.4551, 0.5469, 0.4707], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:25:11,401] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 20:25:11,402] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2134.89 | bwd_microstep: 4595.91 | bwd_inner_microstep: 4590.73 | bwd_allreduce_microstep: 5.08 | step_microstep: 43.81
-[2025-01-25 20:25:11,403] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2134.85 | bwd: 4595.94 | bwd_inner: 4590.73 | bwd_allreduce: 5.13 | step: 43.81
- 71%|███████▏  | 4141/5800 [11:38:41<86:31:10, 187.75s/it]                                                          {'loss': 0.0098, 'grad_norm': 0.47940805554389954, 'learning_rate': 7.985612580378383e-06, 'epoch': 35.7}
- 71%|███████▏  | 4141/5800 [11:38:41<86:31:10, 187.75s/it]score1 tensor([[0.5508],
-        [0.5703],
-        [0.3652],
-        [0.5859]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.5625, 0.3672, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:25:18,256] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 7.58 | optimizer_step: 4.37
-[2025-01-25 20:25:18,258] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2146.30 | bwd_microstep: 4590.27 | bwd_inner_microstep: 4585.23 | bwd_allreduce_microstep: 4.95 | step_microstep: 51.87
-[2025-01-25 20:25:18,258] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2146.27 | bwd: 4590.30 | bwd_inner: 4585.23 | bwd_allreduce: 4.99 | step: 51.88
- 71%|███████▏  | 4142/5800 [11:38:48<61:28:26, 133.48s/it]                                                          {'loss': 0.0054, 'grad_norm': 0.9411638975143433, 'learning_rate': 7.976685987739338e-06, 'epoch': 35.71}
- 71%|███████▏  | 4142/5800 [11:38:48<61:28:26, 133.48s/it]score1 tensor([[0.5078],
-        [0.4043],
-        [0.4102],
-        [0.4141]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5078, 0.4023, 0.4004, 0.4043], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:25:25,045] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.37
-[2025-01-25 20:25:25,046] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2131.83 | bwd_microstep: 4539.82 | bwd_inner_microstep: 4534.61 | bwd_allreduce_microstep: 5.10 | step_microstep: 45.52
-[2025-01-25 20:25:25,047] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2131.78 | bwd: 4539.85 | bwd_inner: 4534.61 | bwd_allreduce: 5.16 | step: 45.53
- 71%|███████▏  | 4143/5800 [11:38:55<43:56:35, 95.47s/it]                                                          {'loss': 0.0054, 'grad_norm': 5.386568546295166, 'learning_rate': 7.967763144178234e-06, 'epoch': 35.72}
- 71%|███████▏  | 4143/5800 [11:38:55<43:56:35, 95.47s/it]score1 tensor([[0.4688],
-        [0.5039],
-        [0.4668],
-        [0.4258]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.4941, 0.4570, 0.4199], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:25:31,877] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 20:25:31,879] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2135.97 | bwd_microstep: 4586.17 | bwd_inner_microstep: 4581.48 | bwd_allreduce_microstep: 4.61 | step_microstep: 43.43
-[2025-01-25 20:25:31,879] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2135.93 | bwd: 4586.19 | bwd_inner: 4581.48 | bwd_allreduce: 4.65 | step: 43.44
- 71%|███████▏  | 4144/5800 [11:39:01<31:41:07, 68.88s/it]                                                         {'loss': 0.0073, 'grad_norm': 7.679704666137695, 'learning_rate': 7.958844052477356e-06, 'epoch': 35.72}
- 71%|███████▏  | 4144/5800 [11:39:01<31:41:07, 68.88s/it]score1 tensor([[0.5703],
-        [0.5586],
-        [0.4258],
-        [0.4512]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.5742, 0.4121, 0.4297], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:25:38,731] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.52 | optimizer_step: 4.36
-[2025-01-25 20:25:38,732] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2140.61 | bwd_microstep: 4598.50 | bwd_inner_microstep: 4593.70 | bwd_allreduce_microstep: 4.66 | step_microstep: 40.63
-[2025-01-25 20:25:38,732] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2140.58 | bwd: 4598.53 | bwd_inner: 4593.70 | bwd_allreduce: 4.72 | step: 40.64
- 71%|███████▏  | 4145/5800 [11:39:08<23:06:39, 50.27s/it]                                                         {'loss': 0.0146, 'grad_norm': 3.765890121459961, 'learning_rate': 7.94992871541783e-06, 'epoch': 35.73}
- 71%|███████▏  | 4145/5800 [11:39:08<23:06:39, 50.27s/it]score1 tensor([[0.4160],
-        [0.6289],
-        [0.6172],
-        [0.3730]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4219, 0.6367, 0.6250, 0.3652], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:25:45,599] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.37
-[2025-01-25 20:25:45,600] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2149.20 | bwd_microstep: 4603.65 | bwd_inner_microstep: 4598.79 | bwd_allreduce_microstep: 4.75 | step_microstep: 49.45
-[2025-01-25 20:25:45,600] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2149.17 | bwd: 4603.67 | bwd_inner: 4598.79 | bwd_allreduce: 4.81 | step: 49.46
- 71%|███████▏  | 4146/5800 [11:39:15<17:06:51, 37.25s/it]                                                         {'loss': 0.0073, 'grad_norm': 4.748857021331787, 'learning_rate': 7.941017135779623e-06, 'epoch': 35.74}
- 71%|███████▏  | 4146/5800 [11:39:15<17:06:51, 37.25s/it]score1 tensor([[0.6250],
-        [0.6680],
-        [0.4102],
-        [0.6133]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6484, 0.6836, 0.3984, 0.6133], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:25:52,439] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 20:25:52,440] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2147.14 | bwd_microstep: 4567.67 | bwd_inner_microstep: 4562.63 | bwd_allreduce_microstep: 4.94 | step_microstep: 42.02
-[2025-01-25 20:25:52,440] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2147.11 | bwd: 4567.69 | bwd_inner: 4562.63 | bwd_allreduce: 4.99 | step: 42.03
- 72%|███████▏  | 4147/5800 [11:39:22<12:54:57, 28.13s/it]                                                         {'loss': 0.0127, 'grad_norm': 3.020280122756958, 'learning_rate': 7.932109316341508e-06, 'epoch': 35.75}
- 72%|███████▏  | 4147/5800 [11:39:22<12:54:57, 28.13s/it]score1 tensor([[0.4062],
-        [0.4121],
-        [0.5273],
-        [0.4023]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4121, 0.4199, 0.5352, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:25:59,318] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.50 | optimizer_step: 4.37
-[2025-01-25 20:25:59,320] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2147.45 | bwd_microstep: 4613.81 | bwd_inner_microstep: 4609.49 | bwd_allreduce_microstep: 4.23 | step_microstep: 41.50
-[2025-01-25 20:25:59,320] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2147.41 | bwd: 4613.83 | bwd_inner: 4609.49 | bwd_allreduce: 4.27 | step: 41.51
- 72%|███████▏  | 4148/5800 [11:39:29<9:58:55, 21.75s/it]                                                         {'loss': 0.0059, 'grad_norm': 3.8633859157562256, 'learning_rate': 7.92320525988111e-06, 'epoch': 35.76}
- 72%|███████▏  | 4148/5800 [11:39:29<9:58:55, 21.75s/it]score1 tensor([[0.4941],
-        [0.4531],
-        [0.4863],
-        [0.4492]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4961, 0.4590, 0.4922, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:26:06,204] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 20:26:06,205] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.07 | bwd_microstep: 4617.26 | bwd_inner_microstep: 4612.38 | bwd_allreduce_microstep: 4.75 | step_microstep: 45.74
-[2025-01-25 20:26:06,206] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.03 | bwd: 4617.28 | bwd_inner: 4612.39 | bwd_allreduce: 4.80 | step: 45.75
- 72%|███████▏  | 4149/5800 [11:39:36<7:55:53, 17.29s/it]                                                        {'loss': 0.0039, 'grad_norm': 3.970958709716797, 'learning_rate': 7.914304969174863e-06, 'epoch': 35.77}
- 72%|███████▏  | 4149/5800 [11:39:36<7:55:53, 17.29s/it]score1 tensor([[0.5547],
-        [0.6328],
-        [0.3809],
-        [0.4648]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.6484, 0.3809, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:26:13,016] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 20:26:13,017] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.10 | bwd_microstep: 4539.92 | bwd_inner_microstep: 4535.03 | bwd_allreduce_microstep: 4.80 | step_microstep: 41.93
-[2025-01-25 20:26:13,017] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.06 | bwd: 4539.94 | bwd_inner: 4535.03 | bwd_allreduce: 4.84 | step: 41.93
- 72%|███████▏  | 4150/5800 [11:39:42<6:29:04, 14.15s/it]                                                        {'loss': 0.0049, 'grad_norm': 0.38490819931030273, 'learning_rate': 7.905408446998027e-06, 'epoch': 35.78}
- 72%|███████▏  | 4150/5800 [11:39:42<6:29:04, 14.15s/it]score1 tensor([[0.3945],
-        [0.6953],
-        [0.4902],
-        [0.4492]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4023, 0.6953, 0.4883, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0029, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:26:19,837] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.08 | optimizer_step: 4.36
-[2025-01-25 20:26:19,839] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2147.91 | bwd_microstep: 4556.25 | bwd_inner_microstep: 4551.90 | bwd_allreduce_microstep: 4.27 | step_microstep: 47.26
-[2025-01-25 20:26:19,839] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2147.88 | bwd: 4556.27 | bwd_inner: 4551.90 | bwd_allreduce: 4.30 | step: 47.26
- 72%|███████▏  | 4151/5800 [11:39:49<5:28:25, 11.95s/it]                                                        {'loss': 0.0029, 'grad_norm': 1.7033309936523438, 'learning_rate': 7.896515696124703e-06, 'epoch': 35.78}
- 72%|███████▏  | 4151/5800 [11:39:49<5:28:25, 11.95s/it]score1 tensor([[0.6562],
-        [0.6211],
-        [0.6172],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6602, 0.6094, 0.6094, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:26:26,719] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.37
-[2025-01-25 20:26:26,720] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.50 | bwd_microstep: 4613.49 | bwd_inner_microstep: 4608.49 | bwd_allreduce_microstep: 4.92 | step_microstep: 44.80
-[2025-01-25 20:26:26,721] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.46 | bwd: 4613.51 | bwd_inner: 4608.49 | bwd_allreduce: 4.96 | step: 44.82
- 72%|███████▏  | 4152/5800 [11:39:56<4:46:28, 10.43s/it]                                                        {'loss': 0.0078, 'grad_norm': 4.290314674377441, 'learning_rate': 7.88762671932779e-06, 'epoch': 35.79}
- 72%|███████▏  | 4152/5800 [11:39:56<4:46:28, 10.43s/it]score1 tensor([[0.4844],
-        [0.4824],
-        [0.5312],
-        [0.6484]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.4785, 0.5352, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:26:33,609] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.36
-[2025-01-25 20:26:33,611] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.63 | bwd_microstep: 4616.69 | bwd_inner_microstep: 4611.63 | bwd_allreduce_microstep: 4.98 | step_microstep: 45.21
-[2025-01-25 20:26:33,611] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.58 | bwd: 4616.71 | bwd_inner: 4611.63 | bwd_allreduce: 5.02 | step: 45.22
- 72%|███████▏  | 4153/5800 [11:40:03<4:17:09,  9.37s/it]                                                        {'loss': 0.0059, 'grad_norm': 4.207347393035889, 'learning_rate': 7.878741519379041e-06, 'epoch': 35.8}
- 72%|███████▏  | 4153/5800 [11:40:03<4:17:09,  9.37s/it]score1 tensor([[0.5312],
-        [0.4414],
-        [0.6250],
-        [0.4629]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.4473, 0.6133, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:26:40,503] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 20:26:40,505] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.84 | bwd_microstep: 4609.72 | bwd_inner_microstep: 4604.71 | bwd_allreduce_microstep: 4.92 | step_microstep: 52.25
-[2025-01-25 20:26:40,505] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.81 | bwd: 4609.74 | bwd_inner: 4604.71 | bwd_allreduce: 4.96 | step: 52.26
- 72%|███████▏  | 4154/5800 [11:40:10<3:56:36,  8.62s/it]                                                        {'loss': 0.0088, 'grad_norm': 0.5078296661376953, 'learning_rate': 7.869860099049003e-06, 'epoch': 35.81}
- 72%|███████▏  | 4154/5800 [11:40:10<3:56:36,  8.62s/it]score1 tensor([[0.5938],
-        [0.4844],
-        [0.5156],
-        [0.4766]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.4707, 0.5039, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:26:47,393] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.36
-[2025-01-25 20:26:47,394] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.20 | bwd_microstep: 4617.88 | bwd_inner_microstep: 4612.45 | bwd_allreduce_microstep: 5.33 | step_microstep: 43.62
-[2025-01-25 20:26:47,395] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.15 | bwd: 4617.91 | bwd_inner: 4612.45 | bwd_allreduce: 5.38 | step: 43.62
- 72%|███████▏  | 4155/5800 [11:40:17<3:42:15,  8.11s/it]                                                        {'loss': 0.0161, 'grad_norm': 4.275269031524658, 'learning_rate': 7.860982461107068e-06, 'epoch': 35.82}
- 72%|███████▏  | 4155/5800 [11:40:17<3:42:15,  8.11s/it]score1 tensor([[0.4199],
-        [0.3711],
-        [0.4785],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4160, 0.2812, 0.4805, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0249, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:26:54,286] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.90 | optimizer_step: 4.36
-[2025-01-25 20:26:54,288] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.66 | bwd_microstep: 4614.13 | bwd_inner_microstep: 4609.07 | bwd_allreduce_microstep: 4.97 | step_microstep: 43.42
-[2025-01-25 20:26:54,288] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.61 | bwd: 4614.15 | bwd_inner: 4609.07 | bwd_allreduce: 5.01 | step: 43.42
- 72%|███████▏  | 4156/5800 [11:40:24<3:32:07,  7.74s/it]                                                        {'loss': 0.0249, 'grad_norm': 0.7964985966682434, 'learning_rate': 7.852108608321432e-06, 'epoch': 35.83}
- 72%|███████▏  | 4156/5800 [11:40:24<3:32:07,  7.74s/it]score1 tensor([[0.5703],
-        [0.5547],
-        [0.5859],
-        [0.7109]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.5508, 0.5977, 0.7070], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:27:01,195] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.97 | optimizer_step: 4.36
-[2025-01-25 20:27:01,197] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.29 | bwd_microstep: 4620.15 | bwd_inner_microstep: 4614.20 | bwd_allreduce_microstep: 5.86 | step_microstep: 52.38
-[2025-01-25 20:27:01,197] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.26 | bwd: 4620.18 | bwd_inner: 4614.20 | bwd_allreduce: 5.90 | step: 52.38
- 72%|███████▏  | 4157/5800 [11:40:31<3:25:08,  7.49s/it]                                                        {'loss': 0.0059, 'grad_norm': 4.641010761260986, 'learning_rate': 7.843238543459109e-06, 'epoch': 35.84}
- 72%|███████▏  | 4157/5800 [11:40:31<3:25:08,  7.49s/it]score1 tensor([[0.5195],
-        [0.6328],
-        [0.5938],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.6289, 0.6055, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:27:08,094] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 20:27:08,095] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.66 | bwd_microstep: 4620.14 | bwd_inner_microstep: 4615.02 | bwd_allreduce_microstep: 5.01 | step_microstep: 44.11
-[2025-01-25 20:27:08,095] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.61 | bwd: 4620.16 | bwd_inner: 4615.02 | bwd_allreduce: 5.07 | step: 44.12
- 72%|███████▏  | 4158/5800 [11:40:38<3:20:08,  7.31s/it]                                                        {'loss': 0.0078, 'grad_norm': 4.136329174041748, 'learning_rate': 7.834372269285954e-06, 'epoch': 35.84}
- 72%|███████▏  | 4158/5800 [11:40:38<3:20:08,  7.31s/it]score1 tensor([[0.4980],
-        [0.4629],
-        [0.5273],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5000, 0.4551, 0.5273, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:27:14,942] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 20:27:14,944] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.98 | bwd_microstep: 4573.95 | bwd_inner_microstep: 4568.49 | bwd_allreduce_microstep: 5.32 | step_microstep: 43.80
-[2025-01-25 20:27:14,944] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.95 | bwd: 4573.98 | bwd_inner: 4568.49 | bwd_allreduce: 5.39 | step: 43.81
- 72%|███████▏  | 4159/5800 [11:40:44<3:16:11,  7.17s/it]                                                        {'loss': 0.0063, 'grad_norm': 2.0029687881469727, 'learning_rate': 7.82550978856661e-06, 'epoch': 35.85}
- 72%|███████▏  | 4159/5800 [11:40:44<3:16:11,  7.17s/it]score1 tensor([[0.4258],
-        [0.4629],
-        [0.5078],
-        [0.4219]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4355, 0.4648, 0.5156, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:27:21,835] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.46 | optimizer_step: 4.37
-[2025-01-25 20:27:21,837] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.64 | bwd_microstep: 4614.03 | bwd_inner_microstep: 4608.35 | bwd_allreduce_microstep: 5.59 | step_microstep: 45.42
-[2025-01-25 20:27:21,837] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.59 | bwd: 4614.05 | bwd_inner: 4608.35 | bwd_allreduce: 5.64 | step: 45.42
- 72%|███████▏  | 4160/5800 [11:40:51<3:13:54,  7.09s/it]                                                        {'loss': 0.0059, 'grad_norm': 3.9987130165100098, 'learning_rate': 7.816651104064567e-06, 'epoch': 35.86}
- 72%|███████▏  | 4160/5800 [11:40:51<3:13:54,  7.09s/it]score1 tensor([[0.5430],
-        [0.4668],
-        [0.4316],
-        [0.4102]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.4805, 0.4316, 0.4121], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:27:28,714] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.36
-[2025-01-25 20:27:28,719] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.61 | bwd_microstep: 4582.46 | bwd_inner_microstep: 4577.42 | bwd_allreduce_microstep: 4.92 | step_microstep: 46.37
-[2025-01-25 20:27:28,719] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.59 | bwd: 4582.48 | bwd_inner: 4577.42 | bwd_allreduce: 4.98 | step: 46.38
- 72%|███████▏  | 4161/5800 [11:40:58<3:11:55,  7.03s/it]                                                        {'loss': 0.0049, 'grad_norm': 5.842111110687256, 'learning_rate': 7.80779621854211e-06, 'epoch': 35.87}
- 72%|███████▏  | 4161/5800 [11:40:58<3:11:55,  7.03s/it]score1 tensor([[0.5117],
-        [0.5195],
-        [0.4883],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5000, 0.5273, 0.4805, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0142, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:27:35,606] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 20:27:35,607] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.91 | bwd_microstep: 4617.41 | bwd_inner_microstep: 4612.44 | bwd_allreduce_microstep: 4.86 | step_microstep: 43.80
-[2025-01-25 20:27:35,608] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.88 | bwd: 4617.43 | bwd_inner: 4612.44 | bwd_allreduce: 4.92 | step: 43.81
- 72%|███████▏  | 4162/5800 [11:41:05<3:10:40,  6.98s/it]                                                        {'loss': 0.0142, 'grad_norm': 4.053166389465332, 'learning_rate': 7.798945134760337e-06, 'epoch': 35.88}
- 72%|███████▏  | 4162/5800 [11:41:05<3:10:40,  6.98s/it]score1 tensor([[0.5000],
-        [0.4473],
-        [0.5156],
-        [0.4863]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.4688, 0.5234, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:27:42,507] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.94 | optimizer_step: 4.36
-[2025-01-25 20:27:42,508] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.75 | bwd_microstep: 4626.60 | bwd_inner_microstep: 4621.53 | bwd_allreduce_microstep: 4.98 | step_microstep: 43.71
-[2025-01-25 20:27:42,508] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.72 | bwd: 4626.63 | bwd_inner: 4621.53 | bwd_allreduce: 5.02 | step: 43.72
- 72%|███████▏  | 4163/5800 [11:41:12<3:09:54,  6.96s/it]                                                        {'loss': 0.0137, 'grad_norm': 7.87991189956665, 'learning_rate': 7.790097855479177e-06, 'epoch': 35.89}
- 72%|███████▏  | 4163/5800 [11:41:12<3:09:54,  6.96s/it]score1 tensor([[0.5312],
-        [0.4062],
-        [0.3965],
-        [0.4785]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.4082, 0.4062, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:27:49,415] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 20:27:49,416] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.53 | bwd_microstep: 4627.98 | bwd_inner_microstep: 4623.03 | bwd_allreduce_microstep: 4.87 | step_microstep: 44.05
-[2025-01-25 20:27:49,416] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.50 | bwd: 4628.00 | bwd_inner: 4623.03 | bwd_allreduce: 4.90 | step: 44.06
- 72%|███████▏  | 4164/5800 [11:41:19<3:09:21,  6.94s/it]                                                        {'loss': 0.0127, 'grad_norm': 7.578120231628418, 'learning_rate': 7.781254383457376e-06, 'epoch': 35.9}
- 72%|███████▏  | 4164/5800 [11:41:19<3:09:21,  6.94s/it]score1 tensor([[0.3281],
-        [0.3926],
-        [0.5352],
-        [0.4668]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3457, 0.4004, 0.5508, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:27:56,313] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.36
-[2025-01-25 20:27:56,314] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.62 | bwd_microstep: 4614.08 | bwd_inner_microstep: 4608.68 | bwd_allreduce_microstep: 5.30 | step_microstep: 48.76
-[2025-01-25 20:27:56,315] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.59 | bwd: 4614.11 | bwd_inner: 4608.68 | bwd_allreduce: 5.35 | step: 48.77
- 72%|███████▏  | 4165/5800 [11:41:26<3:08:49,  6.93s/it]                                                        {'loss': 0.0132, 'grad_norm': 7.414991855621338, 'learning_rate': 7.772414721452461e-06, 'epoch': 35.91}
- 72%|███████▏  | 4165/5800 [11:41:26<3:08:49,  6.93s/it]score1 tensor([[0.5117],
-        [0.6719],
-        [0.6172],
-        [0.6367]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.6875, 0.6211, 0.6367], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:28:03,176] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 20:28:03,177] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.47 | bwd_microstep: 4581.38 | bwd_inner_microstep: 4576.23 | bwd_allreduce_microstep: 5.04 | step_microstep: 45.34
-[2025-01-25 20:28:03,178] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.44 | bwd: 4581.41 | bwd_inner: 4576.23 | bwd_allreduce: 5.09 | step: 45.35
- 72%|███████▏  | 4166/5800 [11:41:33<3:08:10,  6.91s/it]                                                        {'loss': 0.0107, 'grad_norm': 6.843353748321533, 'learning_rate': 7.763578872220811e-06, 'epoch': 35.91}
- 72%|███████▏  | 4166/5800 [11:41:33<3:08:10,  6.91s/it]score1 tensor([[0.6016],
-        [0.5273],
-        [0.5273],
-        [0.4902]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.5391, 0.5469, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:28:10,074] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 20:28:10,075] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.36 | bwd_microstep: 4620.48 | bwd_inner_microstep: 4613.32 | bwd_allreduce_microstep: 7.02 | step_microstep: 45.20
-[2025-01-25 20:28:10,076] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.32 | bwd: 4620.50 | bwd_inner: 4613.32 | bwd_allreduce: 7.07 | step: 45.21
- 72%|███████▏  | 4167/5800 [11:41:40<3:07:57,  6.91s/it]                                                        {'loss': 0.0137, 'grad_norm': 8.371145248413086, 'learning_rate': 7.754746838517584e-06, 'epoch': 35.92}
- 72%|███████▏  | 4167/5800 [11:41:40<3:07:57,  6.91s/it]score1 tensor([[0.5312],
-        [0.5312],
-        [0.4707],
-        [0.3848]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.5508, 0.4980, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0151, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:28:16,980] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 20:28:16,981] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.08 | bwd_microstep: 4627.17 | bwd_inner_microstep: 4618.57 | bwd_allreduce_microstep: 8.44 | step_microstep: 47.50
-[2025-01-25 20:28:16,981] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.04 | bwd: 4627.20 | bwd_inner: 4618.57 | bwd_allreduce: 8.54 | step: 47.50
- 72%|███████▏  | 4168/5800 [11:41:46<3:07:50,  6.91s/it]                                                        {'loss': 0.0151, 'grad_norm': 0.5666874647140503, 'learning_rate': 7.745918623096773e-06, 'epoch': 35.93}
- 72%|███████▏  | 4168/5800 [11:41:46<3:07:50,  6.91s/it]score1 tensor([[0.4766],
-        [0.5352],
-        [0.4961],
-        [0.4570]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.5312, 0.4863, 0.4629], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:28:23,878] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 20:28:23,881] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.87 | bwd_microstep: 4616.22 | bwd_inner_microstep: 4610.82 | bwd_allreduce_microstep: 5.28 | step_microstep: 48.59
-[2025-01-25 20:28:23,881] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.83 | bwd: 4616.27 | bwd_inner: 4610.82 | bwd_allreduce: 5.33 | step: 48.61
- 72%|███████▏  | 4169/5800 [11:41:53<3:07:44,  6.91s/it]                                                        {'loss': 0.0059, 'grad_norm': 4.112548828125, 'learning_rate': 7.737094228711153e-06, 'epoch': 35.94}
- 72%|███████▏  | 4169/5800 [11:41:53<3:07:44,  6.91s/it]score1 tensor([[0.4863],
-        [0.6016],
-        [0.5898],
-        [0.3691]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4863, 0.6094, 0.5898, 0.3516], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:28:30,710] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 20:28:30,712] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.62 | bwd_microstep: 4541.59 | bwd_inner_microstep: 4536.14 | bwd_allreduce_microstep: 5.35 | step_microstep: 47.62
-[2025-01-25 20:28:30,713] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.58 | bwd: 4541.61 | bwd_inner: 4536.14 | bwd_allreduce: 5.40 | step: 47.63
- 72%|███████▏  | 4170/5800 [11:42:00<3:06:58,  6.88s/it]                                                        {'loss': 0.0063, 'grad_norm': 0.7657455205917358, 'learning_rate': 7.728273658112341e-06, 'epoch': 35.95}
- 72%|███████▏  | 4170/5800 [11:42:00<3:06:58,  6.88s/it]score1 tensor([[0.4922],
-        [0.5820],
-        [0.3652],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4766, 0.5547, 0.3672, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0151, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:28:37,604] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 20:28:37,605] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.59 | bwd_microstep: 4616.46 | bwd_inner_microstep: 4610.97 | bwd_allreduce_microstep: 5.40 | step_microstep: 46.56
-[2025-01-25 20:28:37,606] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.54 | bwd: 4616.50 | bwd_inner: 4610.97 | bwd_allreduce: 5.45 | step: 46.57
- 72%|███████▏  | 4171/5800 [11:42:07<3:06:55,  6.89s/it]                                                        {'loss': 0.0151, 'grad_norm': 0.6344605088233948, 'learning_rate': 7.719456914050729e-06, 'epoch': 35.96}
- 72%|███████▏  | 4171/5800 [11:42:07<3:06:55,  6.89s/it]score1 tensor([[0.6289],
-        [0.4473],
-        [0.4805],
-        [0.4551]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.4336, 0.4766, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:28:44,515] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 6.43 | optimizer_step: 4.36
-[2025-01-25 20:28:44,517] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.75 | bwd_microstep: 4623.29 | bwd_inner_microstep: 4613.99 | bwd_allreduce_microstep: 9.21 | step_microstep: 49.97
-[2025-01-25 20:28:44,517] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.71 | bwd: 4623.32 | bwd_inner: 4613.99 | bwd_allreduce: 9.26 | step: 49.97
- 72%|███████▏  | 4172/5800 [11:42:14<3:07:01,  6.89s/it]                                                        {'loss': 0.0103, 'grad_norm': 8.031909942626953, 'learning_rate': 7.710643999275542e-06, 'epoch': 35.97}
- 72%|███████▏  | 4172/5800 [11:42:14<3:07:01,  6.89s/it]score1 tensor([[0.4609],
-        [0.5117],
-        [0.5312],
-        [0.4941]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.4980, 0.5312, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:28:51,330] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.88 | optimizer_step: 4.36
-[2025-01-25 20:28:51,332] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.33 | bwd_microstep: 4539.06 | bwd_inner_microstep: 4533.94 | bwd_allreduce_microstep: 5.00 | step_microstep: 43.49
-[2025-01-25 20:28:51,333] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.29 | bwd: 4539.08 | bwd_inner: 4533.94 | bwd_allreduce: 5.06 | step: 43.51
- 72%|███████▏  | 4173/5800 [11:42:21<3:06:18,  6.87s/it]                                                        {'loss': 0.0068, 'grad_norm': 3.9598233699798584, 'learning_rate': 7.70183491653479e-06, 'epoch': 35.97}
- 72%|███████▏  | 4173/5800 [11:42:21<3:06:18,  6.87s/it]score1 tensor([[0.4883],
-        [0.5625],
-        [0.4688],
-        [0.6250]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4668, 0.6055, 0.4609, 0.6133], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0210, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:28:58,230] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 20:28:58,231] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.87 | bwd_microstep: 4613.98 | bwd_inner_microstep: 4608.55 | bwd_allreduce_microstep: 5.32 | step_microstep: 48.56
-[2025-01-25 20:28:58,232] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.84 | bwd: 4614.01 | bwd_inner: 4608.55 | bwd_allreduce: 5.37 | step: 48.57
- 72%|███████▏  | 4174/5800 [11:42:28<3:06:22,  6.88s/it]                                                        {'loss': 0.021, 'grad_norm': 4.002481460571289, 'learning_rate': 7.69302966857531e-06, 'epoch': 35.98}
- 72%|███████▏  | 4174/5800 [11:42:28<3:06:22,  6.88s/it]score1 tensor([[0.5664],
-        [0.5625],
-        [0.4355],
-        [0.6367]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5742, 0.5625, 0.4180, 0.6406], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:29:05,076] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.36
-[2025-01-25 20:29:05,077] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.16 | bwd_microstep: 4568.35 | bwd_inner_microstep: 4563.19 | bwd_allreduce_microstep: 5.08 | step_microstep: 45.54
-[2025-01-25 20:29:05,078] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.12 | bwd: 4568.37 | bwd_inner: 4563.19 | bwd_allreduce: 5.12 | step: 45.54
- 72%|███████▏  | 4175/5800 [11:42:35<3:06:02,  6.87s/it]                                                        {'loss': 0.0073, 'grad_norm': 2.696834087371826, 'learning_rate': 7.684228258142722e-06, 'epoch': 35.99}
- 72%|███████▏  | 4175/5800 [11:42:35<3:06:02,  6.87s/it][93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-score1 tensor([[0.4336]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:29:09,466] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.37
-[2025-01-25 20:29:09,467] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 575.41 | bwd_microstep: 1220.94 | bwd_inner_microstep: 1215.92 | bwd_allreduce_microstep: 4.91 | step_microstep: 46.57
-[2025-01-25 20:29:09,468] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 575.38 | bwd: 1220.97 | bwd_inner: 1215.92 | bwd_allreduce: 4.97 | step: 46.57
- 72%|███████▏  | 4176/5800 [11:42:39<2:45:46,  6.12s/it]                                                        {'loss': 0.0117, 'grad_norm': 7.490439414978027, 'learning_rate': 7.675430687981454e-06, 'epoch': 36.0}
- 72%|███████▏  | 4176/5800 [11:42:39<2:45:46,  6.12s/it][2025-01-25 20:29:14,398] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 20:29:24,868] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 20:29:35,722] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 20:29:46,374] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-score1 tensor([[0.4473],
-        [0.3770],
-        [0.4824],
-        [0.4922]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4375, 0.3750, 0.4941, 0.4766], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:30:02,665] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.37
-[2025-01-25 20:30:02,667] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2138.64 | bwd_microstep: 4594.71 | bwd_inner_microstep: 4589.12 | bwd_allreduce_microstep: 5.47 | step_microstep: 51.22
-[2025-01-25 20:30:02,667] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2138.61 | bwd: 4594.74 | bwd_inner: 4589.12 | bwd_allreduce: 5.54 | step: 51.23
- 72%|███████▏  | 4177/5800 [11:43:32<9:07:42, 20.25s/it]                                                        {'loss': 0.0098, 'grad_norm': 3.644432544708252, 'learning_rate': 7.666636960834755e-06, 'epoch': 36.01}
- 72%|███████▏  | 4177/5800 [11:43:32<9:07:42, 20.25s/it]score1 tensor([[0.6211],
-        [0.5312],
-        [0.4141],
-        [0.4746]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.5273, 0.4004, 0.4883], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:30:09,455] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 20:30:09,456] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2130.41 | bwd_microstep: 4534.89 | bwd_inner_microstep: 4529.50 | bwd_allreduce_microstep: 5.32 | step_microstep: 45.66
-[2025-01-25 20:30:09,456] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2130.36 | bwd: 4534.92 | bwd_inner: 4529.50 | bwd_allreduce: 5.36 | step: 45.66
- 72%|███████▏  | 4178/5800 [11:43:39<7:18:15, 16.21s/it]                                                        {'loss': 0.0078, 'grad_norm': 1.9658764600753784, 'learning_rate': 7.657847079444647e-06, 'epoch': 36.02}
- 72%|███████▏  | 4178/5800 [11:43:39<7:18:15, 16.21s/it]score1 tensor([[0.3828],
-        [0.4824],
-        [0.5586],
-        [0.4336]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3809, 0.4844, 0.5820, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:30:16,297] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.36
-[2025-01-25 20:30:16,299] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2136.92 | bwd_microstep: 4584.52 | bwd_inner_microstep: 4579.04 | bwd_allreduce_microstep: 5.38 | step_microstep: 42.63
-[2025-01-25 20:30:16,299] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2136.89 | bwd: 4584.54 | bwd_inner: 4579.04 | bwd_allreduce: 5.43 | step: 42.63
- 72%|███████▏  | 4179/5800 [11:43:46<6:02:00, 13.40s/it]                                                        {'loss': 0.0098, 'grad_norm': 4.227005481719971, 'learning_rate': 7.64906104655198e-06, 'epoch': 36.03}
- 72%|███████▏  | 4179/5800 [11:43:46<6:02:00, 13.40s/it]score1 tensor([[0.4043],
-        [0.5234],
-        [0.5508],
-        [0.4434]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4238, 0.5352, 0.5664, 0.4395], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:30:23,170] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 20:30:23,171] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2141.19 | bwd_microstep: 4605.40 | bwd_inner_microstep: 4599.40 | bwd_allreduce_microstep: 5.87 | step_microstep: 52.41
-[2025-01-25 20:30:23,172] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2141.16 | bwd: 4605.43 | bwd_inner: 4599.40 | bwd_allreduce: 5.95 | step: 52.42
- 72%|███████▏  | 4180/5800 [11:43:53<5:08:54, 11.44s/it]                                                        {'loss': 0.0127, 'grad_norm': 4.117626190185547, 'learning_rate': 7.640278864896386e-06, 'epoch': 36.03}
- 72%|███████▏  | 4180/5800 [11:43:53<5:08:54, 11.44s/it]score1 tensor([[0.5000],
-        [0.6211],
-        [0.6367],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.6094, 0.6445, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:30:30,032] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 20:30:30,034] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2147.78 | bwd_microstep: 4599.29 | bwd_inner_microstep: 4594.13 | bwd_allreduce_microstep: 5.03 | step_microstep: 45.27
-[2025-01-25 20:30:30,034] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2147.75 | bwd: 4599.31 | bwd_inner: 4594.14 | bwd_allreduce: 5.11 | step: 45.28
- 72%|███████▏  | 4181/5800 [11:44:00<4:31:39, 10.07s/it]                                                        {'loss': 0.0073, 'grad_norm': 3.9707398414611816, 'learning_rate': 7.631500537216296e-06, 'epoch': 36.04}
- 72%|███████▏  | 4181/5800 [11:44:00<4:31:39, 10.07s/it]score1 tensor([[0.6562],
-        [0.5195],
-        [0.4805],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6484, 0.5195, 0.4785, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0034, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:30:36,850] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.94 | optimizer_step: 4.37
-[2025-01-25 20:30:36,852] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2145.94 | bwd_microstep: 4550.28 | bwd_inner_microstep: 4545.02 | bwd_allreduce_microstep: 5.17 | step_microstep: 49.79
-[2025-01-25 20:30:36,852] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2145.91 | bwd: 4550.30 | bwd_inner: 4545.02 | bwd_allreduce: 5.22 | step: 49.80
- 72%|███████▏  | 4182/5800 [11:44:06<4:05:14,  9.09s/it]                                                        {'loss': 0.0034, 'grad_norm': 2.2170238494873047, 'learning_rate': 7.62272606624896e-06, 'epoch': 36.05}
- 72%|███████▏  | 4182/5800 [11:44:06<4:05:14,  9.09s/it]score1 tensor([[0.4414],
-        [0.5312],
-        [0.6406],
-        [0.6211]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4395, 0.5312, 0.6367, 0.6289], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0034, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:30:43,683] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 20:30:43,684] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2148.33 | bwd_microstep: 4562.21 | bwd_inner_microstep: 4556.62 | bwd_allreduce_microstep: 5.46 | step_microstep: 48.44
-[2025-01-25 20:30:43,685] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2148.30 | bwd: 4562.23 | bwd_inner: 4556.62 | bwd_allreduce: 5.55 | step: 48.45
- 72%|███████▏  | 4183/5800 [11:44:13<3:46:46,  8.41s/it]                                                        {'loss': 0.0034, 'grad_norm': 1.9585129022598267, 'learning_rate': 7.61395545473039e-06, 'epoch': 36.06}
- 72%|███████▏  | 4183/5800 [11:44:13<3:46:46,  8.41s/it]score1 tensor([[0.4629],
-        [0.6289],
-        [0.6641],
-        [0.4434]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.6289, 0.6719, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:30:50,509] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 20:30:50,510] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2146.87 | bwd_microstep: 4550.28 | bwd_inner_microstep: 4545.32 | bwd_allreduce_microstep: 4.88 | step_microstep: 43.79
-[2025-01-25 20:30:50,511] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2146.84 | bwd: 4550.31 | bwd_inner: 4545.32 | bwd_allreduce: 4.92 | step: 43.79
- 72%|███████▏  | 4184/5800 [11:44:20<3:33:47,  7.94s/it]                                                        {'loss': 0.0059, 'grad_norm': 2.3888306617736816, 'learning_rate': 7.605188705395434e-06, 'epoch': 36.07}
- 72%|███████▏  | 4184/5800 [11:44:20<3:33:47,  7.94s/it]score1 tensor([[0.5742],
-        [0.5391],
-        [0.3438],
-        [0.3984]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.5430, 0.2812, 0.3984], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:30:57,349] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 20:30:57,350] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.57 | bwd_microstep: 4560.05 | bwd_inner_microstep: 4555.24 | bwd_allreduce_microstep: 4.72 | step_microstep: 50.65
-[2025-01-25 20:30:57,351] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.54 | bwd: 4560.08 | bwd_inner: 4555.24 | bwd_allreduce: 4.76 | step: 50.66
- 72%|███████▏  | 4185/5800 [11:44:27<3:24:48,  7.61s/it]                                                        {'loss': 0.0254, 'grad_norm': 1.7841498851776123, 'learning_rate': 7.596425820977702e-06, 'epoch': 36.08}
- 72%|███████▏  | 4185/5800 [11:44:27<3:24:48,  7.61s/it]score1 tensor([[0.3750],
-        [0.5195],
-        [0.5391],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3262, 0.5273, 0.5547, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0181, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:31:04,179] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 20:31:04,181] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.88 | bwd_microstep: 4562.74 | bwd_inner_microstep: 4557.78 | bwd_allreduce_microstep: 4.88 | step_microstep: 44.87
-[2025-01-25 20:31:04,181] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.84 | bwd: 4562.77 | bwd_inner: 4557.78 | bwd_allreduce: 4.92 | step: 44.88
- 72%|███████▏  | 4186/5800 [11:44:34<3:18:24,  7.38s/it]                                                        {'loss': 0.0181, 'grad_norm': 2.559882164001465, 'learning_rate': 7.587666804209628e-06, 'epoch': 36.09}
- 72%|███████▏  | 4186/5800 [11:44:34<3:18:24,  7.38s/it]score1 tensor([[0.5469],
-        [0.5781],
-        [0.4258],
-        [0.4492]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.5898, 0.4277, 0.4609], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:31:11,066] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.36
-[2025-01-25 20:31:11,068] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.18 | bwd_microstep: 4616.05 | bwd_inner_microstep: 4610.87 | bwd_allreduce_microstep: 5.08 | step_microstep: 42.61
-[2025-01-25 20:31:11,068] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.15 | bwd: 4616.07 | bwd_inner: 4610.87 | bwd_allreduce: 5.13 | step: 42.61
- 72%|███████▏  | 4187/5800 [11:44:41<3:14:21,  7.23s/it]                                                        {'loss': 0.0083, 'grad_norm': 3.783374071121216, 'learning_rate': 7.578911657822415e-06, 'epoch': 36.09}
- 72%|███████▏  | 4187/5800 [11:44:41<3:14:21,  7.23s/it]score1 tensor([[0.5156],
-        [0.4121],
-        [0.4707],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.4180, 0.4941, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:31:17,914] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.36
-[2025-01-25 20:31:17,915] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.07 | bwd_microstep: 4563.15 | bwd_inner_microstep: 4557.70 | bwd_allreduce_microstep: 5.34 | step_microstep: 56.58
-[2025-01-25 20:31:17,916] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.04 | bwd: 4563.18 | bwd_inner: 4557.70 | bwd_allreduce: 5.40 | step: 56.57
- 72%|███████▏  | 4188/5800 [11:44:47<3:11:09,  7.12s/it]                                                        {'loss': 0.0083, 'grad_norm': 1.6780253648757935, 'learning_rate': 7.5701603845460654e-06, 'epoch': 36.1}
- 72%|███████▏  | 4188/5800 [11:44:47<3:11:09,  7.12s/it]score1 tensor([[0.4824],
-        [0.4688],
-        [0.4746],
-        [0.3477]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4805, 0.4766, 0.4941, 0.3691], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:31:24,817] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.36
-[2025-01-25 20:31:24,819] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.19 | bwd_microstep: 4619.56 | bwd_inner_microstep: 4614.45 | bwd_allreduce_microstep: 4.96 | step_microstep: 44.77
-[2025-01-25 20:31:24,819] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.16 | bwd: 4619.59 | bwd_inner: 4614.45 | bwd_allreduce: 5.05 | step: 44.78
- 72%|███████▏  | 4189/5800 [11:44:54<3:09:22,  7.05s/it]                                                        {'loss': 0.0127, 'grad_norm': 3.604856252670288, 'learning_rate': 7.561412987109391e-06, 'epoch': 36.11}
- 72%|███████▏  | 4189/5800 [11:44:54<3:09:22,  7.05s/it]score1 tensor([[0.4746],
-        [0.3711],
-        [0.4531],
-        [0.4258]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.3750, 0.4551, 0.4258], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0029, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:31:31,688] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 20:31:31,689] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.07 | bwd_microstep: 4579.78 | bwd_inner_microstep: 4574.44 | bwd_allreduce_microstep: 5.23 | step_microstep: 48.72
-[2025-01-25 20:31:31,693] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.04 | bwd: 4579.80 | bwd_inner: 4574.44 | bwd_allreduce: 5.29 | step: 48.72
- 72%|███████▏  | 4190/5800 [11:45:01<3:07:44,  7.00s/it]                                                        {'loss': 0.0029, 'grad_norm': 1.6618191003799438, 'learning_rate': 7.552669468239971e-06, 'epoch': 36.12}
- 72%|███████▏  | 4190/5800 [11:45:01<3:07:44,  7.00s/it]score1 tensor([[0.6133],
-        [0.4902],
-        [0.4141],
-        [0.3828]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133, 0.4805, 0.4180, 0.3887], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:31:38,536] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 20:31:38,537] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.03 | bwd_microstep: 4569.17 | bwd_inner_microstep: 4564.08 | bwd_allreduce_microstep: 5.01 | step_microstep: 45.68
-[2025-01-25 20:31:38,538] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.99 | bwd: 4569.20 | bwd_inner: 4564.08 | bwd_allreduce: 5.05 | step: 45.68
- 72%|███████▏  | 4191/5800 [11:45:08<3:06:24,  6.95s/it]                                                        {'loss': 0.0049, 'grad_norm': 1.6382328271865845, 'learning_rate': 7.543929830664194e-06, 'epoch': 36.13}
- 72%|███████▏  | 4191/5800 [11:45:08<3:06:24,  6.95s/it]score1 tensor([[0.6953],
-        [0.4434],
-        [0.6406],
-        [0.6250]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6875, 0.4473, 0.6406, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:31:45,397] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.95 | optimizer_step: 4.36
-[2025-01-25 20:31:45,398] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.44 | bwd_microstep: 4580.47 | bwd_inner_microstep: 4575.23 | bwd_allreduce_microstep: 5.15 | step_microstep: 48.36
-[2025-01-25 20:31:45,399] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.41 | bwd: 4580.49 | bwd_inner: 4575.23 | bwd_allreduce: 5.19 | step: 48.37
- 72%|███████▏  | 4192/5800 [11:45:15<3:05:34,  6.92s/it]                                                        {'loss': 0.0049, 'grad_norm': 2.9901506900787354, 'learning_rate': 7.535194077107228e-06, 'epoch': 36.14}
- 72%|███████▏  | 4192/5800 [11:45:15<3:05:34,  6.92s/it]score1 tensor([[0.4766],
-        [0.6211],
-        [0.4863],
-        [0.6094]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4805, 0.6055, 0.4746, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:31:52,251] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.36
-[2025-01-25 20:31:52,253] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.73 | bwd_microstep: 4574.33 | bwd_inner_microstep: 4569.45 | bwd_allreduce_microstep: 4.80 | step_microstep: 43.88
-[2025-01-25 20:31:52,253] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.70 | bwd: 4574.35 | bwd_inner: 4569.44 | bwd_allreduce: 4.85 | step: 43.89
- 72%|███████▏  | 4193/5800 [11:45:22<3:04:55,  6.90s/it]                                                        {'loss': 0.0078, 'grad_norm': 2.312450885772705, 'learning_rate': 7.526462210293027e-06, 'epoch': 36.15}
- 72%|███████▏  | 4193/5800 [11:45:22<3:04:55,  6.90s/it]score1 tensor([[0.4961],
-        [0.4531],
-        [0.5078],
-        [0.6562]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.4551, 0.4922, 0.6484], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:31:59,160] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 20:31:59,162] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.71 | bwd_microstep: 4629.28 | bwd_inner_microstep: 4624.42 | bwd_allreduce_microstep: 4.78 | step_microstep: 44.39
-[2025-01-25 20:31:59,162] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.67 | bwd: 4629.31 | bwd_inner: 4624.42 | bwd_allreduce: 4.82 | step: 44.40
- 72%|███████▏  | 4194/5800 [11:45:29<3:04:48,  6.90s/it]                                                        {'loss': 0.0122, 'grad_norm': 4.542669296264648, 'learning_rate': 7.517734232944349e-06, 'epoch': 36.16}
- 72%|███████▏  | 4194/5800 [11:45:29<3:04:48,  6.90s/it]score1 tensor([[0.5391],
-        [0.4902],
-        [0.3984],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.4668, 0.4043, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0142, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:32:06,049] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 20:32:06,050] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.27 | bwd_microstep: 4610.78 | bwd_inner_microstep: 4605.62 | bwd_allreduce_microstep: 5.05 | step_microstep: 43.40
-[2025-01-25 20:32:06,051] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.23 | bwd: 4610.81 | bwd_inner: 4605.62 | bwd_allreduce: 5.11 | step: 43.41
- 72%|███████▏  | 4195/5800 [11:45:36<3:04:34,  6.90s/it]                                                        {'loss': 0.0142, 'grad_norm': 4.54415225982666, 'learning_rate': 7.509010147782718e-06, 'epoch': 36.16}
- 72%|███████▏  | 4195/5800 [11:45:36<3:04:34,  6.90s/it]score1 tensor([[0.6055],
-        [0.5000],
-        [0.5234],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.4863, 0.5000, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:32:12,904] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.02 | optimizer_step: 4.36
-[2025-01-25 20:32:12,906] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.58 | bwd_microstep: 4569.68 | bwd_inner_microstep: 4564.50 | bwd_allreduce_microstep: 5.08 | step_microstep: 44.75
-[2025-01-25 20:32:12,906] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.55 | bwd: 4569.71 | bwd_inner: 4564.50 | bwd_allreduce: 5.13 | step: 44.75
- 72%|███████▏  | 4196/5800 [11:45:42<3:04:05,  6.89s/it]                                                        {'loss': 0.0137, 'grad_norm': 6.0903096199035645, 'learning_rate': 7.500289957528466e-06, 'epoch': 36.17}
- 72%|███████▏  | 4196/5800 [11:45:42<3:04:05,  6.89s/it]score1 tensor([[0.3711],
-        [0.5508],
-        [0.4414],
-        [0.6406]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3652, 0.5469, 0.4453, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:32:19,819] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 20:32:19,820] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.60 | bwd_microstep: 4626.05 | bwd_inner_microstep: 4621.05 | bwd_allreduce_microstep: 4.88 | step_microstep: 43.39
-[2025-01-25 20:32:19,821] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.54 | bwd: 4626.08 | bwd_inner: 4621.05 | bwd_allreduce: 4.93 | step: 43.39
- 72%|███████▏  | 4197/5800 [11:45:49<3:04:16,  6.90s/it]                                                        {'loss': 0.0083, 'grad_norm': 4.331280708312988, 'learning_rate': 7.491573664900686e-06, 'epoch': 36.18}
- 72%|███████▏  | 4197/5800 [11:45:49<3:04:16,  6.90s/it]score1 tensor([[0.5742],
-        [0.6016],
-        [0.6523],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.6016, 0.6445, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:32:26,682] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.36
-[2025-01-25 20:32:26,683] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.89 | bwd_microstep: 4573.84 | bwd_inner_microstep: 4566.87 | bwd_allreduce_microstep: 6.87 | step_microstep: 46.21
-[2025-01-25 20:32:26,683] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.86 | bwd: 4573.87 | bwd_inner: 4566.87 | bwd_allreduce: 6.92 | step: 46.21
- 72%|███████▏  | 4198/5800 [11:45:56<3:03:46,  6.88s/it]                                                        {'loss': 0.0049, 'grad_norm': 6.695840835571289, 'learning_rate': 7.482861272617288e-06, 'epoch': 36.19}
- 72%|███████▏  | 4198/5800 [11:45:56<3:03:46,  6.88s/it]score1 tensor([[0.4199],
-        [0.4004],
-        [0.3379],
-        [0.5938]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4199, 0.4004, 0.3555, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:32:33,498] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 20:32:33,499] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.71 | bwd_microstep: 4539.88 | bwd_inner_microstep: 4535.13 | bwd_allreduce_microstep: 4.68 | step_microstep: 43.17
-[2025-01-25 20:32:33,499] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.68 | bwd: 4539.91 | bwd_inner: 4535.13 | bwd_allreduce: 4.72 | step: 43.18
- 72%|███████▏  | 4199/5800 [11:46:03<3:03:08,  6.86s/it]                                                        {'loss': 0.0083, 'grad_norm': 3.848975896835327, 'learning_rate': 7.474152783394926e-06, 'epoch': 36.2}
- 72%|███████▏  | 4199/5800 [11:46:03<3:03:08,  6.86s/it]score1 tensor([[0.5117],
-        [0.4727],
-        [0.5547],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5078, 0.4648, 0.5625, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:32:40,381] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.36
-[2025-01-25 20:32:40,383] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.03 | bwd_microstep: 4611.27 | bwd_inner_microstep: 4606.24 | bwd_allreduce_microstep: 4.93 | step_microstep: 43.32
-[2025-01-25 20:32:40,383] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.98 | bwd: 4611.30 | bwd_inner: 4606.24 | bwd_allreduce: 4.98 | step: 43.34
- 72%|███████▏  | 4200/5800 [11:46:10<3:03:15,  6.87s/it]                                                        {'loss': 0.0078, 'grad_norm': 3.9061262607574463, 'learning_rate': 7.465448199949077e-06, 'epoch': 36.21}
- 72%|███████▏  | 4200/5800 [11:46:10<3:03:15,  6.87s/it]score1 tensor([[0.5117],
-        [0.5938],
-        [0.5469],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.5820, 0.5664, 0.5000], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0151, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:32:47,284] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.37
-[2025-01-25 20:32:47,285] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.43 | bwd_microstep: 4619.06 | bwd_inner_microstep: 4613.90 | bwd_allreduce_microstep: 5.07 | step_microstep: 45.11
-[2025-01-25 20:32:47,285] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.40 | bwd: 4619.09 | bwd_inner: 4613.90 | bwd_allreduce: 5.12 | step: 45.11
- 72%|███████▏  | 4201/5800 [11:46:17<3:03:19,  6.88s/it]                                                        {'loss': 0.0151, 'grad_norm': 0.46012577414512634, 'learning_rate': 7.456747524993966e-06, 'epoch': 36.22}
- 72%|███████▏  | 4201/5800 [11:46:17<3:03:19,  6.88s/it]score1 tensor([[0.6133],
-        [0.5938],
-        [0.5586],
-        [0.3906]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.6016, 0.5664, 0.3906], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:32:54,133] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.36
-[2025-01-25 20:32:54,134] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.81 | bwd_microstep: 4573.61 | bwd_inner_microstep: 4568.65 | bwd_allreduce_microstep: 4.88 | step_microstep: 44.66
-[2025-01-25 20:32:54,134] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.78 | bwd: 4573.64 | bwd_inner: 4568.65 | bwd_allreduce: 4.92 | step: 44.66
- 72%|███████▏  | 4202/5800 [11:46:24<3:03:00,  6.87s/it]                                                        {'loss': 0.0049, 'grad_norm': 6.648177146911621, 'learning_rate': 7.448050761242627e-06, 'epoch': 36.22}
- 72%|███████▏  | 4202/5800 [11:46:24<3:03:00,  6.87s/it]score1 tensor([[0.5273],
-        [0.6719],
-        [0.4453],
-        [0.4863]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.6797, 0.4473, 0.4863], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0034, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:33:00,981] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 20:33:00,982] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.49 | bwd_microstep: 4571.26 | bwd_inner_microstep: 4566.38 | bwd_allreduce_microstep: 4.81 | step_microstep: 42.99
-[2025-01-25 20:33:00,983] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.45 | bwd: 4571.29 | bwd_inner: 4566.38 | bwd_allreduce: 4.84 | step: 43.00
- 72%|███████▏  | 4203/5800 [11:46:30<3:02:41,  6.86s/it]                                                        {'loss': 0.0034, 'grad_norm': 6.350813388824463, 'learning_rate': 7.43935791140685e-06, 'epoch': 36.23}
- 72%|███████▏  | 4203/5800 [11:46:30<3:02:41,  6.86s/it]score1 tensor([[0.5625],
-        [0.4258],
-        [0.6055],
-        [0.5977]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.4297, 0.6094, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:33:07,840] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 20:33:07,841] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.02 | bwd_microstep: 4582.74 | bwd_inner_microstep: 4577.82 | bwd_allreduce_microstep: 4.80 | step_microstep: 42.45
-[2025-01-25 20:33:07,841] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.99 | bwd: 4582.76 | bwd_inner: 4577.82 | bwd_allreduce: 4.86 | step: 42.46
- 72%|███████▏  | 4204/5800 [11:46:37<3:02:31,  6.86s/it]                                                        {'loss': 0.0039, 'grad_norm': 6.349061012268066, 'learning_rate': 7.430668978197222e-06, 'epoch': 36.24}
- 72%|███████▏  | 4204/5800 [11:46:37<3:02:31,  6.86s/it]score1 tensor([[0.5820],
-        [0.4746],
-        [0.6445],
-        [0.3359]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.4688, 0.6602, 0.3477], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:33:14,730] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 20:33:14,732] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.11 | bwd_microstep: 4614.79 | bwd_inner_microstep: 4609.85 | bwd_allreduce_microstep: 4.85 | step_microstep: 44.05
-[2025-01-25 20:33:14,732] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.06 | bwd: 4614.81 | bwd_inner: 4609.85 | bwd_allreduce: 4.89 | step: 44.05
- 72%|███████▎  | 4205/5800 [11:46:44<3:02:37,  6.87s/it]                                                        {'loss': 0.0093, 'grad_norm': 0.4898735284805298, 'learning_rate': 7.421983964323109e-06, 'epoch': 36.25}
- 72%|███████▎  | 4205/5800 [11:46:44<3:02:37,  6.87s/it]score1 tensor([[0.6016],
-        [0.4902],
-        [0.3691],
-        [0.1807]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.4941, 0.3652, 0.1787], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:33:21,624] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.36
-[2025-01-25 20:33:21,625] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.55 | bwd_microstep: 4613.45 | bwd_inner_microstep: 4608.41 | bwd_allreduce_microstep: 4.95 | step_microstep: 50.14
-[2025-01-25 20:33:21,626] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.52 | bwd: 4613.47 | bwd_inner: 4608.41 | bwd_allreduce: 4.99 | step: 50.14
- 73%|███████▎  | 4206/5800 [11:46:51<3:02:46,  6.88s/it]                                                        {'loss': 0.0063, 'grad_norm': 1.8966038227081299, 'learning_rate': 7.413302872492645e-06, 'epoch': 36.26}
- 73%|███████▎  | 4206/5800 [11:46:51<3:02:46,  6.88s/it]score1 tensor([[0.4688],
-        [0.6797],
-        [0.5391],
-        [0.6172]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.6797, 0.5508, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:33:28,489] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 20:33:28,495] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.06 | bwd_microstep: 4569.67 | bwd_inner_microstep: 4564.57 | bwd_allreduce_microstep: 5.00 | step_microstep: 53.97
-[2025-01-25 20:33:28,496] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.03 | bwd: 4569.70 | bwd_inner: 4564.58 | bwd_allreduce: 5.05 | step: 53.98
- 73%|███████▎  | 4207/5800 [11:46:58<3:02:32,  6.88s/it]                                                        {'loss': 0.0078, 'grad_norm': 6.31688928604126, 'learning_rate': 7.404625705412736e-06, 'epoch': 36.27}
- 73%|███████▎  | 4207/5800 [11:46:58<3:02:32,  6.88s/it]score1 tensor([[0.5117],
-        [0.5312],
-        [0.5195],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.5508, 0.5273, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:33:35,350] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 20:33:35,352] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.19 | bwd_microstep: 4579.53 | bwd_inner_microstep: 4574.66 | bwd_allreduce_microstep: 4.77 | step_microstep: 43.69
-[2025-01-25 20:33:35,352] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.11 | bwd: 4579.55 | bwd_inner: 4574.66 | bwd_allreduce: 4.82 | step: 43.70
- 73%|███████▎  | 4208/5800 [11:47:05<3:02:16,  6.87s/it]                                                        {'loss': 0.0078, 'grad_norm': 6.154433727264404, 'learning_rate': 7.395952465789089e-06, 'epoch': 36.28}
- 73%|███████▎  | 4208/5800 [11:47:05<3:02:16,  6.87s/it]score1 tensor([[0.5195],
-        [0.5078],
-        [0.5781],
-        [0.4805]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.5078, 0.5703, 0.4746], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0034, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:33:42,194] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 20:33:42,195] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.76 | bwd_microstep: 4565.19 | bwd_inner_microstep: 4560.18 | bwd_allreduce_microstep: 4.92 | step_microstep: 43.40
-[2025-01-25 20:33:42,195] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.72 | bwd: 4565.22 | bwd_inner: 4560.18 | bwd_allreduce: 4.97 | step: 43.41
- 73%|███████▎  | 4209/5800 [11:47:12<3:01:56,  6.86s/it]                                                        {'loss': 0.0034, 'grad_norm': 4.130805492401123, 'learning_rate': 7.3872831563261524e-06, 'epoch': 36.28}
- 73%|███████▎  | 4209/5800 [11:47:12<3:01:56,  6.86s/it]score1 tensor([[0.4141],
-        [0.5039],
-        [0.5078],
-        [0.7109]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.5039, 0.5078, 0.7031], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0029, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:33:49,006] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.37
-[2025-01-25 20:33:49,008] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.25 | bwd_microstep: 4532.72 | bwd_inner_microstep: 4527.61 | bwd_allreduce_microstep: 4.99 | step_microstep: 43.57
-[2025-01-25 20:33:49,008] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.21 | bwd: 4532.75 | bwd_inner: 4527.61 | bwd_allreduce: 5.06 | step: 43.58
- 73%|███████▎  | 4210/5800 [11:47:18<3:01:26,  6.85s/it]                                                        {'loss': 0.0029, 'grad_norm': 0.9792575836181641, 'learning_rate': 7.378617779727188e-06, 'epoch': 36.29}
- 73%|███████▎  | 4210/5800 [11:47:18<3:01:26,  6.85s/it]score1 tensor([[0.4414],
-        [0.4199],
-        [0.5352],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4277, 0.4141, 0.5391, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:33:55,908] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 20:33:55,909] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.91 | bwd_microstep: 4623.85 | bwd_inner_microstep: 4618.88 | bwd_allreduce_microstep: 4.87 | step_microstep: 44.70
-[2025-01-25 20:33:55,909] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.88 | bwd: 4623.89 | bwd_inner: 4618.88 | bwd_allreduce: 4.92 | step: 44.71
- 73%|███████▎  | 4211/5800 [11:47:25<3:01:44,  6.86s/it]                                                        {'loss': 0.0073, 'grad_norm': 3.7136824131011963, 'learning_rate': 7.369956338694197e-06, 'epoch': 36.3}
- 73%|███████▎  | 4211/5800 [11:47:25<3:01:44,  6.86s/it]score1 tensor([[0.3066],
-        [0.4941],
-        [0.5508],
-        [0.3809]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3086, 0.4961, 0.5469, 0.3730], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:34:02,803] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.36
-[2025-01-25 20:34:02,805] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.61 | bwd_microstep: 4621.54 | bwd_inner_microstep: 4616.45 | bwd_allreduce_microstep: 4.98 | step_microstep: 43.73
-[2025-01-25 20:34:02,805] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.58 | bwd: 4621.57 | bwd_inner: 4616.45 | bwd_allreduce: 5.04 | step: 43.74
- 73%|███████▎  | 4212/5800 [11:47:32<3:01:53,  6.87s/it]                                                        {'loss': 0.0039, 'grad_norm': 0.5889080166816711, 'learning_rate': 7.3612988359279655e-06, 'epoch': 36.31}
- 73%|███████▎  | 4212/5800 [11:47:32<3:01:53,  6.87s/it]score1 tensor([[0.5977],
-        [0.5312],
-        [0.3691],
-        [0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133, 0.5273, 0.3613, 0.4805], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:34:09,696] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 20:34:09,697] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.64 | bwd_microstep: 4621.84 | bwd_inner_microstep: 4617.02 | bwd_allreduce_microstep: 4.73 | step_microstep: 40.09
-[2025-01-25 20:34:09,697] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.62 | bwd: 4621.87 | bwd_inner: 4617.02 | bwd_allreduce: 4.78 | step: 40.09
- 73%|███████▎  | 4213/5800 [11:47:39<3:01:53,  6.88s/it]                                                        {'loss': 0.0078, 'grad_norm': 3.5242292881011963, 'learning_rate': 7.352645274128063e-06, 'epoch': 36.32}
- 73%|███████▎  | 4213/5800 [11:47:39<3:01:53,  6.88s/it]score1 tensor([[0.5703],
-        [0.6836],
-        [0.4707],
-        [0.4414]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5742, 0.6875, 0.4688, 0.4531], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:34:16,588] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 20:34:16,590] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.50 | bwd_microstep: 4620.91 | bwd_inner_microstep: 4615.53 | bwd_allreduce_microstep: 5.29 | step_microstep: 47.90
-[2025-01-25 20:34:16,590] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.47 | bwd: 4620.94 | bwd_inner: 4615.53 | bwd_allreduce: 5.33 | step: 47.88
- 73%|███████▎  | 4214/5800 [11:47:46<3:01:56,  6.88s/it]                                                        {'loss': 0.0054, 'grad_norm': 4.578063488006592, 'learning_rate': 7.3439956559928085e-06, 'epoch': 36.33}
- 73%|███████▎  | 4214/5800 [11:47:46<3:01:56,  6.88s/it]score1 tensor([[0.4727],
-        [0.6719],
-        [0.4570],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4707, 0.6641, 0.4590, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:34:23,484] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.36
-[2025-01-25 20:34:23,486] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.40 | bwd_microstep: 4616.27 | bwd_inner_microstep: 4610.80 | bwd_allreduce_microstep: 5.33 | step_microstep: 48.59
-[2025-01-25 20:34:23,486] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.37 | bwd: 4616.30 | bwd_inner: 4610.80 | bwd_allreduce: 5.40 | step: 48.59
- 73%|███████▎  | 4215/5800 [11:47:53<3:01:57,  6.89s/it]                                                        {'loss': 0.0068, 'grad_norm': 0.5789585709571838, 'learning_rate': 7.335349984219315e-06, 'epoch': 36.34}
- 73%|███████▎  | 4215/5800 [11:47:53<3:01:57,  6.89s/it]score1 tensor([[0.5391],
-        [0.5000],
-        [0.5352],
-        [0.4805]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.5195, 0.5469, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:34:30,385] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 20:34:30,386] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.60 | bwd_microstep: 4615.09 | bwd_inner_microstep: 4610.37 | bwd_allreduce_microstep: 4.64 | step_microstep: 41.96
-[2025-01-25 20:34:30,387] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.56 | bwd: 4615.12 | bwd_inner: 4610.37 | bwd_allreduce: 4.68 | step: 41.96
- 73%|███████▎  | 4216/5800 [11:48:00<3:01:53,  6.89s/it]                                                        {'loss': 0.0132, 'grad_norm': 3.928318977355957, 'learning_rate': 7.3267082615034415e-06, 'epoch': 36.34}
- 73%|███████▎  | 4216/5800 [11:48:00<3:01:53,  6.89s/it]score1 tensor([[0.4082],
-        [0.5977],
-        [0.5469],
-        [0.3438]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4141, 0.6016, 0.5508, 0.3457], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:34:37,285] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 20:34:37,286] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.25 | bwd_microstep: 4621.74 | bwd_inner_microstep: 4616.32 | bwd_allreduce_microstep: 5.27 | step_microstep: 46.92
-[2025-01-25 20:34:37,287] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.21 | bwd: 4621.76 | bwd_inner: 4616.32 | bwd_allreduce: 5.35 | step: 46.95
- 73%|███████▎  | 4217/5800 [11:48:07<3:01:53,  6.89s/it]                                                        {'loss': 0.0039, 'grad_norm': 7.751802444458008, 'learning_rate': 7.318070490539839e-06, 'epoch': 36.35}
- 73%|███████▎  | 4217/5800 [11:48:07<3:01:53,  6.89s/it]score1 tensor([[0.4473],
-        [0.5742],
-        [0.3750],
-        [0.4336]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4355, 0.5625, 0.3672, 0.4160], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:34:44,196] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 20:34:44,198] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.63 | bwd_microstep: 4629.22 | bwd_inner_microstep: 4624.31 | bwd_allreduce_microstep: 4.81 | step_microstep: 45.88
-[2025-01-25 20:34:44,198] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.59 | bwd: 4629.25 | bwd_inner: 4624.31 | bwd_allreduce: 4.87 | step: 45.88
- 73%|███████▎  | 4218/5800 [11:48:14<3:01:53,  6.90s/it]                                                        {'loss': 0.0122, 'grad_norm': 7.643836975097656, 'learning_rate': 7.309436674021908e-06, 'epoch': 36.36}
- 73%|███████▎  | 4218/5800 [11:48:14<3:01:53,  6.90s/it]score1 tensor([[0.5352],
-        [0.6211],
-        [0.5508],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5234, 0.6094, 0.5508, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:34:51,050] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 20:34:51,055] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.25 | bwd_microstep: 4574.88 | bwd_inner_microstep: 4570.24 | bwd_allreduce_microstep: 4.56 | step_microstep: 48.57
-[2025-01-25 20:34:51,056] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.22 | bwd: 4574.90 | bwd_inner: 4570.24 | bwd_allreduce: 4.60 | step: 48.58
- 73%|███████▎  | 4219/5800 [11:48:21<3:01:27,  6.89s/it]                                                        {'loss': 0.0068, 'grad_norm': 6.51568078994751, 'learning_rate': 7.300806814641816e-06, 'epoch': 36.37}
- 73%|███████▎  | 4219/5800 [11:48:21<3:01:27,  6.89s/it]score1 tensor([[0.5547],
-        [0.4180],
-        [0.4766],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.4082, 0.4648, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:34:57,954] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 20:34:57,955] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.47 | bwd_microstep: 4629.22 | bwd_inner_microstep: 4624.76 | bwd_allreduce_microstep: 4.38 | step_microstep: 38.28
-[2025-01-25 20:34:57,956] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.44 | bwd: 4629.24 | bwd_inner: 4624.76 | bwd_allreduce: 4.41 | step: 38.29
- 73%|███████▎  | 4220/5800 [11:48:27<3:01:28,  6.89s/it]                                                        {'loss': 0.0112, 'grad_norm': 7.8903961181640625, 'learning_rate': 7.292180915090516e-06, 'epoch': 36.38}
- 73%|███████▎  | 4220/5800 [11:48:27<3:01:28,  6.89s/it]score1 tensor([[0.5352],
-        [0.4707],
-        [0.4082],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4551, 0.4043, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:35:04,820] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 20:35:04,822] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.28 | bwd_microstep: 4583.12 | bwd_inner_microstep: 4577.87 | bwd_allreduce_microstep: 5.16 | step_microstep: 45.69
-[2025-01-25 20:35:04,822] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.25 | bwd: 4583.15 | bwd_inner: 4577.87 | bwd_allreduce: 5.21 | step: 45.69
- 73%|███████▎  | 4221/5800 [11:48:34<3:01:10,  6.88s/it]                                                        {'loss': 0.0088, 'grad_norm': 1.688916802406311, 'learning_rate': 7.283558978057699e-06, 'epoch': 36.39}
- 73%|███████▎  | 4221/5800 [11:48:34<3:01:10,  6.88s/it]score1 tensor([[0.4980],
-        [0.5234],
-        [0.6367],
-        [0.4082]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.5273, 0.6562, 0.4023], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:35:11,719] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 20:35:11,721] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.26 | bwd_microstep: 4619.76 | bwd_inner_microstep: 4614.86 | bwd_allreduce_microstep: 4.83 | step_microstep: 45.22
-[2025-01-25 20:35:11,721] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.23 | bwd: 4619.79 | bwd_inner: 4614.86 | bwd_allreduce: 4.87 | step: 45.23
- 73%|███████▎  | 4222/5800 [11:48:41<3:01:07,  6.89s/it]                                                        {'loss': 0.0083, 'grad_norm': 0.8512906432151794, 'learning_rate': 7.27494100623185e-06, 'epoch': 36.4}
- 73%|███████▎  | 4222/5800 [11:48:41<3:01:07,  6.89s/it]score1 tensor([[0.4590],
-        [0.5156],
-        [0.5664],
-        [0.3398]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4707, 0.5156, 0.5664, 0.3340], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0044, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:35:18,535] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 20:35:18,536] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.91 | bwd_microstep: 4543.74 | bwd_inner_microstep: 4538.95 | bwd_allreduce_microstep: 4.71 | step_microstep: 41.24
-[2025-01-25 20:35:18,537] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.88 | bwd: 4543.76 | bwd_inner: 4538.95 | bwd_allreduce: 4.75 | step: 41.24
- 73%|███████▎  | 4223/5800 [11:48:48<3:00:28,  6.87s/it]                                                        {'loss': 0.0044, 'grad_norm': 0.4270106852054596, 'learning_rate': 7.266327002300193e-06, 'epoch': 36.41}
- 73%|███████▎  | 4223/5800 [11:48:48<3:00:28,  6.87s/it]score1 tensor([[0.5469],
-        [0.3789],
-        [0.3496],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.3750, 0.3398, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:35:25,450] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 20:35:25,452] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.77 | bwd_microstep: 4627.00 | bwd_inner_microstep: 4621.88 | bwd_allreduce_microstep: 5.02 | step_microstep: 53.26
-[2025-01-25 20:35:25,452] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.74 | bwd: 4627.03 | bwd_inner: 4621.88 | bwd_allreduce: 5.07 | step: 53.27
- 73%|███████▎  | 4224/5800 [11:48:55<3:00:43,  6.88s/it]                                                        {'loss': 0.0112, 'grad_norm': 1.1301685571670532, 'learning_rate': 7.257716968948721e-06, 'epoch': 36.41}
- 73%|███████▎  | 4224/5800 [11:48:55<3:00:43,  6.88s/it]score1 tensor([[0.4609],
-        [0.5586],
-        [0.4062],
-        [0.4961]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4668, 0.5469, 0.4062, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:35:32,306] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 20:35:32,308] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.58 | bwd_microstep: 4580.42 | bwd_inner_microstep: 4575.59 | bwd_allreduce_microstep: 4.72 | step_microstep: 43.32
-[2025-01-25 20:35:32,308] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.54 | bwd: 4580.45 | bwd_inner: 4575.59 | bwd_allreduce: 4.77 | step: 43.32
- 73%|███████▎  | 4225/5800 [11:49:02<3:00:28,  6.88s/it]                                                        {'loss': 0.0073, 'grad_norm': 2.241309881210327, 'learning_rate': 7.2491109088622e-06, 'epoch': 36.42}
- 73%|███████▎  | 4225/5800 [11:49:02<3:00:28,  6.88s/it]score1 tensor([[0.5000],
-        [0.3652],
-        [0.5508],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.3652, 0.5625, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:35:39,176] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.37
-[2025-01-25 20:35:39,177] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.56 | bwd_microstep: 4577.15 | bwd_inner_microstep: 4572.22 | bwd_allreduce_microstep: 4.83 | step_microstep: 45.77
-[2025-01-25 20:35:39,177] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.53 | bwd: 4577.18 | bwd_inner: 4572.22 | bwd_allreduce: 4.89 | step: 45.77
- 73%|███████▎  | 4226/5800 [11:49:09<3:00:14,  6.87s/it]                                                        {'loss': 0.0059, 'grad_norm': 6.237234115600586, 'learning_rate': 7.24050882472414e-06, 'epoch': 36.43}
- 73%|███████▎  | 4226/5800 [11:49:09<3:00:14,  6.87s/it]score1 tensor([[0.5898],
-        [0.5234],
-        [0.4355],
-        [0.4863]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5977, 0.5312, 0.4512, 0.4902], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:35:46,064] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.54 | optimizer_step: 4.37
-[2025-01-25 20:35:46,066] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.19 | bwd_microstep: 4614.64 | bwd_inner_microstep: 4609.54 | bwd_allreduce_microstep: 5.00 | step_microstep: 43.99
-[2025-01-25 20:35:46,066] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.16 | bwd: 4614.68 | bwd_inner: 4609.54 | bwd_allreduce: 5.05 | step: 43.99
- 73%|███████▎  | 4227/5800 [11:49:16<3:00:16,  6.88s/it]                                                        {'loss': 0.0088, 'grad_norm': 8.070841789245605, 'learning_rate': 7.23191071921683e-06, 'epoch': 36.44}
- 73%|███████▎  | 4227/5800 [11:49:16<3:00:16,  6.88s/it]score1 tensor([[0.5547],
-        [0.6367],
-        [0.6016],
-        [0.3926]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.6523, 0.6094, 0.3984], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:35:52,963] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.37
-[2025-01-25 20:35:52,964] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.63 | bwd_microstep: 4623.85 | bwd_inner_microstep: 4618.92 | bwd_allreduce_microstep: 4.83 | step_microstep: 45.69
-[2025-01-25 20:35:52,964] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.60 | bwd: 4623.88 | bwd_inner: 4618.92 | bwd_allreduce: 4.89 | step: 45.69
- 73%|███████▎  | 4228/5800 [11:49:22<3:00:19,  6.88s/it]                                                        {'loss': 0.0093, 'grad_norm': 8.504878997802734, 'learning_rate': 7.223316595021297e-06, 'epoch': 36.45}
- 73%|███████▎  | 4228/5800 [11:49:22<3:00:19,  6.88s/it]score1 tensor([[0.5117],
-        [0.5391],
-        [0.4453],
-        [0.4434]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.5508, 0.4551, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:35:59,852] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.37
-[2025-01-25 20:35:59,853] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.90 | bwd_microstep: 4617.17 | bwd_inner_microstep: 4612.07 | bwd_allreduce_microstep: 5.01 | step_microstep: 44.34
-[2025-01-25 20:35:59,854] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.86 | bwd: 4617.20 | bwd_inner: 4612.07 | bwd_allreduce: 5.05 | step: 44.34
- 73%|███████▎  | 4229/5800 [11:49:29<3:00:15,  6.88s/it]                                                        {'loss': 0.0093, 'grad_norm': 7.895949840545654, 'learning_rate': 7.214726454817351e-06, 'epoch': 36.46}
- 73%|███████▎  | 4229/5800 [11:49:29<3:00:15,  6.88s/it]score1 tensor([[0.4883],
-        [0.3809],
-        [0.4141],
-        [0.4785]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4824, 0.3926, 0.4141, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0044, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:36:06,669] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.37
-[2025-01-25 20:36:06,670] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.39 | bwd_microstep: 4548.09 | bwd_inner_microstep: 4542.94 | bwd_allreduce_microstep: 5.07 | step_microstep: 45.48
-[2025-01-25 20:36:06,671] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.36 | bwd: 4548.12 | bwd_inner: 4542.94 | bwd_allreduce: 5.11 | step: 45.48
- 73%|███████▎  | 4230/5800 [11:49:36<2:59:40,  6.87s/it]                                                        {'loss': 0.0044, 'grad_norm': 0.4226534366607666, 'learning_rate': 7.206140301283542e-06, 'epoch': 36.47}
- 73%|███████▎  | 4230/5800 [11:49:36<2:59:40,  6.87s/it]score1 tensor([[0.5078],
-        [0.4805],
-        [0.5117],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5000, 0.4785, 0.5117, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0024, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:36:13,495] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 20:36:13,496] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.17 | bwd_microstep: 4544.84 | bwd_inner_microstep: 4539.24 | bwd_allreduce_microstep: 5.50 | step_microstep: 43.98
-[2025-01-25 20:36:13,496] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.14 | bwd: 4544.87 | bwd_inner: 4539.24 | bwd_allreduce: 5.56 | step: 43.99
- 73%|███████▎  | 4231/5800 [11:49:43<2:59:12,  6.85s/it]                                                        {'loss': 0.0024, 'grad_norm': 3.994960069656372, 'learning_rate': 7.197558137097171e-06, 'epoch': 36.47}
- 73%|███████▎  | 4231/5800 [11:49:43<2:59:12,  6.85s/it]score1 tensor([[0.4922],
-        [0.6094],
-        [0.4512],
-        [0.4414]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.6133, 0.4316, 0.4355], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:36:20,383] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 20:36:20,384] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.66 | bwd_microstep: 4614.11 | bwd_inner_microstep: 4609.28 | bwd_allreduce_microstep: 4.73 | step_microstep: 42.88
-[2025-01-25 20:36:20,384] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.63 | bwd: 4614.14 | bwd_inner: 4609.28 | bwd_allreduce: 4.78 | step: 42.89
- 73%|███████▎  | 4232/5800 [11:49:50<2:59:22,  6.86s/it]                                                        {'loss': 0.0078, 'grad_norm': 0.8204931616783142, 'learning_rate': 7.188979964934322e-06, 'epoch': 36.48}
- 73%|███████▎  | 4232/5800 [11:49:50<2:59:22,  6.86s/it]score1 tensor([[0.6875],
-        [0.5703],
-        [0.4453],
-        [0.6094]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6836, 0.5547, 0.4297, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:36:27,283] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 20:36:27,285] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.66 | bwd_microstep: 4619.08 | bwd_inner_microstep: 4613.83 | bwd_allreduce_microstep: 5.17 | step_microstep: 50.31
-[2025-01-25 20:36:27,285] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.61 | bwd: 4619.12 | bwd_inner: 4613.83 | bwd_allreduce: 5.21 | step: 50.32
- 73%|███████▎  | 4233/5800 [11:49:57<2:59:33,  6.87s/it]                                                        {'loss': 0.0098, 'grad_norm': 8.789752960205078, 'learning_rate': 7.180405787469804e-06, 'epoch': 36.49}
- 73%|███████▎  | 4233/5800 [11:49:57<2:59:33,  6.87s/it]score1 tensor([[0.6250],
-        [0.6406],
-        [0.5508],
-        [0.3184]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6328, 0.6289, 0.5508, 0.3223], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:36:34,139] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 20:36:34,140] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.30 | bwd_microstep: 4574.88 | bwd_inner_microstep: 4569.96 | bwd_allreduce_microstep: 4.84 | step_microstep: 44.29
-[2025-01-25 20:36:34,141] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.26 | bwd: 4574.91 | bwd_inner: 4569.96 | bwd_allreduce: 4.88 | step: 44.30
- 73%|███████▎  | 4234/5800 [11:50:04<2:59:17,  6.87s/it]                                                        {'loss': 0.0059, 'grad_norm': 1.5744717121124268, 'learning_rate': 7.171835607377206e-06, 'epoch': 36.5}
- 73%|███████▎  | 4234/5800 [11:50:04<2:59:17,  6.87s/it]score1 tensor([[0.5195],
-        [0.5078],
-        [0.6094],
-        [0.4609]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.4980, 0.5938, 0.4668], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:36:41,048] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.37
-[2025-01-25 20:36:41,049] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.29 | bwd_microstep: 4628.21 | bwd_inner_microstep: 4623.41 | bwd_allreduce_microstep: 4.72 | step_microstep: 42.08
-[2025-01-25 20:36:41,049] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.26 | bwd: 4628.24 | bwd_inner: 4623.41 | bwd_allreduce: 4.76 | step: 42.09
- 73%|███████▎  | 4235/5800 [11:50:11<2:59:27,  6.88s/it]                                                        {'loss': 0.0088, 'grad_norm': 4.452803611755371, 'learning_rate': 7.163269427328849e-06, 'epoch': 36.51}
- 73%|███████▎  | 4235/5800 [11:50:11<2:59:27,  6.88s/it]score1 tensor([[0.4648],
-        [0.5625],
-        [0.5312],
-        [0.3809]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4570, 0.5625, 0.5352, 0.3809], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0029, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:36:47,863] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 20:36:47,864] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.16 | bwd_microstep: 4542.89 | bwd_inner_microstep: 4537.60 | bwd_allreduce_microstep: 5.18 | step_microstep: 43.79
-[2025-01-25 20:36:47,864] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.13 | bwd: 4542.91 | bwd_inner: 4537.60 | bwd_allreduce: 5.23 | step: 43.80
- 73%|███████▎  | 4236/5800 [11:50:17<2:58:52,  6.86s/it]                                                        {'loss': 0.0029, 'grad_norm': 0.33395177125930786, 'learning_rate': 7.15470724999582e-06, 'epoch': 36.52}
- 73%|███████▎  | 4236/5800 [11:50:17<2:58:52,  6.86s/it]score1 tensor([[0.5508],
-        [0.5078],
-        [0.4766],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5586, 0.4902, 0.4785, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:36:54,767] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.37
-[2025-01-25 20:36:54,769] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.34 | bwd_microstep: 4626.34 | bwd_inner_microstep: 4621.27 | bwd_allreduce_microstep: 4.99 | step_microstep: 41.63
-[2025-01-25 20:36:54,769] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.31 | bwd: 4626.36 | bwd_inner: 4621.27 | bwd_allreduce: 5.03 | step: 41.64
- 73%|███████▎  | 4237/5800 [11:50:24<2:59:02,  6.87s/it]                                                        {'loss': 0.0107, 'grad_norm': 0.4117385447025299, 'learning_rate': 7.146149078047964e-06, 'epoch': 36.53}
- 73%|███████▎  | 4237/5800 [11:50:24<2:59:02,  6.87s/it]score1 tensor([[0.6602],
-        [0.3789],
-        [0.3398],
-        [0.4160]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.3789, 0.3438, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:37:01,621] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 20:37:01,622] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.63 | bwd_microstep: 4576.19 | bwd_inner_microstep: 4571.22 | bwd_allreduce_microstep: 4.89 | step_microstep: 42.80
-[2025-01-25 20:37:01,622] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.59 | bwd: 4576.21 | bwd_inner: 4571.22 | bwd_allreduce: 4.93 | step: 42.81
- 73%|███████▎  | 4238/5800 [11:50:31<2:58:47,  6.87s/it]                                                        {'loss': 0.0054, 'grad_norm': 1.3196214437484741, 'learning_rate': 7.137594914153852e-06, 'epoch': 36.53}
- 73%|███████▎  | 4238/5800 [11:50:31<2:58:47,  6.87s/it]score1 tensor([[0.4668],
-        [0.5352],
-        [0.5234],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4629, 0.5039, 0.5117, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:37:08,528] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 20:37:08,529] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.39 | bwd_microstep: 4624.76 | bwd_inner_microstep: 4619.47 | bwd_allreduce_microstep: 5.16 | step_microstep: 44.93
-[2025-01-25 20:37:08,530] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.35 | bwd: 4624.79 | bwd_inner: 4619.47 | bwd_allreduce: 5.22 | step: 44.94
- 73%|███████▎  | 4239/5800 [11:50:38<2:58:59,  6.88s/it]                                                        {'loss': 0.0127, 'grad_norm': 8.1657133102417, 'learning_rate': 7.129044760980832e-06, 'epoch': 36.54}
- 73%|███████▎  | 4239/5800 [11:50:38<2:58:59,  6.88s/it]score1 tensor([[0.4434],
-        [0.4023],
-        [0.6211],
-        [0.4707]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.4160, 0.6094, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:37:15,432] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 20:37:15,433] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.77 | bwd_microstep: 4624.87 | bwd_inner_microstep: 4619.10 | bwd_allreduce_microstep: 5.61 | step_microstep: 46.88
-[2025-01-25 20:37:15,434] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.73 | bwd: 4624.90 | bwd_inner: 4619.10 | bwd_allreduce: 5.66 | step: 46.89
- 73%|███████▎  | 4240/5800 [11:50:45<2:59:02,  6.89s/it]                                                        {'loss': 0.0098, 'grad_norm': 3.4034645557403564, 'learning_rate': 7.1204986211949826e-06, 'epoch': 36.55}
- 73%|███████▎  | 4240/5800 [11:50:45<2:59:02,  6.89s/it]score1 tensor([[0.5469],
-        [0.4199],
-        [0.4453],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.4277, 0.4258, 0.5820], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:37:22,288] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 20:37:22,290] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.73 | bwd_microstep: 4581.52 | bwd_inner_microstep: 4576.71 | bwd_allreduce_microstep: 4.73 | step_microstep: 44.32
-[2025-01-25 20:37:22,290] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.69 | bwd: 4581.54 | bwd_inner: 4576.71 | bwd_allreduce: 4.76 | step: 44.33
- 73%|███████▎  | 4241/5800 [11:50:52<2:58:43,  6.88s/it]                                                        {'loss': 0.0098, 'grad_norm': 2.215137004852295, 'learning_rate': 7.111956497461146e-06, 'epoch': 36.56}
- 73%|███████▎  | 4241/5800 [11:50:52<2:58:43,  6.88s/it]score1 tensor([[0.5469],
-        [0.4941],
-        [0.5039],
-        [0.4609]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.4844, 0.4961, 0.4609], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:37:29,156] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 20:37:29,158] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.28 | bwd_microstep: 4585.03 | bwd_inner_microstep: 4579.84 | bwd_allreduce_microstep: 5.09 | step_microstep: 44.65
-[2025-01-25 20:37:29,158] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.24 | bwd: 4585.05 | bwd_inner: 4579.84 | bwd_allreduce: 5.14 | step: 44.66
- 73%|███████▎  | 4242/5800 [11:50:59<2:58:30,  6.87s/it]                                                        {'loss': 0.0073, 'grad_norm': 6.124460697174072, 'learning_rate': 7.103418392442902e-06, 'epoch': 36.57}
- 73%|███████▎  | 4242/5800 [11:50:59<2:58:30,  6.87s/it]score1 tensor([[0.6367],
-        [0.4473],
-        [0.6406],
-        [0.4062]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6367, 0.4434, 0.6445, 0.4199], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:37:36,012] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 20:37:36,013] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.99 | bwd_microstep: 4585.66 | bwd_inner_microstep: 4580.21 | bwd_allreduce_microstep: 5.35 | step_microstep: 43.91
-[2025-01-25 20:37:36,014] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.96 | bwd: 4585.69 | bwd_inner: 4580.21 | bwd_allreduce: 5.39 | step: 43.91
- 73%|███████▎  | 4243/5800 [11:51:05<2:58:15,  6.87s/it]                                                        {'loss': 0.0054, 'grad_norm': 2.276664972305298, 'learning_rate': 7.094884308802572e-06, 'epoch': 36.58}
- 73%|███████▎  | 4243/5800 [11:51:05<2:58:15,  6.87s/it]score1 tensor([[0.5430],
-        [0.4570],
-        [0.4570],
-        [0.4219]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.4453, 0.4492, 0.4160], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:37:42,907] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 20:37:42,909] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.79 | bwd_microstep: 4621.63 | bwd_inner_microstep: 4616.30 | bwd_allreduce_microstep: 5.21 | step_microstep: 47.29
-[2025-01-25 20:37:42,909] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.76 | bwd: 4621.65 | bwd_inner: 4616.29 | bwd_allreduce: 5.27 | step: 47.30
- 73%|███████▎  | 4244/5800 [11:51:12<2:58:23,  6.88s/it]                                                        {'loss': 0.0073, 'grad_norm': 7.751935958862305, 'learning_rate': 7.086354249201244e-06, 'epoch': 36.59}
- 73%|███████▎  | 4244/5800 [11:51:12<2:58:23,  6.88s/it]score1 tensor([[0.4902],
-        [0.5742],
-        [0.5664],
-        [0.4570]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4824, 0.5625, 0.5625, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:37:49,814] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.88 | optimizer_step: 4.36
-[2025-01-25 20:37:49,816] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.02 | bwd_microstep: 4614.93 | bwd_inner_microstep: 4610.16 | bwd_allreduce_microstep: 4.70 | step_microstep: 45.44
-[2025-01-25 20:37:49,816] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.99 | bwd: 4614.95 | bwd_inner: 4610.16 | bwd_allreduce: 4.73 | step: 45.45
- 73%|███████▎  | 4245/5800 [11:51:19<2:58:27,  6.89s/it]                                                        {'loss': 0.0073, 'grad_norm': 8.207440376281738, 'learning_rate': 7.077828216298726e-06, 'epoch': 36.59}
- 73%|███████▎  | 4245/5800 [11:51:19<2:58:27,  6.89s/it]score1 tensor([[0.6016],
-        [0.4922],
-        [0.5469],
-        [0.4121]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.4844, 0.5352, 0.4219], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:37:56,711] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.36
-[2025-01-25 20:37:56,713] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.56 | bwd_microstep: 4617.43 | bwd_inner_microstep: 4612.50 | bwd_allreduce_microstep: 4.83 | step_microstep: 43.99
-[2025-01-25 20:37:56,713] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.52 | bwd: 4617.46 | bwd_inner: 4612.50 | bwd_allreduce: 4.89 | step: 44.00
- 73%|███████▎  | 4246/5800 [11:51:26<2:58:24,  6.89s/it]                                                        {'loss': 0.0112, 'grad_norm': 0.3870081305503845, 'learning_rate': 7.069306212753595e-06, 'epoch': 36.6}
- 73%|███████▎  | 4246/5800 [11:51:26<2:58:24,  6.89s/it]score1 tensor([[0.5000],
-        [0.4980],
-        [0.5195],
-        [0.5977]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.4980, 0.5234, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:38:03,567] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.37
-[2025-01-25 20:38:03,568] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.84 | bwd_microstep: 4578.04 | bwd_inner_microstep: 4573.29 | bwd_allreduce_microstep: 4.68 | step_microstep: 45.42
-[2025-01-25 20:38:03,568] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.80 | bwd: 4578.07 | bwd_inner: 4573.29 | bwd_allreduce: 4.72 | step: 45.42
- 73%|███████▎  | 4247/5800 [11:51:33<2:58:01,  6.88s/it]                                                        {'loss': 0.0098, 'grad_norm': 6.317594051361084, 'learning_rate': 7.060788241223147e-06, 'epoch': 36.61}
- 73%|███████▎  | 4247/5800 [11:51:33<2:58:01,  6.88s/it]score1 tensor([[0.4043],
-        [0.3730],
-        [0.4551],
-        [0.6719]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4258, 0.3750, 0.4492, 0.6641], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:38:10,460] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 20:38:10,461] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.02 | bwd_microstep: 4612.95 | bwd_inner_microstep: 4607.46 | bwd_allreduce_microstep: 5.40 | step_microstep: 47.02
-[2025-01-25 20:38:10,461] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.97 | bwd: 4612.97 | bwd_inner: 4607.46 | bwd_allreduce: 5.44 | step: 47.03
- 73%|███████▎  | 4248/5800 [11:51:40<2:58:02,  6.88s/it]                                                        {'loss': 0.0093, 'grad_norm': 1.1578946113586426, 'learning_rate': 7.052274304363449e-06, 'epoch': 36.62}
- 73%|███████▎  | 4248/5800 [11:51:40<2:58:02,  6.88s/it]score1 tensor([[0.5859],
-        [0.6406],
-        [0.5664],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.6367, 0.5664, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:38:17,319] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.48 | optimizer_step: 4.37
-[2025-01-25 20:38:17,320] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.35 | bwd_microstep: 4575.12 | bwd_inner_microstep: 4568.67 | bwd_allreduce_microstep: 6.33 | step_microstep: 50.33
-[2025-01-25 20:38:17,321] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.32 | bwd: 4575.17 | bwd_inner: 4568.67 | bwd_allreduce: 6.39 | step: 50.33
- 73%|███████▎  | 4249/5800 [11:51:47<2:57:47,  6.88s/it]                                                        {'loss': 0.0049, 'grad_norm': 2.585153579711914, 'learning_rate': 7.043764404829283e-06, 'epoch': 36.63}
- 73%|███████▎  | 4249/5800 [11:51:47<2:57:47,  6.88s/it]score1 tensor([[0.4238],
-        [0.5039],
-        [0.5586],
-        [0.5977]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4297, 0.4961, 0.6055, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0200, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:38:24,230] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.12 | optimizer_step: 4.37
-[2025-01-25 20:38:24,232] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.82 | bwd_microstep: 4616.02 | bwd_inner_microstep: 4610.60 | bwd_allreduce_microstep: 5.33 | step_microstep: 49.91
-[2025-01-25 20:38:24,232] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.79 | bwd: 4616.04 | bwd_inner: 4610.60 | bwd_allreduce: 5.37 | step: 49.91
- 73%|███████▎  | 4250/5800 [11:51:54<2:57:55,  6.89s/it]                                                        {'loss': 0.02, 'grad_norm': 4.236022472381592, 'learning_rate': 7.0352585452741796e-06, 'epoch': 36.64}
- 73%|███████▎  | 4250/5800 [11:51:54<2:57:55,  6.89s/it]score1 tensor([[0.3438],
-        [0.5469],
-        [0.4238],
-        [0.4082]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3516, 0.5430, 0.4121, 0.4023], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:38:31,138] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.68 | optimizer_step: 4.38
-[2025-01-25 20:38:31,140] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.06 | bwd_microstep: 4625.68 | bwd_inner_microstep: 4619.48 | bwd_allreduce_microstep: 6.03 | step_microstep: 49.73
-[2025-01-25 20:38:31,140] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.03 | bwd: 4625.75 | bwd_inner: 4619.48 | bwd_allreduce: 6.12 | step: 49.73
- 73%|███████▎  | 4251/5800 [11:52:01<2:57:59,  6.89s/it]                                                        {'loss': 0.0073, 'grad_norm': 4.128523349761963, 'learning_rate': 7.026756728350428e-06, 'epoch': 36.65}
- 73%|███████▎  | 4251/5800 [11:52:01<2:57:59,  6.89s/it]score1 tensor([[0.4375],
-        [0.5391],
-        [0.4844],
-        [0.3848]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.5664, 0.4863, 0.3945], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:38:38,049] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 8.55
-[2025-01-25 20:38:38,051] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.54 | bwd_microstep: 4618.41 | bwd_inner_microstep: 4613.07 | bwd_allreduce_microstep: 5.23 | step_microstep: 51.68
-[2025-01-25 20:38:38,052] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.51 | bwd: 4618.44 | bwd_inner: 4613.07 | bwd_allreduce: 5.27 | step: 51.69
- 73%|███████▎  | 4252/5800 [11:52:08<2:58:00,  6.90s/it]                                                        {'loss': 0.0122, 'grad_norm': 7.654620170593262, 'learning_rate': 7.018258956709025e-06, 'epoch': 36.66}
- 73%|███████▎  | 4252/5800 [11:52:08<2:58:00,  6.90s/it]score1 tensor([[0.4355],
-        [0.4551],
-        [0.5508],
-        [0.6484]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.4688, 0.5547, 0.6406], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:38:44,967] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.88 | optimizer_step: 4.36
-[2025-01-25 20:38:44,968] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.55 | bwd_microstep: 4624.13 | bwd_inner_microstep: 4619.29 | bwd_allreduce_microstep: 4.75 | step_microstep: 49.02
-[2025-01-25 20:38:44,969] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.52 | bwd: 4624.15 | bwd_inner: 4619.29 | bwd_allreduce: 4.79 | step: 49.02
- 73%|███████▎  | 4253/5800 [11:52:14<2:57:59,  6.90s/it]                                                        {'loss': 0.0098, 'grad_norm': 3.5084495544433594, 'learning_rate': 7.0097652329997414e-06, 'epoch': 36.66}
- 73%|███████▎  | 4253/5800 [11:52:14<2:57:59,  6.90s/it]score1 tensor([[0.6289],
-        [0.6328],
-        [0.4355],
-        [0.3711]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.6484, 0.4238, 0.3672], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:38:51,873] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.37
-[2025-01-25 20:38:51,875] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.78 | bwd_microstep: 4617.93 | bwd_inner_microstep: 4612.63 | bwd_allreduce_microstep: 5.17 | step_microstep: 46.85
-[2025-01-25 20:38:51,875] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.72 | bwd: 4617.96 | bwd_inner: 4612.63 | bwd_allreduce: 5.23 | step: 46.86
- 73%|███████▎  | 4254/5800 [11:52:21<2:57:52,  6.90s/it]                                                        {'loss': 0.0117, 'grad_norm': 1.4772673845291138, 'learning_rate': 7.001275559871057e-06, 'epoch': 36.67}
- 73%|███████▎  | 4254/5800 [11:52:21<2:57:52,  6.90s/it]score1 tensor([[0.5742],
-        [0.5430],
-        [0.4395],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5898, 0.5469, 0.4473, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:38:58,775] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 20:38:58,776] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.88 | bwd_microstep: 4621.58 | bwd_inner_microstep: 4616.37 | bwd_allreduce_microstep: 5.07 | step_microstep: 44.75
-[2025-01-25 20:38:58,777] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.83 | bwd: 4621.60 | bwd_inner: 4616.37 | bwd_allreduce: 5.12 | step: 44.76
- 73%|███████▎  | 4255/5800 [11:52:28<2:57:47,  6.90s/it]                                                        {'loss': 0.0107, 'grad_norm': 4.133635520935059, 'learning_rate': 6.992789939970193e-06, 'epoch': 36.68}
- 73%|███████▎  | 4255/5800 [11:52:28<2:57:47,  6.90s/it]score1 tensor([[0.5312],
-        [0.5156],
-        [0.5391],
-        [0.4395]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.5195, 0.5469, 0.4336], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:39:05,677] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.36
-[2025-01-25 20:39:05,679] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.34 | bwd_microstep: 4623.80 | bwd_inner_microstep: 4618.66 | bwd_allreduce_microstep: 5.03 | step_microstep: 45.21
-[2025-01-25 20:39:05,679] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.31 | bwd: 4623.82 | bwd_inner: 4618.66 | bwd_allreduce: 5.09 | step: 45.21
- 73%|███████▎  | 4256/5800 [11:52:35<2:57:36,  6.90s/it]                                                        {'loss': 0.0093, 'grad_norm': 4.404428958892822, 'learning_rate': 6.984308375943128e-06, 'epoch': 36.69}
- 73%|███████▎  | 4256/5800 [11:52:35<2:57:36,  6.90s/it]score1 tensor([[0.4180],
-        [0.5586],
-        [0.5977],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4043, 0.5586, 0.6133, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:39:12,513] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 20:39:12,515] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.22 | bwd_microstep: 4566.67 | bwd_inner_microstep: 4561.85 | bwd_allreduce_microstep: 4.74 | step_microstep: 43.43
-[2025-01-25 20:39:12,515] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.19 | bwd: 4566.69 | bwd_inner: 4561.85 | bwd_allreduce: 4.78 | step: 43.44
- 73%|███████▎  | 4257/5800 [11:52:42<2:57:01,  6.88s/it]                                                        {'loss': 0.0093, 'grad_norm': 2.636570930480957, 'learning_rate': 6.975830870434548e-06, 'epoch': 36.7}
- 73%|███████▎  | 4257/5800 [11:52:42<2:57:01,  6.88s/it]score1 tensor([[0.3496],
-        [0.4668],
-        [0.4102],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3418, 0.4648, 0.4121, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:39:19,408] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.37
-[2025-01-25 20:39:19,409] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.48 | bwd_microstep: 4615.30 | bwd_inner_microstep: 4610.74 | bwd_allreduce_microstep: 4.46 | step_microstep: 42.91
-[2025-01-25 20:39:19,409] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.45 | bwd: 4615.33 | bwd_inner: 4610.74 | bwd_allreduce: 4.50 | step: 42.91
- 73%|███████▎  | 4258/5800 [11:52:49<2:57:00,  6.89s/it]                                                        {'loss': 0.0063, 'grad_norm': 3.8319473266601562, 'learning_rate': 6.967357426087895e-06, 'epoch': 36.71}
- 73%|███████▎  | 4258/5800 [11:52:49<2:57:00,  6.89s/it]score1 tensor([[0.4473],
-        [0.4102],
-        [0.3906],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4434, 0.4004, 0.3711, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:39:26,316] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 20:39:26,318] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.20 | bwd_microstep: 4623.79 | bwd_inner_microstep: 4618.65 | bwd_allreduce_microstep: 5.05 | step_microstep: 46.38
-[2025-01-25 20:39:26,318] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.16 | bwd: 4623.82 | bwd_inner: 4618.65 | bwd_allreduce: 5.09 | step: 46.40
- 73%|███████▎  | 4259/5800 [11:52:56<2:57:02,  6.89s/it]                                                        {'loss': 0.0122, 'grad_norm': 7.515286922454834, 'learning_rate': 6.958888045545329e-06, 'epoch': 36.72}
- 73%|███████▎  | 4259/5800 [11:52:56<2:57:02,  6.89s/it]score1 tensor([[0.4609],
-        [0.5000],
-        [0.6992],
-        [0.4473]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.4922, 0.6953, 0.4395], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:39:33,215] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 20:39:33,216] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.83 | bwd_microstep: 4619.18 | bwd_inner_microstep: 4614.54 | bwd_allreduce_microstep: 4.55 | step_microstep: 43.88
-[2025-01-25 20:39:33,216] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.80 | bwd: 4619.20 | bwd_inner: 4614.54 | bwd_allreduce: 4.59 | step: 43.89
- 73%|███████▎  | 4260/5800 [11:53:03<2:56:56,  6.89s/it]                                                        {'loss': 0.0083, 'grad_norm': 8.28699016571045, 'learning_rate': 6.950422731447759e-06, 'epoch': 36.72}
- 73%|███████▎  | 4260/5800 [11:53:03<2:56:56,  6.89s/it]score1 tensor([[0.5742],
-        [0.5430],
-        [0.4707],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.5391, 0.4453, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:39:40,119] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 20:39:40,121] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.76 | bwd_microstep: 4619.92 | bwd_inner_microstep: 4614.46 | bwd_allreduce_microstep: 5.33 | step_microstep: 47.63
-[2025-01-25 20:39:40,121] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.73 | bwd: 4619.95 | bwd_inner: 4614.46 | bwd_allreduce: 5.39 | step: 47.64
- 73%|███████▎  | 4261/5800 [11:53:10<2:56:56,  6.90s/it]                                                        {'loss': 0.0093, 'grad_norm': 0.5426236987113953, 'learning_rate': 6.941961486434809e-06, 'epoch': 36.73}
- 73%|███████▎  | 4261/5800 [11:53:10<2:56:56,  6.90s/it]score1 tensor([[0.4277],
-        [0.6211],
-        [0.4785],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4121, 0.6133, 0.4629, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:39:47,014] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 20:39:47,015] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.04 | bwd_microstep: 4614.34 | bwd_inner_microstep: 4609.18 | bwd_allreduce_microstep: 5.06 | step_microstep: 43.91
-[2025-01-25 20:39:47,015] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.00 | bwd: 4614.37 | bwd_inner: 4609.18 | bwd_allreduce: 5.11 | step: 43.91
- 73%|███████▎  | 4262/5800 [11:53:16<2:56:43,  6.89s/it]                                                        {'loss': 0.0127, 'grad_norm': 8.081253051757812, 'learning_rate': 6.933504313144841e-06, 'epoch': 36.74}
- 73%|███████▎  | 4262/5800 [11:53:16<2:56:43,  6.89s/it]score1 tensor([[0.5078],
-        [0.4375],
-        [0.5156],
-        [0.4629]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.4414, 0.5117, 0.4707], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:39:53,913] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.37
-[2025-01-25 20:39:53,914] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.64 | bwd_microstep: 4622.89 | bwd_inner_microstep: 4615.24 | bwd_allreduce_microstep: 7.55 | step_microstep: 43.35
-[2025-01-25 20:39:53,914] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.60 | bwd: 4622.92 | bwd_inner: 4615.24 | bwd_allreduce: 7.60 | step: 43.36
- 74%|███████▎  | 4263/5800 [11:53:23<2:56:40,  6.90s/it]                                                        {'loss': 0.0063, 'grad_norm': 0.5256568789482117, 'learning_rate': 6.925051214214955e-06, 'epoch': 36.75}
- 74%|███████▎  | 4263/5800 [11:53:23<2:56:40,  6.90s/it]score1 tensor([[0.4668],
-        [0.4980],
-        [0.4902],
-        [0.4082]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4570, 0.4785, 0.4922, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:40:00,802] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 20:40:00,803] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.80 | bwd_microstep: 4611.70 | bwd_inner_microstep: 4606.40 | bwd_allreduce_microstep: 5.21 | step_microstep: 46.15
-[2025-01-25 20:40:00,804] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.77 | bwd: 4611.73 | bwd_inner: 4606.40 | bwd_allreduce: 5.25 | step: 46.16
- 74%|███████▎  | 4264/5800 [11:53:30<2:56:29,  6.89s/it]                                                        {'loss': 0.0098, 'grad_norm': 3.7214345932006836, 'learning_rate': 6.916602192280965e-06, 'epoch': 36.76}
- 74%|███████▎  | 4264/5800 [11:53:30<2:56:29,  6.89s/it]score1 tensor([[0.4258],
-        [0.5000],
-        [0.3906],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4336, 0.4961, 0.3750, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:40:07,703] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 20:40:07,705] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.10 | bwd_microstep: 4618.65 | bwd_inner_microstep: 4613.83 | bwd_allreduce_microstep: 4.71 | step_microstep: 43.12
-[2025-01-25 20:40:07,705] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.07 | bwd: 4618.68 | bwd_inner: 4613.83 | bwd_allreduce: 4.77 | step: 43.12
- 74%|███████▎  | 4265/5800 [11:53:37<2:56:27,  6.90s/it]                                                        {'loss': 0.0078, 'grad_norm': 0.40786901116371155, 'learning_rate': 6.908157249977437e-06, 'epoch': 36.77}
- 74%|███████▎  | 4265/5800 [11:53:37<2:56:27,  6.90s/it]score1 tensor([[0.3965],
-        [0.5859],
-        [0.4824],
-        [0.4590]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3945, 0.5938, 0.4863, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:40:14,599] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 20:40:14,600] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.66 | bwd_microstep: 4622.09 | bwd_inner_microstep: 4617.41 | bwd_allreduce_microstep: 4.59 | step_microstep: 42.36
-[2025-01-25 20:40:14,600] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.63 | bwd: 4622.13 | bwd_inner: 4617.41 | bwd_allreduce: 4.63 | step: 42.36
- 74%|███████▎  | 4266/5800 [11:53:44<2:56:20,  6.90s/it]                                                        {'loss': 0.0068, 'grad_norm': 0.7410370111465454, 'learning_rate': 6.899716389937634e-06, 'epoch': 36.78}
- 74%|███████▎  | 4266/5800 [11:53:44<2:56:20,  6.90s/it]score1 tensor([[0.4316],
-        [0.5703],
-        [0.5117],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4375, 0.5781, 0.5156, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:40:21,500] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.37
-[2025-01-25 20:40:21,502] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.95 | bwd_microstep: 4623.56 | bwd_inner_microstep: 4618.46 | bwd_allreduce_microstep: 5.00 | step_microstep: 42.47
-[2025-01-25 20:40:21,502] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.91 | bwd: 4623.58 | bwd_inner: 4618.46 | bwd_allreduce: 5.04 | step: 42.48
- 74%|███████▎  | 4267/5800 [11:53:51<2:56:13,  6.90s/it]                                                        {'loss': 0.0063, 'grad_norm': 8.092981338500977, 'learning_rate': 6.891279614793575e-06, 'epoch': 36.78}
- 74%|███████▎  | 4267/5800 [11:53:51<2:56:13,  6.90s/it]score1 tensor([[0.5195],
-        [0.5430],
-        [0.4902],
-        [0.4453]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.5508, 0.4863, 0.4570], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:40:28,400] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 20:40:28,401] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.68 | bwd_microstep: 4622.77 | bwd_inner_microstep: 4618.12 | bwd_allreduce_microstep: 4.57 | step_microstep: 43.36
-[2025-01-25 20:40:28,402] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.65 | bwd: 4622.79 | bwd_inner: 4618.12 | bwd_allreduce: 4.61 | step: 43.36
- 74%|███████▎  | 4268/5800 [11:53:58<2:56:09,  6.90s/it]                                                        {'loss': 0.0088, 'grad_norm': 4.065124034881592, 'learning_rate': 6.882846927175984e-06, 'epoch': 36.79}
- 74%|███████▎  | 4268/5800 [11:53:58<2:56:09,  6.90s/it]score1 tensor([[0.4277],
-        [0.4238],
-        [0.5469],
-        [0.4863]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4316, 0.4277, 0.5391, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:40:35,294] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.37
-[2025-01-25 20:40:35,296] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.95 | bwd_microstep: 4616.54 | bwd_inner_microstep: 4611.12 | bwd_allreduce_microstep: 5.34 | step_microstep: 45.03
-[2025-01-25 20:40:35,296] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.92 | bwd: 4616.56 | bwd_inner: 4611.12 | bwd_allreduce: 5.38 | step: 45.02
- 74%|███████▎  | 4269/5800 [11:54:05<2:55:57,  6.90s/it]                                                        {'loss': 0.0093, 'grad_norm': 0.6616399884223938, 'learning_rate': 6.87441832971433e-06, 'epoch': 36.8}
- 74%|███████▎  | 4269/5800 [11:54:05<2:55:57,  6.90s/it]score1 tensor([[0.4395],
-        [0.6445],
-        [0.6133],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4375, 0.6445, 0.6172, 0.5586], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0034, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:40:42,135] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 20:40:42,137] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.41 | bwd_microstep: 4570.33 | bwd_inner_microstep: 4565.18 | bwd_allreduce_microstep: 5.04 | step_microstep: 43.07
-[2025-01-25 20:40:42,142] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.37 | bwd: 4570.36 | bwd_inner: 4565.18 | bwd_allreduce: 5.10 | step: 43.08
- 74%|███████▎  | 4270/5800 [11:54:12<2:55:29,  6.88s/it]                                                        {'loss': 0.0034, 'grad_norm': 2.6150166988372803, 'learning_rate': 6.865993825036781e-06, 'epoch': 36.81}
- 74%|███████▎  | 4270/5800 [11:54:12<2:55:29,  6.88s/it]score1 tensor([[0.3828],
-        [0.5977],
-        [0.4727],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3867, 0.6016, 0.4844, 0.5000], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:40:49,042] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.99 | optimizer_step: 4.36
-[2025-01-25 20:40:49,043] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.91 | bwd_microstep: 4616.87 | bwd_inner_microstep: 4611.90 | bwd_allreduce_microstep: 4.88 | step_microstep: 46.39
-[2025-01-25 20:40:49,046] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.88 | bwd: 4616.90 | bwd_inner: 4611.90 | bwd_allreduce: 4.92 | step: 46.39
- 74%|███████▎  | 4271/5800 [11:54:19<2:55:31,  6.89s/it]                                                        {'loss': 0.0059, 'grad_norm': 3.8982012271881104, 'learning_rate': 6.8575734157702604e-06, 'epoch': 36.82}
- 74%|███████▎  | 4271/5800 [11:54:19<2:55:31,  6.89s/it]score1 tensor([[0.6172],
-        [0.5859],
-        [0.3672],
-        [0.4531]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.5977, 0.3789, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:40:55,930] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 20:40:55,931] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.66 | bwd_microstep: 4611.83 | bwd_inner_microstep: 4607.18 | bwd_allreduce_microstep: 4.58 | step_microstep: 42.83
-[2025-01-25 20:40:55,931] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.63 | bwd: 4611.86 | bwd_inner: 4607.18 | bwd_allreduce: 4.62 | step: 42.83
- 74%|███████▎  | 4272/5800 [11:54:25<2:55:25,  6.89s/it]                                                        {'loss': 0.0107, 'grad_norm': 3.5154967308044434, 'learning_rate': 6.849157104540381e-06, 'epoch': 36.83}
- 74%|███████▎  | 4272/5800 [11:54:25<2:55:25,  6.89s/it]score1 tensor([[0.4609],
-        [0.4238],
-        [0.4082],
-        [0.4961]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4629, 0.4082, 0.4180, 0.4883], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:41:02,828] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.36
-[2025-01-25 20:41:02,829] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.59 | bwd_microstep: 4617.93 | bwd_inner_microstep: 4613.26 | bwd_allreduce_microstep: 4.60 | step_microstep: 43.46
-[2025-01-25 20:41:02,830] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.55 | bwd: 4617.96 | bwd_inner: 4613.26 | bwd_allreduce: 4.63 | step: 43.46
- 74%|███████▎  | 4273/5800 [11:54:32<2:55:22,  6.89s/it]                                                        {'loss': 0.0088, 'grad_norm': 0.3933800756931305, 'learning_rate': 6.840744893971512e-06, 'epoch': 36.84}
- 74%|███████▎  | 4273/5800 [11:54:32<2:55:22,  6.89s/it]score1 tensor([[0.5547],
-        [0.4648],
-        [0.4590],
-        [0.4512]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.4648, 0.4609, 0.4414], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0029, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:41:09,638] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 20:41:09,639] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.43 | bwd_microstep: 4537.89 | bwd_inner_microstep: 4532.94 | bwd_allreduce_microstep: 4.84 | step_microstep: 43.02
-[2025-01-25 20:41:09,639] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.40 | bwd: 4537.91 | bwd_inner: 4532.94 | bwd_allreduce: 4.89 | step: 43.02
- 74%|███████▎  | 4274/5800 [11:54:39<2:54:37,  6.87s/it]                                                        {'loss': 0.0029, 'grad_norm': 0.29050329327583313, 'learning_rate': 6.832336786686711e-06, 'epoch': 36.84}
- 74%|███████▎  | 4274/5800 [11:54:39<2:54:37,  6.87s/it]score1 tensor([[0.5781],
-        [0.4824],
-        [0.4551],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5703, 0.4727, 0.4551, 0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0044, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:41:16,459] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 20:41:16,460] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.18 | bwd_microstep: 4547.74 | bwd_inner_microstep: 4542.90 | bwd_allreduce_microstep: 4.75 | step_microstep: 44.19
-[2025-01-25 20:41:16,460] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.15 | bwd: 4547.77 | bwd_inner: 4542.90 | bwd_allreduce: 4.79 | step: 44.20
- 74%|███████▎  | 4275/5800 [11:54:46<2:54:14,  6.86s/it]                                                        {'loss': 0.0044, 'grad_norm': 4.158930778503418, 'learning_rate': 6.823932785307785e-06, 'epoch': 36.85}
- 74%|███████▎  | 4275/5800 [11:54:46<2:54:14,  6.86s/it]score1 tensor([[0.5000],
-        [0.5312],
-        [0.4629],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4922, 0.5195, 0.4531, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:41:23,371] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 20:41:23,372] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.02 | bwd_microstep: 4622.58 | bwd_inner_microstep: 4617.78 | bwd_allreduce_microstep: 4.70 | step_microstep: 47.11
-[2025-01-25 20:41:23,372] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.99 | bwd: 4622.60 | bwd_inner: 4617.78 | bwd_allreduce: 4.76 | step: 47.12
- 74%|███████▎  | 4276/5800 [11:54:53<2:54:31,  6.87s/it]                                                        {'loss': 0.0107, 'grad_norm': 8.012539863586426, 'learning_rate': 6.815532892455234e-06, 'epoch': 36.86}
- 74%|███████▎  | 4276/5800 [11:54:53<2:54:31,  6.87s/it]score1 tensor([[0.7109],
-        [0.6641],
-        [0.4473],
-        [0.6992]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.7070, 0.6484, 0.4590, 0.6875], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:41:30,267] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.37
-[2025-01-25 20:41:30,269] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.84 | bwd_microstep: 4618.38 | bwd_inner_microstep: 4613.47 | bwd_allreduce_microstep: 4.81 | step_microstep: 42.46
-[2025-01-25 20:41:30,269] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.80 | bwd: 4618.40 | bwd_inner: 4613.47 | bwd_allreduce: 4.86 | step: 42.47
- 74%|███████▎  | 4277/5800 [11:55:00<2:54:36,  6.88s/it]                                                        {'loss': 0.0107, 'grad_norm': 5.648890018463135, 'learning_rate': 6.807137110748305e-06, 'epoch': 36.87}
- 74%|███████▎  | 4277/5800 [11:55:00<2:54:36,  6.88s/it]score1 tensor([[0.4121],
-        [0.5430],
-        [0.5938],
-        [0.4805]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4023, 0.5352, 0.5898, 0.4766], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:41:37,177] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 7.43 | optimizer_step: 4.36
-[2025-01-25 20:41:37,178] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.92 | bwd_microstep: 4620.40 | bwd_inner_microstep: 4615.30 | bwd_allreduce_microstep: 4.99 | step_microstep: 47.19
-[2025-01-25 20:41:37,179] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.89 | bwd: 4620.42 | bwd_inner: 4615.30 | bwd_allreduce: 5.05 | step: 47.20
- 74%|███████▍  | 4278/5800 [11:55:07<2:54:39,  6.89s/it]                                                        {'loss': 0.0063, 'grad_norm': 8.08924674987793, 'learning_rate': 6.798745442804935e-06, 'epoch': 36.88}
- 74%|███████▍  | 4278/5800 [11:55:07<2:54:39,  6.89s/it]score1 tensor([[0.5859],
-        [0.4844],
-        [0.4570],
-        [0.6250]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5742, 0.5039, 0.4492, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:41:44,100] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 20:41:44,101] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.86 | bwd_microstep: 4640.83 | bwd_inner_microstep: 4635.89 | bwd_allreduce_microstep: 4.85 | step_microstep: 48.80
-[2025-01-25 20:41:44,102] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.83 | bwd: 4640.86 | bwd_inner: 4635.89 | bwd_allreduce: 4.90 | step: 48.81
- 74%|███████▍  | 4279/5800 [11:55:14<2:54:53,  6.90s/it]                                                        {'loss': 0.0146, 'grad_norm': 4.514791011810303, 'learning_rate': 6.790357891241803e-06, 'epoch': 36.89}
- 74%|███████▍  | 4279/5800 [11:55:14<2:54:53,  6.90s/it]score1 tensor([[0.6211],
-        [0.4844],
-        [0.4707],
-        [0.4688]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6250, 0.4883, 0.4688, 0.4590], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:41:51,027] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.37
-[2025-01-25 20:41:51,028] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.57 | bwd_microstep: 4643.21 | bwd_inner_microstep: 4638.13 | bwd_allreduce_microstep: 5.01 | step_microstep: 42.79
-[2025-01-25 20:41:51,028] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.53 | bwd: 4643.24 | bwd_inner: 4638.13 | bwd_allreduce: 5.04 | step: 42.80
- 74%|███████▍  | 4280/5800 [11:55:21<2:54:59,  6.91s/it]                                                        {'loss': 0.0049, 'grad_norm': 0.7079099416732788, 'learning_rate': 6.781974458674287e-06, 'epoch': 36.9}
- 74%|███████▍  | 4280/5800 [11:55:21<2:54:59,  6.91s/it]score1 tensor([[0.4844],
-        [0.6484],
-        [0.4102],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.6562, 0.4004, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:41:57,893] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.37
-[2025-01-25 20:41:57,894] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.13 | bwd_microstep: 4582.79 | bwd_inner_microstep: 4578.01 | bwd_allreduce_microstep: 4.69 | step_microstep: 40.93
-[2025-01-25 20:41:57,894] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.11 | bwd: 4582.81 | bwd_inner: 4578.01 | bwd_allreduce: 4.73 | step: 40.93
- 74%|███████▍  | 4281/5800 [11:55:27<2:54:34,  6.90s/it]                                                        {'loss': 0.0054, 'grad_norm': 2.688140630722046, 'learning_rate': 6.773595147716483e-06, 'epoch': 36.91}
- 74%|███████▍  | 4281/5800 [11:55:27<2:54:34,  6.90s/it]score1 tensor([[0.4199],
-        [0.4609],
-        [0.5508],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4219, 0.4609, 0.5430, 0.5586], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:42:04,762] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.93 | optimizer_step: 4.36
-[2025-01-25 20:42:04,763] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.82 | bwd_microstep: 4580.56 | bwd_inner_microstep: 4575.43 | bwd_allreduce_microstep: 5.02 | step_microstep: 44.93
-[2025-01-25 20:42:04,763] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.78 | bwd: 4580.59 | bwd_inner: 4575.43 | bwd_allreduce: 5.08 | step: 44.94
- 74%|███████▍  | 4282/5800 [11:55:34<2:54:12,  6.89s/it]                                                        {'loss': 0.0063, 'grad_norm': 2.5695242881774902, 'learning_rate': 6.765219960981215e-06, 'epoch': 36.91}
- 74%|███████▍  | 4282/5800 [11:55:34<2:54:12,  6.89s/it]score1 tensor([[0.4707],
-        [0.5039],
-        [0.4961],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.4980, 0.5078, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:42:11,686] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 20:42:11,688] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.63 | bwd_microstep: 4633.39 | bwd_inner_microstep: 4627.92 | bwd_allreduce_microstep: 5.35 | step_microstep: 50.69
-[2025-01-25 20:42:11,688] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.58 | bwd: 4633.42 | bwd_inner: 4627.92 | bwd_allreduce: 5.42 | step: 50.70
- 74%|███████▍  | 4283/5800 [11:55:41<2:54:22,  6.90s/it]                                                        {'loss': 0.0059, 'grad_norm': 3.90861439704895, 'learning_rate': 6.756848901080002e-06, 'epoch': 36.92}
- 74%|███████▍  | 4283/5800 [11:55:41<2:54:22,  6.90s/it]score1 tensor([[0.5273],
-        [0.3711],
-        [0.4023],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.3730, 0.4062, 0.5000], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0044, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:42:18,545] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 20:42:18,546] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.75 | bwd_microstep: 4584.42 | bwd_inner_microstep: 4579.83 | bwd_allreduce_microstep: 4.52 | step_microstep: 42.84
-[2025-01-25 20:42:18,546] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.72 | bwd: 4584.45 | bwd_inner: 4579.83 | bwd_allreduce: 4.56 | step: 42.85
- 74%|███████▍  | 4284/5800 [11:55:48<2:53:59,  6.89s/it]                                                        {'loss': 0.0044, 'grad_norm': 5.480127811431885, 'learning_rate': 6.7484819706230955e-06, 'epoch': 36.93}
- 74%|███████▍  | 4284/5800 [11:55:48<2:53:59,  6.89s/it]score1 tensor([[0.4316],
-        [0.4238],
-        [0.4922],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.4180, 0.5117, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:42:25,416] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.36
-[2025-01-25 20:42:25,417] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.79 | bwd_microstep: 4589.88 | bwd_inner_microstep: 4585.17 | bwd_allreduce_microstep: 4.63 | step_microstep: 43.34
-[2025-01-25 20:42:25,418] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.76 | bwd: 4589.90 | bwd_inner: 4585.17 | bwd_allreduce: 4.67 | step: 43.35
- 74%|███████▍  | 4285/5800 [11:55:55<2:53:46,  6.88s/it]                                                        {'loss': 0.0112, 'grad_norm': 2.0217881202697754, 'learning_rate': 6.740119172219444e-06, 'epoch': 36.94}
- 74%|███████▍  | 4285/5800 [11:55:55<2:53:46,  6.88s/it]score1 tensor([[0.3145],
-        [0.5742],
-        [0.5039],
-        [0.3457]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3105, 0.5703, 0.5039, 0.3457], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:42:32,238] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.37
-[2025-01-25 20:42:32,239] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.90 | bwd_microstep: 4542.31 | bwd_inner_microstep: 4537.48 | bwd_allreduce_microstep: 4.75 | step_microstep: 44.83
-[2025-01-25 20:42:32,240] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.87 | bwd: 4542.34 | bwd_inner: 4537.48 | bwd_allreduce: 4.79 | step: 44.83
- 74%|███████▍  | 4286/5800 [11:56:02<2:53:11,  6.86s/it]                                                        {'loss': 0.002, 'grad_norm': 3.7560386657714844, 'learning_rate': 6.7317605084767076e-06, 'epoch': 36.95}
- 74%|███████▍  | 4286/5800 [11:56:02<2:53:11,  6.86s/it]score1 tensor([[0.5469],
-        [0.4609],
-        [0.6172],
-        [0.5781]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4512, 0.6211, 0.5742], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:42:39,153] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 20:42:39,154] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.02 | bwd_microstep: 4635.27 | bwd_inner_microstep: 4630.59 | bwd_allreduce_microstep: 4.60 | step_microstep: 43.66
-[2025-01-25 20:42:39,155] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.99 | bwd: 4635.29 | bwd_inner: 4630.59 | bwd_allreduce: 4.64 | step: 43.67
- 74%|███████▍  | 4287/5800 [11:56:09<2:53:27,  6.88s/it]                                                        {'loss': 0.0054, 'grad_norm': 0.5453500151634216, 'learning_rate': 6.7234059820012745e-06, 'epoch': 36.96}
- 74%|███████▍  | 4287/5800 [11:56:09<2:53:27,  6.88s/it]score1 tensor([[0.5547],
-        [0.3828],
-        [0.4473],
-        [0.6367]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.3789, 0.4492, 0.6250], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:42:46,078] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 20:42:46,079] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.84 | bwd_microstep: 4642.54 | bwd_inner_microstep: 4637.68 | bwd_allreduce_microstep: 4.75 | step_microstep: 45.04
-[2025-01-25 20:42:46,079] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.82 | bwd: 4642.56 | bwd_inner: 4637.68 | bwd_allreduce: 4.81 | step: 45.04
- 74%|███████▍  | 4288/5800 [11:56:16<2:53:42,  6.89s/it]                                                        {'loss': 0.0063, 'grad_norm': 4.310001373291016, 'learning_rate': 6.715055595398219e-06, 'epoch': 36.97}
- 74%|███████▍  | 4288/5800 [11:56:16<2:53:42,  6.89s/it]score1 tensor([[0.3945],
-        [0.6680],
-        [0.5352],
-        [0.4590]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4043, 0.6562, 0.5391, 0.4629], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:42:52,997] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.37
-[2025-01-25 20:42:52,998] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.46 | bwd_microstep: 4630.94 | bwd_inner_microstep: 4625.60 | bwd_allreduce_microstep: 5.23 | step_microstep: 45.47
-[2025-01-25 20:42:52,999] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.43 | bwd: 4630.97 | bwd_inner: 4625.60 | bwd_allreduce: 5.29 | step: 45.46
- 74%|███████▍  | 4289/5800 [11:56:22<2:53:50,  6.90s/it]                                                        {'loss': 0.0073, 'grad_norm': 3.4093852043151855, 'learning_rate': 6.70670935127135e-06, 'epoch': 36.97}
- 74%|███████▍  | 4289/5800 [11:56:22<2:53:50,  6.90s/it]score1 tensor([[0.4727],
-        [0.5234],
-        [0.4316],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.5430, 0.4336, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:42:59,866] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.37
-[2025-01-25 20:42:59,867] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.64 | bwd_microstep: 4580.39 | bwd_inner_microstep: 4574.98 | bwd_allreduce_microstep: 5.23 | step_microstep: 45.08
-[2025-01-25 20:42:59,868] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.61 | bwd: 4580.43 | bwd_inner: 4574.98 | bwd_allreduce: 5.30 | step: 45.09
- 74%|███████▍  | 4290/5800 [11:56:29<2:53:25,  6.89s/it]                                                        {'loss': 0.0073, 'grad_norm': 5.983875751495361, 'learning_rate': 6.698367252223157e-06, 'epoch': 36.98}
- 74%|███████▍  | 4290/5800 [11:56:29<2:53:25,  6.89s/it]score1 tensor([[0.4570],
-        [0.5625],
-        [0.5820],
-        [0.4453]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4570, 0.5742, 0.5781, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0044, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:43:06,739] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 20:43:06,741] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.75 | bwd_microstep: 4590.58 | bwd_inner_microstep: 4586.02 | bwd_allreduce_microstep: 4.41 | step_microstep: 42.47
-[2025-01-25 20:43:06,741] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.72 | bwd: 4590.61 | bwd_inner: 4586.02 | bwd_allreduce: 4.45 | step: 42.47
- 74%|███████▍  | 4291/5800 [11:56:36<2:53:10,  6.89s/it]                                                        {'loss': 0.0044, 'grad_norm': 1.9009343385696411, 'learning_rate': 6.690029300854863e-06, 'epoch': 36.99}
- 74%|███████▍  | 4291/5800 [11:56:36<2:53:10,  6.89s/it][93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-score1 tensor([[0.4473]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:43:11,130] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.37
-[2025-01-25 20:43:11,131] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 575.25 | bwd_microstep: 1224.70 | bwd_inner_microstep: 1219.93 | bwd_allreduce_microstep: 4.66 | step_microstep: 48.25
-[2025-01-25 20:43:11,131] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 575.20 | bwd: 1224.72 | bwd_inner: 1219.93 | bwd_allreduce: 4.72 | step: 48.26
- 74%|███████▍  | 4292/5800 [11:56:41<2:34:15,  6.14s/it]                                                        {'loss': 0.0039, 'grad_norm': 7.571781158447266, 'learning_rate': 6.681695499766383e-06, 'epoch': 37.0}
- 74%|███████▍  | 4292/5800 [11:56:41<2:34:15,  6.14s/it][2025-01-25 20:43:15,467] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 20:43:27,137] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 20:43:37,633] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 20:43:48,148] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-score1 tensor([[0.4355],
-        [0.5039],
-        [0.6289],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4336, 0.4922, 0.6133, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:44:05,201] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 20:44:05,203] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2141.91 | bwd_microstep: 4598.25 | bwd_inner_microstep: 4592.57 | bwd_allreduce_microstep: 5.59 | step_microstep: 46.70
-[2025-01-25 20:44:05,203] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2141.87 | bwd: 4598.27 | bwd_inner: 4592.57 | bwd_allreduce: 5.64 | step: 46.71
- 74%|███████▍  | 4293/5800 [11:57:35<8:35:20, 20.52s/it]                                                        {'loss': 0.0093, 'grad_norm': 8.304221153259277, 'learning_rate': 6.673365851556332e-06, 'epoch': 37.01}
- 74%|███████▍  | 4293/5800 [11:57:35<8:35:20, 20.52s/it]score1 tensor([[0.5781],
-        [0.6172],
-        [0.5469],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5742, 0.6211, 0.5430, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:44:12,040] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 20:44:12,041] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2131.08 | bwd_microstep: 4580.49 | bwd_inner_microstep: 4575.69 | bwd_allreduce_microstep: 4.71 | step_microstep: 42.82
-[2025-01-25 20:44:12,042] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2131.04 | bwd: 4580.52 | bwd_inner: 4575.69 | bwd_allreduce: 4.76 | step: 42.82
- 74%|███████▍  | 4294/5800 [11:57:42<6:51:59, 16.41s/it]                                                        {'loss': 0.0039, 'grad_norm': 4.259773254394531, 'learning_rate': 6.665040358822053e-06, 'epoch': 37.02}
- 74%|███████▍  | 4294/5800 [11:57:42<6:51:59, 16.41s/it]score1 tensor([[0.6016],
-        [0.4531],
-        [0.4766],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.4453, 0.4727, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:44:18,888] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 20:44:18,889] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2134.33 | bwd_microstep: 4583.41 | bwd_inner_microstep: 4578.66 | bwd_allreduce_microstep: 4.67 | step_microstep: 50.31
-[2025-01-25 20:44:18,890] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2134.30 | bwd: 4583.43 | bwd_inner: 4578.66 | bwd_allreduce: 4.70 | step: 50.31
- 74%|███████▍  | 4295/5800 [11:57:48<5:39:42, 13.54s/it]                                                        {'loss': 0.0068, 'grad_norm': 3.6403722763061523, 'learning_rate': 6.656719024159564e-06, 'epoch': 37.03}
- 74%|███████▍  | 4295/5800 [11:57:48<5:39:42, 13.54s/it]score1 tensor([[0.6641],
-        [0.6875],
-        [0.6250],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.7031, 0.6133, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:44:25,746] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 20:44:25,747] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2142.42 | bwd_microstep: 4593.82 | bwd_inner_microstep: 4587.29 | bwd_allreduce_microstep: 6.41 | step_microstep: 49.63
-[2025-01-25 20:44:25,748] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2142.39 | bwd: 4593.85 | bwd_inner: 4587.29 | bwd_allreduce: 6.47 | step: 49.64
- 74%|███████▍  | 4296/5800 [11:57:55<4:49:14, 11.54s/it]                                                        {'loss': 0.0146, 'grad_norm': 4.323612213134766, 'learning_rate': 6.6484018501636174e-06, 'epoch': 37.03}
- 74%|███████▍  | 4296/5800 [11:57:55<4:49:14, 11.54s/it]score1 tensor([[0.6484],
-        [0.6094],
-        [0.4902],
-        [0.4961]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6484, 0.6133, 0.4863, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:44:32,563] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 20:44:32,564] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2144.37 | bwd_microstep: 4548.61 | bwd_inner_microstep: 4543.61 | bwd_allreduce_microstep: 4.92 | step_microstep: 43.11
-[2025-01-25 20:44:32,564] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2144.33 | bwd: 4548.63 | bwd_inner: 4543.61 | bwd_allreduce: 4.96 | step: 43.11
- 74%|███████▍  | 4297/5800 [11:58:02<4:13:32, 10.12s/it]                                                        {'loss': 0.0049, 'grad_norm': 1.7221524715423584, 'learning_rate': 6.6400888394276365e-06, 'epoch': 37.04}
- 74%|███████▍  | 4297/5800 [11:58:02<4:13:32, 10.12s/it]score1 tensor([[0.5195],
-        [0.4375],
-        [0.4668],
-        [0.3711]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.4180, 0.4609, 0.3652], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:44:39,426] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 20:44:39,427] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2145.62 | bwd_microstep: 4604.10 | bwd_inner_microstep: 4599.31 | bwd_allreduce_microstep: 4.71 | step_microstep: 41.98
-[2025-01-25 20:44:39,428] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2145.59 | bwd: 4604.12 | bwd_inner: 4599.31 | bwd_allreduce: 4.75 | step: 41.98
- 74%|███████▍  | 4298/5800 [11:58:09<3:48:54,  9.14s/it]                                                        {'loss': 0.0088, 'grad_norm': 7.546811580657959, 'learning_rate': 6.631779994543776e-06, 'epoch': 37.05}
- 74%|███████▍  | 4298/5800 [11:58:09<3:48:54,  9.14s/it]score1 tensor([[0.4238],
-        [0.5664],
-        [0.3379],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4258, 0.5625, 0.3418, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0044, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:44:46,306] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 20:44:46,307] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2147.57 | bwd_microstep: 4608.11 | bwd_inner_microstep: 4603.28 | bwd_allreduce_microstep: 4.76 | step_microstep: 42.69
-[2025-01-25 20:44:46,307] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2147.53 | bwd: 4608.14 | bwd_inner: 4603.28 | bwd_allreduce: 4.79 | step: 42.70
- 74%|███████▍  | 4299/5800 [11:58:16<3:31:45,  8.46s/it]                                                        {'loss': 0.0044, 'grad_norm': 3.3551695346832275, 'learning_rate': 6.623475318102872e-06, 'epoch': 37.06}
- 74%|███████▍  | 4299/5800 [11:58:16<3:31:45,  8.46s/it]score1 tensor([[0.4844],
-        [0.4336],
-        [0.4746],
-        [0.4141]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.4395, 0.4727, 0.4199], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:44:53,189] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.36
-[2025-01-25 20:44:53,191] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2147.18 | bwd_microstep: 4613.99 | bwd_inner_microstep: 4608.09 | bwd_allreduce_microstep: 5.78 | step_microstep: 48.16
-[2025-01-25 20:44:53,191] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2147.15 | bwd: 4614.02 | bwd_inner: 4608.08 | bwd_allreduce: 5.85 | step: 48.17
- 74%|███████▍  | 4300/5800 [11:58:23<3:19:46,  7.99s/it]                                                        {'loss': 0.0049, 'grad_norm': 0.47950825095176697, 'learning_rate': 6.615174812694461e-06, 'epoch': 37.07}
- 74%|███████▍  | 4300/5800 [11:58:23<3:19:46,  7.99s/it]score1 tensor([[0.4219],
-        [0.4805],
-        [0.4102],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4160, 0.4727, 0.4062, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:45:00,077] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 20:45:00,078] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2149.19 | bwd_microstep: 4612.58 | bwd_inner_microstep: 4607.58 | bwd_allreduce_microstep: 4.91 | step_microstep: 43.94
-[2025-01-25 20:45:00,079] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2149.16 | bwd: 4612.60 | bwd_inner: 4607.58 | bwd_allreduce: 4.96 | step: 43.96
- 74%|███████▍  | 4301/5800 [11:58:30<3:11:21,  7.66s/it]                                                        {'loss': 0.0054, 'grad_norm': 3.595515251159668, 'learning_rate': 6.606878480906798e-06, 'epoch': 37.08}
- 74%|███████▍  | 4301/5800 [11:58:30<3:11:21,  7.66s/it]score1 tensor([[0.5469],
-        [0.5078],
-        [0.4180],
-        [0.4863]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.5117, 0.4219, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:45:06,965] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.13 | optimizer_step: 4.36
-[2025-01-25 20:45:06,966] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2151.51 | bwd_microstep: 4613.47 | bwd_inner_microstep: 4608.75 | bwd_allreduce_microstep: 4.65 | step_microstep: 45.50
-[2025-01-25 20:45:06,967] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2151.47 | bwd: 4613.49 | bwd_inner: 4608.75 | bwd_allreduce: 4.68 | step: 45.51
- 74%|███████▍  | 4302/5800 [11:58:36<3:05:28,  7.43s/it]                                                        {'loss': 0.0049, 'grad_norm': 7.912044525146484, 'learning_rate': 6.598586325326808e-06, 'epoch': 37.09}
- 74%|███████▍  | 4302/5800 [11:58:36<3:05:28,  7.43s/it]score1 tensor([[0.3809],
-        [0.5742],
-        [0.4668],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3906, 0.5781, 0.4844, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:45:13,859] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 20:45:13,861] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.58 | bwd_microstep: 4618.15 | bwd_inner_microstep: 4613.05 | bwd_allreduce_microstep: 5.01 | step_microstep: 47.11
-[2025-01-25 20:45:13,861] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.54 | bwd: 4618.18 | bwd_inner: 4613.05 | bwd_allreduce: 5.05 | step: 47.12
- 74%|███████▍  | 4303/5800 [11:58:43<3:01:20,  7.27s/it]                                                        {'loss': 0.0088, 'grad_norm': 8.023941040039062, 'learning_rate': 6.5902983485401425e-06, 'epoch': 37.09}
- 74%|███████▍  | 4303/5800 [11:58:43<3:01:20,  7.27s/it]score1 tensor([[0.4277],
-        [0.3438],
-        [0.4609],
-        [0.4902]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.3340, 0.4688, 0.5000], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:45:20,762] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.75 | optimizer_step: 4.37
-[2025-01-25 20:45:20,764] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.42 | bwd_microstep: 4619.01 | bwd_inner_microstep: 4610.16 | bwd_allreduce_microstep: 8.74 | step_microstep: 49.20
-[2025-01-25 20:45:20,765] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.38 | bwd: 4619.03 | bwd_inner: 4610.16 | bwd_allreduce: 8.80 | step: 49.21
- 74%|███████▍  | 4304/5800 [11:58:50<2:58:29,  7.16s/it]                                                        {'loss': 0.0127, 'grad_norm': 4.088408946990967, 'learning_rate': 6.5820145531311245e-06, 'epoch': 37.1}
- 74%|███████▍  | 4304/5800 [11:58:50<2:58:29,  7.16s/it]score1 tensor([[0.4941],
-        [0.4043],
-        [0.4727],
-        [0.4375]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.4082, 0.4629, 0.4277], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:45:27,608] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 20:45:27,614] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.78 | bwd_microstep: 4562.67 | bwd_inner_microstep: 4557.35 | bwd_allreduce_microstep: 5.21 | step_microstep: 49.64
-[2025-01-25 20:45:27,615] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.74 | bwd: 4562.69 | bwd_inner: 4557.35 | bwd_allreduce: 5.27 | step: 49.65
- 74%|███████▍  | 4305/5800 [11:58:57<2:56:03,  7.07s/it]                                                        {'loss': 0.0059, 'grad_norm': 2.0735175609588623, 'learning_rate': 6.5737349416827965e-06, 'epoch': 37.11}
- 74%|███████▍  | 4305/5800 [11:58:57<2:56:03,  7.07s/it]score1 tensor([[0.5703],
-        [0.4863],
-        [0.4590],
-        [0.4590]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.4922, 0.4648, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:45:34,514] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.37
-[2025-01-25 20:45:34,515] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.59 | bwd_microstep: 4620.00 | bwd_inner_microstep: 4614.83 | bwd_allreduce_microstep: 5.07 | step_microstep: 46.36
-[2025-01-25 20:45:34,516] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.56 | bwd: 4620.03 | bwd_inner: 4614.83 | bwd_allreduce: 5.12 | step: 46.37
- 74%|███████▍  | 4306/5800 [11:59:04<2:54:41,  7.02s/it]                                                        {'loss': 0.0054, 'grad_norm': 3.6474385261535645, 'learning_rate': 6.565459516776875e-06, 'epoch': 37.12}
- 74%|███████▍  | 4306/5800 [11:59:04<2:54:41,  7.02s/it]score1 tensor([[0.4082],
-        [0.6406],
-        [0.5977],
-        [0.4609]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.6406, 0.6094, 0.4688], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:45:41,350] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 20:45:41,351] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.46 | bwd_microstep: 4562.33 | bwd_inner_microstep: 4557.44 | bwd_allreduce_microstep: 4.81 | step_microstep: 43.97
-[2025-01-25 20:45:41,351] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.41 | bwd: 4562.36 | bwd_inner: 4557.44 | bwd_allreduce: 4.85 | step: 43.97
- 74%|███████▍  | 4307/5800 [11:59:11<2:53:16,  6.96s/it]                                                        {'loss': 0.0068, 'grad_norm': 2.3685543537139893, 'learning_rate': 6.557188280993791e-06, 'epoch': 37.13}
- 74%|███████▍  | 4307/5800 [11:59:11<2:53:16,  6.96s/it]score1 tensor([[0.3789],
-        [0.5312],
-        [0.4336],
-        [0.4785]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3652, 0.5312, 0.4492, 0.4883], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:45:48,194] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 20:45:48,196] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.25 | bwd_microstep: 4565.87 | bwd_inner_microstep: 4560.66 | bwd_allreduce_microstep: 5.09 | step_microstep: 44.36
-[2025-01-25 20:45:48,196] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.21 | bwd: 4565.89 | bwd_inner: 4560.66 | bwd_allreduce: 5.15 | step: 44.37
- 74%|███████▍  | 4308/5800 [11:59:18<2:52:14,  6.93s/it]                                                        {'loss': 0.0098, 'grad_norm': 2.1193346977233887, 'learning_rate': 6.548921236912646e-06, 'epoch': 37.14}
- 74%|███████▍  | 4308/5800 [11:59:18<2:52:14,  6.93s/it]score1 tensor([[0.4883],
-        [0.5703],
-        [0.4160],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.5820, 0.4141, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:45:55,092] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.37
-[2025-01-25 20:45:55,094] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.80 | bwd_microstep: 4615.74 | bwd_inner_microstep: 4610.40 | bwd_allreduce_microstep: 5.22 | step_microstep: 44.17
-[2025-01-25 20:45:55,094] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.77 | bwd: 4615.76 | bwd_inner: 4610.40 | bwd_allreduce: 5.29 | step: 44.18
- 74%|███████▍  | 4309/5800 [11:59:25<2:51:56,  6.92s/it]                                                        {'loss': 0.0093, 'grad_norm': 0.7187594175338745, 'learning_rate': 6.5406583871112585e-06, 'epoch': 37.15}
- 74%|███████▍  | 4309/5800 [11:59:25<2:51:56,  6.92s/it]score1 tensor([[0.5391],
-        [0.3770],
-        [0.5352],
-        [0.4785]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.3730, 0.5273, 0.4766], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0044, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:46:01,990] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.37
-[2025-01-25 20:46:01,991] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.68 | bwd_microstep: 4613.82 | bwd_inner_microstep: 4609.04 | bwd_allreduce_microstep: 4.69 | step_microstep: 47.54
-[2025-01-25 20:46:01,992] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.63 | bwd: 4613.84 | bwd_inner: 4609.04 | bwd_allreduce: 4.73 | step: 47.54
- 74%|███████▍  | 4310/5800 [11:59:31<2:51:38,  6.91s/it]                                                        {'loss': 0.0044, 'grad_norm': 3.63999605178833, 'learning_rate': 6.532399734166132e-06, 'epoch': 37.16}
- 74%|███████▍  | 4310/5800 [11:59:31<2:51:38,  6.91s/it]score1 tensor([[0.5234],
-        [0.4492],
-        [0.4609],
-        [0.6523]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.4453, 0.4648, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:46:08,902] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 20:46:08,904] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.97 | bwd_microstep: 4620.86 | bwd_inner_microstep: 4615.80 | bwd_allreduce_microstep: 4.97 | step_microstep: 45.90
-[2025-01-25 20:46:08,904] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.93 | bwd: 4620.89 | bwd_inner: 4615.80 | bwd_allreduce: 5.02 | step: 45.90
- 74%|███████▍  | 4311/5800 [11:59:38<2:51:33,  6.91s/it]                                                        {'loss': 0.0049, 'grad_norm': 4.4199604988098145, 'learning_rate': 6.5241452806524515e-06, 'epoch': 37.16}
- 74%|███████▍  | 4311/5800 [11:59:38<2:51:33,  6.91s/it]score1 tensor([[0.5469],
-        [0.4395],
-        [0.5586],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4434, 0.5586, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0034, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:46:15,763] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 20:46:15,764] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.67 | bwd_microstep: 4574.90 | bwd_inner_microstep: 4570.01 | bwd_allreduce_microstep: 4.80 | step_microstep: 44.81
-[2025-01-25 20:46:15,764] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.62 | bwd: 4574.92 | bwd_inner: 4570.01 | bwd_allreduce: 4.85 | step: 44.82
- 74%|███████▍  | 4312/5800 [11:59:45<2:51:01,  6.90s/it]                                                        {'loss': 0.0034, 'grad_norm': 1.9943455457687378, 'learning_rate': 6.515895029144092e-06, 'epoch': 37.17}
- 74%|███████▍  | 4312/5800 [11:59:45<2:51:01,  6.90s/it]score1 tensor([[0.4355],
-        [0.4219],
-        [0.5977],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4355, 0.4180, 0.5898, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:46:22,612] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.37
-[2025-01-25 20:46:22,613] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.86 | bwd_microstep: 4565.66 | bwd_inner_microstep: 4560.48 | bwd_allreduce_microstep: 5.09 | step_microstep: 46.84
-[2025-01-25 20:46:22,614] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.83 | bwd: 4565.68 | bwd_inner: 4560.48 | bwd_allreduce: 5.13 | step: 46.85
- 74%|███████▍  | 4313/5800 [11:59:52<2:50:33,  6.88s/it]                                                        {'loss': 0.0039, 'grad_norm': 6.103335380554199, 'learning_rate': 6.507648982213639e-06, 'epoch': 37.18}
- 74%|███████▍  | 4313/5800 [11:59:52<2:50:33,  6.88s/it]score1 tensor([[0.4082],
-        [0.5195],
-        [0.5859],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.5039, 0.5664, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:46:29,492] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 20:46:29,494] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.90 | bwd_microstep: 4597.75 | bwd_inner_microstep: 4592.69 | bwd_allreduce_microstep: 4.95 | step_microstep: 44.56
-[2025-01-25 20:46:29,494] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.85 | bwd: 4597.78 | bwd_inner: 4592.69 | bwd_allreduce: 5.00 | step: 44.56
- 74%|███████▍  | 4314/5800 [11:59:59<2:50:25,  6.88s/it]                                                        {'loss': 0.0107, 'grad_norm': 6.080193042755127, 'learning_rate': 6.499407142432339e-06, 'epoch': 37.19}
- 74%|███████▍  | 4314/5800 [11:59:59<2:50:25,  6.88s/it]score1 tensor([[0.5117],
-        [0.6172],
-        [0.5078],
-        [0.3730]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.6172, 0.5156, 0.3789], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:46:36,344] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 20:46:36,345] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.12 | bwd_microstep: 4569.42 | bwd_inner_microstep: 4564.33 | bwd_allreduce_microstep: 4.99 | step_microstep: 43.64
-[2025-01-25 20:46:36,345] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.08 | bwd: 4569.45 | bwd_inner: 4564.33 | bwd_allreduce: 5.04 | step: 43.64
- 74%|███████▍  | 4315/5800 [12:00:06<2:50:07,  6.87s/it]                                                        {'loss': 0.0054, 'grad_norm': 5.7447614669799805, 'learning_rate': 6.491169512370154e-06, 'epoch': 37.2}
- 74%|███████▍  | 4315/5800 [12:00:06<2:50:07,  6.87s/it]score1 tensor([[0.4551],
-        [0.5312],
-        [0.3867],
-        [0.6172]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4707, 0.5195, 0.3691, 0.6289], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0142, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:46:43,253] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 20:46:43,254] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.32 | bwd_microstep: 4614.13 | bwd_inner_microstep: 4609.40 | bwd_allreduce_microstep: 4.65 | step_microstep: 49.06
-[2025-01-25 20:46:43,255] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.28 | bwd: 4614.15 | bwd_inner: 4609.40 | bwd_allreduce: 4.68 | step: 49.07
- 74%|███████▍  | 4316/5800 [12:00:13<2:50:15,  6.88s/it]                                                        {'loss': 0.0142, 'grad_norm': 0.6376991868019104, 'learning_rate': 6.482936094595713e-06, 'epoch': 37.21}
- 74%|███████▍  | 4316/5800 [12:00:13<2:50:15,  6.88s/it]score1 tensor([[0.4863],
-        [0.4355],
-        [0.4355],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4805, 0.4316, 0.4258, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:46:50,160] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.97 | optimizer_step: 4.37
-[2025-01-25 20:46:50,162] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.64 | bwd_microstep: 4619.10 | bwd_inner_microstep: 4614.19 | bwd_allreduce_microstep: 4.84 | step_microstep: 46.50
-[2025-01-25 20:46:50,162] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.60 | bwd: 4619.12 | bwd_inner: 4614.19 | bwd_allreduce: 4.88 | step: 46.50
- 74%|███████▍  | 4317/5800 [12:00:20<2:50:17,  6.89s/it]                                                        {'loss': 0.0078, 'grad_norm': 7.810204029083252, 'learning_rate': 6.474706891676334e-06, 'epoch': 37.22}
- 74%|███████▍  | 4317/5800 [12:00:20<2:50:17,  6.89s/it]score1 tensor([[0.5586],
-        [0.5156],
-        [0.5508],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5586, 0.5156, 0.5508, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:46:56,943] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.37
-[2025-01-25 20:46:56,944] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.53 | bwd_microstep: 4500.55 | bwd_inner_microstep: 4495.23 | bwd_allreduce_microstep: 5.24 | step_microstep: 43.98
-[2025-01-25 20:46:56,944] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.46 | bwd: 4500.58 | bwd_inner: 4495.23 | bwd_allreduce: 5.28 | step: 43.98
- 74%|███████▍  | 4318/5800 [12:00:26<2:49:22,  6.86s/it]                                                        {'loss': 0.0039, 'grad_norm': 2.0424773693084717, 'learning_rate': 6.466481906178037e-06, 'epoch': 37.22}
- 74%|███████▍  | 4318/5800 [12:00:26<2:49:22,  6.86s/it]score1 tensor([[0.4668],
-        [0.3906],
-        [0.6211],
-        [0.4902]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.3730, 0.6172, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:47:03,847] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.36
-[2025-01-25 20:47:03,849] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.28 | bwd_microstep: 4621.70 | bwd_inner_microstep: 4616.37 | bwd_allreduce_microstep: 5.21 | step_microstep: 43.70
-[2025-01-25 20:47:03,849] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.25 | bwd: 4621.72 | bwd_inner: 4616.37 | bwd_allreduce: 5.27 | step: 43.71
- 74%|███████▍  | 4319/5800 [12:00:33<2:49:37,  6.87s/it]                                                        {'loss': 0.0063, 'grad_norm': 3.9959909915924072, 'learning_rate': 6.458261140665501e-06, 'epoch': 37.23}
- 74%|███████▍  | 4319/5800 [12:00:33<2:49:37,  6.87s/it]score1 tensor([[0.5117],
-        [0.5352],
-        [0.4844],
-        [0.3887]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5078, 0.5391, 0.4844, 0.3809], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:47:10,729] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.37
-[2025-01-25 20:47:10,730] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.65 | bwd_microstep: 4598.21 | bwd_inner_microstep: 4593.21 | bwd_allreduce_microstep: 4.91 | step_microstep: 48.93
-[2025-01-25 20:47:10,730] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.61 | bwd: 4598.23 | bwd_inner: 4593.21 | bwd_allreduce: 4.95 | step: 48.94
- 74%|███████▍  | 4320/5800 [12:00:40<2:49:34,  6.87s/it]                                                        {'loss': 0.0039, 'grad_norm': 1.7275711297988892, 'learning_rate': 6.450044597702118e-06, 'epoch': 37.24}
- 74%|███████▍  | 4320/5800 [12:00:40<2:49:34,  6.87s/it]score1 tensor([[0.3477],
-        [0.5938],
-        [0.4512],
-        [0.4570]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3555, 0.5781, 0.4570, 0.4570], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:47:17,590] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 20:47:17,591] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.01 | bwd_microstep: 4576.73 | bwd_inner_microstep: 4571.87 | bwd_allreduce_microstep: 4.77 | step_microstep: 43.77
-[2025-01-25 20:47:17,592] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.97 | bwd: 4576.75 | bwd_inner: 4571.87 | bwd_allreduce: 4.81 | step: 43.79
- 74%|███████▍  | 4321/5800 [12:00:47<2:49:21,  6.87s/it]                                                        {'loss': 0.0073, 'grad_norm': 1.4376922845840454, 'learning_rate': 6.4418322798499355e-06, 'epoch': 37.25}
- 74%|███████▍  | 4321/5800 [12:00:47<2:49:21,  6.87s/it]score1 tensor([[0.3711],
-        [0.5547],
-        [0.4746],
-        [0.3984]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3789, 0.5547, 0.4785, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:47:24,453] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 20:47:24,458] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.80 | bwd_microstep: 4579.27 | bwd_inner_microstep: 4574.08 | bwd_allreduce_microstep: 5.10 | step_microstep: 46.63
-[2025-01-25 20:47:24,458] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.76 | bwd: 4579.30 | bwd_inner: 4574.08 | bwd_allreduce: 5.14 | step: 46.64
- 75%|███████▍  | 4322/5800 [12:00:54<2:49:12,  6.87s/it]                                                        {'loss': 0.0088, 'grad_norm': 1.9205156564712524, 'learning_rate': 6.433624189669709e-06, 'epoch': 37.26}
- 75%|███████▍  | 4322/5800 [12:00:54<2:49:12,  6.87s/it]score1 tensor([[0.5352],
-        [0.4531],
-        [0.3730],
-        [0.3555]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.4629, 0.3262, 0.3672], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0190, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:47:31,345] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 20:47:31,347] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.25 | bwd_microstep: 4614.30 | bwd_inner_microstep: 4609.43 | bwd_allreduce_microstep: 4.77 | step_microstep: 42.40
-[2025-01-25 20:47:31,347] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.22 | bwd: 4614.32 | bwd_inner: 4609.43 | bwd_allreduce: 4.81 | step: 42.41
- 75%|███████▍  | 4323/5800 [12:01:01<2:49:15,  6.88s/it]                                                        {'loss': 0.019, 'grad_norm': 3.9484963417053223, 'learning_rate': 6.425420329720857e-06, 'epoch': 37.27}
- 75%|███████▍  | 4323/5800 [12:01:01<2:49:15,  6.88s/it]score1 tensor([[0.5391],
-        [0.4512],
-        [0.6406],
-        [0.4785]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4648, 0.6641, 0.4805], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:47:38,254] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.37
-[2025-01-25 20:47:38,255] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.80 | bwd_microstep: 4618.30 | bwd_inner_microstep: 4613.21 | bwd_allreduce_microstep: 5.01 | step_microstep: 43.97
-[2025-01-25 20:47:38,255] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.77 | bwd: 4618.32 | bwd_inner: 4613.21 | bwd_allreduce: 5.05 | step: 43.98
- 75%|███████▍  | 4324/5800 [12:01:08<2:49:23,  6.89s/it]                                                        {'loss': 0.0127, 'grad_norm': 8.298087120056152, 'learning_rate': 6.41722070256148e-06, 'epoch': 37.28}
- 75%|███████▍  | 4324/5800 [12:01:08<2:49:23,  6.89s/it]score1 tensor([[0.4551],
-        [0.4297],
-        [0.5312],
-        [0.3750]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.4238, 0.5391, 0.3711], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:47:45,152] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 20:47:45,154] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.58 | bwd_microstep: 4612.24 | bwd_inner_microstep: 4607.13 | bwd_allreduce_microstep: 5.01 | step_microstep: 44.75
-[2025-01-25 20:47:45,154] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.52 | bwd: 4612.27 | bwd_inner: 4607.13 | bwd_allreduce: 5.07 | step: 44.76
- 75%|███████▍  | 4325/5800 [12:01:15<2:49:21,  6.89s/it]                                                        {'loss': 0.0078, 'grad_norm': 0.659400224685669, 'learning_rate': 6.409025310748376e-06, 'epoch': 37.28}
- 75%|███████▍  | 4325/5800 [12:01:15<2:49:21,  6.89s/it]score1 tensor([[0.5586],
-        [0.4629],
-        [0.3105],
-        [0.6328]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.4688, 0.3086, 0.6328], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:47:52,010] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 20:47:52,011] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.29 | bwd_microstep: 4575.46 | bwd_inner_microstep: 4570.50 | bwd_allreduce_microstep: 4.87 | step_microstep: 43.75
-[2025-01-25 20:47:52,012] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.25 | bwd: 4575.48 | bwd_inner: 4570.50 | bwd_allreduce: 4.92 | step: 43.75
- 75%|███████▍  | 4326/5800 [12:01:21<2:49:02,  6.88s/it]                                                        {'loss': 0.0049, 'grad_norm': 1.8149617910385132, 'learning_rate': 6.400834156836997e-06, 'epoch': 37.29}
- 75%|███████▍  | 4326/5800 [12:01:21<2:49:02,  6.88s/it]score1 tensor([[0.4570],
-        [0.3965],
-        [0.6055],
-        [0.4414]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4590, 0.4043, 0.5938, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:47:58,920] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.93 | optimizer_step: 4.36
-[2025-01-25 20:47:58,921] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.43 | bwd_microstep: 4624.98 | bwd_inner_microstep: 4619.62 | bwd_allreduce_microstep: 5.25 | step_microstep: 47.05
-[2025-01-25 20:47:58,922] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.39 | bwd: 4625.01 | bwd_inner: 4619.62 | bwd_allreduce: 5.31 | step: 47.06
- 75%|███████▍  | 4327/5800 [12:01:28<2:49:08,  6.89s/it]                                                        {'loss': 0.0068, 'grad_norm': 3.3390514850616455, 'learning_rate': 6.3926472433815e-06, 'epoch': 37.3}
- 75%|███████▍  | 4327/5800 [12:01:28<2:49:08,  6.89s/it]score1 tensor([[0.4043],
-        [0.5430],
-        [0.4648],
-        [0.4570]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3984, 0.5469, 0.4629, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:48:05,830] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 20:48:05,831] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.76 | bwd_microstep: 4621.59 | bwd_inner_microstep: 4616.05 | bwd_allreduce_microstep: 5.44 | step_microstep: 45.23
-[2025-01-25 20:48:05,832] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.73 | bwd: 4621.61 | bwd_inner: 4616.05 | bwd_allreduce: 5.49 | step: 45.23
- 75%|███████▍  | 4328/5800 [12:01:35<2:49:08,  6.89s/it]                                                        {'loss': 0.0049, 'grad_norm': 3.5340800285339355, 'learning_rate': 6.384464572934694e-06, 'epoch': 37.31}
- 75%|███████▍  | 4328/5800 [12:01:35<2:49:08,  6.89s/it]score1 tensor([[0.6133],
-        [0.5820],
-        [0.5664],
-        [0.4453]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.5898, 0.5664, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0044, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:48:12,700] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 20:48:12,701] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.20 | bwd_microstep: 4576.59 | bwd_inner_microstep: 4571.26 | bwd_allreduce_microstep: 5.22 | step_microstep: 47.05
-[2025-01-25 20:48:12,702] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.15 | bwd: 4576.61 | bwd_inner: 4571.26 | bwd_allreduce: 5.28 | step: 47.06
- 75%|███████▍  | 4329/5800 [12:01:42<2:48:51,  6.89s/it]                                                        {'loss': 0.0044, 'grad_norm': 6.384038925170898, 'learning_rate': 6.376286148048092e-06, 'epoch': 37.32}
- 75%|███████▍  | 4329/5800 [12:01:42<2:48:51,  6.89s/it]score1 tensor([[0.4980],
-        [0.4980],
-        [0.6523],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.4980, 0.6523, 0.5586], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0034, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:48:19,509] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 20:48:19,511] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.34 | bwd_microstep: 4533.11 | bwd_inner_microstep: 4528.51 | bwd_allreduce_microstep: 4.51 | step_microstep: 42.19
-[2025-01-25 20:48:19,511] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.29 | bwd: 4533.13 | bwd_inner: 4528.51 | bwd_allreduce: 4.55 | step: 42.20
- 75%|█��█████▍  | 4330/5800 [12:01:49<2:48:08,  6.86s/it]                                                        {'loss': 0.0034, 'grad_norm': 0.31980976462364197, 'learning_rate': 6.36811197127186e-06, 'epoch': 37.33}
- 75%|███████▍  | 4330/5800 [12:01:49<2:48:08,  6.86s/it]score1 tensor([[0.5469],
-        [0.5703],
-        [0.4023],
-        [0.5781]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.5625, 0.3887, 0.5742], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:48:26,363] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 20:48:26,365] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.73 | bwd_microstep: 4572.44 | bwd_inner_microstep: 4566.52 | bwd_allreduce_microstep: 5.81 | step_microstep: 46.74
-[2025-01-25 20:48:26,365] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.69 | bwd: 4572.47 | bwd_inner: 4566.52 | bwd_allreduce: 5.87 | step: 46.74
- 75%|███████▍  | 4331/5800 [12:01:56<2:47:59,  6.86s/it]                                                        {'loss': 0.0063, 'grad_norm': 6.170427322387695, 'learning_rate': 6.359942045154843e-06, 'epoch': 37.34}
- 75%|███████▍  | 4331/5800 [12:01:56<2:47:59,  6.86s/it]score1 tensor([[0.5586],
-        [0.6250],
-        [0.5273],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.6172, 0.5156, 0.5000], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:48:33,261] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 20:48:33,262] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.94 | bwd_microstep: 4614.37 | bwd_inner_microstep: 4609.48 | bwd_allreduce_microstep: 4.81 | step_microstep: 43.60
-[2025-01-25 20:48:33,263] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.90 | bwd: 4614.39 | bwd_inner: 4609.48 | bwd_allreduce: 4.85 | step: 43.61
- 75%|███████▍  | 4332/5800 [12:02:03<2:48:08,  6.87s/it]                                                        {'loss': 0.0078, 'grad_norm': 4.240263938903809, 'learning_rate': 6.351776372244578e-06, 'epoch': 37.34}
- 75%|███████▍  | 4332/5800 [12:02:03<2:48:08,  6.87s/it]score1 tensor([[0.5273],
-        [0.6484],
-        [0.4375],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.6484, 0.4121, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:48:40,111] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.37
-[2025-01-25 20:48:40,117] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.40 | bwd_microstep: 4567.06 | bwd_inner_microstep: 4562.51 | bwd_allreduce_microstep: 4.48 | step_microstep: 47.36
-[2025-01-25 20:48:40,117] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.35 | bwd: 4567.08 | bwd_inner: 4562.50 | bwd_allreduce: 4.52 | step: 47.38
- 75%|███████▍  | 4333/5800 [12:02:10<2:47:53,  6.87s/it]                                                        {'loss': 0.0083, 'grad_norm': 1.8120274543762207, 'learning_rate': 6.343614955087252e-06, 'epoch': 37.35}
- 75%|███████▍  | 4333/5800 [12:02:10<2:47:53,  6.87s/it]score1 tensor([[0.4414],
-        [0.4668],
-        [0.5273],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4375, 0.4707, 0.5352, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:48:47,017] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.94 | optimizer_step: 4.36
-[2025-01-25 20:48:47,018] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.20 | bwd_microstep: 4615.41 | bwd_inner_microstep: 4610.26 | bwd_allreduce_microstep: 5.05 | step_microstep: 49.50
-[2025-01-25 20:48:47,018] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.15 | bwd: 4615.43 | bwd_inner: 4610.26 | bwd_allreduce: 5.10 | step: 49.51
- 75%|███████▍  | 4334/5800 [12:02:16<2:48:01,  6.88s/it]                                                        {'loss': 0.0049, 'grad_norm': 0.38783764839172363, 'learning_rate': 6.335457796227749e-06, 'epoch': 37.36}
- 75%|███████▍  | 4334/5800 [12:02:16<2:48:01,  6.88s/it]score1 tensor([[0.5156],
-        [0.4336],
-        [0.4082],
-        [0.4277]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.4336, 0.4160, 0.4395], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:48:53,864] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 20:48:53,866] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.50 | bwd_microstep: 4566.44 | bwd_inner_microstep: 4561.23 | bwd_allreduce_microstep: 5.09 | step_microstep: 50.30
-[2025-01-25 20:48:53,866] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.45 | bwd: 4566.46 | bwd_inner: 4561.23 | bwd_allreduce: 5.14 | step: 50.31
- 75%|███████▍  | 4335/5800 [12:02:23<2:47:45,  6.87s/it]                                                        {'loss': 0.0059, 'grad_norm': 5.691107749938965, 'learning_rate': 6.327304898209603e-06, 'epoch': 37.37}
- 75%|███████▍  | 4335/5800 [12:02:23<2:47:45,  6.87s/it]score1 tensor([[0.5273],
-        [0.5586],
-        [0.4863],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.5547, 0.5039, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:49:00,785] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 20:49:00,787] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.62 | bwd_microstep: 4622.63 | bwd_inner_microstep: 4617.67 | bwd_allreduce_microstep: 4.87 | step_microstep: 49.32
-[2025-01-25 20:49:00,787] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.60 | bwd: 4622.66 | bwd_inner: 4617.67 | bwd_allreduce: 4.92 | step: 49.34
- 75%|███████▍  | 4336/5800 [12:02:30<2:47:55,  6.88s/it]                                                        {'loss': 0.0073, 'grad_norm': 4.044354438781738, 'learning_rate': 6.319156263575026e-06, 'epoch': 37.38}
- 75%|███████▍  | 4336/5800 [12:02:30<2:47:55,  6.88s/it]score1 tensor([[0.6328],
-        [0.5703],
-        [0.6758],
-        [0.4473]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.5664, 0.6719, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:49:07,702] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 20:49:07,704] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.63 | bwd_microstep: 4639.44 | bwd_inner_microstep: 4633.65 | bwd_allreduce_microstep: 5.68 | step_microstep: 45.08
-[2025-01-25 20:49:07,704] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.60 | bwd: 4639.47 | bwd_inner: 4633.65 | bwd_allreduce: 5.74 | step: 45.04
- 75%|███████▍  | 4337/5800 [12:02:37<2:48:04,  6.89s/it]                                                        {'loss': 0.0083, 'grad_norm': 8.777215957641602, 'learning_rate': 6.311011894864918e-06, 'epoch': 37.39}
- 75%|███████▍  | 4337/5800 [12:02:37<2:48:04,  6.89s/it]score1 tensor([[0.5430],
-        [0.6836],
-        [0.4688],
-        [0.3418]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.6875, 0.4668, 0.3438], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:49:14,572] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.36
-[2025-01-25 20:49:14,574] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.03 | bwd_microstep: 4582.68 | bwd_inner_microstep: 4577.49 | bwd_allreduce_microstep: 5.08 | step_microstep: 45.69
-[2025-01-25 20:49:14,574] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.99 | bwd: 4582.70 | bwd_inner: 4577.49 | bwd_allreduce: 5.14 | step: 45.69
- 75%|███████▍  | 4338/5800 [12:02:44<2:47:47,  6.89s/it]                                                        {'loss': 0.002, 'grad_norm': 2.1503236293792725, 'learning_rate': 6.302871794618817e-06, 'epoch': 37.4}
- 75%|███████▍  | 4338/5800 [12:02:44<2:47:47,  6.89s/it]score1 tensor([[0.4570],
-        [0.5039],
-        [0.5664],
-        [0.1465]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.4941, 0.5391, 0.1787], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0178, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:49:21,495] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.37
-[2025-01-25 20:49:21,497] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.62 | bwd_microstep: 4637.59 | bwd_inner_microstep: 4632.18 | bwd_allreduce_microstep: 5.29 | step_microstep: 46.62
-[2025-01-25 20:49:21,497] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.59 | bwd: 4637.62 | bwd_inner: 4632.18 | bwd_allreduce: 5.35 | step: 46.63
- 75%|███████▍  | 4339/5800 [12:02:51<2:47:56,  6.90s/it]                                                        {'loss': 0.0178, 'grad_norm': 5.258248805999756, 'learning_rate': 6.294735965374963e-06, 'epoch': 37.41}
- 75%|███████▍  | 4339/5800 [12:02:51<2:47:56,  6.90s/it]score1 tensor([[0.6602],
-        [0.5508],
-        [0.4043],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6562, 0.5625, 0.4043, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:49:28,367] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.37
-[2025-01-25 20:49:28,368] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.49 | bwd_microstep: 4585.86 | bwd_inner_microstep: 4580.49 | bwd_allreduce_microstep: 5.29 | step_microstep: 47.84
-[2025-01-25 20:49:28,369] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.44 | bwd: 4585.88 | bwd_inner: 4580.49 | bwd_allreduce: 5.33 | step: 47.84
- 75%|███████▍  | 4340/5800 [12:02:58<2:47:40,  6.89s/it]                                                        {'loss': 0.0049, 'grad_norm': 1.887022852897644, 'learning_rate': 6.286604409670236e-06, 'epoch': 37.41}
- 75%|███████▍  | 4340/5800 [12:02:58<2:47:40,  6.89s/it]score1 tensor([[0.4590],
-        [0.4941],
-        [0.6094],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4531, 0.4883, 0.6094, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:49:35,248] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.37
-[2025-01-25 20:49:35,250] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.10 | bwd_microstep: 4593.74 | bwd_inner_microstep: 4588.46 | bwd_allreduce_microstep: 5.16 | step_microstep: 44.74
-[2025-01-25 20:49:35,250] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.06 | bwd: 4593.76 | bwd_inner: 4588.46 | bwd_allreduce: 5.22 | step: 44.74
- 75%|███████▍  | 4341/5800 [12:03:05<2:47:27,  6.89s/it]                                                        {'loss': 0.0068, 'grad_norm': 1.8027228116989136, 'learning_rate': 6.2784771300402035e-06, 'epoch': 37.42}
- 75%|███████▍  | 4341/5800 [12:03:05<2:47:27,  6.89s/it]score1 tensor([[0.5664],
-        [0.6250],
-        [0.5508],
-        [0.4961]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.6211, 0.5352, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:49:42,168] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 20:49:42,170] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2169.43 | bwd_microstep: 4633.40 | bwd_inner_microstep: 4628.15 | bwd_allreduce_microstep: 5.14 | step_microstep: 45.60
-[2025-01-25 20:49:42,170] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2169.38 | bwd: 4633.43 | bwd_inner: 4628.15 | bwd_allreduce: 5.20 | step: 45.60
- 75%|███████▍  | 4342/5800 [12:03:12<2:47:35,  6.90s/it]                                                        {'loss': 0.0083, 'grad_norm': 8.60790729522705, 'learning_rate': 6.2703541290190964e-06, 'epoch': 37.43}
- 75%|███████▍  | 4342/5800 [12:03:12<2:47:35,  6.90s/it]score1 tensor([[0.4004],
-        [0.5547],
-        [0.6094],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4023, 0.5547, 0.6094, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0024, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:49:49,001] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.37
-[2025-01-25 20:49:49,002] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.16 | bwd_microstep: 4545.69 | bwd_inner_microstep: 4540.80 | bwd_allreduce_microstep: 4.80 | step_microstep: 46.01
-[2025-01-25 20:49:49,003] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.12 | bwd: 4545.72 | bwd_inner: 4540.80 | bwd_allreduce: 4.85 | step: 46.02
- 75%|███████▍  | 4343/5800 [12:03:18<2:46:59,  6.88s/it]                                                        {'loss': 0.0024, 'grad_norm': 3.779538154602051, 'learning_rate': 6.262235409139794e-06, 'epoch': 37.44}
- 75%|███████▍  | 4343/5800 [12:03:18<2:46:59,  6.88s/it]score1 tensor([[0.6406],
-        [0.4961],
-        [0.7031],
-        [0.6289]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.4980, 0.6875, 0.6367], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:49:55,921] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 20:49:55,923] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.40 | bwd_microstep: 4638.95 | bwd_inner_microstep: 4633.69 | bwd_allreduce_microstep: 5.16 | step_microstep: 44.62
-[2025-01-25 20:49:55,923] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.35 | bwd: 4638.97 | bwd_inner: 4633.69 | bwd_allreduce: 5.22 | step: 44.63
- 75%|███████▍  | 4344/5800 [12:03:25<2:47:13,  6.89s/it]                                                        {'loss': 0.0073, 'grad_norm': 4.227662563323975, 'learning_rate': 6.25412097293387e-06, 'epoch': 37.45}
- 75%|███████▍  | 4344/5800 [12:03:25<2:47:13,  6.89s/it]score1 tensor([[0.5312],
-        [0.4805],
-        [0.3281],
-        [0.4492]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.4805, 0.3223, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0029, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:50:02,759] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 20:50:02,760] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.86 | bwd_microstep: 4545.09 | bwd_inner_microstep: 4540.00 | bwd_allreduce_microstep: 5.01 | step_microstep: 47.48
-[2025-01-25 20:50:02,761] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.81 | bwd: 4545.12 | bwd_inner: 4540.00 | bwd_allreduce: 5.05 | step: 47.50
- 75%|███████▍  | 4345/5800 [12:03:32<2:46:42,  6.87s/it]                                                        {'loss': 0.0029, 'grad_norm': 0.46830087900161743, 'learning_rate': 6.246010822931532e-06, 'epoch': 37.46}
- 75%|███████▍  | 4345/5800 [12:03:32<2:46:42,  6.87s/it]score1 tensor([[0.4824],
-        [0.4746],
-        [0.4395],
-        [0.4961]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.4844, 0.4512, 0.4902], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0142, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:50:09,681] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.37
-[2025-01-25 20:50:09,683] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.83 | bwd_microstep: 4638.65 | bwd_inner_microstep: 4633.41 | bwd_allreduce_microstep: 5.13 | step_microstep: 45.50
-[2025-01-25 20:50:09,683] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.80 | bwd: 4638.68 | bwd_inner: 4633.41 | bwd_allreduce: 5.19 | step: 45.51
- 75%|███████▍  | 4346/5800 [12:03:39<2:46:58,  6.89s/it]                                                        {'loss': 0.0142, 'grad_norm': 3.820844888687134, 'learning_rate': 6.237904961661678e-06, 'epoch': 37.47}
- 75%|███████▍  | 4346/5800 [12:03:39<2:46:58,  6.89s/it]score1 tensor([[0.6211],
-        [0.6406],
-        [0.4453],
-        [0.6055]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133, 0.6406, 0.4648, 0.6133], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:50:16,567] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.92 | optimizer_step: 4.37
-[2025-01-25 20:50:16,568] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.61 | bwd_microstep: 4590.51 | bwd_inner_microstep: 4583.72 | bwd_allreduce_microstep: 6.67 | step_microstep: 53.42
-[2025-01-25 20:50:16,569] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.57 | bwd: 4590.54 | bwd_inner: 4583.72 | bwd_allreduce: 6.74 | step: 53.43
- 75%|███████▍  | 4347/5800 [12:03:46<2:46:51,  6.89s/it]                                                        {'loss': 0.0088, 'grad_norm': 1.8504838943481445, 'learning_rate': 6.22980339165185e-06, 'epoch': 37.47}
- 75%|███████▍  | 4347/5800 [12:03:46<2:46:51,  6.89s/it]score1 tensor([[0.5234],
-        [0.4883],
-        [0.4414],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.4941, 0.4434, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:50:23,499] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.37
-[2025-01-25 20:50:23,500] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.18 | bwd_microstep: 4635.07 | bwd_inner_microstep: 4630.01 | bwd_allreduce_microstep: 4.98 | step_microstep: 46.15
-[2025-01-25 20:50:23,501] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.15 | bwd: 4635.09 | bwd_inner: 4630.01 | bwd_allreduce: 5.02 | step: 46.16
- 75%|███████▍  | 4348/5800 [12:03:53<2:46:58,  6.90s/it]                                                        {'loss': 0.0039, 'grad_norm': 0.4497283399105072, 'learning_rate': 6.221706115428252e-06, 'epoch': 37.48}
- 75%|███████▍  | 4348/5800 [12:03:53<2:46:58,  6.90s/it]score1 tensor([[0.4727],
-        [0.4512],
-        [0.4062],
-        [0.4297]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.4492, 0.4004, 0.4355], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0044, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:50:30,423] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.37
-[2025-01-25 20:50:30,425] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.01 | bwd_microstep: 4634.58 | bwd_inner_microstep: 4629.56 | bwd_allreduce_microstep: 4.93 | step_microstep: 46.48
-[2025-01-25 20:50:30,425] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.98 | bwd: 4634.60 | bwd_inner: 4629.56 | bwd_allreduce: 4.98 | step: 46.48
- 75%|███████▍  | 4349/5800 [12:04:00<2:47:02,  6.91s/it]                                                        {'loss': 0.0044, 'grad_norm': 3.804840326309204, 'learning_rate': 6.213613135515766e-06, 'epoch': 37.49}
- 75%|███████▍  | 4349/5800 [12:04:00<2:47:02,  6.91s/it]score1 tensor([[0.5273],
-        [0.5352],
-        [0.4043],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.5195, 0.4121, 0.5703], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:50:37,344] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 20:50:37,345] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.76 | bwd_microstep: 4635.23 | bwd_inner_microstep: 4630.10 | bwd_allreduce_microstep: 5.02 | step_microstep: 45.81
-[2025-01-25 20:50:37,345] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.72 | bwd: 4635.25 | bwd_inner: 4630.10 | bwd_allreduce: 5.07 | step: 45.82
- 75%|███████▌  | 4350/5800 [12:04:07<2:47:02,  6.91s/it]                                                        {'loss': 0.0098, 'grad_norm': 4.621777534484863, 'learning_rate': 6.2055244544379145e-06, 'epoch': 37.5}
- 75%|███████▌  | 4350/5800 [12:04:07<2:47:02,  6.91s/it]score1 tensor([[0.5391],
-        [0.4922],
-        [0.5312],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.4863, 0.5391, 0.4863], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:50:44,272] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.37
-[2025-01-25 20:50:44,273] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.91 | bwd_microstep: 4636.82 | bwd_inner_microstep: 4631.58 | bwd_allreduce_microstep: 5.17 | step_microstep: 43.74
-[2025-01-25 20:50:44,273] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.87 | bwd: 4636.85 | bwd_inner: 4631.58 | bwd_allreduce: 5.20 | step: 43.75
- 75%|███████▌  | 4351/5800 [12:04:14<2:47:00,  6.92s/it]                                                        {'loss': 0.0049, 'grad_norm': 3.9733572006225586, 'learning_rate': 6.197440074716899e-06, 'epoch': 37.51}
- 75%|███████▌  | 4351/5800 [12:04:14<2:47:00,  6.92s/it]score1 tensor([[0.4629],
-        [0.5586],
-        [0.5547],
-        [0.3750]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.5508, 0.5469, 0.3789], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:50:51,198] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 20:50:51,199] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.33 | bwd_microstep: 4640.68 | bwd_inner_microstep: 4635.29 | bwd_allreduce_microstep: 5.30 | step_microstep: 45.81
-[2025-01-25 20:50:51,200] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.30 | bwd: 4640.70 | bwd_inner: 4635.29 | bwd_allreduce: 5.34 | step: 45.81
- 75%|███████▌  | 4352/5800 [12:04:21<2:46:59,  6.92s/it]                                                        {'loss': 0.0068, 'grad_norm': 4.529484748840332, 'learning_rate': 6.1893599988735564e-06, 'epoch': 37.52}
- 75%|███████▌  | 4352/5800 [12:04:21<2:46:59,  6.92s/it]score1 tensor([[0.7070],
-        [0.4453],
-        [0.3984],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.7070, 0.4473, 0.4043, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:50:58,067] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 20:50:58,069] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.09 | bwd_microstep: 4580.44 | bwd_inner_microstep: 4574.89 | bwd_allreduce_microstep: 5.38 | step_microstep: 46.51
-[2025-01-25 20:50:58,069] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.05 | bwd: 4580.47 | bwd_inner: 4574.88 | bwd_allreduce: 5.44 | step: 46.52
- 75%|███████▌  | 4353/5800 [12:04:28<2:46:30,  6.90s/it]                                                        {'loss': 0.0049, 'grad_norm': 1.6658046245574951, 'learning_rate': 6.181284229427409e-06, 'epoch': 37.53}
- 75%|███████▌  | 4353/5800 [12:04:28<2:46:30,  6.90s/it]score1 tensor([[0.6367],
-        [0.5508],
-        [0.4434],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6367, 0.5664, 0.4473, 0.5000], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:51:04,948] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 20:51:04,950] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.27 | bwd_microstep: 4590.29 | bwd_inner_microstep: 4584.84 | bwd_allreduce_microstep: 5.34 | step_microstep: 49.18
-[2025-01-25 20:51:04,950] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.23 | bwd: 4590.32 | bwd_inner: 4584.84 | bwd_allreduce: 5.39 | step: 49.18
- 75%|███████▌  | 4354/5800 [12:04:34<2:46:15,  6.90s/it]                                                        {'loss': 0.0078, 'grad_norm': 2.0051136016845703, 'learning_rate': 6.173212768896615e-06, 'epoch': 37.53}
- 75%|███████▌  | 4354/5800 [12:04:34<2:46:15,  6.90s/it]score1 tensor([[0.5742],
-        [0.3711],
-        [0.6836],
-        [0.4277]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.3613, 0.6836, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:51:11,830] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 20:51:11,831] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.10 | bwd_microstep: 4586.10 | bwd_inner_microstep: 4577.07 | bwd_allreduce_microstep: 8.90 | step_microstep: 48.41
-[2025-01-25 20:51:11,832] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.06 | bwd: 4586.13 | bwd_inner: 4577.07 | bwd_allreduce: 8.98 | step: 48.42
- 75%|███████▌  | 4355/5800 [12:04:41<2:45:59,  6.89s/it]                                                        {'loss': 0.0059, 'grad_norm': 1.479073405265808, 'learning_rate': 6.165145619797987e-06, 'epoch': 37.54}
- 75%|███████▌  | 4355/5800 [12:04:41<2:45:59,  6.89s/it]score1 tensor([[0.4648],
-        [0.4238],
-        [0.5312],
-        [0.4336]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4609, 0.4180, 0.5352, 0.4297], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0044, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:51:18,752] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.36
-[2025-01-25 20:51:18,754] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.03 | bwd_microstep: 4634.39 | bwd_inner_microstep: 4629.01 | bwd_allreduce_microstep: 5.24 | step_microstep: 49.22
-[2025-01-25 20:51:18,754] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.00 | bwd: 4634.41 | bwd_inner: 4629.01 | bwd_allreduce: 5.31 | step: 49.22
- 75%|███████▌  | 4356/5800 [12:04:48<2:46:05,  6.90s/it]                                                        {'loss': 0.0044, 'grad_norm': 3.553500175476074, 'learning_rate': 6.15708278464702e-06, 'epoch': 37.55}
- 75%|███████▌  | 4356/5800 [12:04:48<2:46:05,  6.90s/it]score1 tensor([[0.4531],
-        [0.5625],
-        [0.6328],
-        [0.4434]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.5742, 0.6250, 0.4297], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:51:25,680] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.37
-[2025-01-25 20:51:25,681] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.63 | bwd_microstep: 4640.72 | bwd_inner_microstep: 4635.10 | bwd_allreduce_microstep: 5.52 | step_microstep: 45.80
-[2025-01-25 20:51:25,682] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.59 | bwd: 4640.74 | bwd_inner: 4635.10 | bwd_allreduce: 5.57 | step: 45.81
- 75%|███████▌  | 4357/5800 [12:04:55<2:46:09,  6.91s/it]                                                        {'loss': 0.0088, 'grad_norm': 0.38619574904441833, 'learning_rate': 6.149024265957826e-06, 'epoch': 37.56}
- 75%|███████▌  | 4357/5800 [12:04:55<2:46:09,  6.91s/it]score1 tensor([[0.5039],
-        [0.5117],
-        [0.4824],
-        [0.4082]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.5312, 0.4941, 0.3984], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:51:32,597] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.48 | optimizer_step: 4.36
-[2025-01-25 20:51:32,598] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.85 | bwd_microstep: 4636.08 | bwd_inner_microstep: 4631.24 | bwd_allreduce_microstep: 4.75 | step_microstep: 44.24
-[2025-01-25 20:51:32,598] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.80 | bwd: 4636.11 | bwd_inner: 4631.24 | bwd_allreduce: 4.80 | step: 44.24
- 75%|███████▌  | 4358/5800 [12:05:02<2:46:04,  6.91s/it]                                                        {'loss': 0.0117, 'grad_norm': 0.4474973976612091, 'learning_rate': 6.14097006624321e-06, 'epoch': 37.57}
- 75%|███████▌  | 4358/5800 [12:05:02<2:46:04,  6.91s/it]score1 tensor([[0.5977],
-        [0.4277],
-        [0.4512],
-        [0.3945]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6016, 0.4316, 0.4473, 0.4023], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:51:39,505] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.29 | optimizer_step: 4.37
-[2025-01-25 20:51:39,506] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.81 | bwd_microstep: 4633.02 | bwd_inner_microstep: 4627.59 | bwd_allreduce_microstep: 5.35 | step_microstep: 41.70
-[2025-01-25 20:51:39,510] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.78 | bwd: 4633.05 | bwd_inner: 4627.59 | bwd_allreduce: 5.39 | step: 41.71
- 75%|███████▌  | 4359/5800 [12:05:09<2:45:57,  6.91s/it]                                                        {'loss': 0.0049, 'grad_norm': 3.919682264328003, 'learning_rate': 6.132920188014595e-06, 'epoch': 37.58}
- 75%|███████▌  | 4359/5800 [12:05:09<2:45:57,  6.91s/it]score1 tensor([[0.5703],
-        [0.5938],
-        [0.4980],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5820, 0.6055, 0.5000, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:51:46,431] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 20:51:46,433] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.60 | bwd_microstep: 4643.09 | bwd_inner_microstep: 4638.15 | bwd_allreduce_microstep: 4.84 | step_microstep: 47.51
-[2025-01-25 20:51:46,433] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.54 | bwd: 4643.12 | bwd_inner: 4638.15 | bwd_allreduce: 4.89 | step: 47.53
- 75%|███████▌  | 4360/5800 [12:05:16<2:45:57,  6.92s/it]                                                        {'loss': 0.0073, 'grad_norm': 8.419897079467773, 'learning_rate': 6.124874633782088e-06, 'epoch': 37.59}
- 75%|███████▌  | 4360/5800 [12:05:16<2:45:57,  6.92s/it]score1 tensor([[0.4023],
-        [0.4258],
-        [0.6289],
-        [0.4668]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.4297, 0.6289, 0.4629], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0024, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:51:53,298] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.27 | optimizer_step: 4.37
-[2025-01-25 20:51:53,299] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.82 | bwd_microstep: 4583.05 | bwd_inner_microstep: 4577.82 | bwd_allreduce_microstep: 5.13 | step_microstep: 45.09
-[2025-01-25 20:51:53,300] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.79 | bwd: 4583.07 | bwd_inner: 4577.82 | bwd_allreduce: 5.18 | step: 45.09
- 75%|███████▌  | 4361/5800 [12:05:23<2:45:30,  6.90s/it]                                                        {'loss': 0.0024, 'grad_norm': 1.9490580558776855, 'learning_rate': 6.1168334060544234e-06, 'epoch': 37.59}
- 75%|███████▌  | 4361/5800 [12:05:23<2:45:30,  6.90s/it]score1 tensor([[0.4688],
-        [0.6445],
-        [0.4492],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.6484, 0.4492, 0.5703], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:52:00,131] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.63 | optimizer_step: 4.37
-[2025-01-25 20:52:00,132] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.12 | bwd_microstep: 4547.10 | bwd_inner_microstep: 4542.35 | bwd_allreduce_microstep: 4.66 | step_microstep: 45.31
-[2025-01-25 20:52:00,133] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.09 | bwd: 4547.12 | bwd_inner: 4542.35 | bwd_allreduce: 4.71 | step: 45.32
- 75%|███████▌  | 4362/5800 [12:05:30<2:44:52,  6.88s/it]                                                        {'loss': 0.002, 'grad_norm': 4.280580997467041, 'learning_rate': 6.10879650733899e-06, 'epoch': 37.6}
- 75%|███████▌  | 4362/5800 [12:05:30<2:44:52,  6.88s/it]score1 tensor([[0.6055],
-        [0.5000],
-        [0.5547],
-        [0.3965]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.4941, 0.5625, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0044, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:52:06,993] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.36
-[2025-01-25 20:52:06,995] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.20 | bwd_microstep: 4583.10 | bwd_inner_microstep: 4578.09 | bwd_allreduce_microstep: 4.88 | step_microstep: 45.24
-[2025-01-25 20:52:06,995] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.17 | bwd: 4583.12 | bwd_inner: 4578.09 | bwd_allreduce: 4.95 | step: 45.25
- 75%|███████▌  | 4363/5800 [12:05:36<2:44:40,  6.88s/it]                                                        {'loss': 0.0044, 'grad_norm': 1.9178593158721924, 'learning_rate': 6.100763940141845e-06, 'epoch': 37.61}
- 75%|███████▌  | 4363/5800 [12:05:36<2:44:40,  6.88s/it]score1 tensor([[0.4473],
-        [0.4473],
-        [0.4121],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4375, 0.4570, 0.4141, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:52:13,911] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.26 | optimizer_step: 4.37
-[2025-01-25 20:52:13,912] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.66 | bwd_microstep: 4633.07 | bwd_inner_microstep: 4628.27 | bwd_allreduce_microstep: 4.72 | step_microstep: 46.28
-[2025-01-25 20:52:13,912] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.62 | bwd: 4633.09 | bwd_inner: 4628.27 | bwd_allreduce: 4.76 | step: 46.28
- 75%|███████▌  | 4364/5800 [12:05:43<2:44:48,  6.89s/it]                                                        {'loss': 0.0073, 'grad_norm': 0.5386146903038025, 'learning_rate': 6.0927357069676715e-06, 'epoch': 37.62}
- 75%|███████▌  | 4364/5800 [12:05:43<2:44:48,  6.89s/it]score1 tensor([[0.6484],
-        [0.4707],
-        [0.4746],
-        [0.6016]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.4785, 0.4785, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:52:20,808] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.35 | optimizer_step: 4.36
-[2025-01-25 20:52:20,809] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.58 | bwd_microstep: 4630.30 | bwd_inner_microstep: 4625.06 | bwd_allreduce_microstep: 5.12 | step_microstep: 40.68
-[2025-01-25 20:52:20,810] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.54 | bwd: 4630.33 | bwd_inner: 4625.06 | bwd_allreduce: 5.17 | step: 40.69
- 75%|███████▌  | 4365/5800 [12:05:50<2:44:44,  6.89s/it]                                                        {'loss': 0.0146, 'grad_norm': 3.8005783557891846, 'learning_rate': 6.084711810319821e-06, 'epoch': 37.63}
- 75%|███████▌  | 4365/5800 [12:05:50<2:44:44,  6.89s/it]score1 tensor([[0.6211],
-        [0.4219],
-        [0.5078],
-        [0.6211]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.4258, 0.5195, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:52:27,711] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 20:52:27,712] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.60 | bwd_microstep: 4635.67 | bwd_inner_microstep: 4630.89 | bwd_allreduce_microstep: 4.70 | step_microstep: 42.98
-[2025-01-25 20:52:27,713] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.56 | bwd: 4635.69 | bwd_inner: 4630.89 | bwd_allreduce: 4.74 | step: 42.98
- 75%|███████▌  | 4366/5800 [12:05:57<2:44:46,  6.89s/it]                                                        {'loss': 0.0098, 'grad_norm': 1.0322186946868896, 'learning_rate': 6.07669225270028e-06, 'epoch': 37.64}
- 75%|███████▌  | 4366/5800 [12:05:57<2:44:46,  6.89s/it]score1 tensor([[0.3711],
-        [0.4395],
-        [0.5547],
-        [0.4551]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.4277, 0.5508, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:52:34,625] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.36
-[2025-01-25 20:52:34,627] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.29 | bwd_microstep: 4636.98 | bwd_inner_microstep: 4632.04 | bwd_allreduce_microstep: 4.86 | step_microstep: 44.21
-[2025-01-25 20:52:34,627] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.25 | bwd: 4637.01 | bwd_inner: 4632.04 | bwd_allreduce: 4.90 | step: 44.22
- 75%|███████▌  | 4367/5800 [12:06:04<2:44:51,  6.90s/it]                                                        {'loss': 0.0059, 'grad_norm': 4.194864749908447, 'learning_rate': 6.06867703660968e-06, 'epoch': 37.65}
- 75%|███████▌  | 4367/5800 [12:06:04<2:44:51,  6.90s/it]score1 tensor([[0.4180],
-        [0.6719],
-        [0.4863],
-        [0.4414]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4238, 0.6562, 0.4863, 0.4414], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:52:41,461] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 20:52:41,462] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.14 | bwd_microstep: 4545.86 | bwd_inner_microstep: 4540.94 | bwd_allreduce_microstep: 4.81 | step_microstep: 45.69
-[2025-01-25 20:52:41,462] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.09 | bwd: 4545.88 | bwd_inner: 4540.95 | bwd_allreduce: 4.86 | step: 45.69
- 75%|███████▌  | 4368/5800 [12:06:11<2:44:11,  6.88s/it]                                                        {'loss': 0.0054, 'grad_norm': 0.8361756801605225, 'learning_rate': 6.060666164547315e-06, 'epoch': 37.66}
- 75%|███████▌  | 4368/5800 [12:06:11<2:44:11,  6.88s/it]score1 tensor([[0.4434],
-        [0.5430],
-        [0.4473],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4375, 0.5273, 0.4492, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:52:48,365] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 20:52:48,367] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.63 | bwd_microstep: 4636.01 | bwd_inner_microstep: 4630.76 | bwd_allreduce_microstep: 5.13 | step_microstep: 46.22
-[2025-01-25 20:52:48,367] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.61 | bwd: 4636.04 | bwd_inner: 4630.76 | bwd_allreduce: 5.19 | step: 46.22
- 75%|███████▌  | 4369/5800 [12:06:18<2:44:18,  6.89s/it]                                                        {'loss': 0.0068, 'grad_norm': 4.211725234985352, 'learning_rate': 6.052659639011102e-06, 'epoch': 37.66}
- 75%|███████▌  | 4369/5800 [12:06:18<2:44:18,  6.89s/it]score1 tensor([[0.4160],
-        [0.4492],
-        [0.5352],
-        [0.4863]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.4570, 0.5273, 0.4883], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 20:52:55,282] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.37
-[2025-01-25 20:52:55,283] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.68 | bwd_microstep: 4628.88 | bwd_inner_microstep: 4624.14 | bwd_allreduce_microstep: 4.66 | step_microstep: 41.06
-[2025-01-25 20:52:55,284] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.63 | bwd: 4628.91 | bwd_inner: 4624.14 | bwd_allreduce: 4.70 | step: 41.07
- 75%|███████▌  | 4370/5800 [12:06:25<2:44:22,  6.90s/it]                                                        {'loss': 0.0049, 'grad_norm': 3.6089096069335938, 'learning_rate': 6.044657462497628e-06, 'epoch': 37.67}
- 75%|███████▌  | 4370/5800 [12:06:25<2:44:22,  6.90s/it]evaluate!
-score1 tensor([[0.4844]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4805, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5469]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5625, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4805]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4766, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5469]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5703, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6445]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5742, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4590]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5547, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0957, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5977, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5469]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4219, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1250, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5586]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4980]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5000, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4531]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3945, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0586, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5000, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5000]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3730, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1270, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4844]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4297]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4414, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5312]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6797, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1484, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5391, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5000]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4648, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4102]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4258, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4629]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4707, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4531]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3965, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0566, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3750, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1406, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5312]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4492, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0820, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5586, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5781]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6523, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0742, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5898]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5898, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0., device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4355, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1035, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5156, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0., device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3262, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1895, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4180]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4863, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0684, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4805]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1680, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6055, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5430]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4570, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0859, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5703]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0781, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5977]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.7031, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1055, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4863]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4336, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0527, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5391, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0., device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4590]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4473, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5430, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4551]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4590, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6445]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6641, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4453, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0781, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6055]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.7109, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1055, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5430]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5273, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5039]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3691, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1348, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4648]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4180, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4707]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3223, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1484, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5625]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6016, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4961]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4648, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4629]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3418, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1211, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4941]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4590, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5039]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5117, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5000]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4551, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0449, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4727]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5234, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5664]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5312, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5898]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5820, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4336]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4199, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4043]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3926, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5469]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5195]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5117]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5352, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5977]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6250, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5977]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5703, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4746]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4727, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5625]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4434, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1191, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4883]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4941, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5508]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5352, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4961]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4551]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3828, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0723, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4336]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4219, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4961]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4844, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5312]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4805, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5625]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5820]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5898, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6367]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6641, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5000]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4688, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4355]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4297, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4297]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4668, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0371, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5312]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0977, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4629]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4297, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0332, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4238]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4609, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0371, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4883]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1289, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4805]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4492, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4062]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3730, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0332, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6445, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1055, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4805]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1680, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4648]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5430, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0781, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5156]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5273, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4609]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3867, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0742, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4902]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4668, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5664]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5625, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5938]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6250, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5742]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5703]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0586, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4551]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4785, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4902]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4062, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0840, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4395, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0957, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4473]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5664, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1191, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4531]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3555, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0977, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4805]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4785, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5469]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5078, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6016]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6016, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0., device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4746]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4277, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5977]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5781, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5000]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3477, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1523, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5195]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6133, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0938, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5625]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5156, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4375]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3379, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0996, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4355]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3418, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0938, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4941]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3848, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1094, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4336]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3828, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0508, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4707]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4277, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4180]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4043, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16)
-Results saved to /DATA/DATA1/wjr/intern/InternVL/internvl_chat/new_train_result/stage2/mos3_test.csv
-Accuracy: 0.0
-SRCC_score: 0.666574588207346
-PLCC_score: 0.6709370985991026
-KRCC_score: 0.48612559939272676
-SRCC_level: 0.666574588207346
-PLCC_level: 0.6709370985991026
-KRCC_level: 0.48612559939272676
-score1 tensor([[0.6875],
-        [0.5391],
-        [0.3477],
-        [0.4023]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6953, 0.5469, 0.3457, 0.4043], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:02:51,740] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 21:02:51,742] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2138.19 | bwd_microstep: 4599.96 | bwd_inner_microstep: 4594.80 | bwd_allreduce_microstep: 5.06 | step_microstep: 45.57
-[2025-01-25 21:02:51,742] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2138.14 | bwd: 4599.98 | bwd_inner: 4594.80 | bwd_allreduce: 5.11 | step: 45.58
- 75%|███████▌  | 4371/5800 [12:16:21<72:56:42, 183.77s/it]                                                          {'loss': 0.0049, 'grad_norm': 4.730427265167236, 'learning_rate': 6.036659637502096e-06, 'epoch': 37.68}
- 75%|███████▌  | 4371/5800 [12:16:21<72:56:42, 183.77s/it]score1 tensor([[0.5156],
-        [0.4062],
-        [0.4785],
-        [0.4766]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.3945, 0.4824, 0.4746], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:02:58,582] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 21:02:58,583] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2142.82 | bwd_microstep: 4567.83 | bwd_inner_microstep: 4562.44 | bwd_allreduce_microstep: 5.29 | step_microstep: 46.97
-[2025-01-25 21:02:58,584] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2142.78 | bwd: 4567.86 | bwd_inner: 4562.44 | bwd_allreduce: 5.34 | step: 46.98
- 75%|███████▌  | 4372/5800 [12:16:28<51:50:20, 130.69s/it]                                                          {'loss': 0.0054, 'grad_norm': 3.8095006942749023, 'learning_rate': 6.02866616651838e-06, 'epoch': 37.69}
- 75%|███████▌  | 4372/5800 [12:16:28<51:50:20, 130.69s/it]score1 tensor([[0.5234],
-        [0.3535],
-        [0.5156],
-        [0.4961]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.3477, 0.5273, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:03:05,401] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 21:03:05,402] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2131.34 | bwd_microstep: 4575.28 | bwd_inner_microstep: 4570.50 | bwd_allreduce_microstep: 4.71 | step_microstep: 42.62
-[2025-01-25 21:03:05,403] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2131.31 | bwd: 4575.30 | bwd_inner: 4570.50 | bwd_allreduce: 4.75 | step: 42.63
- 75%|███████▌  | 4373/5800 [12:16:35<37:04:24, 93.53s/it]                                                          {'loss': 0.0122, 'grad_norm': 0.6395776271820068, 'learning_rate': 6.02067705203897e-06, 'epoch': 37.7}
- 75%|███████▌  | 4373/5800 [12:16:35<37:04:24, 93.53s/it]score1 tensor([[0.5508],
-        [0.5469],
-        [0.5156],
-        [0.3750]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.5391, 0.5117, 0.3672], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:03:12,193] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.55 | optimizer_step: 4.36
-[2025-01-25 21:03:12,194] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2136.79 | bwd_microstep: 4535.08 | bwd_inner_microstep: 4530.19 | bwd_allreduce_microstep: 4.77 | step_microstep: 41.72
-[2025-01-25 21:03:12,194] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2136.73 | bwd: 4535.10 | bwd_inner: 4530.19 | bwd_allreduce: 4.83 | step: 41.73
- 75%|███████▌  | 4374/5800 [12:16:42<26:44:23, 67.51s/it]                                                         {'loss': 0.0049, 'grad_norm': 5.8710150718688965, 'learning_rate': 6.012692296555027e-06, 'epoch': 37.71}
- 75%|███████▌  | 4374/5800 [12:16:42<26:44:23, 67.51s/it]score1 tensor([[0.4434],
-        [0.5117],
-        [0.3516],
-        [0.6445]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4453, 0.4980, 0.3457, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:03:19,000] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 21:03:19,001] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2140.61 | bwd_microstep: 4549.85 | bwd_inner_microstep: 4544.89 | bwd_allreduce_microstep: 4.84 | step_microstep: 45.61
-[2025-01-25 21:03:19,002] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2140.57 | bwd: 4549.87 | bwd_inner: 4544.89 | bwd_allreduce: 4.90 | step: 45.62
- 75%|███████▌  | 4375/5800 [12:16:48<19:30:47, 49.30s/it]                                                         {'loss': 0.0054, 'grad_norm': 1.8480650186538696, 'learning_rate': 6.004711902556317e-06, 'epoch': 37.72}
- 75%|███████▌  | 4375/5800 [12:16:48<19:30:47, 49.30s/it]score1 tensor([[0.6445],
-        [0.5742],
-        [0.6523],
-        [0.4980]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.5625, 0.6562, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:03:25,809] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 21:03:25,810] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2145.62 | bwd_microstep: 4544.45 | bwd_inner_microstep: 4539.22 | bwd_allreduce_microstep: 5.12 | step_microstep: 47.73
-[2025-01-25 21:03:25,811] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2145.58 | bwd: 4544.46 | bwd_inner: 4539.22 | bwd_allreduce: 5.17 | step: 47.74
- 75%|███████▌  | 4376/5800 [12:16:55<14:27:28, 36.55s/it]                                                         {'loss': 0.0073, 'grad_norm': 1.8675121068954468, 'learning_rate': 5.996735872531283e-06, 'epoch': 37.72}
- 75%|███████▌  | 4376/5800 [12:16:55<14:27:28, 36.55s/it]score1 tensor([[0.5469],
-        [0.4863],
-        [0.4121],
-        [0.4980]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.4863, 0.4082, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:03:32,584] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 21:03:32,586] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2146.34 | bwd_microstep: 4513.77 | bwd_inner_microstep: 4508.94 | bwd_allreduce_microstep: 4.73 | step_microstep: 43.24
-[2025-01-25 21:03:32,586] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2146.30 | bwd: 4513.79 | bwd_inner: 4508.94 | bwd_allreduce: 4.78 | step: 43.24
- 75%|███████▌  | 4377/5800 [12:17:02<10:55:00, 27.62s/it]                                                         {'loss': 0.002, 'grad_norm': 3.9189980030059814, 'learning_rate': 5.988764208966977e-06, 'epoch': 37.73}
- 75%|███████▌  | 4377/5800 [12:17:02<10:55:00, 27.62s/it]score1 tensor([[0.3047],
-        [0.4922],
-        [0.5625],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3105, 0.4824, 0.5625, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:03:39,412] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.36
-[2025-01-25 21:03:39,413] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2146.42 | bwd_microstep: 4557.58 | bwd_inner_microstep: 4552.85 | bwd_allreduce_microstep: 4.63 | step_microstep: 49.50
-[2025-01-25 21:03:39,414] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2146.38 | bwd: 4557.60 | bwd_inner: 4552.85 | bwd_allreduce: 4.68 | step: 49.51
- 75%|███████▌  | 4378/5800 [12:17:09<8:26:42, 21.38s/it]                                                         {'loss': 0.0049, 'grad_norm': 2.5884175300598145, 'learning_rate': 5.980796914349111e-06, 'epoch': 37.74}
- 75%|███████▌  | 4378/5800 [12:17:09<8:26:42, 21.38s/it]score1 tensor([[0.5352],
-        [0.4297],
-        [0.3535],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4395, 0.2812, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0254, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:03:46,278] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 21:03:46,279] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.71 | bwd_microstep: 4605.00 | bwd_inner_microstep: 4599.87 | bwd_allreduce_microstep: 5.04 | step_microstep: 43.67
-[2025-01-25 21:03:46,280] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.68 | bwd: 4605.02 | bwd_inner: 4599.87 | bwd_allreduce: 5.08 | step: 43.68
- 76%|███████▌  | 4379/5800 [12:17:16<6:43:15, 17.03s/it]                                                        {'loss': 0.0254, 'grad_norm': 0.41748201847076416, 'learning_rate': 5.972833991162017e-06, 'epoch': 37.75}
- 76%|███████▌  | 4379/5800 [12:17:16<6:43:15, 17.03s/it]score1 tensor([[0.5000],
-        [0.5508],
-        [0.5352],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.5664, 0.5430, 0.5586], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:03:53,107] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 21:03:53,108] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2151.00 | bwd_microstep: 4559.29 | bwd_inner_microstep: 4554.35 | bwd_allreduce_microstep: 4.83 | step_microstep: 43.67
-[2025-01-25 21:03:53,109] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.97 | bwd: 4559.31 | bwd_inner: 4554.35 | bwd_allreduce: 4.89 | step: 43.68
- 76%|███████▌  | 4380/5800 [12:17:23<5:30:33, 13.97s/it]                                                        {'loss': 0.0068, 'grad_norm': 6.277139663696289, 'learning_rate': 5.96487544188868e-06, 'epoch': 37.76}
- 76%|███████▌  | 4380/5800 [12:17:23<5:30:33, 13.97s/it]score1 tensor([[0.4492],
-        [0.5586],
-        [0.4434],
-        [0.6172]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.5664, 0.4473, 0.6289], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:03:59,995] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 21:03:59,997] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.93 | bwd_microstep: 4618.63 | bwd_inner_microstep: 4612.98 | bwd_allreduce_microstep: 5.55 | step_microstep: 44.25
-[2025-01-25 21:03:59,997] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.89 | bwd: 4618.65 | bwd_inner: 4612.98 | bwd_allreduce: 5.60 | step: 44.26
- 76%|███████▌  | 4381/5800 [12:17:29<4:40:06, 11.84s/it]                                                        {'loss': 0.0073, 'grad_norm': 8.16917896270752, 'learning_rate': 5.956921269010707e-06, 'epoch': 37.77}
- 76%|███████▌  | 4381/5800 [12:17:29<4:40:06, 11.84s/it]score1 tensor([[0.3398],
-        [0.4551],
-        [0.5508],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3516, 0.4590, 0.5625, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:04:06,828] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 21:04:06,829] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.85 | bwd_microstep: 4562.07 | bwd_inner_microstep: 4557.30 | bwd_allreduce_microstep: 4.66 | step_microstep: 44.04
-[2025-01-25 21:04:06,829] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.81 | bwd: 4562.09 | bwd_inner: 4557.30 | bwd_allreduce: 4.71 | step: 44.05
- 76%|███████▌  | 4382/5800 [12:17:36<4:04:22, 10.34s/it]                                                        {'loss': 0.0068, 'grad_norm': 5.6369242668151855, 'learning_rate': 5.948971475008359e-06, 'epoch': 37.78}
- 76%|███████▌  | 4382/5800 [12:17:36<4:04:22, 10.34s/it]score1 tensor([[0.5273],
-        [0.4883],
-        [0.4590],
-        [0.4668]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.4961, 0.4668, 0.4707], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:04:13,710] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 21:04:13,711] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.99 | bwd_microstep: 4611.46 | bwd_inner_microstep: 4606.83 | bwd_allreduce_microstep: 4.50 | step_microstep: 43.16
-[2025-01-25 21:04:13,711] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.96 | bwd: 4611.48 | bwd_inner: 4606.83 | bwd_allreduce: 4.55 | step: 43.16
- 76%|███████▌  | 4383/5800 [12:17:43<3:39:41,  9.30s/it]                                                        {'loss': 0.0068, 'grad_norm': 7.884087562561035, 'learning_rate': 5.941026062360502e-06, 'epoch': 37.78}
- 76%|███████▌  | 4383/5800 [12:17:43<3:39:41,  9.30s/it]score1 tensor([[0.5664],
-        [0.4883],
-        [0.6641],
-        [0.4668]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5742, 0.4961, 0.6797, 0.4746], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:04:20,602] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.47 | optimizer_step: 4.37
-[2025-01-25 21:04:20,604] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.66 | bwd_microstep: 4616.24 | bwd_inner_microstep: 4611.37 | bwd_allreduce_microstep: 4.75 | step_microstep: 49.69
-[2025-01-25 21:04:20,605] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.63 | bwd: 4616.26 | bwd_inner: 4611.37 | bwd_allreduce: 4.81 | step: 49.69
- 76%|███████▌  | 4384/5800 [12:17:50<3:22:27,  8.58s/it]                                                        {'loss': 0.0098, 'grad_norm': 8.440515518188477, 'learning_rate': 5.9330850335446695e-06, 'epoch': 37.79}
- 76%|███████▌  | 4384/5800 [12:17:50<3:22:27,  8.58s/it]score1 tensor([[0.3809],
-        [0.5859],
-        [0.5078],
-        [0.4062]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3809, 0.6016, 0.5078, 0.4141], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:04:27,393] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 21:04:27,394] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.79 | bwd_microstep: 4525.83 | bwd_inner_microstep: 4520.97 | bwd_allreduce_microstep: 4.73 | step_microstep: 41.93
-[2025-01-25 21:04:27,394] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.75 | bwd: 4525.85 | bwd_inner: 4520.98 | bwd_allreduce: 4.80 | step: 41.95
- 76%|███████▌  | 4385/5800 [12:17:57<3:09:40,  8.04s/it]                                                        {'loss': 0.0059, 'grad_norm': 3.9896047115325928, 'learning_rate': 5.925148391037008e-06, 'epoch': 37.8}
- 76%|███████▌  | 4385/5800 [12:17:57<3:09:40,  8.04s/it]score1 tensor([[0.3906],
-        [0.4062],
-        [0.4824],
-        [0.6211]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3867, 0.4023, 0.4922, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:04:34,280] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 21:04:34,282] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.74 | bwd_microstep: 4612.43 | bwd_inner_microstep: 4607.29 | bwd_allreduce_microstep: 5.01 | step_microstep: 47.80
-[2025-01-25 21:04:34,283] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.69 | bwd: 4612.45 | bwd_inner: 4607.28 | bwd_allreduce: 5.08 | step: 47.81
- 76%|███████▌  | 4386/5800 [12:18:04<3:01:25,  7.70s/it]                                                        {'loss': 0.0054, 'grad_norm': 3.8571906089782715, 'learning_rate': 5.917216137312292e-06, 'epoch': 37.81}
- 76%|███████▌  | 4386/5800 [12:18:04<3:01:25,  7.70s/it]score1 tensor([[0.6133],
-        [0.5508],
-        [0.5820],
-        [0.4922]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.5469, 0.5898, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:04:41,153] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 21:04:41,154] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.55 | bwd_microstep: 4574.37 | bwd_inner_microstep: 4569.28 | bwd_allreduce_microstep: 5.00 | step_microstep: 46.44
-[2025-01-25 21:04:41,155] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.51 | bwd: 4574.40 | bwd_inner: 4569.28 | bwd_allreduce: 5.05 | step: 46.45
- 76%|███████▌  | 4387/5800 [12:18:11<2:55:23,  7.45s/it]                                                        {'loss': 0.0039, 'grad_norm': 2.215244770050049, 'learning_rate': 5.9092882748439454e-06, 'epoch': 37.82}
- 76%|███████▌  | 4387/5800 [12:18:11<2:55:23,  7.45s/it]score1 tensor([[0.5273],
-        [0.5703],
-        [0.3848],
-        [0.5898]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.5625, 0.3945, 0.5977], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:04:48,044] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.37
-[2025-01-25 21:04:48,046] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.66 | bwd_microstep: 4618.23 | bwd_inner_microstep: 4613.57 | bwd_allreduce_microstep: 4.57 | step_microstep: 44.67
-[2025-01-25 21:04:48,046] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.61 | bwd: 4618.25 | bwd_inner: 4613.57 | bwd_allreduce: 4.62 | step: 44.68
- 76%|███████▌  | 4388/5800 [12:18:18<2:51:20,  7.28s/it]                                                        {'loss': 0.0093, 'grad_norm': 3.8504092693328857, 'learning_rate': 5.901364806104007e-06, 'epoch': 37.83}
- 76%|███████▌  | 4388/5800 [12:18:18<2:51:20,  7.28s/it]score1 tensor([[0.6602],
-        [0.4512],
-        [0.4844],
-        [0.4316]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6484, 0.4551, 0.4980, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:04:54,927] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 21:04:54,929] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.57 | bwd_microstep: 4611.17 | bwd_inner_microstep: 4606.17 | bwd_allreduce_microstep: 4.89 | step_microstep: 44.28
-[2025-01-25 21:04:54,929] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.53 | bwd: 4611.20 | bwd_inner: 4606.17 | bwd_allreduce: 4.95 | step: 44.29
- 76%|███████▌  | 4389/5800 [12:18:24<2:48:26,  7.16s/it]                                                        {'loss': 0.0107, 'grad_norm': 0.6056408286094666, 'learning_rate': 5.893445733563157e-06, 'epoch': 37.84}
- 76%|███████▌  | 4389/5800 [12:18:24<2:48:26,  7.16s/it]score1 tensor([[0.5859],
-        [0.4590],
-        [0.3438],
-        [0.6250]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6016, 0.4512, 0.3398, 0.6367], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:05:01,820] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.37
-[2025-01-25 21:05:01,822] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.60 | bwd_microstep: 4614.51 | bwd_inner_microstep: 4608.45 | bwd_allreduce_microstep: 5.93 | step_microstep: 49.40
-[2025-01-25 21:05:01,822] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.57 | bwd: 4614.53 | bwd_inner: 4608.44 | bwd_allreduce: 6.00 | step: 49.40
- 76%|███████▌  | 4390/5800 [12:18:31<2:46:24,  7.08s/it]                                                        {'loss': 0.0098, 'grad_norm': 1.31183922290802, 'learning_rate': 5.885531059690694e-06, 'epoch': 37.84}
- 76%|███████▌  | 4390/5800 [12:18:31<2:46:24,  7.08s/it]score1 tensor([[0.4629],
-        [0.3789],
-        [0.4473],
-        [0.4453]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.3750, 0.4473, 0.4531], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:05:08,667] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 21:05:08,668] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.34 | bwd_microstep: 4568.39 | bwd_inner_microstep: 4563.17 | bwd_allreduce_microstep: 5.12 | step_microstep: 43.61
-[2025-01-25 21:05:08,668] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.30 | bwd: 4568.41 | bwd_inner: 4563.17 | bwd_allreduce: 5.17 | step: 43.62
- 76%|███████▌  | 4391/5800 [12:18:38<2:44:38,  7.01s/it]                                                        {'loss': 0.0063, 'grad_norm': 1.7661375999450684, 'learning_rate': 5.877620786954559e-06, 'epoch': 37.85}
- 76%|███████▌  | 4391/5800 [12:18:38<2:44:38,  7.01s/it]score1 tensor([[0.4395],
-        [0.5898],
-        [0.6055],
-        [0.6016]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4336, 0.5781, 0.6055, 0.5938], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:05:15,518] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 21:05:15,520] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.01 | bwd_microstep: 4575.68 | bwd_inner_microstep: 4570.66 | bwd_allreduce_microstep: 4.92 | step_microstep: 46.10
-[2025-01-25 21:05:15,520] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.97 | bwd: 4575.70 | bwd_inner: 4570.66 | bwd_allreduce: 4.97 | step: 46.11
- 76%|███████▌  | 4392/5800 [12:18:45<2:43:23,  6.96s/it]                                                        {'loss': 0.0063, 'grad_norm': 6.3065080642700195, 'learning_rate': 5.869714917821307e-06, 'epoch': 37.86}
- 76%|███████▌  | 4392/5800 [12:18:45<2:43:23,  6.96s/it]score1 tensor([[0.3945],
-        [0.3730],
-        [0.4512],
-        [0.6406]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3926, 0.3750, 0.4590, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:05:22,411] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.37
-[2025-01-25 21:05:22,412] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.54 | bwd_microstep: 4616.39 | bwd_inner_microstep: 4611.73 | bwd_allreduce_microstep: 4.57 | step_microstep: 43.38
-[2025-01-25 21:05:22,412] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.50 | bwd: 4616.42 | bwd_inner: 4611.73 | bwd_allreduce: 4.62 | step: 43.38
- 76%|███████▌  | 4393/5800 [12:18:52<2:42:46,  6.94s/it]                                                        {'loss': 0.0039, 'grad_norm': 4.191023349761963, 'learning_rate': 5.86181345475612e-06, 'epoch': 37.87}
- 76%|███████▌  | 4393/5800 [12:18:52<2:42:46,  6.94s/it]score1 tensor([[0.4180],
-        [0.4512],
-        [0.4414],
-        [0.4746]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4062, 0.4453, 0.4277, 0.4609], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:05:29,304] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 21:05:29,305] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.87 | bwd_microstep: 4613.45 | bwd_inner_microstep: 4608.44 | bwd_allreduce_microstep: 4.92 | step_microstep: 45.12
-[2025-01-25 21:05:29,306] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.82 | bwd: 4613.47 | bwd_inner: 4608.44 | bwd_allreduce: 4.96 | step: 45.12
- 76%|███████▌  | 4394/5800 [12:18:59<2:42:19,  6.93s/it]                                                        {'loss': 0.0112, 'grad_norm': 7.482447624206543, 'learning_rate': 5.853916400222823e-06, 'epoch': 37.88}
- 76%|███████▌  | 4394/5800 [12:18:59<2:42:19,  6.93s/it]score1 tensor([[0.5586],
-        [0.3672],
-        [0.4805],
-        [0.5859]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.3652, 0.4766, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0044, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:05:36,205] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 21:05:36,206] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.07 | bwd_microstep: 4622.34 | bwd_inner_microstep: 4617.34 | bwd_allreduce_microstep: 4.91 | step_microstep: 44.76
-[2025-01-25 21:05:36,207] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.03 | bwd: 4622.36 | bwd_inner: 4617.34 | bwd_allreduce: 4.95 | step: 44.76
- 76%|███████▌  | 4395/5800 [12:19:06<2:42:03,  6.92s/it]                                                        {'loss': 0.0044, 'grad_norm': 8.000894546508789, 'learning_rate': 5.846023756683841e-06, 'epoch': 37.89}
- 76%|███████▌  | 4395/5800 [12:19:06<2:42:03,  6.92s/it]score1 tensor([[0.5938],
-        [0.5703],
-        [0.6055],
-        [0.6055]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6016, 0.5703, 0.6055, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:05:43,017] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 21:05:43,018] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.25 | bwd_microstep: 4533.14 | bwd_inner_microstep: 4528.20 | bwd_allreduce_microstep: 4.83 | step_microstep: 43.27
-[2025-01-25 21:05:43,019] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.22 | bwd: 4533.16 | bwd_inner: 4528.20 | bwd_allreduce: 4.88 | step: 43.27
- 76%|███████▌  | 4396/5800 [12:19:12<2:41:09,  6.89s/it]                                                        {'loss': 0.0049, 'grad_norm': 4.506809711456299, 'learning_rate': 5.838135526600253e-06, 'epoch': 37.9}
- 76%|███████▌  | 4396/5800 [12:19:12<2:41:09,  6.89s/it]score1 tensor([[0.5469],
-        [0.4141],
-        [0.4766],
-        [0.6016]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.4121, 0.4961, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:05:49,900] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.26 | optimizer_step: 4.37
-[2025-01-25 21:05:49,902] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.46 | bwd_microstep: 4613.13 | bwd_inner_microstep: 4609.03 | bwd_allreduce_microstep: 4.03 | step_microstep: 40.55
-[2025-01-25 21:05:49,902] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.42 | bwd: 4613.15 | bwd_inner: 4609.03 | bwd_allreduce: 4.06 | step: 40.56
- 76%|███████▌  | 4397/5800 [12:19:19<2:41:05,  6.89s/it]                                                        {'loss': 0.0112, 'grad_norm': 0.5065122246742249, 'learning_rate': 5.830251712431734e-06, 'epoch': 37.91}
- 76%|███████▌  | 4397/5800 [12:19:19<2:41:05,  6.89s/it]score1 tensor([[0.6250],
-        [0.5352],
-        [0.5391],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6250, 0.5234, 0.5508, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:05:56,736] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.88 | optimizer_step: 4.36
-[2025-01-25 21:05:56,737] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.83 | bwd_microstep: 4537.03 | bwd_inner_microstep: 4532.23 | bwd_allreduce_microstep: 4.72 | step_microstep: 46.40
-[2025-01-25 21:05:56,738] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.80 | bwd: 4537.06 | bwd_inner: 4532.23 | bwd_allreduce: 4.75 | step: 46.41
- 76%|███████▌  | 4398/5800 [12:19:26<2:40:33,  6.87s/it]                                                        {'loss': 0.0059, 'grad_norm': 0.2234552502632141, 'learning_rate': 5.822372316636593e-06, 'epoch': 37.91}
- 76%|███████▌  | 4398/5800 [12:19:26<2:40:33,  6.87s/it]score1 tensor([[0.6758],
-        [0.5078],
-        [0.4160],
-        [0.5977]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6797, 0.5078, 0.4219, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0044, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:06:03,577] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 21:06:03,578] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.19 | bwd_microstep: 4565.73 | bwd_inner_microstep: 4560.80 | bwd_allreduce_microstep: 4.84 | step_microstep: 43.47
-[2025-01-25 21:06:03,578] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.16 | bwd: 4565.75 | bwd_inner: 4560.80 | bwd_allreduce: 4.89 | step: 43.48
- 76%|███████▌  | 4399/5800 [12:19:33<2:40:12,  6.86s/it]                                                        {'loss': 0.0044, 'grad_norm': 13.34214973449707, 'learning_rate': 5.814497341671775e-06, 'epoch': 37.92}
- 76%|███████▌  | 4399/5800 [12:19:33<2:40:12,  6.86s/it]score1 tensor([[0.4961],
-        [0.3730],
-        [0.6523],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.3750, 0.6641, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:06:10,483] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 21:06:10,484] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.64 | bwd_microstep: 4623.99 | bwd_inner_microstep: 4618.96 | bwd_allreduce_microstep: 4.91 | step_microstep: 45.08
-[2025-01-25 21:06:10,484] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.61 | bwd: 4624.02 | bwd_inner: 4618.96 | bwd_allreduce: 4.97 | step: 45.09
- 76%|███████▌  | 4400/5800 [12:19:40<2:40:23,  6.87s/it]                                                        {'loss': 0.0088, 'grad_norm': 4.271470069885254, 'learning_rate': 5.806626789992818e-06, 'epoch': 37.93}
- 76%|███████▌  | 4400/5800 [12:19:40<2:40:23,  6.87s/it]score1 tensor([[0.4492],
-        [0.5078],
-        [0.5430],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.5000, 0.5508, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:06:17,386] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 21:06:17,388] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.80 | bwd_microstep: 4621.99 | bwd_inner_microstep: 4617.27 | bwd_allreduce_microstep: 4.65 | step_microstep: 45.64
-[2025-01-25 21:06:17,388] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.74 | bwd: 4622.02 | bwd_inner: 4617.27 | bwd_allreduce: 4.69 | step: 45.64
- 76%|███████▌  | 4401/5800 [12:19:47<2:40:27,  6.88s/it]                                                        {'loss': 0.0063, 'grad_norm': 3.8415157794952393, 'learning_rate': 5.798760664053911e-06, 'epoch': 37.94}
- 76%|███████▌  | 4401/5800 [12:19:47<2:40:27,  6.88s/it]score1 tensor([[0.4805],
-        [0.5664],
-        [0.4766],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.6055, 0.4668, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0142, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:06:24,270] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.36
-[2025-01-25 21:06:24,271] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.58 | bwd_microstep: 4613.80 | bwd_inner_microstep: 4609.12 | bwd_allreduce_microstep: 4.60 | step_microstep: 42.81
-[2025-01-25 21:06:24,272] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.55 | bwd: 4613.83 | bwd_inner: 4609.12 | bwd_allreduce: 4.64 | step: 42.82
- 76%|███████▌  | 4402/5800 [12:19:54<2:40:22,  6.88s/it]                                                        {'loss': 0.0142, 'grad_norm': 0.34722691774368286, 'learning_rate': 5.790898966307834e-06, 'epoch': 37.95}
- 76%|███████▌  | 4402/5800 [12:19:54<2:40:22,  6.88s/it]score1 tensor([[0.5273],
-        [0.4746],
-        [0.5234],
-        [0.4102]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5234, 0.4785, 0.5039, 0.4199], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:06:31,172] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 21:06:31,174] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.59 | bwd_microstep: 4617.35 | bwd_inner_microstep: 4612.38 | bwd_allreduce_microstep: 4.87 | step_microstep: 46.56
-[2025-01-25 21:06:31,174] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.54 | bwd: 4617.37 | bwd_inner: 4612.38 | bwd_allreduce: 4.92 | step: 46.57
- 76%|███████▌  | 4403/5800 [12:20:01<2:40:24,  6.89s/it]                                                        {'loss': 0.0093, 'grad_norm': 0.6152459979057312, 'learning_rate': 5.7830416992060155e-06, 'epoch': 37.96}
- 76%|███████▌  | 4403/5800 [12:20:01<2:40:24,  6.89s/it]score1 tensor([[0.5273],
-        [0.6719],
-        [0.5742],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.6602, 0.5820, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:06:38,028] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 21:06:38,030] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.49 | bwd_microstep: 4577.15 | bwd_inner_microstep: 4566.40 | bwd_allreduce_microstep: 10.65 | step_microstep: 46.51
-[2025-01-25 21:06:38,030] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.45 | bwd: 4577.17 | bwd_inner: 4566.40 | bwd_allreduce: 10.70 | step: 46.51
- 76%|███████▌  | 4404/5800 [12:20:08<2:40:04,  6.88s/it]                                                        {'loss': 0.0059, 'grad_norm': 2.2982735633850098, 'learning_rate': 5.775188865198483e-06, 'epoch': 37.97}
- 76%|███████▌  | 4404/5800 [12:20:08<2:40:04,  6.88s/it]score1 tensor([[0.4707],
-        [0.5898],
-        [0.6914],
-        [0.4316]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4766, 0.5977, 0.6875, 0.4277], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:06:44,931] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.52 | optimizer_step: 4.36
-[2025-01-25 21:06:44,932] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.90 | bwd_microstep: 4624.00 | bwd_inner_microstep: 4618.97 | bwd_allreduce_microstep: 4.91 | step_microstep: 45.32
-[2025-01-25 21:06:44,933] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.87 | bwd: 4624.02 | bwd_inner: 4618.97 | bwd_allreduce: 4.97 | step: 45.33
- 76%|███████▌  | 4405/5800 [12:20:14<2:40:09,  6.89s/it]                                                        {'loss': 0.0054, 'grad_norm': 0.5145308971405029, 'learning_rate': 5.767340466733875e-06, 'epoch': 37.97}
- 76%|███████▌  | 4405/5800 [12:20:14<2:40:09,  6.89s/it]score1 tensor([[0.5469],
-        [0.4043],
-        [0.4941],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.4160, 0.4941, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:06:51,796] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.37
-[2025-01-25 21:06:51,797] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.62 | bwd_microstep: 4578.11 | bwd_inner_microstep: 4573.47 | bwd_allreduce_microstep: 4.55 | step_microstep: 44.13
-[2025-01-25 21:06:51,798] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.57 | bwd: 4578.13 | bwd_inner: 4573.47 | bwd_allreduce: 4.60 | step: 44.14
- 76%|███████▌  | 4406/5800 [12:20:21<2:39:49,  6.88s/it]                                                        {'loss': 0.0078, 'grad_norm': 2.520212173461914, 'learning_rate': 5.759496506259476e-06, 'epoch': 37.98}
- 76%|███████▌  | 4406/5800 [12:20:21<2:39:49,  6.88s/it]score1 tensor([[0.4434],
-        [0.4512],
-        [0.4883],
-        [0.4375]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.4609, 0.4902, 0.4414], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:06:58,703] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 21:06:58,705] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.99 | bwd_microstep: 4626.34 | bwd_inner_microstep: 4621.14 | bwd_allreduce_microstep: 5.09 | step_microstep: 48.79
-[2025-01-25 21:06:58,705] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.95 | bwd: 4626.36 | bwd_inner: 4621.14 | bwd_allreduce: 5.15 | step: 48.80
- 76%|███████▌  | 4407/5800 [12:20:28<2:39:52,  6.89s/it]                                                        {'loss': 0.0059, 'grad_norm': 7.562690258026123, 'learning_rate': 5.751656986221154e-06, 'epoch': 37.99}
- 76%|███████▌  | 4407/5800 [12:20:28<2:39:52,  6.89s/it][93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-score1 tensor([[0.4785]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4805], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:07:03,890] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 21:07:03,891] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 572.83 | bwd_microstep: 1220.36 | bwd_inner_microstep: 1215.83 | bwd_allreduce_microstep: 4.43 | step_microstep: 47.22
-[2025-01-25 21:07:03,891] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 572.80 | bwd: 1220.39 | bwd_inner: 1215.83 | bwd_allreduce: 4.48 | step: 47.23
- 76%|███████▌  | 4408/5800 [12:20:33<2:27:56,  6.38s/it]                                                        {'loss': 0.002, 'grad_norm': 7.7957763671875, 'learning_rate': 5.7438219090634205e-06, 'epoch': 38.0}
- 76%|███████▌  | 4408/5800 [12:20:33<2:27:56,  6.38s/it][2025-01-25 21:07:08,340] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 21:07:18,182] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 21:07:28,611] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 21:07:38,627] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-score1 tensor([[0.6094],
-        [0.5156],
-        [0.4648],
-        [0.4258]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133, 0.5234, 0.4648, 0.4375], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:08:00,362] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 21:08:00,364] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2143.52 | bwd_microstep: 4536.96 | bwd_inner_microstep: 4532.16 | bwd_allreduce_microstep: 4.69 | step_microstep: 44.78
-[2025-01-25 21:08:00,364] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2143.48 | bwd: 4536.98 | bwd_inner: 4532.16 | bwd_allreduce: 4.75 | step: 44.78
- 76%|███████▌  | 4409/5800 [12:21:30<8:16:16, 21.41s/it]                                                        {'loss': 0.0059, 'grad_norm': 6.165624618530273, 'learning_rate': 5.735991277229373e-06, 'epoch': 38.01}
- 76%|███████▌  | 4409/5800 [12:21:30<8:16:16, 21.41s/it]score1 tensor([[0.5625],
-        [0.5469],
-        [0.4492],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.5508, 0.4512, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:08:07,193] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.37
-[2025-01-25 21:08:07,195] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2130.83 | bwd_microstep: 4584.39 | bwd_inner_microstep: 4579.82 | bwd_allreduce_microstep: 4.49 | step_microstep: 42.42
-[2025-01-25 21:08:07,195] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2130.80 | bwd: 4584.41 | bwd_inner: 4579.82 | bwd_allreduce: 4.53 | step: 42.42
- 76%|███████▌  | 4410/5800 [12:21:37<6:34:37, 17.03s/it]                                                        {'loss': 0.0073, 'grad_norm': 4.00757360458374, 'learning_rate': 5.728165093160758e-06, 'epoch': 38.02}
- 76%|███████▌  | 4410/5800 [12:21:37<6:34:37, 17.03s/it]score1 tensor([[0.5469],
-        [0.5039],
-        [0.4512],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.5156, 0.4551, 0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:08:14,059] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.36
-[2025-01-25 21:08:14,061] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2134.31 | bwd_microstep: 4591.44 | bwd_inner_microstep: 4586.73 | bwd_allreduce_microstep: 4.60 | step_microstep: 44.01
-[2025-01-25 21:08:14,061] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2134.28 | bwd: 4591.47 | bwd_inner: 4586.73 | bwd_allreduce: 4.65 | step: 44.02
- 76%|███████▌  | 4411/5800 [12:21:44<5:23:41, 13.98s/it]                                                        {'loss': 0.0078, 'grad_norm': 0.5113339424133301, 'learning_rate': 5.720343359297895e-06, 'epoch': 38.03}
- 76%|███████▌  | 4411/5800 [12:21:44<5:23:41, 13.98s/it]score1 tensor([[0.5312],
-        [0.6836],
-        [0.6562],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.6797, 0.6562, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:08:20,822] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.37
-[2025-01-25 21:08:20,823] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2148.11 | bwd_microstep: 4505.96 | bwd_inner_microstep: 4501.45 | bwd_allreduce_microstep: 4.42 | step_microstep: 43.12
-[2025-01-25 21:08:20,824] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2148.08 | bwd: 4505.98 | bwd_inner: 4501.45 | bwd_allreduce: 4.46 | step: 43.13
- 76%|███████▌  | 4412/5800 [12:21:50<4:33:23, 11.82s/it]                                                        {'loss': 0.002, 'grad_norm': 0.6093418002128601, 'learning_rate': 5.712526078079754e-06, 'epoch': 38.03}
- 76%|███████▌  | 4412/5800 [12:21:50<4:33:23, 11.82s/it]score1 tensor([[0.5000],
-        [0.6953],
-        [0.4414],
-        [0.6914]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.7070, 0.4336, 0.7031], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:08:27,695] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.37
-[2025-01-25 21:08:27,696] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2147.03 | bwd_microstep: 4601.68 | bwd_inner_microstep: 4597.04 | bwd_allreduce_microstep: 4.55 | step_microstep: 43.51
-[2025-01-25 21:08:27,697] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2146.99 | bwd: 4601.70 | bwd_inner: 4597.04 | bwd_allreduce: 4.59 | step: 43.52
- 76%|███████▌  | 4413/5800 [12:21:57<3:58:51, 10.33s/it]                                                        {'loss': 0.0093, 'grad_norm': 1.4646799564361572, 'learning_rate': 5.704713251943885e-06, 'epoch': 38.04}
- 76%|███████▌  | 4413/5800 [12:21:57<3:58:51, 10.33s/it]score1 tensor([[0.4902],
-        [0.5352],
-        [0.3691],
-        [0.6211]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.5391, 0.3750, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:08:34,572] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 21:08:34,573] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2147.10 | bwd_microstep: 4611.60 | bwd_inner_microstep: 4606.99 | bwd_allreduce_microstep: 4.53 | step_microstep: 43.46
-[2025-01-25 21:08:34,574] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2147.06 | bwd: 4611.62 | bwd_inner: 4606.99 | bwd_allreduce: 4.57 | step: 43.48
- 76%|███████▌  | 4414/5800 [12:22:04<3:34:45,  9.30s/it]                                                        {'loss': 0.0049, 'grad_norm': 0.7349172830581665, 'learning_rate': 5.696904883326475e-06, 'epoch': 38.05}
- 76%|███████▌  | 4414/5800 [12:22:04<3:34:45,  9.30s/it]score1 tensor([[0.5508],
-        [0.5039],
-        [0.5430],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5586, 0.4941, 0.5430, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:08:41,402] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 21:08:41,403] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2149.80 | bwd_microstep: 4560.11 | bwd_inner_microstep: 4555.07 | bwd_allreduce_microstep: 4.94 | step_microstep: 43.67
-[2025-01-25 21:08:41,404] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2149.76 | bwd: 4560.14 | bwd_inner: 4555.07 | bwd_allreduce: 5.00 | step: 43.68
- 76%|███████▌  | 4415/5800 [12:22:11<3:17:30,  8.56s/it]                                                        {'loss': 0.0073, 'grad_norm': 2.297354221343994, 'learning_rate': 5.689100974662296e-06, 'epoch': 38.06}
- 76%|███████▌  | 4415/5800 [12:22:11<3:17:30,  8.56s/it]score1 tensor([[0.4258],
-        [0.4512],
-        [0.6875],
-        [0.3555]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4219, 0.4434, 0.6875, 0.3438], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:08:48,228] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.51 | optimizer_step: 4.36
-[2025-01-25 21:08:48,229] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.13 | bwd_microstep: 4557.04 | bwd_inner_microstep: 4552.15 | bwd_allreduce_microstep: 4.79 | step_microstep: 43.03
-[2025-01-25 21:08:48,229] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.09 | bwd: 4557.06 | bwd_inner: 4552.15 | bwd_allreduce: 4.84 | step: 43.04
- 76%|███████▌  | 4416/5800 [12:22:18<3:05:22,  8.04s/it]                                                        {'loss': 0.0059, 'grad_norm': 5.390426158905029, 'learning_rate': 5.681301528384755e-06, 'epoch': 38.07}
- 76%|███████▌  | 4416/5800 [12:22:18<3:05:22,  8.04s/it]score1 tensor([[0.5234],
-        [0.5859],
-        [0.4648],
-        [0.4590]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.5781, 0.4629, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:08:55,107] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.87 | optimizer_step: 4.36
-[2025-01-25 21:08:55,108] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.62 | bwd_microstep: 4613.40 | bwd_inner_microstep: 4608.16 | bwd_allreduce_microstep: 5.16 | step_microstep: 43.31
-[2025-01-25 21:08:55,109] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.58 | bwd: 4613.42 | bwd_inner: 4608.16 | bwd_allreduce: 5.20 | step: 43.32
- 76%|███████▌  | 4417/5800 [12:22:25<2:57:16,  7.69s/it]                                                        {'loss': 0.0088, 'grad_norm': 8.105965614318848, 'learning_rate': 5.673506546925842e-06, 'epoch': 38.08}
- 76%|███████▌  | 4417/5800 [12:22:25<2:57:16,  7.69s/it]score1 tensor([[0.4863],
-        [0.5391],
-        [0.4844],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.5430, 0.4746, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:09:01,991] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.52 | optimizer_step: 4.36
-[2025-01-25 21:09:01,992] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.91 | bwd_microstep: 4605.56 | bwd_inner_microstep: 4600.88 | bwd_allreduce_microstep: 4.60 | step_microstep: 46.27
-[2025-01-25 21:09:01,992] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.87 | bwd: 4605.58 | bwd_inner: 4600.88 | bwd_allreduce: 4.63 | step: 46.28
- 76%|███████▌  | 4418/5800 [12:22:31<2:51:32,  7.45s/it]                                                        {'loss': 0.0103, 'grad_norm': 3.8761298656463623, 'learning_rate': 5.6657160327161794e-06, 'epoch': 38.09}
- 76%|███████▌  | 4418/5800 [12:22:31<2:51:32,  7.45s/it]score1 tensor([[0.5508],
-        [0.6094],
-        [0.5859],
-        [0.5977]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.6094, 0.5898, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:09:08,824] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 21:09:08,825] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.17 | bwd_microstep: 4566.83 | bwd_inner_microstep: 4561.30 | bwd_allreduce_microstep: 5.44 | step_microstep: 44.28
-[2025-01-25 21:09:08,826] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.12 | bwd: 4566.85 | bwd_inner: 4561.30 | bwd_allreduce: 5.49 | step: 44.29
- 76%|███████▌  | 4419/5800 [12:22:38<2:47:14,  7.27s/it]                                                        {'loss': 0.0049, 'grad_norm': 2.331801414489746, 'learning_rate': 5.6579299881849755e-06, 'epoch': 38.09}
- 76%|███████▌  | 4419/5800 [12:22:38<2:47:14,  7.27s/it]score1 tensor([[0.6328],
-        [0.4492],
-        [0.5391],
-        [0.6602]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6406, 0.4414, 0.5391, 0.6641], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:09:15,682] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.37
-[2025-01-25 21:09:15,683] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.18 | bwd_microstep: 4576.83 | bwd_inner_microstep: 4572.00 | bwd_allreduce_microstep: 4.71 | step_microstep: 43.87
-[2025-01-25 21:09:15,684] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.14 | bwd: 4576.86 | bwd_inner: 4572.00 | bwd_allreduce: 4.77 | step: 43.88
- 76%|███████▌  | 4420/5800 [12:22:45<2:44:16,  7.14s/it]                                                        {'loss': 0.0049, 'grad_norm': 2.9238040447235107, 'learning_rate': 5.650148415760062e-06, 'epoch': 38.1}
- 76%|███████▌  | 4420/5800 [12:22:45<2:44:16,  7.14s/it]score1 tensor([[0.6367],
-        [0.5117],
-        [0.5742],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.4961, 0.5664, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:09:22,576] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.37
-[2025-01-25 21:09:22,577] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.77 | bwd_microstep: 4613.51 | bwd_inner_microstep: 4608.44 | bwd_allreduce_microstep: 4.99 | step_microstep: 47.06
-[2025-01-25 21:09:22,578] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.74 | bwd: 4613.53 | bwd_inner: 4608.44 | bwd_allreduce: 5.03 | step: 47.06
- 76%|███████▌  | 4421/5800 [12:22:52<2:42:25,  7.07s/it]                                                        {'loss': 0.0088, 'grad_norm': 3.9662232398986816, 'learning_rate': 5.6423713178678585e-06, 'epoch': 38.11}
- 76%|███████▌  | 4421/5800 [12:22:52<2:42:25,  7.07s/it]score1 tensor([[0.4941],
-        [0.4570],
-        [0.4766],
-        [0.4180]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4805, 0.4492, 0.4707, 0.4160], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:09:29,471] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 21:09:29,473] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.50 | bwd_microstep: 4616.18 | bwd_inner_microstep: 4610.88 | bwd_allreduce_microstep: 5.20 | step_microstep: 46.06
-[2025-01-25 21:09:29,473] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.46 | bwd: 4616.20 | bwd_inner: 4610.88 | bwd_allreduce: 5.25 | step: 46.07
- 76%|███████▌  | 4422/5800 [12:22:59<2:41:08,  7.02s/it]                                                        {'loss': 0.0073, 'grad_norm': 7.661129951477051, 'learning_rate': 5.634598696933411e-06, 'epoch': 38.12}
- 76%|███████▌  | 4422/5800 [12:22:59<2:41:08,  7.02s/it]score1 tensor([[0.4648],
-        [0.5547],
-        [0.5039],
-        [0.3340]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4570, 0.5469, 0.4980, 0.3457], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:09:36,372] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 21:09:36,373] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.75 | bwd_microstep: 4616.44 | bwd_inner_microstep: 4611.62 | bwd_allreduce_microstep: 4.70 | step_microstep: 44.00
-[2025-01-25 21:09:36,374] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.70 | bwd: 4616.47 | bwd_inner: 4611.62 | bwd_allreduce: 4.77 | step: 44.01
- 76%|███████▋  | 4423/5800 [12:23:06<2:40:11,  6.98s/it]                                                        {'loss': 0.0083, 'grad_norm': 4.446447849273682, 'learning_rate': 5.626830555380352e-06, 'epoch': 38.13}
- 76%|███████▋  | 4423/5800 [12:23:06<2:40:11,  6.98s/it]score1 tensor([[0.5352],
-        [0.4824],
-        [0.4590],
-        [0.5938]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.4648, 0.4531, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:09:43,271] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.53 | optimizer_step: 4.36
-[2025-01-25 21:09:43,272] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.69 | bwd_microstep: 4621.38 | bwd_inner_microstep: 4615.80 | bwd_allreduce_microstep: 5.45 | step_microstep: 43.88
-[2025-01-25 21:09:43,272] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.66 | bwd: 4621.41 | bwd_inner: 4615.80 | bwd_allreduce: 5.52 | step: 43.89
- 76%|███████▋  | 4424/5800 [12:23:13<2:39:32,  6.96s/it]                                                        {'loss': 0.0117, 'grad_norm': 0.7067342400550842, 'learning_rate': 5.619066895630913e-06, 'epoch': 38.14}
- 76%|███████▋  | 4424/5800 [12:23:13<2:39:32,  6.96s/it]score1 tensor([[0.6055],
-        [0.4199],
-        [0.4512],
-        [0.4160]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.4180, 0.4473, 0.4062], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:09:50,171] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 21:09:50,173] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.84 | bwd_microstep: 4626.72 | bwd_inner_microstep: 4621.98 | bwd_allreduce_microstep: 4.64 | step_microstep: 43.63
-[2025-01-25 21:09:50,173] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.81 | bwd: 4626.74 | bwd_inner: 4621.98 | bwd_allreduce: 4.70 | step: 43.64
- 76%|███████▋  | 4425/5800 [12:23:20<2:39:01,  6.94s/it]                                                        {'loss': 0.0078, 'grad_norm': 3.303388833999634, 'learning_rate': 5.611307720105956e-06, 'epoch': 38.15}
- 76%|███████▋  | 4425/5800 [12:23:20<2:39:01,  6.94s/it]score1 tensor([[0.4180],
-        [0.5078],
-        [0.6445],
-        [0.5078]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.5039, 0.6445, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:09:56,987] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 21:09:56,988] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.51 | bwd_microstep: 4542.71 | bwd_inner_microstep: 4538.05 | bwd_allreduce_microstep: 4.56 | step_microstep: 41.76
-[2025-01-25 21:09:56,989] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.47 | bwd: 4542.73 | bwd_inner: 4538.05 | bwd_allreduce: 4.61 | step: 41.76
- 76%|███████▋  | 4426/5800 [12:23:26<2:38:03,  6.90s/it]                                                        {'loss': 0.002, 'grad_norm': 4.062989711761475, 'learning_rate': 5.6035530312249105e-06, 'epoch': 38.16}
- 76%|███████▋  | 4426/5800 [12:23:26<2:38:03,  6.90s/it]score1 tensor([[0.4883],
-        [0.4902],
-        [0.4062],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.5039, 0.4023, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:10:03,833] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 21:10:03,834] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.51 | bwd_microstep: 4575.94 | bwd_inner_microstep: 4571.23 | bwd_allreduce_microstep: 4.62 | step_microstep: 41.62
-[2025-01-25 21:10:03,834] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.47 | bwd: 4575.96 | bwd_inner: 4571.23 | bwd_allreduce: 4.66 | step: 41.63
- 76%|███████▋  | 4427/5800 [12:23:33<2:37:33,  6.89s/it]                                                        {'loss': 0.0059, 'grad_norm': 2.1937663555145264, 'learning_rate': 5.595802831405837e-06, 'epoch': 38.16}
- 76%|███████▋  | 4427/5800 [12:23:33<2:37:33,  6.89s/it]score1 tensor([[0.5664],
-        [0.5547],
-        [0.5703],
-        [0.4980]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.5547, 0.5508, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:10:10,673] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.37
-[2025-01-25 21:10:10,674] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.62 | bwd_microstep: 4568.89 | bwd_inner_microstep: 4564.23 | bwd_allreduce_microstep: 4.57 | step_microstep: 42.32
-[2025-01-25 21:10:10,674] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.59 | bwd: 4568.91 | bwd_inner: 4564.23 | bwd_allreduce: 4.62 | step: 42.33
- 76%|███████▋  | 4428/5800 [12:23:40<2:37:07,  6.87s/it]                                                        {'loss': 0.0068, 'grad_norm': 6.346003532409668, 'learning_rate': 5.5880571230653735e-06, 'epoch': 38.17}
- 76%|███████▋  | 4428/5800 [12:23:40<2:37:07,  6.87s/it]score1 tensor([[0.4902],
-        [0.3379],
-        [0.4688],
-        [0.4316]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4863, 0.3477, 0.4727, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:10:17,567] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.52 | optimizer_step: 4.37
-[2025-01-25 21:10:17,568] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.34 | bwd_microstep: 4622.73 | bwd_inner_microstep: 4618.08 | bwd_allreduce_microstep: 4.57 | step_microstep: 42.12
-[2025-01-25 21:10:17,569] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.31 | bwd: 4622.75 | bwd_inner: 4618.08 | bwd_allreduce: 4.61 | step: 42.12
- 76%|███████▋  | 4429/5800 [12:23:47<2:37:09,  6.88s/it]                                                        {'loss': 0.0093, 'grad_norm': 3.4481163024902344, 'learning_rate': 5.580315908618761e-06, 'epoch': 38.18}
- 76%|███████▋  | 4429/5800 [12:23:47<2:37:09,  6.88s/it]score1 tensor([[0.4629],
-        [0.4668],
-        [0.4668],
-        [0.4121]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.4668, 0.4785, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:10:24,411] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 21:10:24,412] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.08 | bwd_microstep: 4566.37 | bwd_inner_microstep: 4561.43 | bwd_allreduce_microstep: 4.83 | step_microstep: 46.60
-[2025-01-25 21:10:24,412] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.04 | bwd: 4566.39 | bwd_inner: 4561.43 | bwd_allreduce: 4.88 | step: 46.60
- 76%|███████▋  | 4430/5800 [12:23:54<2:36:51,  6.87s/it]                                                        {'loss': 0.0049, 'grad_norm': 5.632482051849365, 'learning_rate': 5.5725791904798585e-06, 'epoch': 38.19}
- 76%|███████▋  | 4430/5800 [12:23:54<2:36:51,  6.87s/it]score1 tensor([[0.3535],
-        [0.3984],
-        [0.6562],
-        [0.6250]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3730, 0.4141, 0.6875, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:10:31,315] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.36
-[2025-01-25 21:10:31,316] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.75 | bwd_microstep: 4628.91 | bwd_inner_microstep: 4624.16 | bwd_allreduce_microstep: 4.66 | step_microstep: 42.23
-[2025-01-25 21:10:31,316] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.73 | bwd: 4628.94 | bwd_inner: 4624.16 | bwd_allreduce: 4.71 | step: 42.24
- 76%|███████▋  | 4431/5800 [12:24:01<2:36:57,  6.88s/it]                                                        {'loss': 0.0176, 'grad_norm': 3.5339696407318115, 'learning_rate': 5.564846971061097e-06, 'epoch': 38.2}
- 76%|███████▋  | 4431/5800 [12:24:01<2:36:57,  6.88s/it]score1 tensor([[0.3594],
-        [0.3535],
-        [0.4863],
-        [0.3828]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3652, 0.3652, 0.4941, 0.4023], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:10:38,213] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 21:10:38,215] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.20 | bwd_microstep: 4623.55 | bwd_inner_microstep: 4618.62 | bwd_allreduce_microstep: 4.83 | step_microstep: 43.42
-[2025-01-25 21:10:38,215] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.17 | bwd: 4623.58 | bwd_inner: 4618.62 | bwd_allreduce: 4.88 | step: 43.43
- 76%|███████▋  | 4432/5800 [12:24:08<2:37:01,  6.89s/it]                                                        {'loss': 0.0112, 'grad_norm': 7.096024990081787, 'learning_rate': 5.557119252773529e-06, 'epoch': 38.21}
- 76%|███████▋  | 4432/5800 [12:24:08<2:37:01,  6.89s/it]score1 tensor([[0.5273],
-        [0.5781],
-        [0.6328],
-        [0.3320]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.5781, 0.6250, 0.3555], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:10:45,070] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 21:10:45,071] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.66 | bwd_microstep: 4577.35 | bwd_inner_microstep: 4572.52 | bwd_allreduce_microstep: 4.75 | step_microstep: 42.12
-[2025-01-25 21:10:45,072] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.62 | bwd: 4577.38 | bwd_inner: 4572.52 | bwd_allreduce: 4.79 | step: 42.13
- 76%|███████▋  | 4433/5800 [12:24:15<2:36:38,  6.88s/it]                                                        {'loss': 0.0088, 'grad_norm': 1.4935215711593628, 'learning_rate': 5.549396038026773e-06, 'epoch': 38.22}
- 76%|███████▋  | 4433/5800 [12:24:15<2:36:38,  6.88s/it]score1 tensor([[0.5000],
-        [0.5234],
-        [0.3477],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4961, 0.5195, 0.3613, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:10:51,974] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.97 | optimizer_step: 4.36
-[2025-01-25 21:10:51,975] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.68 | bwd_microstep: 4627.21 | bwd_inner_microstep: 4622.24 | bwd_allreduce_microstep: 4.86 | step_microstep: 42.28
-[2025-01-25 21:10:51,976] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.64 | bwd: 4627.23 | bwd_inner: 4622.24 | bwd_allreduce: 4.92 | step: 42.30
- 76%|███████▋  | 4434/5800 [12:24:21<2:36:43,  6.88s/it]                                                        {'loss': 0.0063, 'grad_norm': 0.5316221714019775, 'learning_rate': 5.541677329229083e-06, 'epoch': 38.22}
- 76%|███████▋  | 4434/5800 [12:24:21<2:36:43,  6.88s/it]score1 tensor([[0.5547],
-        [0.4707],
-        [0.5156],
-        [0.3711]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4707, 0.5234, 0.3730], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0034, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:10:58,819] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 21:10:58,821] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.59 | bwd_microstep: 4569.51 | bwd_inner_microstep: 4564.51 | bwd_allreduce_microstep: 4.89 | step_microstep: 44.76
-[2025-01-25 21:10:58,821] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.56 | bwd: 4569.54 | bwd_inner: 4564.51 | bwd_allreduce: 4.95 | step: 44.77
- 76%|███████▋  | 4435/5800 [12:24:28<2:36:20,  6.87s/it]                                                        {'loss': 0.0034, 'grad_norm': 1.658992052078247, 'learning_rate': 5.533963128787272e-06, 'epoch': 38.23}
- 76%|███████▋  | 4435/5800 [12:24:28<2:36:20,  6.87s/it]score1 tensor([[0.6094],
-        [0.5312],
-        [0.4375],
-        [0.4980]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133, 0.5273, 0.4414, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0029, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:11:05,675] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 21:11:05,676] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.35 | bwd_microstep: 4580.72 | bwd_inner_microstep: 4575.79 | bwd_allreduce_microstep: 4.85 | step_microstep: 43.57
-[2025-01-25 21:11:05,676] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.32 | bwd: 4580.74 | bwd_inner: 4575.78 | bwd_allreduce: 4.89 | step: 43.58
- 76%|███████▋  | 4436/5800 [12:24:35<2:36:06,  6.87s/it]                                                        {'loss': 0.0029, 'grad_norm': 2.122765302658081, 'learning_rate': 5.526253439106761e-06, 'epoch': 38.24}
- 76%|███████▋  | 4436/5800 [12:24:35<2:36:06,  6.87s/it]score1 tensor([[0.5781],
-        [0.3887],
-        [0.6719],
-        [0.4941]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5703, 0.3945, 0.6562, 0.4766], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:11:12,572] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 21:11:12,574] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.17 | bwd_microstep: 4623.54 | bwd_inner_microstep: 4618.18 | bwd_allreduce_microstep: 5.25 | step_microstep: 46.29
-[2025-01-25 21:11:12,574] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.11 | bwd: 4623.56 | bwd_inner: 4618.18 | bwd_allreduce: 5.31 | step: 46.30
- 76%|███████▋  | 4437/5800 [12:24:42<2:36:14,  6.88s/it]                                                        {'loss': 0.0117, 'grad_norm': 4.916951656341553, 'learning_rate': 5.518548262591574e-06, 'epoch': 38.25}
- 76%|███████▋  | 4437/5800 [12:24:42<2:36:14,  6.88s/it]score1 tensor([[0.5039],
-        [0.4805],
-        [0.4023],
-        [0.4570]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4961, 0.4785, 0.4062, 0.4590], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:11:19,474] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 21:11:19,476] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.07 | bwd_microstep: 4626.72 | bwd_inner_microstep: 4622.26 | bwd_allreduce_microstep: 4.35 | step_microstep: 42.87
-[2025-01-25 21:11:19,476] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.02 | bwd: 4626.74 | bwd_inner: 4622.26 | bwd_allreduce: 4.41 | step: 42.89
- 77%|███████▋  | 4438/5800 [12:24:49<2:36:15,  6.88s/it]                                                        {'loss': 0.0039, 'grad_norm': 0.5244438648223877, 'learning_rate': 5.510847601644309e-06, 'epoch': 38.26}
- 77%|███████▋  | 4438/5800 [12:24:49<2:36:15,  6.88s/it]score1 tensor([[0.4355],
-        [0.5664],
-        [0.5664],
-        [0.6016]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4277, 0.5547, 0.5508, 0.6016], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:11:26,330] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 21:11:26,331] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.72 | bwd_microstep: 4576.64 | bwd_inner_microstep: 4572.19 | bwd_allreduce_microstep: 4.37 | step_microstep: 44.11
-[2025-01-25 21:11:26,332] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.69 | bwd: 4576.66 | bwd_inner: 4572.19 | bwd_allreduce: 4.40 | step: 44.11
- 77%|███████▋  | 4439/5800 [12:24:56<2:35:57,  6.88s/it]                                                        {'loss': 0.0088, 'grad_norm': 6.161041259765625, 'learning_rate': 5.503151458666176e-06, 'epoch': 38.27}
- 77%|███████▋  | 4439/5800 [12:24:56<2:35:57,  6.88s/it]score1 tensor([[0.5039],
-        [0.4707],
-        [0.4785],
-        [0.4062]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.4609, 0.4805, 0.4043], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:11:33,227] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 21:11:33,228] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.37 | bwd_microstep: 4625.07 | bwd_inner_microstep: 4620.16 | bwd_allreduce_microstep: 4.81 | step_microstep: 41.65
-[2025-01-25 21:11:33,229] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.33 | bwd: 4625.09 | bwd_inner: 4620.16 | bwd_allreduce: 4.86 | step: 41.66
- 77%|███████▋  | 4440/5800 [12:25:03<2:36:00,  6.88s/it]                                                        {'loss': 0.0059, 'grad_norm': 3.78572940826416, 'learning_rate': 5.495459836056953e-06, 'epoch': 38.28}
- 77%|███████▋  | 4440/5800 [12:25:03<2:36:00,  6.88s/it]score1 tensor([[0.5508],
-        [0.4941],
-        [0.4355],
-        [0.6094]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.4863, 0.4199, 0.5977], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:11:40,120] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.37
-[2025-01-25 21:11:40,121] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.09 | bwd_microstep: 4616.70 | bwd_inner_microstep: 4611.33 | bwd_allreduce_microstep: 5.25 | step_microstep: 45.81
-[2025-01-25 21:11:40,122] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.05 | bwd: 4616.73 | bwd_inner: 4611.33 | bwd_allreduce: 5.31 | step: 45.81
- 77%|███████▋  | 4441/5800 [12:25:10<2:35:56,  6.89s/it]                                                        {'loss': 0.0117, 'grad_norm': 8.227928161621094, 'learning_rate': 5.487772736215036e-06, 'epoch': 38.28}
- 77%|███████▋  | 4441/5800 [12:25:10<2:35:56,  6.89s/it]score1 tensor([[0.5039],
-        [0.5000],
-        [0.6172],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4902, 0.4961, 0.6133, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:11:47,028] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.37
-[2025-01-25 21:11:47,029] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.65 | bwd_microstep: 4623.67 | bwd_inner_microstep: 4618.17 | bwd_allreduce_microstep: 5.37 | step_microstep: 47.45
-[2025-01-25 21:11:47,029] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.60 | bwd: 4623.69 | bwd_inner: 4618.17 | bwd_allreduce: 5.44 | step: 47.46
- 77%|███████▋  | 4442/5800 [12:25:17<2:36:01,  6.89s/it]                                                        {'loss': 0.0063, 'grad_norm': 8.361059188842773, 'learning_rate': 5.480090161537388e-06, 'epoch': 38.29}
- 77%|███████▋  | 4442/5800 [12:25:17<2:36:01,  6.89s/it]score1 tensor([[0.5781],
-        [0.6211],
-        [0.5508],
-        [0.3496]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.6172, 0.5391, 0.3457], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:11:53,936] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 21:11:53,938] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.05 | bwd_microstep: 4621.32 | bwd_inner_microstep: 4616.56 | bwd_allreduce_microstep: 4.66 | step_microstep: 43.72
-[2025-01-25 21:11:53,938] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.99 | bwd: 4621.34 | bwd_inner: 4616.56 | bwd_allreduce: 4.72 | step: 43.73
- 77%|███████▋  | 4443/5800 [12:25:23<2:35:57,  6.90s/it]                                                        {'loss': 0.0117, 'grad_norm': 3.892815351486206, 'learning_rate': 5.472412114419565e-06, 'epoch': 38.3}
- 77%|███████▋  | 4443/5800 [12:25:23<2:35:57,  6.90s/it]score1 tensor([[0.4688],
-        [0.4238],
-        [0.5664],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.4219, 0.5820, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:12:00,782] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.37
-[2025-01-25 21:12:00,783] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.18 | bwd_microstep: 4565.49 | bwd_inner_microstep: 4560.03 | bwd_allreduce_microstep: 5.34 | step_microstep: 49.03
-[2025-01-25 21:12:00,784] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.15 | bwd: 4565.51 | bwd_inner: 4560.03 | bwd_allreduce: 5.40 | step: 49.04
- 77%|███████▋  | 4444/5800 [12:25:30<2:35:32,  6.88s/it]                                                        {'loss': 0.0063, 'grad_norm': 2.458404064178467, 'learning_rate': 5.464738597255727e-06, 'epoch': 38.31}
- 77%|███████▋  | 4444/5800 [12:25:30<2:35:32,  6.88s/it]score1 tensor([[0.5547],
-        [0.4668],
-        [0.4746],
-        [0.6094]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4668, 0.4844, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0034, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:12:07,603] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 21:12:07,604] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.40 | bwd_microstep: 4536.83 | bwd_inner_microstep: 4532.05 | bwd_allreduce_microstep: 4.70 | step_microstep: 46.74
-[2025-01-25 21:12:07,605] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.37 | bwd: 4536.85 | bwd_inner: 4532.05 | bwd_allreduce: 4.74 | step: 46.75
- 77%|███████▋  | 4445/5800 [12:25:37<2:35:00,  6.86s/it]                                                        {'loss': 0.0034, 'grad_norm': 0.38741010427474976, 'learning_rate': 5.457069612438597e-06, 'epoch': 38.32}
- 77%|███████▋  | 4445/5800 [12:25:37<2:35:00,  6.86s/it]score1 tensor([[0.4531],
-        [0.4453],
-        [0.4082],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.4473, 0.4160, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:12:14,507] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 21:12:14,508] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.11 | bwd_microstep: 4627.21 | bwd_inner_microstep: 4622.58 | bwd_allreduce_microstep: 4.54 | step_microstep: 41.35
-[2025-01-25 21:12:14,509] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.08 | bwd: 4627.23 | bwd_inner: 4622.58 | bwd_allreduce: 4.58 | step: 41.36
- 77%|███████▋  | 4446/5800 [12:25:44<2:35:09,  6.88s/it]                                                        {'loss': 0.0039, 'grad_norm': 3.506923198699951, 'learning_rate': 5.449405162359507e-06, 'epoch': 38.33}
- 77%|███████▋  | 4446/5800 [12:25:44<2:35:09,  6.88s/it]score1 tensor([[0.4414],
-        [0.5312],
-        [0.4902],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4277, 0.5352, 0.4844, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:12:21,367] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 21:12:21,368] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.90 | bwd_microstep: 4583.64 | bwd_inner_microstep: 4578.35 | bwd_allreduce_microstep: 5.18 | step_microstep: 45.57
-[2025-01-25 21:12:21,368] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.86 | bwd: 4583.66 | bwd_inner: 4578.35 | bwd_allreduce: 5.23 | step: 45.58
- 77%|███████▋  | 4447/5800 [12:25:51<2:34:57,  6.87s/it]                                                        {'loss': 0.0059, 'grad_norm': 1.8138391971588135, 'learning_rate': 5.441745249408359e-06, 'epoch': 38.34}
- 77%|███████▋  | 4447/5800 [12:25:51<2:34:57,  6.87s/it]score1 tensor([[0.4121],
-        [0.4609],
-        [0.5703],
-        [0.6523]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3984, 0.4648, 0.5742, 0.6523], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:12:28,234] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 21:12:28,235] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.33 | bwd_microstep: 4575.23 | bwd_inner_microstep: 4570.65 | bwd_allreduce_microstep: 4.49 | step_microstep: 41.40
-[2025-01-25 21:12:28,235] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.30 | bwd: 4575.25 | bwd_inner: 4570.65 | bwd_allreduce: 4.53 | step: 41.41
- 77%|███████▋  | 4448/5800 [12:25:58<2:34:47,  6.87s/it]                                                        {'loss': 0.0054, 'grad_norm': 2.2873806953430176, 'learning_rate': 5.434089875973649e-06, 'epoch': 38.34}
- 77%|███████▋  | 4448/5800 [12:25:58<2:34:47,  6.87s/it]score1 tensor([[0.5547],
-        [0.5391],
-        [0.1689],
-        [0.4961]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.5586, 0.1787, 0.4707], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:12:35,082] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.08 | optimizer_step: 4.37
-[2025-01-25 21:12:35,084] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.88 | bwd_microstep: 4567.41 | bwd_inner_microstep: 4562.27 | bwd_allreduce_microstep: 5.02 | step_microstep: 44.52
-[2025-01-25 21:12:35,084] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.84 | bwd: 4567.43 | bwd_inner: 4562.27 | bwd_allreduce: 5.08 | step: 44.53
- 77%|███████▋  | 4449/5800 [12:26:05<2:34:36,  6.87s/it]                                                        {'loss': 0.0137, 'grad_norm': 1.3058116436004639, 'learning_rate': 5.426439044442462e-06, 'epoch': 38.35}
- 77%|███████▋  | 4449/5800 [12:26:05<2:34:36,  6.87s/it]score1 tensor([[0.4043],
-        [0.5547],
-        [0.5352],
-        [0.4219]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4121, 0.5703, 0.5391, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:12:41,998] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 21:12:42,000] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.56 | bwd_microstep: 4625.07 | bwd_inner_microstep: 4620.49 | bwd_allreduce_microstep: 4.45 | step_microstep: 42.57
-[2025-01-25 21:12:42,000] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.51 | bwd: 4625.09 | bwd_inner: 4620.49 | bwd_allreduce: 4.52 | step: 42.57
- 77%|███████▋  | 4450/5800 [12:26:11<2:34:44,  6.88s/it]                                                        {'loss': 0.0078, 'grad_norm': 4.161705017089844, 'learning_rate': 5.418792757200448e-06, 'epoch': 38.36}
- 77%|███████▋  | 4450/5800 [12:26:11<2:34:44,  6.88s/it]score1 tensor([[0.5352],
-        [0.5625],
-        [0.5742],
-        [0.4434]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.5547, 0.5664, 0.4316], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:12:48,901] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.98 | optimizer_step: 4.36
-[2025-01-25 21:12:48,902] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.29 | bwd_microstep: 4625.72 | bwd_inner_microstep: 4620.52 | bwd_allreduce_microstep: 5.10 | step_microstep: 44.86
-[2025-01-25 21:12:48,902] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.26 | bwd: 4625.74 | bwd_inner: 4620.52 | bwd_allreduce: 5.15 | step: 44.87
- 77%|███████▋  | 4451/5800 [12:26:18<2:34:48,  6.89s/it]                                                        {'loss': 0.0078, 'grad_norm': 4.139087200164795, 'learning_rate': 5.411151016631864e-06, 'epoch': 38.37}
- 77%|███████▋  | 4451/5800 [12:26:18<2:34:48,  6.89s/it]score1 tensor([[0.4199],
-        [0.5977],
-        [0.4590],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4238, 0.5938, 0.4492, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:12:55,785] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 21:12:55,786] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.14 | bwd_microstep: 4612.00 | bwd_inner_microstep: 4607.95 | bwd_allreduce_microstep: 3.98 | step_microstep: 43.82
-[2025-01-25 21:12:55,786] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.11 | bwd: 4612.03 | bwd_inner: 4607.95 | bwd_allreduce: 4.02 | step: 43.83
- 77%|███████▋  | 4452/5800 [12:26:25<2:34:38,  6.88s/it]                                                        {'loss': 0.0054, 'grad_norm': 0.43117228150367737, 'learning_rate': 5.403513825119526e-06, 'epoch': 38.38}
- 77%|███████▋  | 4452/5800 [12:26:25<2:34:38,  6.88s/it]score1 tensor([[0.6094],
-        [0.3652],
-        [0.4570],
-        [0.6250]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.3672, 0.4473, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:13:02,670] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 21:13:02,671] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.07 | bwd_microstep: 4616.51 | bwd_inner_microstep: 4611.81 | bwd_allreduce_microstep: 4.60 | step_microstep: 42.72
-[2025-01-25 21:13:02,671] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.05 | bwd: 4616.53 | bwd_inner: 4611.81 | bwd_allreduce: 4.65 | step: 42.74
- 77%|███████▋  | 4453/5800 [12:26:32<2:34:33,  6.88s/it]                                                        {'loss': 0.0059, 'grad_norm': 4.8375372886657715, 'learning_rate': 5.395881185044856e-06, 'epoch': 38.39}
- 77%|███████▋  | 4453/5800 [12:26:32<2:34:33,  6.88s/it]score1 tensor([[0.5195],
-        [0.4648],
-        [0.4453],
-        [0.4902]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.4551, 0.4316, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:13:09,554] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 21:13:09,555] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.69 | bwd_microstep: 4618.05 | bwd_inner_microstep: 4613.38 | bwd_allreduce_microstep: 4.60 | step_microstep: 42.59
-[2025-01-25 21:13:09,556] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.66 | bwd: 4618.08 | bwd_inner: 4613.38 | bwd_allreduce: 4.64 | step: 42.60
- 77%|███████▋  | 4454/5800 [12:26:39<2:34:27,  6.89s/it]                                                        {'loss': 0.0073, 'grad_norm': 3.896486759185791, 'learning_rate': 5.388253098787834e-06, 'epoch': 38.4}
- 77%|███████▋  | 4454/5800 [12:26:39<2:34:27,  6.89s/it]score1 tensor([[0.6836],
-        [0.5156],
-        [0.4512],
-        [0.6484]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6875, 0.5195, 0.4492, 0.6562], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0044, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:13:16,465] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.37
-[2025-01-25 21:13:16,466] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.24 | bwd_microstep: 4629.49 | bwd_inner_microstep: 4624.87 | bwd_allreduce_microstep: 4.53 | step_microstep: 41.65
-[2025-01-25 21:13:16,467] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.20 | bwd: 4629.51 | bwd_inner: 4624.87 | bwd_allreduce: 4.58 | step: 41.67
- 77%|███████▋  | 4455/5800 [12:26:46<2:34:29,  6.89s/it]                                                        {'loss': 0.0044, 'grad_norm': 5.011258125305176, 'learning_rate': 5.380629568727025e-06, 'epoch': 38.41}
- 77%|███████▋  | 4455/5800 [12:26:46<2:34:29,  6.89s/it]score1 tensor([[0.5156],
-        [0.5117],
-        [0.5586],
-        [0.3672]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.5195, 0.5664, 0.3691], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0044, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:13:23,304] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 21:13:23,305] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.90 | bwd_microstep: 4567.42 | bwd_inner_microstep: 4562.73 | bwd_allreduce_microstep: 4.60 | step_microstep: 45.46
-[2025-01-25 21:13:23,305] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.85 | bwd: 4567.45 | bwd_inner: 4562.73 | bwd_allreduce: 4.64 | step: 45.47
- 77%|███████▋  | 4456/5800 [12:26:53<2:34:04,  6.88s/it]                                                        {'loss': 0.0044, 'grad_norm': 5.835119247436523, 'learning_rate': 5.373010597239592e-06, 'epoch': 38.41}
- 77%|███████▋  | 4456/5800 [12:26:53<2:34:04,  6.88s/it]score1 tensor([[0.5859],
-        [0.5156],
-        [0.4453],
-        [0.4473]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5898, 0.5156, 0.4434, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0015, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:13:30,114] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.37
-[2025-01-25 21:13:30,115] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.14 | bwd_microstep: 4534.66 | bwd_inner_microstep: 4530.07 | bwd_allreduce_microstep: 4.51 | step_microstep: 42.93
-[2025-01-25 21:13:30,115] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.11 | bwd: 4534.68 | bwd_inner: 4530.07 | bwd_allreduce: 4.55 | step: 42.94
- 77%|███████▋  | 4457/5800 [12:27:00<2:33:28,  6.86s/it]                                                        {'loss': 0.0015, 'grad_norm': 0.5086485147476196, 'learning_rate': 5.365396186701251e-06, 'epoch': 38.42}
- 77%|███████▋  | 4457/5800 [12:27:00<2:33:28,  6.86s/it]score1 tensor([[0.4629],
-        [0.5352],
-        [0.3555],
-        [0.4434]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.5469, 0.3516, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:13:37,003] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 21:13:37,004] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.54 | bwd_microstep: 4619.15 | bwd_inner_microstep: 4614.68 | bwd_allreduce_microstep: 4.37 | step_microstep: 41.91
-[2025-01-25 21:13:37,005] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.51 | bwd: 4619.18 | bwd_inner: 4614.68 | bwd_allreduce: 4.42 | step: 41.91
- 77%|███████▋  | 4458/5800 [12:27:06<2:33:33,  6.87s/it]                                                        {'loss': 0.0073, 'grad_norm': 4.223464488983154, 'learning_rate': 5.357786339486315e-06, 'epoch': 38.43}
- 77%|███████▋  | 4458/5800 [12:27:06<2:33:33,  6.87s/it]score1 tensor([[0.5742],
-        [0.6055],
-        [0.4355],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5703, 0.6055, 0.4199, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:13:43,850] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 21:13:43,852] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.25 | bwd_microstep: 4576.14 | bwd_inner_microstep: 4571.60 | bwd_allreduce_microstep: 4.41 | step_microstep: 41.58
-[2025-01-25 21:13:43,852] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.21 | bwd: 4576.16 | bwd_inner: 4571.60 | bwd_allreduce: 4.48 | step: 41.59
- 77%|███████▋  | 4459/5800 [12:27:13<2:33:21,  6.86s/it]                                                        {'loss': 0.0059, 'grad_norm': 1.9809831380844116, 'learning_rate': 5.3501810579676625e-06, 'epoch': 38.44}
- 77%|███████▋  | 4459/5800 [12:27:13<2:33:21,  6.86s/it]score1 tensor([[0.5039],
-        [0.6211],
-        [0.5117],
-        [0.6094]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.6289, 0.5039, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:13:50,702] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 21:13:50,703] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.40 | bwd_microstep: 4575.53 | bwd_inner_microstep: 4570.69 | bwd_allreduce_microstep: 4.74 | step_microstep: 43.47
-[2025-01-25 21:13:50,703] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.38 | bwd: 4575.55 | bwd_inner: 4570.69 | bwd_allreduce: 4.79 | step: 43.48
- 77%|███████▋  | 4460/5800 [12:27:20<2:33:09,  6.86s/it]                                                        {'loss': 0.0059, 'grad_norm': 2.585688352584839, 'learning_rate': 5.342580344516748e-06, 'epoch': 38.45}
- 77%|���██████▋  | 4460/5800 [12:27:20<2:33:09,  6.86s/it]score1 tensor([[0.5938],
-        [0.4766],
-        [0.4082],
-        [0.5938]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6016, 0.4766, 0.4004, 0.6016], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:13:57,553] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 21:13:57,557] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.36 | bwd_microstep: 4578.22 | bwd_inner_microstep: 4573.83 | bwd_allreduce_microstep: 4.33 | step_microstep: 45.28
-[2025-01-25 21:13:57,557] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.33 | bwd: 4578.24 | bwd_inner: 4573.83 | bwd_allreduce: 4.36 | step: 45.30
- 77%|███████▋  | 4461/5800 [12:27:27<2:33:00,  6.86s/it]                                                        {'loss': 0.0059, 'grad_norm': 2.741060733795166, 'learning_rate': 5.334984201503615e-06, 'epoch': 38.46}
- 77%|███████▋  | 4461/5800 [12:27:27<2:33:00,  6.86s/it]score1 tensor([[0.5820],
-        [0.3398],
-        [0.4941],
-        [0.4727]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.3340, 0.5000, 0.4727], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:14:04,413] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 21:14:04,414] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.07 | bwd_microstep: 4580.76 | bwd_inner_microstep: 4575.77 | bwd_allreduce_microstep: 4.88 | step_microstep: 45.33
-[2025-01-25 21:14:04,415] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.04 | bwd: 4580.78 | bwd_inner: 4575.77 | bwd_allreduce: 4.93 | step: 45.33
- 77%|███████▋  | 4462/5800 [12:27:34<2:32:55,  6.86s/it]                                                        {'loss': 0.0039, 'grad_norm': 1.8454347848892212, 'learning_rate': 5.327392631296864e-06, 'epoch': 38.47}
- 77%|███████▋  | 4462/5800 [12:27:34<2:32:55,  6.86s/it]score1 tensor([[0.5078],
-        [0.5898],
-        [0.5508],
-        [0.4590]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5078, 0.5781, 0.5469, 0.4629], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:14:11,258] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 21:14:11,259] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.64 | bwd_microstep: 4568.45 | bwd_inner_microstep: 4563.76 | bwd_allreduce_microstep: 4.58 | step_microstep: 44.55
-[2025-01-25 21:14:11,260] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.61 | bwd: 4568.47 | bwd_inner: 4563.76 | bwd_allreduce: 4.63 | step: 44.56
- 77%|███████▋  | 4463/5800 [12:27:41<2:32:42,  6.85s/it]                                                        {'loss': 0.0049, 'grad_norm': 2.4561355113983154, 'learning_rate': 5.3198056362636864e-06, 'epoch': 38.47}
- 77%|███████▋  | 4463/5800 [12:27:41<2:32:42,  6.85s/it]score1 tensor([[0.5273],
-        [0.5234],
-        [0.3945],
-        [0.4746]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.5117, 0.3926, 0.4590], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:14:18,142] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.54 | optimizer_step: 4.36
-[2025-01-25 21:14:18,143] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.73 | bwd_microstep: 4613.47 | bwd_inner_microstep: 4608.92 | bwd_allreduce_microstep: 4.45 | step_microstep: 41.83
-[2025-01-25 21:14:18,144] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.68 | bwd: 4613.50 | bwd_inner: 4608.92 | bwd_allreduce: 4.50 | step: 41.83
- 77%|███████▋  | 4464/5800 [12:27:48<2:32:47,  6.86s/it]                                                        {'loss': 0.0103, 'grad_norm': 7.830155372619629, 'learning_rate': 5.31222321876983e-06, 'epoch': 38.48}
- 77%|███████▋  | 4464/5800 [12:27:48<2:32:47,  6.86s/it]score1 tensor([[0.4570],
-        [0.6484],
-        [0.4473],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4609, 0.6641, 0.4453, 0.5586], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:14:25,023] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 21:14:25,025] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.49 | bwd_microstep: 4613.30 | bwd_inner_microstep: 4608.23 | bwd_allreduce_microstep: 4.95 | step_microstep: 41.89
-[2025-01-25 21:14:25,025] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.45 | bwd: 4613.32 | bwd_inner: 4608.23 | bwd_allreduce: 5.01 | step: 41.89
- 77%|███████▋  | 4465/5800 [12:27:54<2:32:49,  6.87s/it]                                                        {'loss': 0.0073, 'grad_norm': 4.496706485748291, 'learning_rate': 5.304645381179636e-06, 'epoch': 38.49}
- 77%|███████▋  | 4465/5800 [12:27:55<2:32:49,  6.87s/it]score1 tensor([[0.5430],
-        [0.4844],
-        [0.5547],
-        [0.4238]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.4805, 0.5625, 0.4238], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:14:31,882] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 21:14:31,883] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.44 | bwd_microstep: 4582.51 | bwd_inner_microstep: 4577.32 | bwd_allreduce_microstep: 5.11 | step_microstep: 40.96
-[2025-01-25 21:14:31,883] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.40 | bwd: 4582.53 | bwd_inner: 4577.32 | bwd_allreduce: 5.15 | step: 40.96
- 77%|███████▋  | 4466/5800 [12:28:01<2:32:37,  6.86s/it]                                                        {'loss': 0.0078, 'grad_norm': 2.306537389755249, 'learning_rate': 5.297072125855998e-06, 'epoch': 38.5}
- 77%|███████▋  | 4466/5800 [12:28:01<2:32:37,  6.86s/it]score1 tensor([[0.5508],
-        [0.4180],
-        [0.5117],
-        [0.6172]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.4277, 0.5117, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:14:38,732] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.52 | optimizer_step: 4.37
-[2025-01-25 21:14:38,734] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.85 | bwd_microstep: 4574.72 | bwd_inner_microstep: 4569.76 | bwd_allreduce_microstep: 4.87 | step_microstep: 46.06
-[2025-01-25 21:14:38,734] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.81 | bwd: 4574.75 | bwd_inner: 4569.76 | bwd_allreduce: 4.92 | step: 46.06
- 77%|███████▋  | 4467/5800 [12:28:08<2:32:24,  6.86s/it]                                                        {'loss': 0.0063, 'grad_norm': 2.677351474761963, 'learning_rate': 5.289503455160381e-06, 'epoch': 38.51}
- 77%|███████▋  | 4467/5800 [12:28:08<2:32:24,  6.86s/it]score1 tensor([[0.6406],
-        [0.4121],
-        [0.4277],
-        [0.3711]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.4023, 0.4121, 0.3789], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:14:45,625] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 21:14:45,626] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.97 | bwd_microstep: 4618.83 | bwd_inner_microstep: 4614.12 | bwd_allreduce_microstep: 4.61 | step_microstep: 42.44
-[2025-01-25 21:14:45,627] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.94 | bwd: 4618.85 | bwd_inner: 4614.12 | bwd_allreduce: 4.66 | step: 42.44
- 77%|███████▋  | 4468/5800 [12:28:15<2:32:33,  6.87s/it]                                                        {'loss': 0.0093, 'grad_norm': 0.71847003698349, 'learning_rate': 5.281939371452844e-06, 'epoch': 38.52}
- 77%|███████▋  | 4468/5800 [12:28:15<2:32:33,  6.87s/it]score1 tensor([[0.4355],
-        [0.4863],
-        [0.5430],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4375, 0.4883, 0.5391, 0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:14:52,523] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 21:14:52,524] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.68 | bwd_microstep: 4620.66 | bwd_inner_microstep: 4616.05 | bwd_allreduce_microstep: 4.52 | step_microstep: 42.87
-[2025-01-25 21:14:52,524] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.63 | bwd: 4620.68 | bwd_inner: 4616.05 | bwd_allreduce: 4.56 | step: 42.88
- 77%|███████▋  | 4469/5800 [12:28:22<2:32:36,  6.88s/it]                                                        {'loss': 0.0039, 'grad_norm': 0.6222617030143738, 'learning_rate': 5.2743798770919865e-06, 'epoch': 38.53}
- 77%|███████▋  | 4469/5800 [12:28:22<2:32:36,  6.88s/it]score1 tensor([[0.5234],
-        [0.5469],
-        [0.5117],
-        [0.4121]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.5547, 0.5195, 0.4082], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:14:59,414] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 21:14:59,415] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.74 | bwd_microstep: 4620.72 | bwd_inner_microstep: 4616.07 | bwd_allreduce_microstep: 4.56 | step_microstep: 42.69
-[2025-01-25 21:14:59,415] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.71 | bwd: 4620.74 | bwd_inner: 4616.07 | bwd_allreduce: 4.60 | step: 42.70
- 77%|███████▋  | 4470/5800 [12:28:29<2:32:34,  6.88s/it]                                                        {'loss': 0.0098, 'grad_norm': 0.5075669884681702, 'learning_rate': 5.266824974434998e-06, 'epoch': 38.53}
- 77%|███████▋  | 4470/5800 [12:28:29<2:32:34,  6.88s/it]score1 tensor([[0.4434],
-        [0.4922],
-        [0.6719],
-        [0.4082]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4453, 0.4980, 0.6797, 0.4043], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:15:06,314] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.55 | optimizer_step: 4.36
-[2025-01-25 21:15:06,315] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.80 | bwd_microstep: 4618.78 | bwd_inner_microstep: 4614.13 | bwd_allreduce_microstep: 4.56 | step_microstep: 42.13
-[2025-01-25 21:15:06,315] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.76 | bwd: 4618.80 | bwd_inner: 4614.13 | bwd_allreduce: 4.60 | step: 42.14
- 77%|███████▋  | 4471/5800 [12:28:36<2:32:33,  6.89s/it]                                                        {'loss': 0.0049, 'grad_norm': 4.536463260650635, 'learning_rate': 5.259274665837624e-06, 'epoch': 38.54}
- 77%|███████▋  | 4471/5800 [12:28:36<2:32:33,  6.89s/it]score1 tensor([[0.4180],
-        [0.4199],
-        [0.5117],
-        [0.6250]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4082, 0.4160, 0.5000, 0.6289], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:15:13,216] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.36
-[2025-01-25 21:15:13,217] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.97 | bwd_microstep: 4623.16 | bwd_inner_microstep: 4618.62 | bwd_allreduce_microstep: 4.46 | step_microstep: 41.96
-[2025-01-25 21:15:13,217] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.93 | bwd: 4623.18 | bwd_inner: 4618.62 | bwd_allreduce: 4.49 | step: 41.98
- 77%|███████▋  | 4472/5800 [12:28:43<2:32:31,  6.89s/it]                                                        {'loss': 0.0073, 'grad_norm': 3.4045259952545166, 'learning_rate': 5.251728953654185e-06, 'epoch': 38.55}
- 77%|███████▋  | 4472/5800 [12:28:43<2:32:31,  6.89s/it]score1 tensor([[0.4512],
-        [0.5391],
-        [0.4375],
-        [0.4512]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.5469, 0.4297, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:15:20,116] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.37
-[2025-01-25 21:15:20,117] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.63 | bwd_microstep: 4627.67 | bwd_inner_microstep: 4622.75 | bwd_allreduce_microstep: 4.75 | step_microstep: 42.95
-[2025-01-25 21:15:20,118] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.59 | bwd: 4627.66 | bwd_inner: 4622.73 | bwd_allreduce: 4.81 | step: 42.96
- 77%|███████▋  | 4473/5800 [12:28:50<2:32:28,  6.89s/it]                                                        {'loss': 0.0059, 'grad_norm': 0.47213083505630493, 'learning_rate': 5.244187840237564e-06, 'epoch': 38.56}
- 77%|███████▋  | 4473/5800 [12:28:50<2:32:28,  6.89s/it]score1 tensor([[0.4199],
-        [0.5234],
-        [0.4902],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.5273, 0.4844, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:15:27,014] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 21:15:27,015] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.94 | bwd_microstep: 4624.11 | bwd_inner_microstep: 4619.19 | bwd_allreduce_microstep: 4.81 | step_microstep: 41.98
-[2025-01-25 21:15:27,015] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.89 | bwd: 4624.13 | bwd_inner: 4619.19 | bwd_allreduce: 4.87 | step: 41.98
- 77%|███████▋  | 4474/5800 [12:28:56<2:32:22,  6.89s/it]                                                        {'loss': 0.0049, 'grad_norm': 3.7998852729797363, 'learning_rate': 5.2366513279392066e-06, 'epoch': 38.57}
- 77%|███████▋  | 4474/5800 [12:28:56<2:32:22,  6.89s/it]score1 tensor([[0.5859],
-        [0.4648],
-        [0.6211],
-        [0.3691]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5898, 0.4648, 0.6094, 0.3262], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0146, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:15:33,854] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 21:15:33,855] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.75 | bwd_microstep: 4566.23 | bwd_inner_microstep: 4561.48 | bwd_allreduce_microstep: 4.64 | step_microstep: 46.29
-[2025-01-25 21:15:33,856] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.71 | bwd: 4566.26 | bwd_inner: 4561.48 | bwd_allreduce: 4.70 | step: 46.29
- 77%|███████▋  | 4475/5800 [12:29:03<2:31:55,  6.88s/it]                                                        {'loss': 0.0146, 'grad_norm': 1.8044536113739014, 'learning_rate': 5.229119419109134e-06, 'epoch': 38.58}
- 77%|███████▋  | 4475/5800 [12:29:03<2:31:55,  6.88s/it]score1 tensor([[0.6641],
-        [0.5469],
-        [0.5586],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6602, 0.5508, 0.5625, 0.5820], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:15:40,741] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 21:15:40,742] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.71 | bwd_microstep: 4613.88 | bwd_inner_microstep: 4609.36 | bwd_allreduce_microstep: 4.43 | step_microstep: 42.66
-[2025-01-25 21:15:40,742] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.67 | bwd: 4613.90 | bwd_inner: 4609.36 | bwd_allreduce: 4.47 | step: 42.67
- 77%|███████▋  | 4476/5800 [12:29:10<2:31:50,  6.88s/it]                                                        {'loss': 0.0078, 'grad_norm': 4.064233303070068, 'learning_rate': 5.22159211609592e-06, 'epoch': 38.59}
- 77%|███████▋  | 4476/5800 [12:29:10<2:31:50,  6.88s/it]score1 tensor([[0.5156],
-        [0.4453],
-        [0.6367],
-        [0.4375]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.4512, 0.6367, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:15:47,593] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 21:15:47,594] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.35 | bwd_microstep: 4578.24 | bwd_inner_microstep: 4573.77 | bwd_allreduce_microstep: 4.39 | step_microstep: 46.30
-[2025-01-25 21:15:47,594] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.31 | bwd: 4578.26 | bwd_inner: 4573.77 | bwd_allreduce: 4.43 | step: 46.31
- 77%|███████▋  | 4477/5800 [12:29:17<2:31:31,  6.87s/it]                                                        {'loss': 0.0063, 'grad_norm': 5.753952980041504, 'learning_rate': 5.214069421246715e-06, 'epoch': 38.59}
- 77%|███████▋  | 4477/5800 [12:29:17<2:31:31,  6.87s/it]score1 tensor([[0.4961],
-        [0.4512],
-        [0.6172],
-        [0.4980]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4922, 0.4570, 0.6172, 0.5000], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0029, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:15:54,444] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 21:15:54,445] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.40 | bwd_microstep: 4579.53 | bwd_inner_microstep: 4574.70 | bwd_allreduce_microstep: 4.71 | step_microstep: 44.50
-[2025-01-25 21:15:54,446] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.36 | bwd: 4579.55 | bwd_inner: 4574.70 | bwd_allreduce: 4.76 | step: 44.53
- 77%|███████▋  | 4478/5800 [12:29:24<2:31:18,  6.87s/it]                                                        {'loss': 0.0029, 'grad_norm': 1.9139151573181152, 'learning_rate': 5.206551336907224e-06, 'epoch': 38.6}
- 77%|███████▋  | 4478/5800 [12:29:24<2:31:18,  6.87s/it]score1 tensor([[0.4941],
-        [0.6602],
-        [0.5742],
-        [0.4648]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.6719, 0.5625, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:16:01,341] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 21:16:01,343] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.77 | bwd_microstep: 4615.32 | bwd_inner_microstep: 4610.50 | bwd_allreduce_microstep: 4.70 | step_microstep: 42.78
-[2025-01-25 21:16:01,343] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.74 | bwd: 4615.34 | bwd_inner: 4610.50 | bwd_allreduce: 4.76 | step: 42.79
- 77%|███████▋  | 4479/5800 [12:29:31<2:31:22,  6.88s/it]                                                        {'loss': 0.0137, 'grad_norm': 4.1496992111206055, 'learning_rate': 5.19903786542171e-06, 'epoch': 38.61}
- 77%|███████▋  | 4479/5800 [12:29:31<2:31:22,  6.88s/it]score1 tensor([[0.5469],
-        [0.4883],
-        [0.6406],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.4980, 0.6367, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:16:08,246] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.93 | optimizer_step: 4.36
-[2025-01-25 21:16:08,247] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.87 | bwd_microstep: 4625.38 | bwd_inner_microstep: 4620.13 | bwd_allreduce_microstep: 5.12 | step_microstep: 43.70
-[2025-01-25 21:16:08,247] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.84 | bwd: 4625.40 | bwd_inner: 4620.13 | bwd_allreduce: 5.18 | step: 43.71
- 77%|███████▋  | 4480/5800 [12:29:38<2:31:28,  6.89s/it]                                                        {'loss': 0.0054, 'grad_norm': 4.608960151672363, 'learning_rate': 5.191529009133007e-06, 'epoch': 38.62}
- 77%|███████▋  | 4480/5800 [12:29:38<2:31:28,  6.89s/it]score1 tensor([[0.4531],
-        [0.3613],
-        [0.5664],
-        [0.3359]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4531, 0.3652, 0.5586, 0.3418], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0044, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:16:15,111] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 21:16:15,113] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.19 | bwd_microstep: 4564.44 | bwd_inner_microstep: 4558.85 | bwd_allreduce_microstep: 5.49 | step_microstep: 50.55
-[2025-01-25 21:16:15,113] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.16 | bwd: 4564.46 | bwd_inner: 4558.85 | bwd_allreduce: 5.54 | step: 50.57
- 77%|███████▋  | 4481/5800 [12:29:45<2:31:11,  6.88s/it]                                                        {'loss': 0.0044, 'grad_norm': 1.3600493669509888, 'learning_rate': 5.184024770382516e-06, 'epoch': 38.63}
- 77%|███████▋  | 4481/5800 [12:29:45<2:31:11,  6.88s/it]score1 tensor([[0.4199],
-        [0.5430],
-        [0.4414],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4355, 0.5312, 0.4473, 0.5742], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:16:22,013] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.37
-[2025-01-25 21:16:22,015] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.05 | bwd_microstep: 4621.93 | bwd_inner_microstep: 4617.03 | bwd_allreduce_microstep: 4.80 | step_microstep: 44.24
-[2025-01-25 21:16:22,015] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.01 | bwd: 4621.96 | bwd_inner: 4617.03 | bwd_allreduce: 4.85 | step: 44.24
- 77%|���██████▋  | 4482/5800 [12:29:51<2:31:15,  6.89s/it]                                                        {'loss': 0.0093, 'grad_norm': 3.780637741088867, 'learning_rate': 5.1765251515101745e-06, 'epoch': 38.64}
- 77%|███████▋  | 4482/5800 [12:29:51<2:31:15,  6.89s/it]score1 tensor([[0.5117],
-        [0.3301],
-        [0.4668],
-        [0.4902]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5078, 0.3398, 0.4668, 0.4883], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:16:28,868] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 21:16:28,869] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.91 | bwd_microstep: 4576.47 | bwd_inner_microstep: 4571.37 | bwd_allreduce_microstep: 4.98 | step_microstep: 47.30
-[2025-01-25 21:16:28,870] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.88 | bwd: 4576.50 | bwd_inner: 4571.37 | bwd_allreduce: 5.05 | step: 47.31
- 77%|███████▋  | 4483/5800 [12:29:58<2:30:55,  6.88s/it]                                                        {'loss': 0.0039, 'grad_norm': 2.445326805114746, 'learning_rate': 5.169030154854506e-06, 'epoch': 38.65}
- 77%|███████▋  | 4483/5800 [12:29:58<2:30:55,  6.88s/it]score1 tensor([[0.4551],
-        [0.3867],
-        [0.4297],
-        [0.4688]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.3867, 0.4258, 0.4688], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:16:35,682] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.54 | optimizer_step: 4.36
-[2025-01-25 21:16:35,683] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.03 | bwd_microstep: 4541.02 | bwd_inner_microstep: 4536.04 | bwd_allreduce_microstep: 4.90 | step_microstep: 47.77
-[2025-01-25 21:16:35,684] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.00 | bwd: 4541.04 | bwd_inner: 4536.04 | bwd_allreduce: 4.94 | step: 47.78
- 77%|███████▋  | 4484/5800 [12:30:05<2:30:23,  6.86s/it]                                                        {'loss': 0.002, 'grad_norm': 3.737173080444336, 'learning_rate': 5.1615397827525694e-06, 'epoch': 38.66}
- 77%|███████▋  | 4484/5800 [12:30:05<2:30:23,  6.86s/it]score1 tensor([[0.6367],
-        [0.6641],
-        [0.6562],
-        [0.3750]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.6484, 0.6445, 0.3789], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:16:42,579] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.37
-[2025-01-25 21:16:42,580] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.96 | bwd_microstep: 4624.66 | bwd_inner_microstep: 4620.07 | bwd_allreduce_microstep: 4.51 | step_microstep: 45.17
-[2025-01-25 21:16:42,581] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.93 | bwd: 4624.68 | bwd_inner: 4620.07 | bwd_allreduce: 4.55 | step: 45.17
- 77%|███████▋  | 4485/5800 [12:30:12<2:30:34,  6.87s/it]                                                        {'loss': 0.0117, 'grad_norm': 5.555271148681641, 'learning_rate': 5.154054037540004e-06, 'epoch': 38.66}
- 77%|███████▋  | 4485/5800 [12:30:12<2:30:34,  6.87s/it]score1 tensor([[0.5234],
-        [0.4395],
-        [0.4668],
-        [0.6484]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.4297, 0.4688, 0.6484], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:16:49,449] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 21:16:49,450] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.17 | bwd_microstep: 4598.36 | bwd_inner_microstep: 4593.39 | bwd_allreduce_microstep: 4.89 | step_microstep: 42.70
-[2025-01-25 21:16:49,450] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.12 | bwd: 4598.38 | bwd_inner: 4593.39 | bwd_allreduce: 4.93 | step: 42.71
- 77%|███████▋  | 4486/5800 [12:30:19<2:30:26,  6.87s/it]                                                        {'loss': 0.0059, 'grad_norm': 2.001026153564453, 'learning_rate': 5.1465729215509826e-06, 'epoch': 38.67}
- 77%|███████▋  | 4486/5800 [12:30:19<2:30:26,  6.87s/it]score1 tensor([[0.4863],
-        [0.5156],
-        [0.3516],
-        [0.4023]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4883, 0.5156, 0.2812, 0.4043], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0186, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:16:56,299] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 21:16:56,301] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.80 | bwd_microstep: 4572.75 | bwd_inner_microstep: 4564.18 | bwd_allreduce_microstep: 8.49 | step_microstep: 43.56
-[2025-01-25 21:16:56,301] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.77 | bwd: 4572.78 | bwd_inner: 4564.18 | bwd_allreduce: 8.53 | step: 43.57
- 77%|███████▋  | 4487/5800 [12:30:26<2:30:13,  6.87s/it]                                                        {'loss': 0.0186, 'grad_norm': 2.1244208812713623, 'learning_rate': 5.13909643711826e-06, 'epoch': 38.68}
- 77%|███████▋  | 4487/5800 [12:30:26<2:30:13,  6.87s/it]score1 tensor([[0.3809],
-        [0.5078],
-        [0.3730],
-        [0.4531]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3789, 0.5078, 0.3750, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0015, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:17:03,155] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.45 | optimizer_step: 4.37
-[2025-01-25 21:17:03,157] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.30 | bwd_microstep: 4576.83 | bwd_inner_microstep: 4571.61 | bwd_allreduce_microstep: 5.12 | step_microstep: 43.43
-[2025-01-25 21:17:03,157] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.26 | bwd: 4576.85 | bwd_inner: 4571.61 | bwd_allreduce: 5.17 | step: 43.44
- 77%|███████▋  | 4488/5800 [12:30:33<2:30:00,  6.86s/it]                                                        {'loss': 0.0015, 'grad_norm': 1.9275572299957275, 'learning_rate': 5.131624586573123e-06, 'epoch': 38.69}
- 77%|███████▋  | 4488/5800 [12:30:33<2:30:00,  6.86s/it]score1 tensor([[0.3711],
-        [0.5234],
-        [0.5898],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.5195, 0.5938, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:17:10,048] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 21:17:10,049] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.34 | bwd_microstep: 4621.31 | bwd_inner_microstep: 4616.61 | bwd_allreduce_microstep: 4.59 | step_microstep: 43.20
-[2025-01-25 21:17:10,050] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.31 | bwd: 4621.35 | bwd_inner: 4616.61 | bwd_allreduce: 4.64 | step: 43.20
- 77%|███████▋  | 4489/5800 [12:30:40<2:30:08,  6.87s/it]                                                        {'loss': 0.0054, 'grad_norm': 0.3768371343612671, 'learning_rate': 5.124157372245433e-06, 'epoch': 38.7}
- 77%|███████▋  | 4489/5800 [12:30:40<2:30:08,  6.87s/it]score1 tensor([[0.3770],
-        [0.5391],
-        [0.4531],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3711, 0.5273, 0.4609, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:17:16,914] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 21:17:16,916] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.63 | bwd_microstep: 4588.20 | bwd_inner_microstep: 4583.46 | bwd_allreduce_microstep: 4.67 | step_microstep: 42.54
-[2025-01-25 21:17:16,916] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.59 | bwd: 4588.23 | bwd_inner: 4583.45 | bwd_allreduce: 4.71 | step: 42.54
- 77%|███████▋  | 4490/5800 [12:30:46<2:30:00,  6.87s/it]                                                        {'loss': 0.0063, 'grad_norm': 1.9421977996826172, 'learning_rate': 5.116694796463593e-06, 'epoch': 38.71}
- 77%|███████▋  | 4490/5800 [12:30:46<2:30:00,  6.87s/it]score1 tensor([[0.4434],
-        [0.4961],
-        [0.5352],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.4922, 0.5430, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:17:23,845] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 21:17:23,846] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.64 | bwd_microstep: 4643.13 | bwd_inner_microstep: 4638.49 | bwd_allreduce_microstep: 4.56 | step_microstep: 46.25
-[2025-01-25 21:17:23,846] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.60 | bwd: 4643.16 | bwd_inner: 4638.49 | bwd_allreduce: 4.60 | step: 46.27
- 77%|███████▋  | 4491/5800 [12:30:53<2:30:15,  6.89s/it]                                                        {'loss': 0.0063, 'grad_norm': 0.47982358932495117, 'learning_rate': 5.109236861554558e-06, 'epoch': 38.72}
- 77%|███████▋  | 4491/5800 [12:30:53<2:30:15,  6.89s/it]score1 tensor([[0.5703],
-        [0.5391],
-        [0.5039],
-        [0.4941]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.5352, 0.5195, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:17:30,755] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.46 | optimizer_step: 4.37
-[2025-01-25 21:17:30,756] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.42 | bwd_microstep: 4634.46 | bwd_inner_microstep: 4629.68 | bwd_allreduce_microstep: 4.67 | step_microstep: 42.47
-[2025-01-25 21:17:30,756] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.39 | bwd: 4634.49 | bwd_inner: 4629.68 | bwd_allreduce: 4.72 | step: 42.48
- 77%|███████▋  | 4492/5800 [12:31:00<2:30:14,  6.89s/it]                                                        {'loss': 0.0083, 'grad_norm': 0.5867334604263306, 'learning_rate': 5.101783569843852e-06, 'epoch': 38.72}
- 77%|███████▋  | 4492/5800 [12:31:00<2:30:14,  6.89s/it]score1 tensor([[0.5977],
-        [0.4473],
-        [0.4219],
-        [0.4199]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6016, 0.4395, 0.4336, 0.4277], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:17:37,661] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 21:17:37,663] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.50 | bwd_microstep: 4637.26 | bwd_inner_microstep: 4632.48 | bwd_allreduce_microstep: 4.68 | step_microstep: 43.77
-[2025-01-25 21:17:37,663] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.46 | bwd: 4637.28 | bwd_inner: 4632.48 | bwd_allreduce: 4.73 | step: 43.77
- 77%|███████▋  | 4493/5800 [12:31:07<2:30:16,  6.90s/it]                                                        {'loss': 0.0078, 'grad_norm': 3.989622116088867, 'learning_rate': 5.094334923655531e-06, 'epoch': 38.73}
- 77%|███████▋  | 4493/5800 [12:31:07<2:30:16,  6.90s/it]score1 tensor([[0.3086],
-        [0.4336],
-        [0.6562],
-        [0.4199]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3086, 0.4297, 0.6484, 0.4258], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0044, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:17:44,524] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 21:17:44,525] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.65 | bwd_microstep: 4579.56 | bwd_inner_microstep: 4575.28 | bwd_allreduce_microstep: 4.20 | step_microstep: 45.84
-[2025-01-25 21:17:44,526] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.61 | bwd: 4579.58 | bwd_inner: 4575.28 | bwd_allreduce: 4.23 | step: 45.85
- 77%|███████▋  | 4494/5800 [12:31:14<2:29:55,  6.89s/it]                                                        {'loss': 0.0044, 'grad_norm': 2.476630926132202, 'learning_rate': 5.08689092531222e-06, 'epoch': 38.74}
- 77%|███████▋  | 4494/5800 [12:31:14<2:29:55,  6.89s/it]score1 tensor([[0.6875],
-        [0.6094],
-        [0.5039],
-        [0.4785]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6836, 0.6211, 0.5000, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:17:51,433] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.49 | optimizer_step: 4.37
-[2025-01-25 21:17:51,434] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.56 | bwd_microstep: 4638.26 | bwd_inner_microstep: 4633.90 | bwd_allreduce_microstep: 4.24 | step_microstep: 43.92
-[2025-01-25 21:17:51,435] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.53 | bwd: 4638.29 | bwd_inner: 4633.90 | bwd_allreduce: 4.31 | step: 43.93
- 78%|███████▊  | 4495/5800 [12:31:21<2:29:55,  6.89s/it]                                                        {'loss': 0.0063, 'grad_norm': 0.5473212003707886, 'learning_rate': 5.079451577135079e-06, 'epoch': 38.75}
- 78%|███████▊  | 4495/5800 [12:31:21<2:29:55,  6.89s/it]score1 tensor([[0.5977],
-        [0.5117],
-        [0.5938],
-        [0.4609]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.4961, 0.6094, 0.4629], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:17:58,335] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.55 | optimizer_step: 4.37
-[2025-01-25 21:17:58,336] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.60 | bwd_microstep: 4633.40 | bwd_inner_microstep: 4628.79 | bwd_allreduce_microstep: 4.52 | step_microstep: 42.57
-[2025-01-25 21:17:58,336] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.57 | bwd: 4633.43 | bwd_inner: 4628.79 | bwd_allreduce: 4.57 | step: 42.58
- 78%|███████▊  | 4496/5800 [12:31:28<2:29:53,  6.90s/it]                                                        {'loss': 0.0112, 'grad_norm': 4.3686418533325195, 'learning_rate': 5.0720168814438356e-06, 'epoch': 38.76}
- 78%|███████▊  | 4496/5800 [12:31:28<2:29:53,  6.90s/it]score1 tensor([[0.5508],
-        [0.4336],
-        [0.4082],
-        [0.5859]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.4395, 0.4141, 0.5977], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:18:05,252] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.36
-[2025-01-25 21:18:05,254] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.25 | bwd_microstep: 4633.68 | bwd_inner_microstep: 4628.55 | bwd_allreduce_microstep: 5.01 | step_microstep: 43.90
-[2025-01-25 21:18:05,254] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.22 | bwd: 4633.70 | bwd_inner: 4628.55 | bwd_allreduce: 5.06 | step: 43.91
- 78%|███████▊  | 4497/5800 [12:31:35<2:29:55,  6.90s/it]                                                        {'loss': 0.0098, 'grad_norm': 7.983582019805908, 'learning_rate': 5.064586840556751e-06, 'epoch': 38.77}
- 78%|███████▊  | 4497/5800 [12:31:35<2:29:55,  6.90s/it]score1 tensor([[0.4922],
-        [0.5000],
-        [0.5391],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.5039, 0.5391, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:18:12,115] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.55 | optimizer_step: 4.37
-[2025-01-25 21:18:12,116] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.65 | bwd_microstep: 4583.72 | bwd_inner_microstep: 4579.35 | bwd_allreduce_microstep: 4.30 | step_microstep: 42.51
-[2025-01-25 21:18:12,117] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.62 | bwd: 4583.74 | bwd_inner: 4579.35 | bwd_allreduce: 4.33 | step: 42.53
- 78%|███████▊  | 4498/5800 [12:31:42<2:29:31,  6.89s/it]                                                        {'loss': 0.0054, 'grad_norm': 6.112082481384277, 'learning_rate': 5.057161456790638e-06, 'epoch': 38.78}
- 78%|███████▊  | 4498/5800 [12:31:42<2:29:31,  6.89s/it]score1 tensor([[0.6094],
-        [0.4766],
-        [0.5352],
-        [0.4902]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.4570, 0.5352, 0.4824], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:18:18,996] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 21:18:18,997] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.67 | bwd_microstep: 4590.26 | bwd_inner_microstep: 4585.75 | bwd_allreduce_microstep: 4.41 | step_microstep: 43.09
-[2025-01-25 21:18:18,997] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.63 | bwd: 4590.28 | bwd_inner: 4585.75 | bwd_allreduce: 4.46 | step: 43.10
- 78%|███████▊  | 4499/5800 [12:31:48<2:29:22,  6.89s/it]                                                        {'loss': 0.0098, 'grad_norm': 1.6889079809188843, 'learning_rate': 5.0497407324608665e-06, 'epoch': 38.78}
- 78%|███████▊  | 4499/5800 [12:31:48<2:29:22,  6.89s/it]score1 tensor([[0.6289],
-        [0.5312],
-        [0.4453],
-        [0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6328, 0.5312, 0.4395, 0.4922], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0044, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:18:25,875] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.36
-[2025-01-25 21:18:25,876] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2169.67 | bwd_microstep: 4592.07 | bwd_inner_microstep: 4587.12 | bwd_allreduce_microstep: 4.86 | step_microstep: 44.47
-[2025-01-25 21:18:25,877] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2169.64 | bwd: 4592.09 | bwd_inner: 4587.12 | bwd_allreduce: 4.91 | step: 44.50
- 78%|███████▊  | 4500/5800 [12:31:55<2:29:09,  6.88s/it]                                                        {'loss': 0.0044, 'grad_norm': 2.4961695671081543, 'learning_rate': 5.04232466988134e-06, 'epoch': 38.79}
- 78%|███████▊  | 4500/5800 [12:31:55<2:29:09,  6.88s/it]score1 tensor([[0.6406],
-        [0.4883],
-        [0.4082],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.4785, 0.4004, 0.5000], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:18:32,796] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.37
-[2025-01-25 21:18:32,797] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.82 | bwd_microstep: 4639.68 | bwd_inner_microstep: 4634.17 | bwd_allreduce_microstep: 5.38 | step_microstep: 45.61
-[2025-01-25 21:18:32,797] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.79 | bwd: 4639.71 | bwd_inner: 4634.17 | bwd_allreduce: 5.46 | step: 45.62
- 78%|███████▊  | 4501/5800 [12:32:02<2:29:18,  6.90s/it]                                                        {'loss': 0.0063, 'grad_norm': 3.49857497215271, 'learning_rate': 5.034913271364525e-06, 'epoch': 38.8}
- 78%|███████▊  | 4501/5800 [12:32:02<2:29:18,  6.90s/it]score1 tensor([[0.4492],
-        [0.6133],
-        [0.4902],
-        [0.4961]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.6250, 0.4902, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:18:39,674] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 5.42 | optimizer_step: 4.36
-[2025-01-25 21:18:39,675] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.39 | bwd_microstep: 4592.25 | bwd_inner_microstep: 4587.38 | bwd_allreduce_microstep: 4.77 | step_microstep: 45.87
-[2025-01-25 21:18:39,676] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.34 | bwd: 4592.28 | bwd_inner: 4587.38 | bwd_allreduce: 4.82 | step: 45.89
- 78%|███████▊  | 4502/5800 [12:32:09<2:29:02,  6.89s/it]                                                        {'loss': 0.0039, 'grad_norm': 1.6924527883529663, 'learning_rate': 5.027506539221414e-06, 'epoch': 38.81}
- 78%|███████▊  | 4502/5800 [12:32:09<2:29:02,  6.89s/it]score1 tensor([[0.4668],
-        [0.4102],
-        [0.5625],
-        [0.6445]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4609, 0.4004, 0.5664, 0.6484], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:18:46,592] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 21:18:46,593] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.24 | bwd_microstep: 4639.49 | bwd_inner_microstep: 4635.06 | bwd_allreduce_microstep: 4.33 | step_microstep: 44.00
-[2025-01-25 21:18:46,593] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.22 | bwd: 4639.51 | bwd_inner: 4635.06 | bwd_allreduce: 4.39 | step: 44.01
- 78%|███████▊  | 4503/5800 [12:32:16<2:29:07,  6.90s/it]                                                        {'loss': 0.0059, 'grad_norm': 1.0836433172225952, 'learning_rate': 5.020104475761565e-06, 'epoch': 38.82}
- 78%|███████▊  | 4503/5800 [12:32:16<2:29:07,  6.90s/it]score1 tensor([[0.6367],
-        [0.5195],
-        [0.4785],
-        [0.4121]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.5273, 0.4766, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:18:53,508] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.54 | optimizer_step: 4.37
-[2025-01-25 21:18:53,510] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.74 | bwd_microstep: 4640.54 | bwd_inner_microstep: 4636.06 | bwd_allreduce_microstep: 4.40 | step_microstep: 42.19
-[2025-01-25 21:18:53,510] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.71 | bwd: 4640.56 | bwd_inner: 4636.06 | bwd_allreduce: 4.44 | step: 42.20
- 78%|███████▊  | 4504/5800 [12:32:23<2:29:08,  6.90s/it]                                                        {'loss': 0.0107, 'grad_norm': 0.6490595936775208, 'learning_rate': 5.012707083293062e-06, 'epoch': 38.83}
- 78%|███████▊  | 4504/5800 [12:32:23<2:29:08,  6.90s/it]score1 tensor([[0.4922],
-        [0.4609],
-        [0.5586],
-        [0.4961]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.4648, 0.5508, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:19:00,420] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 21:19:00,421] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.56 | bwd_microstep: 4636.25 | bwd_inner_microstep: 4631.46 | bwd_allreduce_microstep: 4.68 | step_microstep: 42.20
-[2025-01-25 21:19:00,421] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.52 | bwd: 4636.27 | bwd_inner: 4631.46 | bwd_allreduce: 4.73 | step: 42.21
- 78%|███████▊  | 4505/5800 [12:32:30<2:29:03,  6.91s/it]                                                        {'loss': 0.0049, 'grad_norm': 3.7581090927124023, 'learning_rate': 5.00531436412254e-06, 'epoch': 38.84}
- 78%|███████▊  | 4505/5800 [12:32:30<2:29:03,  6.91s/it]score1 tensor([[0.6328],
-        [0.4512],
-        [0.5000],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6367, 0.4629, 0.4922, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:19:07,338] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.36
-[2025-01-25 21:19:07,339] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.02 | bwd_microstep: 4641.34 | bwd_inner_microstep: 4636.62 | bwd_allreduce_microstep: 4.61 | step_microstep: 42.65
-[2025-01-25 21:19:07,340] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.97 | bwd: 4641.36 | bwd_inner: 4636.62 | bwd_allreduce: 4.67 | step: 42.66
- 78%|███████▊  | 4506/5800 [12:32:37<2:29:01,  6.91s/it]                                                        {'loss': 0.0068, 'grad_norm': 4.3901286125183105, 'learning_rate': 4.997926320555184e-06, 'epoch': 38.84}
- 78%|███████▊  | 4506/5800 [12:32:37<2:29:01,  6.91s/it]score1 tensor([[0.4883],
-        [0.6055],
-        [0.3867],
-        [0.4824]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4863, 0.6133, 0.3887, 0.4805], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0034, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:19:14,262] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 21:19:14,263] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.09 | bwd_microstep: 4639.82 | bwd_inner_microstep: 4635.40 | bwd_allreduce_microstep: 4.34 | step_microstep: 46.67
-[2025-01-25 21:19:14,264] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.06 | bwd: 4639.84 | bwd_inner: 4635.40 | bwd_allreduce: 4.38 | step: 46.68
- 78%|███████▊  | 4507/5800 [12:32:44<2:28:59,  6.91s/it]                                                        {'loss': 0.0034, 'grad_norm': 0.38404974341392517, 'learning_rate': 4.990542954894704e-06, 'epoch': 38.85}
- 78%|███████▊  | 4507/5800 [12:32:44<2:28:59,  6.91s/it]score1 tensor([[0.4727],
-        [0.3691],
-        [0.4102],
-        [0.4570]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.3750, 0.4004, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:19:21,179] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 21:19:21,181] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.76 | bwd_microstep: 4634.06 | bwd_inner_microstep: 4629.22 | bwd_allreduce_microstep: 4.75 | step_microstep: 47.64
-[2025-01-25 21:19:21,181] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.72 | bwd: 4634.09 | bwd_inner: 4629.22 | bwd_allreduce: 4.80 | step: 47.66
- 78%|███████▊  | 4508/5800 [12:32:51<2:28:53,  6.91s/it]                                                        {'loss': 0.0073, 'grad_norm': 0.33516523241996765, 'learning_rate': 4.9831642694433725e-06, 'epoch': 38.86}
- 78%|███████▊  | 4508/5800 [12:32:51<2:28:53,  6.91s/it]score1 tensor([[0.5000],
-        [0.6094],
-        [0.3242],
-        [0.3145]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.6133, 0.3223, 0.3105], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:19:28,093] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 21:19:28,094] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.37 | bwd_microstep: 4633.96 | bwd_inner_microstep: 4629.26 | bwd_allreduce_microstep: 4.61 | step_microstep: 42.14
-[2025-01-25 21:19:28,094] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.34 | bwd: 4633.98 | bwd_inner: 4629.26 | bwd_allreduce: 4.66 | step: 42.14
- 78%|███████▊  | 4509/5800 [12:32:58<2:28:47,  6.92s/it]                                                        {'loss': 0.0078, 'grad_norm': 3.0749099254608154, 'learning_rate': 4.975790266501987e-06, 'epoch': 38.87}
- 78%|███████▊  | 4509/5800 [12:32:58<2:28:47,  6.92s/it]score1 tensor([[0.4258],
-        [0.4609],
-        [0.3652],
-        [0.4531]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4258, 0.4590, 0.3672, 0.4570], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:19:34,956] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 21:19:34,957] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.29 | bwd_microstep: 4586.65 | bwd_inner_microstep: 4578.94 | bwd_allreduce_microstep: 7.61 | step_microstep: 43.60
-[2025-01-25 21:19:34,958] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.25 | bwd: 4586.68 | bwd_inner: 4578.94 | bwd_allreduce: 7.66 | step: 43.60
- 78%|███████▊  | 4510/5800 [12:33:04<2:28:22,  6.90s/it]                                                        {'loss': 0.002, 'grad_norm': 1.7373085021972656, 'learning_rate': 4.968420948369886e-06, 'epoch': 38.88}
- 78%|███████▊  | 4510/5800 [12:33:04<2:28:22,  6.90s/it]score1 tensor([[0.5703],
-        [0.4453],
-        [0.4023],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.4453, 0.3984, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:19:41,830] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.36
-[2025-01-25 21:19:41,831] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.28 | bwd_microstep: 4588.59 | bwd_inner_microstep: 4583.98 | bwd_allreduce_microstep: 4.53 | step_microstep: 42.46
-[2025-01-25 21:19:41,831] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.23 | bwd: 4588.61 | bwd_inner: 4583.98 | bwd_allreduce: 4.57 | step: 42.47
- 78%|███████▊  | 4511/5800 [12:33:11<2:28:01,  6.89s/it]                                                        {'loss': 0.0068, 'grad_norm': 5.978063583374023, 'learning_rate': 4.961056317344956e-06, 'epoch': 38.89}
- 78%|███████▊  | 4511/5800 [12:33:11<2:28:01,  6.89s/it]score1 tensor([[0.5586],
-        [0.5391],
-        [0.4258],
-        [0.4805]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.5469, 0.4336, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:19:48,745] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 21:19:48,746] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.89 | bwd_microstep: 4640.28 | bwd_inner_microstep: 4635.66 | bwd_allreduce_microstep: 4.52 | step_microstep: 42.63
-[2025-01-25 21:19:48,747] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.87 | bwd: 4640.30 | bwd_inner: 4635.66 | bwd_allreduce: 4.57 | step: 42.64
- 78%|███████▊  | 4512/5800 [12:33:18<2:28:05,  6.90s/it]                                                        {'loss': 0.0068, 'grad_norm': 8.022354125976562, 'learning_rate': 4.9536963757236115e-06, 'epoch': 38.9}
- 78%|███████▊  | 4512/5800 [12:33:18<2:28:05,  6.90s/it]score1 tensor([[0.5977],
-        [0.4688],
-        [0.4336],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5781, 0.4688, 0.4375, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:19:55,611] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 21:19:55,612] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.85 | bwd_microstep: 4583.22 | bwd_inner_microstep: 4578.47 | bwd_allreduce_microstep: 4.64 | step_microstep: 42.94
-[2025-01-25 21:19:55,612] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.82 | bwd: 4583.24 | bwd_inner: 4578.47 | bwd_allreduce: 4.69 | step: 42.95
- 78%|███████▊  | 4513/5800 [12:33:25<2:27:45,  6.89s/it]                                                        {'loss': 0.0117, 'grad_norm': 1.7266006469726562, 'learning_rate': 4.9463411258008175e-06, 'epoch': 38.91}
- 78%|███████▊  | 4513/5800 [12:33:25<2:27:45,  6.89s/it]score1 tensor([[0.5547],
-        [0.5469],
-        [0.4941],
-        [0.4766]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.5508, 0.4863, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:20:02,528] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 21:20:02,529] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.37 | bwd_microstep: 4642.32 | bwd_inner_microstep: 4637.48 | bwd_allreduce_microstep: 4.76 | step_microstep: 42.95
-[2025-01-25 21:20:02,530] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.34 | bwd: 4642.34 | bwd_inner: 4637.48 | bwd_allreduce: 4.80 | step: 42.96
- 78%|███████▊  | 4514/5800 [12:33:32<2:27:49,  6.90s/it]                                                        {'loss': 0.0059, 'grad_norm': 0.3833417594432831, 'learning_rate': 4.938990569870057e-06, 'epoch': 38.91}
- 78%|███████▊  | 4514/5800 [12:33:32<2:27:49,  6.90s/it]score1 tensor([[0.5039],
-        [0.4648],
-        [0.6953],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.4746, 0.6953, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:20:09,409] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 21:20:09,410] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.89 | bwd_microstep: 4593.31 | bwd_inner_microstep: 4588.57 | bwd_allreduce_microstep: 4.65 | step_microstep: 42.62
-[2025-01-25 21:20:09,410] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.83 | bwd: 4593.34 | bwd_inner: 4588.58 | bwd_allreduce: 4.69 | step: 42.62
- 78%|███████▊  | 4515/5800 [12:33:39<2:27:35,  6.89s/it]                                                        {'loss': 0.0078, 'grad_norm': 2.2910850048065186, 'learning_rate': 4.931644710223375e-06, 'epoch': 38.92}
- 78%|███████▊  | 4515/5800 [12:33:39<2:27:35,  6.89s/it]score1 tensor([[0.5781],
-        [0.4141],
-        [0.5234],
-        [0.3730]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5820, 0.4121, 0.5352, 0.3809], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:20:16,325] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 21:20:16,326] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.80 | bwd_microstep: 4638.30 | bwd_inner_microstep: 4633.74 | bwd_allreduce_microstep: 4.48 | step_microstep: 45.80
-[2025-01-25 21:20:16,326] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.73 | bwd: 4638.33 | bwd_inner: 4633.74 | bwd_allreduce: 4.52 | step: 45.82
- 78%|███████▊  | 4516/5800 [12:33:46<2:27:39,  6.90s/it]                                                        {'loss': 0.0063, 'grad_norm': 4.210325241088867, 'learning_rate': 4.9243035491513255e-06, 'epoch': 38.93}
- 78%|███████▊  | 4516/5800 [12:33:46<2:27:39,  6.90s/it]score1 tensor([[0.6250],
-        [0.3984],
-        [0.5352],
-        [0.4238]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.3906, 0.5273, 0.4043], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:20:23,252] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 21:20:23,254] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.37 | bwd_microstep: 4644.50 | bwd_inner_microstep: 4639.86 | bwd_allreduce_microstep: 4.57 | step_microstep: 48.15
-[2025-01-25 21:20:23,254] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.34 | bwd: 4644.53 | bwd_inner: 4639.86 | bwd_allreduce: 4.61 | step: 48.16
- 78%|███████▊  | 4517/5800 [12:33:53<2:27:43,  6.91s/it]                                                        {'loss': 0.0137, 'grad_norm': 7.993203163146973, 'learning_rate': 4.9169670889430235e-06, 'epoch': 38.94}
- 78%|███████▊  | 4517/5800 [12:33:53<2:27:43,  6.91s/it]score1 tensor([[0.5312],
-        [0.3867],
-        [0.4238],
-        [0.4512]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.3945, 0.4355, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:20:30,182] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 21:20:30,183] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.94 | bwd_microstep: 4644.20 | bwd_inner_microstep: 4639.39 | bwd_allreduce_microstep: 4.71 | step_microstep: 46.42
-[2025-01-25 21:20:30,184] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.91 | bwd: 4644.22 | bwd_inner: 4639.39 | bwd_allreduce: 4.76 | step: 46.43
- 78%|███████▊  | 4518/5800 [12:34:00<2:27:44,  6.91s/it]                                                        {'loss': 0.0063, 'grad_norm': 3.7885043621063232, 'learning_rate': 4.909635331886087e-06, 'epoch': 38.95}
- 78%|███████▊  | 4518/5800 [12:34:00<2:27:44,  6.91s/it]score1 tensor([[0.6523],
-        [0.4590],
-        [0.4824],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6406, 0.4551, 0.4844, 0.5742], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0044, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:20:37,069] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.37
-[2025-01-25 21:20:37,070] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.44 | bwd_microstep: 4597.72 | bwd_inner_microstep: 4592.28 | bwd_allreduce_microstep: 5.32 | step_microstep: 51.74
-[2025-01-25 21:20:37,071] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.41 | bwd: 4597.75 | bwd_inner: 4592.28 | bwd_allreduce: 5.37 | step: 51.75
- 78%|███████▊  | 4519/5800 [12:34:07<2:27:28,  6.91s/it]                                                        {'loss': 0.0044, 'grad_norm': 2.342292547225952, 'learning_rate': 4.902308280266704e-06, 'epoch': 38.96}
- 78%|███████▊  | 4519/5800 [12:34:07<2:27:28,  6.91s/it]score1 tensor([[0.4902],
-        [0.5586],
-        [0.5039],
-        [0.6445]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4863, 0.5742, 0.5117, 0.6289], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:20:43,995] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.51 | optimizer_step: 4.37
-[2025-01-25 21:20:43,997] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.66 | bwd_microstep: 4642.55 | bwd_inner_microstep: 4638.04 | bwd_allreduce_microstep: 4.43 | step_microstep: 43.02
-[2025-01-25 21:20:43,997] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.61 | bwd: 4642.57 | bwd_inner: 4638.04 | bwd_allreduce: 4.47 | step: 43.03
- 78%|███████▊  | 4520/5800 [12:34:13<2:27:26,  6.91s/it]                                                        {'loss': 0.0107, 'grad_norm': 0.441537082195282, 'learning_rate': 4.894985936369558e-06, 'epoch': 38.97}
- 78%|███████▊  | 4520/5800 [12:34:13<2:27:26,  6.91s/it]score1 tensor([[0.4102],
-        [0.4844],
-        [0.4766],
-        [0.3672]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.4824, 0.4727, 0.3809], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:20:50,910] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.54 | optimizer_step: 4.36
-[2025-01-25 21:20:50,912] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.81 | bwd_microstep: 4640.19 | bwd_inner_microstep: 4635.07 | bwd_allreduce_microstep: 5.03 | step_microstep: 44.41
-[2025-01-25 21:20:50,912] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.77 | bwd: 4640.22 | bwd_inner: 4635.07 | bwd_allreduce: 5.08 | step: 44.42
- 78%|███████▊  | 4521/5800 [12:34:20<2:27:21,  6.91s/it]                                                        {'loss': 0.0073, 'grad_norm': 4.045475006103516, 'learning_rate': 4.8876683024779e-06, 'epoch': 38.97}
- 78%|███████▊  | 4521/5800 [12:34:20<2:27:21,  6.91s/it]score1 tensor([[0.5508],
-        [0.6445],
-        [0.4160],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.6445, 0.4141, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:20:57,744] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.36
-[2025-01-25 21:20:57,745] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.37 | bwd_microstep: 4549.82 | bwd_inner_microstep: 4544.08 | bwd_allreduce_microstep: 5.61 | step_microstep: 48.52
-[2025-01-25 21:20:57,746] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.33 | bwd: 4549.85 | bwd_inner: 4544.08 | bwd_allreduce: 5.68 | step: 48.53
- 78%|███████▊  | 4522/5800 [12:34:27<2:26:43,  6.89s/it]                                                        {'loss': 0.002, 'grad_norm': 3.8040595054626465, 'learning_rate': 4.880355380873478e-06, 'epoch': 38.98}
- 78%|███████▊  | 4522/5800 [12:34:27<2:26:43,  6.89s/it]score1 tensor([[0.5703],
-        [0.4590],
-        [0.6250],
-        [0.5938]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.4492, 0.6172, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:21:04,672] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 21:21:04,673] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.77 | bwd_microstep: 4644.22 | bwd_inner_microstep: 4639.20 | bwd_allreduce_microstep: 4.91 | step_microstep: 45.65
-[2025-01-25 21:21:04,674] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.72 | bwd: 4644.25 | bwd_inner: 4639.20 | bwd_allreduce: 4.96 | step: 45.68
- 78%|███████▊  | 4523/5800 [12:34:34<2:26:52,  6.90s/it]                                                        {'loss': 0.0103, 'grad_norm': 4.129640102386475, 'learning_rate': 4.873047173836607e-06, 'epoch': 38.99}
- 78%|███████▊  | 4523/5800 [12:34:34<2:26:52,  6.90s/it][93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-score1 tensor([[0.3906]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:21:10,756] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 21:21:10,758] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 574.53 | bwd_microstep: 1219.11 | bwd_inner_microstep: 1214.23 | bwd_allreduce_microstep: 4.77 | step_microstep: 44.56
-[2025-01-25 21:21:10,758] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 574.49 | bwd: 1219.13 | bwd_inner: 1214.23 | bwd_allreduce: 4.82 | step: 44.57
- 78%|███████▊  | 4524/5800 [12:34:40<2:21:33,  6.66s/it]                                                        {'loss': 0.0156, 'grad_norm': 7.0629563331604, 'learning_rate': 4.865743683646094e-06, 'epoch': 39.0}
- 78%|███████▊  | 4524/5800 [12:34:40<2:21:33,  6.66s/it][2025-01-25 21:21:15,328] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 21:21:25,545] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 21:21:35,858] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 21:21:45,994] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-score1 tensor([[0.5273],
-        [0.5195],
-        [0.4453],
-        [0.4961]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.4941, 0.4512, 0.4980], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:21:59,984] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 21:21:59,986] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2134.66 | bwd_microstep: 4577.39 | bwd_inner_microstep: 4572.64 | bwd_allreduce_microstep: 4.66 | step_microstep: 44.94
-[2025-01-25 21:21:59,986] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2134.63 | bwd: 4577.41 | bwd_inner: 4572.64 | bwd_allreduce: 4.71 | step: 44.95
- 78%|███████▊  | 4525/5800 [12:35:29<6:52:50, 19.43s/it]                                                        {'loss': 0.0093, 'grad_norm': 3.897688150405884, 'learning_rate': 4.85844491257931e-06, 'epoch': 39.01}
- 78%|███████▊  | 4525/5800 [12:35:29<6:52:50, 19.43s/it]score1 tensor([[0.5312],
-        [0.3398],
-        [0.4238],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.3457, 0.4355, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:22:06,848] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.09 | optimizer_step: 4.36
-[2025-01-25 21:22:06,849] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2133.87 | bwd_microstep: 4599.44 | bwd_inner_microstep: 4592.71 | bwd_allreduce_microstep: 6.54 | step_microstep: 47.39
-[2025-01-25 21:22:06,850] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2133.82 | bwd: 4599.46 | bwd_inner: 4592.71 | bwd_allreduce: 6.66 | step: 47.39
- 78%|███████▊  | 4526/5800 [12:35:36<5:32:29, 15.66s/it]                                                        {'loss': 0.0093, 'grad_norm': 7.608931541442871, 'learning_rate': 4.851150862912124e-06, 'epoch': 39.02}
- 78%|███████▊  | 4526/5800 [12:35:36<5:32:29, 15.66s/it]score1 tensor([[0.4453],
-        [0.5234],
-        [0.4043],
-        [0.6094]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4531, 0.5352, 0.4004, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:22:13,673] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 21:22:13,675] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2146.98 | bwd_microstep: 4552.43 | bwd_inner_microstep: 4547.02 | bwd_allreduce_microstep: 5.32 | step_microstep: 47.20
-[2025-01-25 21:22:13,675] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2146.95 | bwd: 4552.52 | bwd_inner: 4547.02 | bwd_allreduce: 5.37 | step: 47.20
- 78%|███████▊  | 4527/5800 [12:35:43<4:35:59, 13.01s/it]                                                        {'loss': 0.0059, 'grad_norm': 2.1786069869995117, 'learning_rate': 4.843861536918961e-06, 'epoch': 39.03}
- 78%|███████▊  | 4527/5800 [12:35:43<4:35:59, 13.01s/it]score1 tensor([[0.3965],
-        [0.6016],
-        [0.5430],
-        [0.4297]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3984, 0.6094, 0.5508, 0.4238], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:22:20,544] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 21:22:20,546] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2146.41 | bwd_microstep: 4609.52 | bwd_inner_microstep: 4604.79 | bwd_allreduce_microstep: 4.62 | step_microstep: 44.77
-[2025-01-25 21:22:20,546] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2146.38 | bwd: 4609.54 | bwd_inner: 4604.79 | bwd_allreduce: 4.68 | step: 44.78
- 78%|███████▊  | 4528/5800 [12:35:50<3:56:47, 11.17s/it]                                                        {'loss': 0.0059, 'grad_norm': 4.308115482330322, 'learning_rate': 4.836576936872752e-06, 'epoch': 39.03}
- 78%|███████▊  | 4528/5800 [12:35:50<3:56:47, 11.17s/it]score1 tensor([[0.5508],
-        [0.4980],
-        [0.5469],
-        [0.6523]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.5000, 0.5586, 0.6484], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:22:27,444] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 21:22:27,445] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.74 | bwd_microstep: 4617.35 | bwd_inner_microstep: 4609.98 | bwd_allreduce_microstep: 7.26 | step_microstep: 47.10
-[2025-01-25 21:22:27,445] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.69 | bwd: 4617.37 | bwd_inner: 4609.98 | bwd_allreduce: 7.31 | step: 47.10
- 78%|███████▊  | 4529/5800 [12:35:57<3:29:26,  9.89s/it]                                                        {'loss': 0.0083, 'grad_norm': 0.6467443108558655, 'learning_rate': 4.829297065044956e-06, 'epoch': 39.04}
- 78%|███████▊  | 4529/5800 [12:35:57<3:29:26,  9.89s/it]score1 tensor([[0.5117],
-        [0.5508],
-        [0.4863],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.5547, 0.4863, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:22:34,301] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 21:22:34,303] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2149.19 | bwd_microstep: 4569.52 | bwd_inner_microstep: 4564.65 | bwd_allreduce_microstep: 4.79 | step_microstep: 44.07
-[2025-01-25 21:22:34,303] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2149.16 | bwd: 4569.54 | bwd_inner: 4564.65 | bwd_allreduce: 4.83 | step: 44.07
- 78%|███████▊  | 4530/5800 [12:36:04<3:10:01,  8.98s/it]                                                        {'loss': 0.0059, 'grad_norm': 6.285431385040283, 'learning_rate': 4.822021923705577e-06, 'epoch': 39.05}
- 78%|███████▊  | 4530/5800 [12:36:04<3:10:01,  8.98s/it]score1 tensor([[0.6641],
-        [0.5469],
-        [0.5977],
-        [0.4316]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6875, 0.5391, 0.5938, 0.4277], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:22:41,183] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.37
-[2025-01-25 21:22:41,184] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.60 | bwd_microstep: 4609.74 | bwd_inner_microstep: 4604.60 | bwd_allreduce_microstep: 5.04 | step_microstep: 45.28
-[2025-01-25 21:22:41,184] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.57 | bwd: 4609.76 | bwd_inner: 4604.60 | bwd_allreduce: 5.09 | step: 45.28
- 78%|███████▊  | 4531/5800 [12:36:11<2:56:35,  8.35s/it]                                                        {'loss': 0.0098, 'grad_norm': 3.8391757011413574, 'learning_rate': 4.8147515151231175e-06, 'epoch': 39.06}
- 78%|███████▊  | 4531/5800 [12:36:11<2:56:35,  8.35s/it]score1 tensor([[0.4023],
-        [0.4512],
-        [0.6055],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4082, 0.4551, 0.6055, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0024, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:22:48,000] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.50 | optimizer_step: 4.36
-[2025-01-25 21:22:48,002] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.18 | bwd_microstep: 4535.67 | bwd_inner_microstep: 4530.91 | bwd_allreduce_microstep: 4.67 | step_microstep: 45.23
-[2025-01-25 21:22:48,002] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.15 | bwd: 4535.69 | bwd_inner: 4530.91 | bwd_allreduce: 4.72 | step: 45.24
- 78%|███████▊  | 4532/5800 [12:36:17<2:46:44,  7.89s/it]                                                        {'loss': 0.0024, 'grad_norm': 3.638204336166382, 'learning_rate': 4.807485841564625e-06, 'epoch': 39.07}
- 78%|███████▊  | 4532/5800 [12:36:17<2:46:44,  7.89s/it]score1 tensor([[0.4414],
-        [0.5586],
-        [0.3926],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4395, 0.5469, 0.3887, 0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:22:54,882] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.37
-[2025-01-25 21:22:54,884] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.51 | bwd_microstep: 4605.46 | bwd_inner_microstep: 4599.83 | bwd_allreduce_microstep: 5.50 | step_microstep: 48.33
-[2025-01-25 21:22:54,884] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.48 | bwd: 4605.49 | bwd_inner: 4599.83 | bwd_allreduce: 5.57 | step: 48.34
- 78%|███████▊  | 4533/5800 [12:36:24<2:40:13,  7.59s/it]                                                        {'loss': 0.0054, 'grad_norm': 7.819904804229736, 'learning_rate': 4.800224905295659e-06, 'epoch': 39.08}
- 78%|███████▊  | 4533/5800 [12:36:24<2:40:13,  7.59s/it]score1 tensor([[0.4941],
-        [0.3496],
-        [0.5781],
-        [0.4805]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5000, 0.3555, 0.5781, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:23:01,731] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 21:23:01,732] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.82 | bwd_microstep: 4572.53 | bwd_inner_microstep: 4567.61 | bwd_allreduce_microstep: 4.82 | step_microstep: 43.02
-[2025-01-25 21:23:01,733] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.79 | bwd: 4572.56 | bwd_inner: 4567.61 | bwd_allreduce: 4.87 | step: 43.03
- 78%|███████▊  | 4534/5800 [12:36:31<2:35:24,  7.36s/it]                                                        {'loss': 0.0039, 'grad_norm': 5.606717109680176, 'learning_rate': 4.79296870858031e-06, 'epoch': 39.09}
- 78%|███████▊  | 4534/5800 [12:36:31<2:35:24,  7.36s/it]score1 tensor([[0.4297],
-        [0.4512],
-        [0.4980],
-        [0.5586]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4258, 0.4492, 0.5078, 0.5586], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:23:08,575] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.37
-[2025-01-25 21:23:08,576] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.94 | bwd_microstep: 4566.26 | bwd_inner_microstep: 4561.33 | bwd_allreduce_microstep: 4.84 | step_microstep: 45.62
-[2025-01-25 21:23:08,577] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.90 | bwd: 4566.29 | bwd_inner: 4561.33 | bwd_allreduce: 4.89 | step: 45.63
- 78%|███████▊  | 4535/5800 [12:36:38<2:31:58,  7.21s/it]                                                        {'loss': 0.0039, 'grad_norm': 1.7511365413665771, 'learning_rate': 4.785717253681181e-06, 'epoch': 39.09}
- 78%|███████▊  | 4535/5800 [12:36:38<2:31:58,  7.21s/it]score1 tensor([[0.6055],
-        [0.4043],
-        [0.4492],
-        [0.4434]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.4043, 0.4512, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0044, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:23:15,414] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.51 | optimizer_step: 4.37
-[2025-01-25 21:23:15,415] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.68 | bwd_microstep: 4564.84 | bwd_inner_microstep: 4558.59 | bwd_allreduce_microstep: 5.83 | step_microstep: 45.70
-[2025-01-25 21:23:15,416] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.65 | bwd: 4564.90 | bwd_inner: 4558.59 | bwd_allreduce: 5.89 | step: 45.71
- 78%|███████▊  | 4536/5800 [12:36:45<2:29:31,  7.10s/it]                                                        {'loss': 0.0044, 'grad_norm': 6.018586158752441, 'learning_rate': 4.778470542859399e-06, 'epoch': 39.1}
- 78%|███████▊  | 4536/5800 [12:36:45<2:29:31,  7.10s/it]score1 tensor([[0.6094],
-        [0.4746],
-        [0.4648],
-        [0.6484]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.4785, 0.4688, 0.6523], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:23:22,311] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 21:23:22,313] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.87 | bwd_microstep: 4617.09 | bwd_inner_microstep: 4612.07 | bwd_allreduce_microstep: 4.91 | step_microstep: 45.19
-[2025-01-25 21:23:22,313] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.84 | bwd: 4617.11 | bwd_inner: 4612.07 | bwd_allreduce: 4.97 | step: 45.22
- 78%|███████▊  | 4537/5800 [12:36:52<2:28:08,  7.04s/it]                                                        {'loss': 0.0039, 'grad_norm': 3.9360876083374023, 'learning_rate': 4.771228578374625e-06, 'epoch': 39.11}
- 78%|███████▊  | 4537/5800 [12:36:52<2:28:08,  7.04s/it]score1 tensor([[0.6016],
-        [0.4727],
-        [0.6328],
-        [0.6094]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133, 0.4727, 0.6289, 0.6094], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:23:29,131] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 21:23:29,132] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.86 | bwd_microstep: 4537.27 | bwd_inner_microstep: 4531.23 | bwd_allreduce_microstep: 5.90 | step_microstep: 44.68
-[2025-01-25 21:23:29,132] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.83 | bwd: 4537.29 | bwd_inner: 4531.23 | bwd_allreduce: 5.98 | step: 44.69
- 78%|███████▊  | 4538/5800 [12:36:59<2:26:38,  6.97s/it]                                                        {'loss': 0.0039, 'grad_norm': 0.295392245054245, 'learning_rate': 4.763991362485016e-06, 'epoch': 39.12}
- 78%|███████▊  | 4538/5800 [12:36:59<2:26:38,  6.97s/it]score1 tensor([[0.4727],
-        [0.5469],
-        [0.5703],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4609, 0.5508, 0.5625, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:23:36,024] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.37
-[2025-01-25 21:23:36,025] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.71 | bwd_microstep: 4618.32 | bwd_inner_microstep: 4613.59 | bwd_allreduce_microstep: 4.59 | step_microstep: 43.40
-[2025-01-25 21:23:36,026] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.67 | bwd: 4618.36 | bwd_inner: 4613.59 | bwd_allreduce: 4.65 | step: 43.41
- 78%|███████▊  | 4539/5800 [12:37:05<2:26:02,  6.95s/it]                                                        {'loss': 0.0068, 'grad_norm': 0.36693161725997925, 'learning_rate': 4.7567588974472734e-06, 'epoch': 39.13}
- 78%|███████▊  | 4539/5800 [12:37:06<2:26:02,  6.95s/it]score1 tensor([[0.5859],
-        [0.5859],
-        [0.6055],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5742, 0.5781, 0.6055, 0.5703], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:23:42,888] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 21:23:42,889] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.72 | bwd_microstep: 4576.77 | bwd_inner_microstep: 4571.54 | bwd_allreduce_microstep: 5.11 | step_microstep: 47.82
-[2025-01-25 21:23:42,890] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.69 | bwd: 4576.79 | bwd_inner: 4571.54 | bwd_allreduce: 5.18 | step: 47.83
- 78%|███████▊  | 4540/5800 [12:37:12<2:25:23,  6.92s/it]                                                        {'loss': 0.0059, 'grad_norm': 6.646190643310547, 'learning_rate': 4.7495311855166e-06, 'epoch': 39.14}
- 78%|███████▊  | 4540/5800 [12:37:12<2:25:23,  6.92s/it]score1 tensor([[0.7070],
-        [0.5117],
-        [0.6055],
-        [0.4707]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6875, 0.5039, 0.5898, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:23:49,800] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 21:23:49,807] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.13 | bwd_microstep: 4631.52 | bwd_inner_microstep: 4626.94 | bwd_allreduce_microstep: 4.49 | step_microstep: 48.83
-[2025-01-25 21:23:49,808] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.08 | bwd: 4631.54 | bwd_inner: 4626.94 | bwd_allreduce: 4.53 | step: 48.85
- 78%|███████▊  | 4541/5800 [12:37:19<2:25:13,  6.92s/it]                                                        {'loss': 0.0127, 'grad_norm': 4.928693771362305, 'learning_rate': 4.742308228946714e-06, 'epoch': 39.15}
- 78%|███████▊  | 4541/5800 [12:37:19<2:25:13,  6.92s/it]score1 tensor([[0.5508],
-        [0.5508],
-        [0.4492],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.5391, 0.4473, 0.5469], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:23:56,732] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 21:23:56,734] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.45 | bwd_microstep: 4637.93 | bwd_inner_microstep: 4632.98 | bwd_allreduce_microstep: 4.83 | step_microstep: 52.42
-[2025-01-25 21:23:56,734] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.41 | bwd: 4637.95 | bwd_inner: 4632.98 | bwd_allreduce: 4.88 | step: 52.43
- 78%|███████▊  | 4542/5800 [12:37:26<2:25:10,  6.92s/it]                                                        {'loss': 0.0093, 'grad_norm': 8.310093879699707, 'learning_rate': 4.735090029989875e-06, 'epoch': 39.16}
- 78%|███████▊  | 4542/5800 [12:37:26<2:25:10,  6.92s/it]score1 tensor([[0.5742],
-        [0.4844],
-        [0.5352],
-        [0.4863]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.4844, 0.5273, 0.4766], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:24:03,607] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.36
-[2025-01-25 21:24:03,608] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.30 | bwd_microstep: 4587.27 | bwd_inner_microstep: 4582.23 | bwd_allreduce_microstep: 4.92 | step_microstep: 46.34
-[2025-01-25 21:24:03,609] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.27 | bwd: 4587.29 | bwd_inner: 4582.23 | bwd_allreduce: 4.97 | step: 46.34
- 78%|███████▊  | 4543/5800 [12:37:33<2:24:46,  6.91s/it]                                                        {'loss': 0.0063, 'grad_norm': 6.223272800445557, 'learning_rate': 4.7278765908968276e-06, 'epoch': 39.16}
- 78%|███████▊  | 4543/5800 [12:37:33<2:24:46,  6.91s/it]score1 tensor([[0.4336],
-        [0.6484],
-        [0.3750],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4434, 0.6562, 0.3652, 0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:24:10,548] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.36
-[2025-01-25 21:24:10,549] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.05 | bwd_microstep: 4644.07 | bwd_inner_microstep: 4638.96 | bwd_allreduce_microstep: 5.00 | step_microstep: 45.92
-[2025-01-25 21:24:10,550] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.00 | bwd: 4644.10 | bwd_inner: 4638.96 | bwd_allreduce: 5.06 | step: 45.93
- 78%|███████▊  | 4544/5800 [12:37:40<2:24:49,  6.92s/it]                                                        {'loss': 0.0098, 'grad_norm': 4.600212574005127, 'learning_rate': 4.7206679139168585e-06, 'epoch': 39.17}
- 78%|███████▊  | 4544/5800 [12:37:40<2:24:49,  6.92s/it]score1 tensor([[0.4316],
-        [0.4805],
-        [0.7070],
-        [0.4863]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4238, 0.4844, 0.7070, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:24:17,420] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 21:24:17,421] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.01 | bwd_microstep: 4588.58 | bwd_inner_microstep: 4583.71 | bwd_allreduce_microstep: 4.76 | step_microstep: 44.90
-[2025-01-25 21:24:17,422] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.99 | bwd: 4588.60 | bwd_inner: 4583.71 | bwd_allreduce: 4.81 | step: 44.91
- 78%|███████▊  | 4545/5800 [12:37:47<2:24:24,  6.90s/it]                                                        {'loss': 0.0049, 'grad_norm': 2.078458309173584, 'learning_rate': 4.713464001297747e-06, 'epoch': 39.18}
- 78%|███████▊  | 4545/5800 [12:37:47<2:24:24,  6.90s/it]score1 tensor([[0.5234],
-        [0.6211],
-        [0.5781],
-        [0.4824]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.6133, 0.5820, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:24:24,344] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.55 | optimizer_step: 4.36
-[2025-01-25 21:24:24,346] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.45 | bwd_microstep: 4639.92 | bwd_inner_microstep: 4635.21 | bwd_allreduce_microstep: 4.59 | step_microstep: 44.42
-[2025-01-25 21:24:24,346] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.41 | bwd: 4639.95 | bwd_inner: 4635.21 | bwd_allreduce: 4.65 | step: 44.42
- 78%|███████▊  | 4546/5800 [12:37:54<2:24:24,  6.91s/it]                                                        {'loss': 0.0054, 'grad_norm': 0.43069377541542053, 'learning_rate': 4.706264855285811e-06, 'epoch': 39.19}
- 78%|███████▊  | 4546/5800 [12:37:54<2:24:24,  6.91s/it]score1 tensor([[0.3770],
-        [0.3477],
-        [0.5664],
-        [0.4980]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3789, 0.3340, 0.5625, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:24:31,254] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.36
-[2025-01-25 21:24:31,255] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.47 | bwd_microstep: 4633.25 | bwd_inner_microstep: 4628.36 | bwd_allreduce_microstep: 4.76 | step_microstep: 44.47
-[2025-01-25 21:24:31,256] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.43 | bwd: 4633.28 | bwd_inner: 4628.36 | bwd_allreduce: 4.83 | step: 44.47
- 78%|███████▊  | 4547/5800 [12:38:01<2:24:18,  6.91s/it]                                                        {'loss': 0.0059, 'grad_norm': 4.116915702819824, 'learning_rate': 4.699070478125863e-06, 'epoch': 39.2}
- 78%|███████▊  | 4547/5800 [12:38:01<2:24:18,  6.91s/it]score1 tensor([[0.4941],
-        [0.5195],
-        [0.4727],
-        [0.4668]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4961, 0.5156, 0.4668, 0.4746], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:24:38,169] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.37
-[2025-01-25 21:24:38,174] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.11 | bwd_microstep: 4631.02 | bwd_inner_microstep: 4625.82 | bwd_allreduce_microstep: 5.09 | step_microstep: 50.09
-[2025-01-25 21:24:38,175] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.07 | bwd: 4631.05 | bwd_inner: 4625.82 | bwd_allreduce: 5.15 | step: 50.10
- 78%|███████▊  | 4548/5800 [12:38:08<2:24:12,  6.91s/it]                                                        {'loss': 0.0049, 'grad_norm': 0.3397415578365326, 'learning_rate': 4.691880872061228e-06, 'epoch': 39.21}
- 78%|███████▊  | 4548/5800 [12:38:08<2:24:12,  6.91s/it]score1 tensor([[0.6250],
-        [0.5469],
-        [0.6016],
-        [0.4004]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.5469, 0.5977, 0.4023], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0024, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:24:45,030] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 21:24:45,031] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.87 | bwd_microstep: 4576.63 | bwd_inner_microstep: 4572.23 | bwd_allreduce_microstep: 4.30 | step_microstep: 46.57
-[2025-01-25 21:24:45,031] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.84 | bwd: 4576.65 | bwd_inner: 4572.23 | bwd_allreduce: 4.34 | step: 46.58
- 78%|███████▊  | 4549/5800 [12:38:15<2:23:46,  6.90s/it]                                                        {'loss': 0.0024, 'grad_norm': 2.8891406059265137, 'learning_rate': 4.6846960393337606e-06, 'epoch': 39.22}
- 78%|███████▊  | 4549/5800 [12:38:15<2:23:46,  6.90s/it]score1 tensor([[0.4883],
-        [0.4473],
-        [0.4883],
-        [0.5977]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4902, 0.4453, 0.4961, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:24:51,944] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 21:24:51,945] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.01 | bwd_microstep: 4632.20 | bwd_inner_microstep: 4626.46 | bwd_allreduce_microstep: 5.62 | step_microstep: 46.67
-[2025-01-25 21:24:51,945] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.96 | bwd: 4632.22 | bwd_inner: 4626.46 | bwd_allreduce: 5.69 | step: 46.68
- 78%|███████▊  | 4550/5800 [12:38:21<2:23:48,  6.90s/it]                                                        {'loss': 0.0088, 'grad_norm': 4.338943958282471, 'learning_rate': 4.677515982183806e-06, 'epoch': 39.22}
- 78%|███████▊  | 4550/5800 [12:38:21<2:23:48,  6.90s/it]score1 tensor([[0.4902],
-        [0.4902],
-        [0.4590],
-        [0.3398]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.4980, 0.4648, 0.3477], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:24:58,862] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.37
-[2025-01-25 21:24:58,864] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.47 | bwd_microstep: 4638.11 | bwd_inner_microstep: 4633.06 | bwd_allreduce_microstep: 4.93 | step_microstep: 44.41
-[2025-01-25 21:24:58,864] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.44 | bwd: 4638.13 | bwd_inner: 4633.06 | bwd_allreduce: 5.00 | step: 44.42
- 78%|███████▊  | 4551/5800 [12:38:28<2:23:45,  6.91s/it]                                                        {'loss': 0.0068, 'grad_norm': 3.5571846961975098, 'learning_rate': 4.670340702850242e-06, 'epoch': 39.23}
- 78%|███████▊  | 4551/5800 [12:38:28<2:23:45,  6.91s/it]score1 tensor([[0.5195],
-        [0.5391],
-        [0.5938],
-        [0.6875]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.5430, 0.5898, 0.6797], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:25:05,728] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.31 | optimizer_step: 4.36
-[2025-01-25 21:25:05,729] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.00 | bwd_microstep: 4591.69 | bwd_inner_microstep: 4583.97 | bwd_allreduce_microstep: 7.66 | step_microstep: 42.40
-[2025-01-25 21:25:05,729] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.95 | bwd: 4591.71 | bwd_inner: 4583.97 | bwd_allreduce: 7.70 | step: 42.42
- 78%|███████▊  | 4552/5800 [12:38:35<2:23:22,  6.89s/it]                                                        {'loss': 0.0039, 'grad_norm': 2.681950569152832, 'learning_rate': 4.663170203570435e-06, 'epoch': 39.24}
- 78%|███████��  | 4552/5800 [12:38:35<2:23:22,  6.89s/it]score1 tensor([[0.5625],
-        [0.5039],
-        [0.5156],
-        [0.4355]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.5039, 0.5195, 0.4297], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0034, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:25:12,579] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 21:25:12,580] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.54 | bwd_microstep: 4578.99 | bwd_inner_microstep: 4573.94 | bwd_allreduce_microstep: 4.97 | step_microstep: 47.17
-[2025-01-25 21:25:12,581] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.48 | bwd: 4579.01 | bwd_inner: 4573.94 | bwd_allreduce: 5.01 | step: 47.19
- 78%|███████▊  | 4553/5800 [12:38:42<2:23:01,  6.88s/it]                                                        {'loss': 0.0034, 'grad_norm': 2.3670573234558105, 'learning_rate': 4.656004486580276e-06, 'epoch': 39.25}
- 78%|███████▊  | 4553/5800 [12:38:42<2:23:01,  6.88s/it]score1 tensor([[0.5195],
-        [0.4746],
-        [0.5547],
-        [0.3828]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.4863, 0.5508, 0.3730], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:25:19,441] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 21:25:19,442] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.62 | bwd_microstep: 4576.88 | bwd_inner_microstep: 4571.91 | bwd_allreduce_microstep: 4.87 | step_microstep: 48.51
-[2025-01-25 21:25:19,442] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.59 | bwd: 4576.91 | bwd_inner: 4571.91 | bwd_allreduce: 4.92 | step: 48.52
- 79%|███████▊  | 4554/5800 [12:38:49<2:22:46,  6.88s/it]                                                        {'loss': 0.0063, 'grad_norm': 1.9244648218154907, 'learning_rate': 4.648843554114166e-06, 'epoch': 39.26}
- 79%|███████▊  | 4554/5800 [12:38:49<2:22:46,  6.88s/it]score1 tensor([[0.6016],
-        [0.4629],
-        [0.6562],
-        [0.6016]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.4668, 0.6562, 0.6172], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:25:26,308] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 21:25:26,310] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.18 | bwd_microstep: 4583.09 | bwd_inner_microstep: 4578.01 | bwd_allreduce_microstep: 4.98 | step_microstep: 44.73
-[2025-01-25 21:25:26,310] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.14 | bwd: 4583.11 | bwd_inner: 4578.01 | bwd_allreduce: 5.04 | step: 44.75
- 79%|███████▊  | 4555/5800 [12:38:56<2:22:36,  6.87s/it]                                                        {'loss': 0.0059, 'grad_norm': 6.420928001403809, 'learning_rate': 4.641687408404994e-06, 'epoch': 39.27}
- 79%|███████▊  | 4555/5800 [12:38:56<2:22:36,  6.87s/it]score1 tensor([[0.4766],
-        [0.4004],
-        [0.1943],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4766, 0.4023, 0.1787, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0044, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:25:33,131] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 21:25:33,133] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.34 | bwd_microstep: 4544.89 | bwd_inner_microstep: 4537.90 | bwd_allreduce_microstep: 6.88 | step_microstep: 43.32
-[2025-01-25 21:25:33,133] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.30 | bwd: 4544.91 | bwd_inner: 4537.90 | bwd_allreduce: 6.94 | step: 43.33
- 79%|███████▊  | 4556/5800 [12:39:03<2:22:12,  6.86s/it]                                                        {'loss': 0.0044, 'grad_norm': 1.0361570119857788, 'learning_rate': 4.634536051684184e-06, 'epoch': 39.28}
- 79%|███████▊  | 4556/5800 [12:39:03<2:22:12,  6.86s/it]score1 tensor([[0.5508],
-        [0.5898],
-        [0.4453],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.5977, 0.4512, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:25:40,048] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 21:25:40,049] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.44 | bwd_microstep: 4630.86 | bwd_inner_microstep: 4625.17 | bwd_allreduce_microstep: 5.59 | step_microstep: 46.54
-[2025-01-25 21:25:40,049] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.40 | bwd: 4630.88 | bwd_inner: 4625.17 | bwd_allreduce: 5.64 | step: 46.54
- 79%|███████▊  | 4557/5800 [12:39:10<2:22:27,  6.88s/it]                                                        {'loss': 0.0083, 'grad_norm': 4.171710014343262, 'learning_rate': 4.627389486181641e-06, 'epoch': 39.28}
- 79%|███████▊  | 4557/5800 [12:39:10<2:22:27,  6.88s/it]score1 tensor([[0.5664],
-        [0.4414],
-        [0.4922],
-        [0.6133]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.4512, 0.4941, 0.6133], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0029, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:25:46,877] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.37
-[2025-01-25 21:25:46,879] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.86 | bwd_microstep: 4542.09 | bwd_inner_microstep: 4536.94 | bwd_allreduce_microstep: 5.04 | step_microstep: 45.02
-[2025-01-25 21:25:46,879] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.81 | bwd: 4542.12 | bwd_inner: 4536.94 | bwd_allreduce: 5.11 | step: 45.02
- 79%|███████▊  | 4558/5800 [12:39:16<2:22:03,  6.86s/it]                                                        {'loss': 0.0029, 'grad_norm': 3.8587779998779297, 'learning_rate': 4.6202477141258025e-06, 'epoch': 39.29}
- 79%|███████▊  | 4558/5800 [12:39:16<2:22:03,  6.86s/it]score1 tensor([[0.4668],
-        [0.5469],
-        [0.6055],
-        [0.3789]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.5430, 0.6211, 0.3652], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:25:53,798] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 21:25:53,799] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.67 | bwd_microstep: 4636.67 | bwd_inner_microstep: 4631.69 | bwd_allreduce_microstep: 4.89 | step_microstep: 44.39
-[2025-01-25 21:25:53,800] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.63 | bwd: 4636.69 | bwd_inner: 4631.69 | bwd_allreduce: 4.93 | step: 44.40
- 79%|███████▊  | 4559/5800 [12:39:23<2:22:16,  6.88s/it]                                                        {'loss': 0.0088, 'grad_norm': 0.565043568611145, 'learning_rate': 4.613110737743587e-06, 'epoch': 39.3}
- 79%|███████▊  | 4559/5800 [12:39:23<2:22:16,  6.88s/it]score1 tensor([[0.5078],
-        [0.5508],
-        [0.6055],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.5430, 0.6016, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:26:00,711] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.37
-[2025-01-25 21:26:00,713] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.87 | bwd_microstep: 4630.85 | bwd_inner_microstep: 4625.62 | bwd_allreduce_microstep: 5.11 | step_microstep: 45.17
-[2025-01-25 21:26:00,713] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.82 | bwd: 4630.87 | bwd_inner: 4625.62 | bwd_allreduce: 5.17 | step: 45.18
- 79%|███████▊  | 4560/5800 [12:39:30<2:22:21,  6.89s/it]                                                        {'loss': 0.0068, 'grad_norm': 8.480569839477539, 'learning_rate': 4.605978559260422e-06, 'epoch': 39.31}
- 79%|███████▊  | 4560/5800 [12:39:30<2:22:21,  6.89s/it]score1 tensor([[0.3848],
-        [0.4688],
-        [0.5039],
-        [0.4180]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3613, 0.4727, 0.4980, 0.4121], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:26:07,631] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 21:26:07,633] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.95 | bwd_microstep: 4639.06 | bwd_inner_microstep: 4633.95 | bwd_allreduce_microstep: 5.00 | step_microstep: 45.97
-[2025-01-25 21:26:07,633] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.92 | bwd: 4639.08 | bwd_inner: 4633.95 | bwd_allreduce: 5.05 | step: 45.98
- 79%|███████▊  | 4561/5800 [12:39:37<2:22:27,  6.90s/it]                                                        {'loss': 0.0098, 'grad_norm': 3.6397926807403564, 'learning_rate': 4.598851180900259e-06, 'epoch': 39.32}
- 79%|███████▊  | 4561/5800 [12:39:37<2:22:27,  6.90s/it]score1 tensor([[0.4746],
-        [0.5547],
-        [0.5625],
-        [0.4336]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4766, 0.5469, 0.5430, 0.4199], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:26:14,552] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.37
-[2025-01-25 21:26:14,553] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.29 | bwd_microstep: 4639.37 | bwd_inner_microstep: 4634.04 | bwd_allreduce_microstep: 5.23 | step_microstep: 45.82
-[2025-01-25 21:26:14,554] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.24 | bwd: 4639.39 | bwd_inner: 4634.04 | bwd_allreduce: 5.28 | step: 45.82
- 79%|███████▊  | 4562/5800 [12:39:44<2:22:29,  6.91s/it]                                                        {'loss': 0.0107, 'grad_norm': 4.228699207305908, 'learning_rate': 4.591728604885526e-06, 'epoch': 39.33}
- 79%|███████▊  | 4562/5800 [12:39:44<2:22:29,  6.91s/it]score1 tensor([[0.3809],
-        [0.6328],
-        [0.5312],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3652, 0.6445, 0.5195, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:26:21,474] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 21:26:21,475] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.94 | bwd_microstep: 4635.24 | bwd_inner_microstep: 4630.17 | bwd_allreduce_microstep: 4.98 | step_microstep: 45.25
-[2025-01-25 21:26:21,476] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.91 | bwd: 4635.27 | bwd_inner: 4630.17 | bwd_allreduce: 5.02 | step: 45.26
- 79%|███████▊  | 4563/5800 [12:39:51<2:22:30,  6.91s/it]                                                        {'loss': 0.0117, 'grad_norm': 3.6638004779815674, 'learning_rate': 4.584610833437176e-06, 'epoch': 39.34}
- 79%|███████▊  | 4563/5800 [12:39:51<2:22:30,  6.91s/it]score1 tensor([[0.5664],
-        [0.3750],
-        [0.4961],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.3672, 0.5078, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:26:28,349] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.36
-[2025-01-25 21:26:28,350] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.24 | bwd_microstep: 4582.11 | bwd_inner_microstep: 4576.88 | bwd_allreduce_microstep: 5.10 | step_microstep: 46.67
-[2025-01-25 21:26:28,351] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.20 | bwd: 4582.15 | bwd_inner: 4576.88 | bwd_allreduce: 5.17 | step: 46.67
- 79%|███████▊  | 4564/5800 [12:39:58<2:22:06,  6.90s/it]                                                        {'loss': 0.0049, 'grad_norm': 0.44859543442726135, 'learning_rate': 4.5774978687746405e-06, 'epoch': 39.34}
- 79%|███████▊  | 4564/5800 [12:39:58<2:22:06,  6.90s/it]score1 tensor([[0.3223],
-        [0.4648],
-        [0.3613],
-        [0.4961]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3105, 0.4805, 0.3691, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:26:35,260] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.61 | optimizer_step: 4.37
-[2025-01-25 21:26:35,261] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.46 | bwd_microstep: 4634.40 | bwd_inner_microstep: 4629.06 | bwd_allreduce_microstep: 5.25 | step_microstep: 44.74
-[2025-01-25 21:26:35,261] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.43 | bwd: 4634.42 | bwd_inner: 4629.06 | bwd_allreduce: 5.30 | step: 44.74
- 79%|███████▊  | 4565/5800 [12:40:05<2:22:04,  6.90s/it]                                                        {'loss': 0.0093, 'grad_norm': 0.34124162793159485, 'learning_rate': 4.5703897131158815e-06, 'epoch': 39.35}
- 79%|███████▊  | 4565/5800 [12:40:05<2:22:04,  6.90s/it]score1 tensor([[0.4844],
-        [0.4551],
-        [0.4355],
-        [0.4727]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4824, 0.4570, 0.4453, 0.4824], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:26:42,177] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 21:26:42,178] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.55 | bwd_microstep: 4632.65 | bwd_inner_microstep: 4628.00 | bwd_allreduce_microstep: 4.55 | step_microstep: 45.45
-[2025-01-25 21:26:42,179] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.47 | bwd: 4632.67 | bwd_inner: 4628.00 | bwd_allreduce: 4.60 | step: 45.46
- 79%|███████▊  | 4566/5800 [12:40:12<2:22:03,  6.91s/it]                                                        {'loss': 0.0059, 'grad_norm': 3.735673189163208, 'learning_rate': 4.5632863686773355e-06, 'epoch': 39.36}
- 79%|███████▊  | 4566/5800 [12:40:12<2:22:03,  6.91s/it]score1 tensor([[0.6367],
-        [0.4922],
-        [0.5273],
-        [0.5781]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6367, 0.4844, 0.5312, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:26:49,041] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 7.37 | optimizer_step: 4.37
-[2025-01-25 21:26:49,042] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.66 | bwd_microstep: 4576.14 | bwd_inner_microstep: 4570.81 | bwd_allreduce_microstep: 5.25 | step_microstep: 48.72
-[2025-01-25 21:26:49,043] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.63 | bwd: 4576.17 | bwd_inner: 4570.81 | bwd_allreduce: 5.29 | step: 48.72
- 79%|███████▊  | 4567/5800 [12:40:19<2:21:39,  6.89s/it]                                                        {'loss': 0.0059, 'grad_norm': 2.1339845657348633, 'learning_rate': 4.556187837673945e-06, 'epoch': 39.37}
- 79%|███████▊  | 4567/5800 [12:40:19<2:21:39,  6.89s/it]score1 tensor([[0.6094],
-        [0.4473],
-        [0.6250],
-        [0.4629]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.4648, 0.6328, 0.4590], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:26:55,907] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.55 | optimizer_step: 4.36
-[2025-01-25 21:26:55,908] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.38 | bwd_microstep: 4586.27 | bwd_inner_microstep: 4581.61 | bwd_allreduce_microstep: 4.57 | step_microstep: 47.72
-[2025-01-25 21:26:55,908] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.35 | bwd: 4586.30 | bwd_inner: 4581.61 | bwd_allreduce: 4.62 | step: 47.72
- 79%|███████▉  | 4568/5800 [12:40:25<2:21:20,  6.88s/it]                                                        {'loss': 0.0073, 'grad_norm': 2.3261663913726807, 'learning_rate': 4.549094122319166e-06, 'epoch': 39.38}
- 79%|███████▉  | 4568/5800 [12:40:25<2:21:20,  6.88s/it]score1 tensor([[0.5273],
-        [0.5391],
-        [0.4863],
-        [0.4043]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.5430, 0.4844, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0044, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:27:02,808] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.54 | optimizer_step: 4.37
-[2025-01-25 21:27:02,809] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.01 | bwd_microstep: 4634.35 | bwd_inner_microstep: 4627.59 | bwd_allreduce_microstep: 6.68 | step_microstep: 44.03
-[2025-01-25 21:27:02,809] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.97 | bwd: 4634.38 | bwd_inner: 4627.59 | bwd_allreduce: 6.72 | step: 44.03
- 79%|███████▉  | 4569/5800 [12:40:32<2:21:22,  6.89s/it]                                                        {'loss': 0.0044, 'grad_norm': 0.6650820374488831, 'learning_rate': 4.54200522482493e-06, 'epoch': 39.39}
- 79%|███████▉  | 4569/5800 [12:40:32<2:21:22,  6.89s/it]score1 tensor([[0.6250],
-        [0.6289],
-        [0.4668],
-        [0.4609]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6367, 0.6211, 0.4688, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:27:09,725] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.37
-[2025-01-25 21:27:09,726] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.09 | bwd_microstep: 4638.38 | bwd_inner_microstep: 4633.62 | bwd_allreduce_microstep: 4.65 | step_microstep: 45.33
-[2025-01-25 21:27:09,726] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.06 | bwd: 4638.40 | bwd_inner: 4633.62 | bwd_allreduce: 4.71 | step: 45.33
- 79%|███████▉  | 4570/5800 [12:40:39<2:21:26,  6.90s/it]                                                        {'loss': 0.0063, 'grad_norm': 3.840005397796631, 'learning_rate': 4.5349211474016894e-06, 'epoch': 39.4}
- 79%|███████▉  | 4570/5800 [12:40:39<2:21:26,  6.90s/it]score1 tensor([[0.4121],
-        [0.4258],
-        [0.5000],
-        [0.3223]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4219, 0.4180, 0.4922, 0.3223], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:27:16,591] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.36
-[2025-01-25 21:27:16,592] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.61 | bwd_microstep: 4588.06 | bwd_inner_microstep: 4583.35 | bwd_allreduce_microstep: 4.61 | step_microstep: 40.61
-[2025-01-25 21:27:16,592] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.58 | bwd: 4588.08 | bwd_inner: 4583.35 | bwd_allreduce: 4.66 | step: 40.63
- 79%|███████▉  | 4571/5800 [12:40:46<2:21:06,  6.89s/it]                                                        {'loss': 0.0063, 'grad_norm': 2.0544631481170654, 'learning_rate': 4.52784189225838e-06, 'epoch': 39.41}
- 79%|███████▉  | 4571/5800 [12:40:46<2:21:06,  6.89s/it]score1 tensor([[0.4629],
-        [0.5508],
-        [0.5273],
-        [0.3926]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.5508, 0.5273, 0.3809], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:27:23,406] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 21:27:23,407] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.81 | bwd_microstep: 4539.12 | bwd_inner_microstep: 4534.00 | bwd_allreduce_microstep: 5.00 | step_microstep: 44.34
-[2025-01-25 21:27:23,408] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.76 | bwd: 4539.15 | bwd_inner: 4533.99 | bwd_allreduce: 5.06 | step: 44.34
- 79%|███████▉  | 4572/5800 [12:40:53<2:20:34,  6.87s/it]                                                        {'loss': 0.0049, 'grad_norm': 3.68263840675354, 'learning_rate': 4.520767461602426e-06, 'epoch': 39.41}
- 79%|███████▉  | 4572/5800 [12:40:53<2:20:34,  6.87s/it]score1 tensor([[0.5586],
-        [0.6328],
-        [0.4121],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5586, 0.6484, 0.4160, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:27:30,244] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.54 | optimizer_step: 4.36
-[2025-01-25 21:27:30,245] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.41 | bwd_microstep: 4542.28 | bwd_inner_microstep: 4537.26 | bwd_allreduce_microstep: 4.93 | step_microstep: 54.31
-[2025-01-25 21:27:30,246] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.38 | bwd: 4542.31 | bwd_inner: 4537.26 | bwd_allreduce: 4.97 | step: 54.32
- 79%|███████▉  | 4573/5800 [12:41:00<2:20:14,  6.86s/it]                                                        {'loss': 0.0049, 'grad_norm': 4.149129867553711, 'learning_rate': 4.5136978576397714e-06, 'epoch': 39.42}
- 79%|███████▉  | 4573/5800 [12:41:00<2:20:14,  6.86s/it]score1 tensor([[0.5430],
-        [0.4395],
-        [0.3750],
-        [0.3457]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4316, 0.3750, 0.3418], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:27:37,111] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.37
-[2025-01-25 21:27:37,113] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.69 | bwd_microstep: 4582.36 | bwd_inner_microstep: 4577.62 | bwd_allreduce_microstep: 4.67 | step_microstep: 45.66
-[2025-01-25 21:27:37,113] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.65 | bwd: 4582.38 | bwd_inner: 4577.62 | bwd_allreduce: 4.71 | step: 45.67
- 79%|█████��█▉  | 4574/5800 [12:41:07<2:20:12,  6.86s/it]                                                        {'loss': 0.0049, 'grad_norm': 1.5386574268341064, 'learning_rate': 4.506633082574832e-06, 'epoch': 39.43}
- 79%|███████▉  | 4574/5800 [12:41:07<2:20:12,  6.86s/it]score1 tensor([[0.5273],
-        [0.5117],
-        [0.5273],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.5117, 0.5312, 0.5742], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:27:44,002] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 21:27:44,003] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.16 | bwd_microstep: 4587.58 | bwd_inner_microstep: 4582.30 | bwd_allreduce_microstep: 5.18 | step_microstep: 48.98
-[2025-01-25 21:27:44,003] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.11 | bwd: 4587.60 | bwd_inner: 4582.30 | bwd_allreduce: 5.23 | step: 48.97
- 79%|███████▉  | 4575/5800 [12:41:13<2:20:13,  6.87s/it]                                                        {'loss': 0.0059, 'grad_norm': 1.9462883472442627, 'learning_rate': 4.4995731386105336e-06, 'epoch': 39.44}
- 79%|███████▉  | 4575/5800 [12:41:13<2:20:13,  6.87s/it]score1 tensor([[0.4648],
-        [0.5547],
-        [0.4355],
-        [0.3418]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.5625, 0.4453, 0.2812], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0239, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:27:50,910] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.37
-[2025-01-25 21:27:50,912] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.57 | bwd_microstep: 4632.98 | bwd_inner_microstep: 4627.70 | bwd_allreduce_microstep: 5.17 | step_microstep: 48.59
-[2025-01-25 21:27:50,912] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.53 | bwd: 4633.00 | bwd_inner: 4627.70 | bwd_allreduce: 5.23 | step: 48.60
- 79%|███████▉  | 4576/5800 [12:41:20<2:20:21,  6.88s/it]                                                        {'loss': 0.0239, 'grad_norm': 0.659433901309967, 'learning_rate': 4.492518027948283e-06, 'epoch': 39.45}
- 79%|███████▉  | 4576/5800 [12:41:20<2:20:21,  6.88s/it]score1 tensor([[0.5625],
-        [0.6133],
-        [0.4121],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.6133, 0.4141, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:27:57,731] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.57 | optimizer_step: 4.37
-[2025-01-25 21:27:57,733] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.69 | bwd_microstep: 4536.92 | bwd_inner_microstep: 4531.88 | bwd_allreduce_microstep: 4.94 | step_microstep: 51.43
-[2025-01-25 21:27:57,733] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.66 | bwd: 4536.95 | bwd_inner: 4531.88 | bwd_allreduce: 4.99 | step: 51.43
- 79%|███████▉  | 4577/5800 [12:41:27<2:19:53,  6.86s/it]                                                        {'loss': 0.002, 'grad_norm': 0.3579854667186737, 'learning_rate': 4.4854677527879935e-06, 'epoch': 39.46}
- 79%|███████▉  | 4577/5800 [12:41:27<2:19:53,  6.86s/it]score1 tensor([[0.4883],
-        [0.5078],
-        [0.6406],
-        [0.5898]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.5156, 0.6367, 0.6016], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:28:04,649] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.36
-[2025-01-25 21:28:04,651] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.84 | bwd_microstep: 4637.07 | bwd_inner_microstep: 4631.82 | bwd_allreduce_microstep: 5.14 | step_microstep: 46.24
-[2025-01-25 21:28:04,651] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.78 | bwd: 4637.10 | bwd_inner: 4631.82 | bwd_allreduce: 5.20 | step: 46.25
- 79%|███████▉  | 4578/5800 [12:41:34<2:20:06,  6.88s/it]                                                        {'loss': 0.0083, 'grad_norm': 3.869323968887329, 'learning_rate': 4.478422315328059e-06, 'epoch': 39.47}
- 79%|███████▉  | 4578/5800 [12:41:34<2:20:06,  6.88s/it]score1 tensor([[0.4492],
-        [0.6289],
-        [0.6094],
-        [0.4277]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4512, 0.6094, 0.6094, 0.4375], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:28:11,520] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 21:28:11,522] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.94 | bwd_microstep: 4592.08 | bwd_inner_microstep: 4586.43 | bwd_allreduce_microstep: 5.53 | step_microstep: 46.14
-[2025-01-25 21:28:11,522] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.91 | bwd: 4592.11 | bwd_inner: 4586.43 | bwd_allreduce: 5.61 | step: 46.14
- 79%|███████▉  | 4579/5800 [12:41:41<2:19:57,  6.88s/it]                                                        {'loss': 0.0078, 'grad_norm': 1.5476603507995605, 'learning_rate': 4.471381717765366e-06, 'epoch': 39.47}
- 79%|███████▉  | 4579/5800 [12:41:41<2:19:57,  6.88s/it]score1 tensor([[0.5156],
-        [0.4531],
-        [0.4766],
-        [0.4434]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.4414, 0.4590, 0.4531], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:28:18,439] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 21:28:18,440] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.97 | bwd_microstep: 4631.14 | bwd_inner_microstep: 4625.76 | bwd_allreduce_microstep: 5.28 | step_microstep: 48.47
-[2025-01-25 21:28:18,440] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.93 | bwd: 4631.16 | bwd_inner: 4625.76 | bwd_allreduce: 5.33 | step: 48.47
- 79%|███████▉  | 4580/5800 [12:41:48<2:20:05,  6.89s/it]                                                        {'loss': 0.0107, 'grad_norm': 3.9876530170440674, 'learning_rate': 4.464345962295302e-06, 'epoch': 39.48}
- 79%|███████▉  | 4580/5800 [12:41:48<2:20:05,  6.89s/it]score1 tensor([[0.4258],
-        [0.4375],
-        [0.6133],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4336, 0.4395, 0.6055, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:28:25,363] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 21:28:25,370] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.61 | bwd_microstep: 4639.78 | bwd_inner_microstep: 4634.32 | bwd_allreduce_microstep: 5.37 | step_microstep: 51.57
-[2025-01-25 21:28:25,371] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.54 | bwd: 4639.81 | bwd_inner: 4634.32 | bwd_allreduce: 5.42 | step: 51.59
- 79%|███████▉  | 4581/5800 [12:41:55<2:20:13,  6.90s/it]                                                        {'loss': 0.0083, 'grad_norm': 0.9477718472480774, 'learning_rate': 4.457315051111728e-06, 'epoch': 39.49}
- 79%|███████▉  | 4581/5800 [12:41:55<2:20:13,  6.90s/it]score1 tensor([[0.4316],
-        [0.4258],
-        [0.5703],
-        [0.4824]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4297, 0.4355, 0.5820, 0.4863], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:28:32,294] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 21:28:32,296] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.97 | bwd_microstep: 4633.77 | bwd_inner_microstep: 4628.06 | bwd_allreduce_microstep: 5.56 | step_microstep: 49.32
-[2025-01-25 21:28:32,296] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.93 | bwd: 4633.80 | bwd_inner: 4628.06 | bwd_allreduce: 5.65 | step: 49.33
- 79%|███████▉  | 4582/5800 [12:42:02<2:20:15,  6.91s/it]                                                        {'loss': 0.0068, 'grad_norm': 4.125362873077393, 'learning_rate': 4.450288986407019e-06, 'epoch': 39.5}
- 79%|███████▉  | 4582/5800 [12:42:02<2:20:15,  6.91s/it]score1 tensor([[0.5703],
-        [0.4961],
-        [0.5820],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.4785, 0.6055, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:28:39,207] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 21:28:39,208] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.67 | bwd_microstep: 4626.80 | bwd_inner_microstep: 4621.72 | bwd_allreduce_microstep: 4.97 | step_microstep: 43.71
-[2025-01-25 21:28:39,208] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.63 | bwd: 4626.83 | bwd_inner: 4621.73 | bwd_allreduce: 5.02 | step: 43.72
- 79%|███████▉  | 4583/5800 [12:42:09<2:20:10,  6.91s/it]                                                        {'loss': 0.0132, 'grad_norm': 0.47425323724746704, 'learning_rate': 4.443267770372011e-06, 'epoch': 39.51}
- 79%|███████▉  | 4583/5800 [12:42:09<2:20:10,  6.91s/it]score1 tensor([[0.4668],
-        [0.5625],
-        [0.4863],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.5664, 0.4863, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:28:46,082] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.37
-[2025-01-25 21:28:46,083] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.78 | bwd_microstep: 4583.81 | bwd_inner_microstep: 4578.54 | bwd_allreduce_microstep: 5.16 | step_microstep: 46.84
-[2025-01-25 21:28:46,083] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.74 | bwd: 4583.84 | bwd_inner: 4578.54 | bwd_allreduce: 5.22 | step: 46.84
- 79%|███████▉  | 4584/5800 [12:42:16<2:19:49,  6.90s/it]                                                        {'loss': 0.0059, 'grad_norm': 1.8925505876541138, 'learning_rate': 4.4362514051960415e-06, 'epoch': 39.52}
- 79%|███████▉  | 4584/5800 [12:42:16<2:19:49,  6.90s/it]score1 tensor([[0.4277],
-        [0.4199],
-        [0.4551],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4277, 0.4160, 0.4375, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:28:52,945] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.60 | optimizer_step: 4.37
-[2025-01-25 21:28:52,946] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.19 | bwd_microstep: 4580.29 | bwd_inner_microstep: 4575.56 | bwd_allreduce_microstep: 4.63 | step_microstep: 43.99
-[2025-01-25 21:28:52,947] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.14 | bwd: 4580.31 | bwd_inner: 4575.56 | bwd_allreduce: 4.68 | step: 44.00
- 79%|███████▉  | 4585/5800 [12:42:22<2:19:29,  6.89s/it]                                                        {'loss': 0.0063, 'grad_norm': 5.752233982086182, 'learning_rate': 4.429239893066939e-06, 'epoch': 39.53}
- 79%|███████▉  | 4585/5800 [12:42:22<2:19:29,  6.89s/it]score1 tensor([[0.5703],
-        [0.3535],
-        [0.4941],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.3438, 0.4922, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:28:59,863] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 21:28:59,864] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.35 | bwd_microstep: 4636.41 | bwd_inner_microstep: 4631.23 | bwd_allreduce_microstep: 5.05 | step_microstep: 47.03
-[2025-01-25 21:28:59,865] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.32 | bwd: 4636.44 | bwd_inner: 4631.23 | bwd_allreduce: 5.12 | step: 47.04
- 79%|███████▉  | 4586/5800 [12:42:29<2:19:32,  6.90s/it]                                                        {'loss': 0.0059, 'grad_norm': 8.033376693725586, 'learning_rate': 4.422233236171018e-06, 'epoch': 39.53}
- 79%|███████▉  | 4586/5800 [12:42:29<2:19:32,  6.90s/it]score1 tensor([[0.4805],
-        [0.4473],
-        [0.4219],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.4473, 0.4180, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:29:06,724] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 21:29:06,725] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.82 | bwd_microstep: 4577.32 | bwd_inner_microstep: 4571.65 | bwd_allreduce_microstep: 5.54 | step_microstep: 47.50
-[2025-01-25 21:29:06,725] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.76 | bwd: 4577.34 | bwd_inner: 4571.65 | bwd_allreduce: 5.60 | step: 47.51
- 79%|███████▉  | 4587/5800 [12:42:36<2:19:11,  6.89s/it]                                                        {'loss': 0.0059, 'grad_norm': 5.886017799377441, 'learning_rate': 4.415231436693066e-06, 'epoch': 39.54}
- 79%|███████▉  | 4587/5800 [12:42:36<2:19:11,  6.89s/it]score1 tensor([[0.6406],
-        [0.5547],
-        [0.5117],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.5508, 0.5039, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:29:13,631] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 21:29:13,633] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.45 | bwd_microstep: 4628.45 | bwd_inner_microstep: 4623.06 | bwd_allreduce_microstep: 5.30 | step_microstep: 45.42
-[2025-01-25 21:29:13,633] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.42 | bwd: 4628.47 | bwd_inner: 4623.06 | bwd_allreduce: 5.34 | step: 45.42
- 79%|███████▉  | 4588/5800 [12:42:43<2:19:14,  6.89s/it]                                                        {'loss': 0.0059, 'grad_norm': 0.5556801557540894, 'learning_rate': 4.408234496816377e-06, 'epoch': 39.55}
- 79%|███████▉  | 4588/5800 [12:42:43<2:19:14,  6.89s/it]score1 tensor([[0.6172],
-        [0.3828],
-        [0.6211],
-        [0.4746]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.3789, 0.6172, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0044, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:29:20,511] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.54 | optimizer_step: 4.36
-[2025-01-25 21:29:20,513] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.71 | bwd_microstep: 4597.90 | bwd_inner_microstep: 4593.43 | bwd_allreduce_microstep: 4.37 | step_microstep: 41.65
-[2025-01-25 21:29:20,513] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.66 | bwd: 4597.92 | bwd_inner: 4593.43 | bwd_allreduce: 4.41 | step: 41.67
- 79%|███████▉  | 4589/5800 [12:42:50<2:19:01,  6.89s/it]                                                        {'loss': 0.0044, 'grad_norm': 5.963225841522217, 'learning_rate': 4.401242418722709e-06, 'epoch': 39.56}
- 79%|███████▉  | 4589/5800 [12:42:50<2:19:01,  6.89s/it]score1 tensor([[0.6641],
-        [0.6289],
-        [0.5391],
-        [0.6797]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6875, 0.6289, 0.5352, 0.6836], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:29:27,369] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 21:29:27,370] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.77 | bwd_microstep: 4575.01 | bwd_inner_microstep: 4570.08 | bwd_allreduce_microstep: 4.80 | step_microstep: 46.44
-[2025-01-25 21:29:27,371] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.73 | bwd: 4575.03 | bwd_inner: 4570.08 | bwd_allreduce: 4.87 | step: 46.45
- 79%|███████▉  | 4590/5800 [12:42:57<2:18:44,  6.88s/it]                                                        {'loss': 0.0078, 'grad_norm': 2.818101644515991, 'learning_rate': 4.394255204592322e-06, 'epoch': 39.57}
- 79%|███████▉  | 4590/5800 [12:42:57<2:18:44,  6.88s/it]score1 tensor([[0.5000],
-        [0.6875],
-        [0.6719],
-        [0.4082]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4922, 0.6953, 0.6797, 0.4160], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:29:34,280] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 21:29:34,282] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.40 | bwd_microstep: 4630.09 | bwd_inner_microstep: 4624.74 | bwd_allreduce_microstep: 5.21 | step_microstep: 45.94
-[2025-01-25 21:29:34,282] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.37 | bwd: 4630.12 | bwd_inner: 4624.74 | bwd_allreduce: 5.29 | step: 45.95
- 79%|███████▉  | 4591/5800 [12:43:04<2:18:49,  6.89s/it]                                                        {'loss': 0.0078, 'grad_norm': 4.712545394897461, 'learning_rate': 4.387272856603937e-06, 'epoch': 39.58}
- 79%|███████▉  | 4591/5800 [12:43:04<2:18:49,  6.89s/it]score1 tensor([[0.4355],
-        [0.3574],
-        [0.4941],
-        [0.3535]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4336, 0.3672, 0.4980, 0.3516], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0044, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:29:41,194] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 21:29:41,195] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.90 | bwd_microstep: 4632.04 | bwd_inner_microstep: 4627.13 | bwd_allreduce_microstep: 4.81 | step_microstep: 45.32
-[2025-01-25 21:29:41,196] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.87 | bwd: 4632.06 | bwd_inner: 4627.13 | bwd_allreduce: 4.86 | step: 45.32
- 79%|███████▉  | 4592/5800 [12:43:11<2:18:50,  6.90s/it]                                                        {'loss': 0.0044, 'grad_norm': 0.3802138566970825, 'learning_rate': 4.380295376934787e-06, 'epoch': 39.59}
- 79%|███████▉  | 4592/5800 [12:43:11<2:18:50,  6.90s/it]score1 tensor([[0.3672],
-        [0.5000],
-        [0.5820],
-        [0.4453]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.4961, 0.5781, 0.4609], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:29:48,101] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 21:29:48,103] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.90 | bwd_microstep: 4636.99 | bwd_inner_microstep: 4632.23 | bwd_allreduce_microstep: 4.66 | step_microstep: 43.09
-[2025-01-25 21:29:48,103] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.87 | bwd: 4637.03 | bwd_inner: 4632.23 | bwd_allreduce: 4.72 | step: 43.09
- 79%|███████▉  | 4593/5800 [12:43:18<2:18:50,  6.90s/it]                                                        {'loss': 0.0078, 'grad_norm': 0.8761264681816101, 'learning_rate': 4.37332276776056e-06, 'epoch': 39.59}
- 79%|███████▉  | 4593/5800 [12:43:18<2:18:50,  6.90s/it]score1 tensor([[0.5664],
-        [0.3555],
-        [0.4961],
-        [0.5820]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5703, 0.3730, 0.4805, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:29:55,023] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.37
-[2025-01-25 21:29:55,024] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.54 | bwd_microstep: 4639.31 | bwd_inner_microstep: 4633.63 | bwd_allreduce_microstep: 5.56 | step_microstep: 46.69
-[2025-01-25 21:29:55,025] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.51 | bwd: 4639.34 | bwd_inner: 4633.63 | bwd_allreduce: 5.62 | step: 46.70
- 79%|███████▉  | 4594/5800 [12:43:25<2:18:50,  6.91s/it]                                                        {'loss': 0.0103, 'grad_norm': 0.5971795916557312, 'learning_rate': 4.3663550312554425e-06, 'epoch': 39.6}
- 79%|███████▉  | 4594/5800 [12:43:25<2:18:50,  6.91s/it]score1 tensor([[0.4590],
-        [0.4492],
-        [0.4219],
-        [0.4277]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4707, 0.4570, 0.4297, 0.4277], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:30:01,904] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 21:30:01,905] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.36 | bwd_microstep: 4594.87 | bwd_inner_microstep: 4590.33 | bwd_allreduce_microstep: 4.44 | step_microstep: 43.31
-[2025-01-25 21:30:01,905] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.32 | bwd: 4594.89 | bwd_inner: 4590.33 | bwd_allreduce: 4.49 | step: 43.32
- 79%|███████▉  | 4595/5800 [12:43:31<2:18:32,  6.90s/it]                                                        {'loss': 0.0068, 'grad_norm': 5.623906135559082, 'learning_rate': 4.359392169592091e-06, 'epoch': 39.61}
- 79%|███████▉  | 4595/5800 [12:43:31<2:18:32,  6.90s/it]score1 tensor([[0.5586],
-        [0.4805],
-        [0.5078],
-        [0.6445]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.4727, 0.5195, 0.6484], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:30:08,828] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.37
-[2025-01-25 21:30:08,829] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.62 | bwd_microstep: 4642.38 | bwd_inner_microstep: 4637.19 | bwd_allreduce_microstep: 5.10 | step_microstep: 45.30
-[2025-01-25 21:30:08,830] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.59 | bwd: 4642.40 | bwd_inner: 4637.19 | bwd_allreduce: 5.15 | step: 45.31
- 79%|███████▉  | 4596/5800 [12:43:38<2:18:36,  6.91s/it]                                                        {'loss': 0.0068, 'grad_norm': 0.5080921053886414, 'learning_rate': 4.352434184941654e-06, 'epoch': 39.62}
- 79%|███████▉  | 4596/5800 [12:43:38<2:18:36,  6.91s/it]score1 tensor([[0.4941],
-        [0.6016],
-        [0.5469],
-        [0.4570]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.6094, 0.5469, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:30:15,708] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 21:30:15,710] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.50 | bwd_microstep: 4585.66 | bwd_inner_microstep: 4580.47 | bwd_allreduce_microstep: 5.08 | step_microstep: 48.36
-[2025-01-25 21:30:15,710] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.46 | bwd: 4585.68 | bwd_inner: 4580.47 | bwd_allreduce: 5.14 | step: 48.37
- 79%|███████▉  | 4597/5800 [12:43:45<2:18:17,  6.90s/it]                                                        {'loss': 0.0063, 'grad_norm': 1.6949913501739502, 'learning_rate': 4.345481079473745e-06, 'epoch': 39.63}
- 79%|███████▉  | 4597/5800 [12:43:45<2:18:17,  6.90s/it]score1 tensor([[0.4863],
-        [0.4043],
-        [0.6562],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.4121, 0.6562, 0.5586], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:30:22,571] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.37
-[2025-01-25 21:30:22,573] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.73 | bwd_microstep: 4583.98 | bwd_inner_microstep: 4579.23 | bwd_allreduce_microstep: 4.65 | step_microstep: 44.33
-[2025-01-25 21:30:22,573] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.71 | bwd: 4584.01 | bwd_inner: 4579.23 | bwd_allreduce: 4.70 | step: 44.33
- 79%|███████▉  | 4598/5800 [12:43:52<2:17:59,  6.89s/it]                                                        {'loss': 0.0049, 'grad_norm': 1.9597166776657104, 'learning_rate': 4.338532855356463e-06, 'epoch': 39.64}
- 79%|███████▉  | 4598/5800 [12:43:52<2:17:59,  6.89s/it]score1 tensor([[0.5234],
-        [0.4453],
-        [0.4629],
-        [0.4805]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.4551, 0.4648, 0.4883], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:30:29,489] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.37
-[2025-01-25 21:30:29,490] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.24 | bwd_microstep: 4634.88 | bwd_inner_microstep: 4630.05 | bwd_allreduce_microstep: 4.73 | step_microstep: 44.85
-[2025-01-25 21:30:29,491] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.21 | bwd: 4634.90 | bwd_inner: 4630.05 | bwd_allreduce: 4.78 | step: 44.85
- 79%|███████▉  | 4599/5800 [12:43:59<2:18:02,  6.90s/it]                                                        {'loss': 0.0059, 'grad_norm': 7.856596946716309, 'learning_rate': 4.331589514756391e-06, 'epoch': 39.65}
- 79%|███████▉  | 4599/5800 [12:43:59<2:18:02,  6.90s/it]score1 tensor([[0.3906],
-        [0.5586],
-        [0.3945],
-        [0.4258]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4023, 0.5508, 0.4043, 0.4141], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:30:36,404] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.55 | optimizer_step: 4.37
-[2025-01-25 21:30:36,405] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.59 | bwd_microstep: 4636.82 | bwd_inner_microstep: 4632.13 | bwd_allreduce_microstep: 4.59 | step_microstep: 42.80
-[2025-01-25 21:30:36,406] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.55 | bwd: 4636.84 | bwd_inner: 4632.13 | bwd_allreduce: 4.64 | step: 42.80
- 79%|███████▉  | 4600/5800 [12:44:06<2:18:03,  6.90s/it]                                                        {'loss': 0.0103, 'grad_norm': 0.714084804058075, 'learning_rate': 4.324651059838572e-06, 'epoch': 39.66}
- 79%|███████▉  | 4600/5800 [12:44:06<2:18:03,  6.90s/it]evaluate!
-score1 tensor([[0.4707]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4805, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5430]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5625, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4629]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4766, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5312]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5703, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6328]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5742, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0586, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4434]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5547, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1113, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5195]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5977, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0781, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5625]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4219, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1406, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5586]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4883]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5000, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4434]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3945, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0488, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5000, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4863]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3730, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1133, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4590]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0332, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4316]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4414, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6797, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1445, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5391, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4922]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4648, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4043]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4258, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0215, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4531]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4707, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4512]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3965, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5078]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3750, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1328, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4492, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0859, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5117]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5586, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5625]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6523, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0898, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5859]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5898, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5430]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4355, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1074, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5078]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5156, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5000]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3262, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1738, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4062]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4863, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0801, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4727]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1758, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6055, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0703, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4570, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5547]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0938, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5781]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.7031, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1250, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4668]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4336, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0332, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5391]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5391, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0., device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4492]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4473, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5312]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5430, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4570]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4590, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6406]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6641, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4453, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0820, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5898]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.7109, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1211, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5273, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4883]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3691, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1191, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4609]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4180, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4688]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3223, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1465, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5625]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6016, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0391, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4883]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4648, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4512]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3418, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1094, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4766]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4590, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0176, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4922]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5117, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4961]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4551, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0410, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4688]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5234, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0547, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5625]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5312, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5820]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5820, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0., device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4180]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4199, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4082]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3926, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5352]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0820, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5117]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5352, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5781]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6250, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0469, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5938]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5703, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4570]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4727, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5547]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4434, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1113, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4844]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4941, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5430]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5352, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4824]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4883, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4551]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3828, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0723, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4238]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4219, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4844]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4844, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0., device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4805, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5586]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0586, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5703]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5898, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.6406]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6641, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4844]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4688, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4277]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4297, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4141]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4668, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0527, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1016, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4570]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4297, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0273, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4180]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4609, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4805]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6172, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1367, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4629]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4492, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4023]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3730, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0293, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5312]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6445, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1133, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4746]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6484, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1738, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4551]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5430, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0879, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5039]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5273, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0234, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4531]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3867, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4746]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4668, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5234]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4922, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5664]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5625, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5898]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6250, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5625]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5625]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6289, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0664, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4414]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4785, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0371, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4902]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4062, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0840, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5273]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4395, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0879, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4375]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5664, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1289, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4434]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3555, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0879, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4766]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4785, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5508]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5078, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0430, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5820]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6016, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4629]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4277, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0352, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5781]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5781, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0., device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4844]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3477, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1367, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5117]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.6133, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1016, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.5469]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.5156, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4395]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3379, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.1016, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4375]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3418, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0957, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4785]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3848, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0938, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4238]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.3828, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0410, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4590]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4277, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0312, device='cuda:0', dtype=torch.bfloat16)
-score1 tensor([[0.4141]], device='cuda:0', dtype=torch.bfloat16)
-mos tensor(0.4043, device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16)
-Results saved to /DATA/DATA1/wjr/intern/InternVL/internvl_chat/new_train_result/stage2/mos3_test.csv
-Accuracy: 0.0
-SRCC_score: 0.6591838598986725
-PLCC_score: 0.6579413770922811
-KRCC_score: 0.4753067271611655
-SRCC_level: 0.6591838598986725
-PLCC_level: 0.6579413770922811
-KRCC_level: 0.4753067271611655
-score1 tensor([[0.5430],
-        [0.6445],
-        [0.4902],
-        [0.6250]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.6406, 0.4980, 0.6250], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:41:29,877] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.89 | optimizer_step: 4.36
-[2025-01-25 21:41:29,879] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2137.27 | bwd_microstep: 4555.75 | bwd_inner_microstep: 4551.03 | bwd_allreduce_microstep: 4.63 | step_microstep: 44.74
-[2025-01-25 21:41:29,879] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2137.23 | bwd: 4555.77 | bwd_inner: 4551.03 | bwd_allreduce: 4.67 | step: 44.75
- 79%|███████▉  | 4601/5800 [12:54:59<66:54:08, 200.87s/it]                                                          {'loss': 0.0049, 'grad_norm': 2.5269906520843506, 'learning_rate': 4.317717492766551e-06, 'epoch': 39.66}
- 79%|███████▉  | 4601/5800 [12:54:59<66:54:08, 200.87s/it]score1 tensor([[0.5469],
-        [0.4570],
-        [0.3711],
-        [0.6562]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.4570, 0.3711, 0.6641], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0029, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:41:36,638] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 21:41:36,639] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2143.47 | bwd_microstep: 4493.62 | bwd_inner_microstep: 4488.86 | bwd_allreduce_microstep: 4.67 | step_microstep: 44.75
-[2025-01-25 21:41:36,639] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2143.45 | bwd: 4493.65 | bwd_inner: 4488.86 | bwd_allreduce: 4.71 | step: 44.76
- 79%|███████▉  | 4602/5800 [12:55:06<47:28:01, 142.64s/it]                                                          {'loss': 0.0029, 'grad_norm': 0.44179055094718933, 'learning_rate': 4.310788815702325e-06, 'epoch': 39.67}
- 79%|███████▉  | 4602/5800 [12:55:06<47:28:01, 142.64s/it]score1 tensor([[0.5625],
-        [0.6484],
-        [0.3926],
-        [0.4043]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.6406, 0.4043, 0.3945], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:41:43,467] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 21:41:43,468] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2132.62 | bwd_microstep: 4578.87 | bwd_inner_microstep: 4573.70 | bwd_allreduce_microstep: 5.06 | step_microstep: 44.82
-[2025-01-25 21:41:43,468] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2132.59 | bwd: 4578.90 | bwd_inner: 4573.70 | bwd_allreduce: 5.11 | step: 44.83
- 79%|███████▉  | 4603/5800 [12:55:13<33:52:48, 101.89s/it]                                                          {'loss': 0.0083, 'grad_norm': 0.49225950241088867, 'learning_rate': 4.303865030806374e-06, 'epoch': 39.68}
- 79%|███████▉  | 4603/5800 [12:55:13<33:52:48, 101.89s/it]score1 tensor([[0.5195],
-        [0.5000],
-        [0.4492],
-        [0.4414]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5156, 0.4883, 0.4551, 0.4512], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:41:50,300] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.54 | optimizer_step: 4.36
-[2025-01-25 21:41:50,301] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2135.47 | bwd_microstep: 4582.79 | bwd_inner_microstep: 4577.70 | bwd_allreduce_microstep: 4.99 | step_microstep: 44.62
-[2025-01-25 21:41:50,301] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2135.44 | bwd: 4582.82 | bwd_inner: 4577.70 | bwd_allreduce: 5.04 | step: 44.63
- 79%|███████▉  | 4604/5800 [12:55:20<24:22:39, 73.38s/it]                                                          {'loss': 0.0078, 'grad_norm': 0.5466083288192749, 'learning_rate': 4.296946140237661e-06, 'epoch': 39.69}
- 79%|███████▉  | 4604/5800 [12:55:20<24:22:39, 73.38s/it]score1 tensor([[0.4922],
-        [0.4805],
-        [0.6250],
-        [0.6016]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5078, 0.4570, 0.6289, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0156, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:41:57,149] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.37
-[2025-01-25 21:41:57,150] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2140.64 | bwd_microstep: 4594.20 | bwd_inner_microstep: 4589.48 | bwd_allreduce_microstep: 4.64 | step_microstep: 42.59
-[2025-01-25 21:41:57,150] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2140.61 | bwd: 4594.22 | bwd_inner: 4589.48 | bwd_allreduce: 4.68 | step: 42.60
- 79%|███████▉  | 4605/5800 [12:55:27<17:43:55, 53.42s/it]                                                         {'loss': 0.0156, 'grad_norm': 4.594160556793213, 'learning_rate': 4.290032146153611e-06, 'epoch': 39.7}
- 79%|███████▉  | 4605/5800 [12:55:27<17:43:55, 53.42s/it]score1 tensor([[0.4668],
-        [0.6055],
-        [0.4336],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.6094, 0.4414, 0.5430], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0034, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:42:03,968] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.37
-[2025-01-25 21:42:03,970] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2144.32 | bwd_microstep: 4560.15 | bwd_inner_microstep: 4555.40 | bwd_allreduce_microstep: 4.66 | step_microstep: 44.34
-[2025-01-25 21:42:03,970] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2144.28 | bwd: 4560.18 | bwd_inner: 4555.40 | bwd_allreduce: 4.70 | step: 44.35
- 79%|███████▉  | 4606/5800 [12:55:33<13:04:49, 39.44s/it]                                                         {'loss': 0.0034, 'grad_norm': 6.029141426086426, 'learning_rate': 4.283123050710132e-06, 'epoch': 39.71}
- 79%|███████▉  | 4606/5800 [12:55:33<13:04:49, 39.44s/it]score1 tensor([[0.4180],
-        [0.4941],
-        [0.5312],
-        [0.4785]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4219, 0.5039, 0.5391, 0.4805], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:42:10,830] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 21:42:10,831] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2145.37 | bwd_microstep: 4600.02 | bwd_inner_microstep: 4595.00 | bwd_allreduce_microstep: 4.92 | step_microstep: 44.65
-[2025-01-25 21:42:10,831] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2145.33 | bwd: 4600.04 | bwd_inner: 4595.00 | bwd_allreduce: 4.97 | step: 44.65
- 79%|███████▉  | 4607/5800 [12:55:40<9:49:50, 29.67s/it]                                                         {'loss': 0.0059, 'grad_norm': 7.826103210449219, 'learning_rate': 4.276218856061593e-06, 'epoch': 39.72}
- 79%|███████▉  | 4607/5800 [12:55:40<9:49:50, 29.67s/it]score1 tensor([[0.4609],
-        [0.4219],
-        [0.4473],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4629, 0.4180, 0.4492, 0.5117], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:42:17,701] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.62 | optimizer_step: 4.37
-[2025-01-25 21:42:17,702] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2147.36 | bwd_microstep: 4608.89 | bwd_inner_microstep: 4603.57 | bwd_allreduce_microstep: 5.22 | step_microstep: 44.78
-[2025-01-25 21:42:17,703] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2147.31 | bwd: 4608.91 | bwd_inner: 4603.57 | bwd_allreduce: 5.27 | step: 44.78
- 79%|███████▉  | 4608/5800 [12:55:47<7:33:29, 22.83s/it]                                                        {'loss': 0.0049, 'grad_norm': 4.009204864501953, 'learning_rate': 4.269319564360852e-06, 'epoch': 39.72}
- 79%|███████▉  | 4608/5800 [12:55:47<7:33:29, 22.83s/it]score1 tensor([[0.3789],
-        [0.4062],
-        [0.4785],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3262, 0.3984, 0.4785, 0.5078], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0161, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:42:24,521] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 21:42:24,522] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2149.22 | bwd_microstep: 4555.74 | bwd_inner_microstep: 4550.71 | bwd_allreduce_microstep: 4.91 | step_microstep: 45.20
-[2025-01-25 21:42:24,523] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2149.19 | bwd: 4555.76 | bwd_inner: 4550.71 | bwd_allreduce: 4.97 | step: 45.21
- 79%|███████▉  | 4609/5800 [12:55:54<5:57:48, 18.03s/it]                                                        {'loss': 0.0161, 'grad_norm': 5.547990322113037, 'learning_rate': 4.262425177759222e-06, 'epoch': 39.73}
- 79%|███████▉  | 4609/5800 [12:55:54<5:57:48, 18.03s/it]score1 tensor([[0.5156],
-        [0.4492],
-        [0.4062],
-        [0.3926]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5000, 0.4453, 0.4043, 0.3750], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:42:31,400] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.56 | optimizer_step: 4.37
-[2025-01-25 21:42:31,401] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2147.17 | bwd_microstep: 4610.93 | bwd_inner_microstep: 4605.75 | bwd_allreduce_microstep: 5.08 | step_microstep: 46.45
-[2025-01-25 21:42:31,402] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2147.14 | bwd: 4610.95 | bwd_inner: 4605.74 | bwd_allreduce: 5.14 | step: 46.46
- 79%|███████▉  | 4610/5800 [12:56:01<4:51:11, 14.68s/it]                                                        {'loss': 0.0098, 'grad_norm': 7.450813293457031, 'learning_rate': 4.255535698406488e-06, 'epoch': 39.74}
- 79%|███████▉  | 4610/5800 [12:56:01<4:51:11, 14.68s/it]score1 tensor([[0.5625],
-        [0.3965],
-        [0.5039],
-        [0.4336]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5625, 0.3750, 0.4941, 0.4336], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:42:38,200] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 21:42:38,201] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2150.84 | bwd_microstep: 4527.20 | bwd_inner_microstep: 4522.57 | bwd_allreduce_microstep: 4.51 | step_microstep: 43.68
-[2025-01-25 21:42:38,202] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2150.81 | bwd: 4527.22 | bwd_inner: 4522.57 | bwd_allreduce: 4.57 | step: 43.69
- 80%|███████▉  | 4611/5800 [12:56:08<4:04:05, 12.32s/it]                                                        {'loss': 0.0078, 'grad_norm': 3.7738120555877686, 'learning_rate': 4.248651128450916e-06, 'epoch': 39.75}
- 80%|███████▉  | 4611/5800 [12:56:08<4:04:05, 12.32s/it]score1 tensor([[0.6289],
-        [0.5469],
-        [0.5664],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.5391, 0.5547, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:42:45,048] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.31 | optimizer_step: 4.37
-[2025-01-25 21:42:45,049] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.23 | bwd_microstep: 4570.43 | bwd_inner_microstep: 4566.08 | bwd_allreduce_microstep: 4.28 | step_microstep: 40.77
-[2025-01-25 21:42:45,049] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.19 | bwd: 4570.45 | bwd_inner: 4566.08 | bwd_allreduce: 4.31 | step: 40.78
- 80%|███████▉  | 4612/5800 [12:56:15<3:31:22, 10.68s/it]                                                        {'loss': 0.0088, 'grad_norm': 1.9723399877548218, 'learning_rate': 4.2417714700392285e-06, 'epoch': 39.76}
- 80%|███████▉  | 4612/5800 [12:56:15<3:31:22, 10.68s/it]score1 tensor([[0.4570],
-        [0.4395],
-        [0.4082],
-        [0.6484]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.4277, 0.4004, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:42:51,919] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 21:42:51,920] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.85 | bwd_microstep: 4607.42 | bwd_inner_microstep: 4602.55 | bwd_allreduce_microstep: 4.77 | step_microstep: 42.38
-[2025-01-25 21:42:51,921] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.82 | bwd: 4607.44 | bwd_inner: 4602.55 | bwd_allreduce: 4.82 | step: 42.39
- 80%|███████▉  | 4613/5800 [12:56:21<3:08:38,  9.54s/it]                                                        {'loss': 0.0083, 'grad_norm': 7.939825057983398, 'learning_rate': 4.234896725316631e-06, 'epoch': 39.77}
- 80%|███████▉  | 4613/5800 [12:56:21<3:08:38,  9.54s/it]score1 tensor([[0.5273],
-        [0.6562],
-        [0.5508],
-        [0.4785]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5234, 0.6445, 0.5508, 0.4707], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:42:58,763] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.37
-[2025-01-25 21:42:58,764] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.91 | bwd_microstep: 4566.67 | bwd_inner_microstep: 4561.19 | bwd_allreduce_microstep: 5.36 | step_microstep: 48.43
-[2025-01-25 21:42:58,764] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.89 | bwd: 4566.70 | bwd_inner: 4561.19 | bwd_allreduce: 5.42 | step: 48.43
- 80%|███████▉  | 4614/5800 [12:56:28<2:52:30,  8.73s/it]                                                        {'loss': 0.0059, 'grad_norm': 6.401241779327393, 'learning_rate': 4.22802689642678e-06, 'epoch': 39.78}
- 80%|███████▉  | 4614/5800 [12:56:28<2:52:30,  8.73s/it]score1 tensor([[0.6055],
-        [0.6641],
-        [0.4434],
-        [0.5508]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.6602, 0.4473, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:43:05,652] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 21:43:05,653] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.76 | bwd_microstep: 4612.59 | bwd_inner_microstep: 4607.53 | bwd_allreduce_microstep: 4.96 | step_microstep: 43.71
-[2025-01-25 21:43:05,653] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.73 | bwd: 4612.62 | bwd_inner: 4607.53 | bwd_allreduce: 5.01 | step: 43.71
- 80%|███████▉  | 4615/5800 [12:56:35<2:41:30,  8.18s/it]                                                        {'loss': 0.0059, 'grad_norm': 3.8717269897460938, 'learning_rate': 4.221161985511808e-06, 'epoch': 39.78}
- 80%|███████▉  | 4615/5800 [12:56:35<2:41:30,  8.18s/it]score1 tensor([[0.5391],
-        [0.4199],
-        [0.4180],
-        [0.4668]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.4062, 0.4141, 0.4648], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:43:12,493] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 21:43:12,495] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.36 | bwd_microstep: 4561.79 | bwd_inner_microstep: 4556.70 | bwd_allreduce_microstep: 5.01 | step_microstep: 46.14
-[2025-01-25 21:43:12,495] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.32 | bwd: 4561.82 | bwd_inner: 4556.70 | bwd_allreduce: 5.05 | step: 46.15
- 80%|███████▉  | 4616/5800 [12:56:42<2:33:24,  7.77s/it]                                                        {'loss': 0.0049, 'grad_norm': 5.523972511291504, 'learning_rate': 4.21430199471232e-06, 'epoch': 39.79}
- 80%|███████▉  | 4616/5800 [12:56:42<2:33:24,  7.77s/it]score1 tensor([[0.5547],
-        [0.5781],
-        [0.4961],
-        [0.6602]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.5781, 0.4883, 0.6719], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:43:19,333] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.36
-[2025-01-25 21:43:19,335] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.14 | bwd_microstep: 4561.87 | bwd_inner_microstep: 4556.98 | bwd_allreduce_microstep: 4.79 | step_microstep: 46.06
-[2025-01-25 21:43:19,335] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.10 | bwd: 4561.89 | bwd_inner: 4556.98 | bwd_allreduce: 4.84 | step: 46.07
- 80%|███████▉  | 4617/5800 [12:56:49<2:27:46,  7.49s/it]                                                        {'loss': 0.0068, 'grad_norm': 1.8169499635696411, 'learning_rate': 4.207446926167371e-06, 'epoch': 39.8}
- 80%|███████▉  | 4617/5800 [12:56:49<2:27:46,  7.49s/it]score1 tensor([[0.6172],
-        [0.5625],
-        [0.3828],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6250, 0.5742, 0.3750, 0.5820], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:43:26,228] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 21:43:26,229] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2154.99 | bwd_microstep: 4615.98 | bwd_inner_microstep: 4610.46 | bwd_allreduce_microstep: 5.43 | step_microstep: 45.33
-[2025-01-25 21:43:26,230] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2154.95 | bwd: 4616.00 | bwd_inner: 4610.46 | bwd_allreduce: 5.47 | step: 45.34
- 80%|███████▉  | 4618/5800 [12:56:56<2:24:06,  7.32s/it]                                                        {'loss': 0.0098, 'grad_norm': 4.932650566101074, 'learning_rate': 4.200596782014499e-06, 'epoch': 39.81}
- 80%|███████▉  | 4618/5800 [12:56:56<2:24:06,  7.32s/it]score1 tensor([[0.5391],
-        [0.4746],
-        [0.5508],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.4941, 0.5508, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:43:33,077] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 21:43:33,078] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.30 | bwd_microstep: 4578.75 | bwd_inner_microstep: 4573.77 | bwd_allreduce_microstep: 4.87 | step_microstep: 43.42
-[2025-01-25 21:43:33,079] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.27 | bwd: 4578.77 | bwd_inner: 4573.77 | bwd_allreduce: 4.92 | step: 43.43
- 80%|███████▉  | 4619/5800 [12:57:03<2:21:13,  7.17s/it]                                                        {'loss': 0.0088, 'grad_norm': 6.147365570068359, 'learning_rate': 4.19375156438969e-06, 'epoch': 39.82}
- 80%|███████▉  | 4619/5800 [12:57:03<2:21:13,  7.17s/it]score1 tensor([[0.6406],
-        [0.4414],
-        [0.4609],
-        [0.3379]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6484, 0.4395, 0.4629, 0.3457], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:43:39,969] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.37
-[2025-01-25 21:43:39,971] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.31 | bwd_microstep: 4621.88 | bwd_inner_microstep: 4616.90 | bwd_allreduce_microstep: 4.89 | step_microstep: 41.55
-[2025-01-25 21:43:39,971] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.28 | bwd: 4621.91 | bwd_inner: 4616.90 | bwd_allreduce: 4.93 | step: 41.56
- 80%|███████▉  | 4620/5800 [12:57:09<2:19:27,  7.09s/it]                                                        {'loss': 0.0049, 'grad_norm': 4.042167663574219, 'learning_rate': 4.18691127542741e-06, 'epoch': 39.83}
- 80%|███████▉  | 4620/5800 [12:57:09<2:19:27,  7.09s/it]score1 tensor([[0.3965],
-        [0.4473],
-        [0.4707],
-        [0.4395]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.4492, 0.4629, 0.4375], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:43:46,873] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 21:43:46,874] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.17 | bwd_microstep: 4629.12 | bwd_inner_microstep: 4623.94 | bwd_allreduce_microstep: 5.07 | step_microstep: 43.65
-[2025-01-25 21:43:46,874] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.14 | bwd: 4629.15 | bwd_inner: 4623.94 | bwd_allreduce: 5.12 | step: 43.66
- 80%|███████▉  | 4621/5800 [12:57:16<2:18:13,  7.03s/it]                                                        {'loss': 0.0039, 'grad_norm': 0.41298428177833557, 'learning_rate': 4.180075917260568e-06, 'epoch': 39.84}
- 80%|███████▉  | 4621/5800 [12:57:16<2:18:13,  7.03s/it]score1 tensor([[0.4883],
-        [0.3438],
-        [0.3750],
-        [0.4629]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4902, 0.3398, 0.3809, 0.4707], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:43:53,767] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.36
-[2025-01-25 21:43:53,769] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.98 | bwd_microstep: 4616.06 | bwd_inner_microstep: 4610.90 | bwd_allreduce_microstep: 5.04 | step_microstep: 46.23
-[2025-01-25 21:43:53,769] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.93 | bwd: 4616.08 | bwd_inner: 4610.90 | bwd_allreduce: 5.10 | step: 46.23
- 80%|███████▉  | 4622/5800 [12:57:23<2:17:18,  6.99s/it]                                                        {'loss': 0.0049, 'grad_norm': 3.952224016189575, 'learning_rate': 4.173245492020563e-06, 'epoch': 39.84}
- 80%|███████▉  | 4622/5800 [12:57:23<2:17:18,  6.99s/it]score1 tensor([[0.4590],
-        [0.3887],
-        [0.4531],
-        [0.4648]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.3945, 0.4609, 0.4668], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:44:00,667] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 21:44:00,668] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.50 | bwd_microstep: 4611.91 | bwd_inner_microstep: 4606.74 | bwd_allreduce_microstep: 5.07 | step_microstep: 46.02
-[2025-01-25 21:44:00,668] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.46 | bwd: 4611.94 | bwd_inner: 4606.75 | bwd_allreduce: 5.12 | step: 46.03
- 80%|███████▉  | 4623/5800 [12:57:30<2:16:37,  6.96s/it]                                                        {'loss': 0.0063, 'grad_norm': 7.478607654571533, 'learning_rate': 4.166420001837226e-06, 'epoch': 39.85}
- 80%|███████▉  | 4623/5800 [12:57:30<2:16:37,  6.96s/it]score1 tensor([[0.4473],
-        [0.5117],
-        [0.5039],
-        [0.4785]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4434, 0.5156, 0.5117, 0.4805], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0044, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:44:07,563] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.37
-[2025-01-25 21:44:07,565] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.74 | bwd_microstep: 4616.91 | bwd_inner_microstep: 4611.94 | bwd_allreduce_microstep: 4.89 | step_microstep: 44.34
-[2025-01-25 21:44:07,565] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.70 | bwd: 4616.93 | bwd_inner: 4611.94 | bwd_allreduce: 4.92 | step: 44.35
- 80%|███████▉  | 4624/5800 [12:57:37<2:16:04,  6.94s/it]                                                        {'loss': 0.0044, 'grad_norm': 4.15604829788208, 'learning_rate': 4.159599448838874e-06, 'epoch': 39.86}
- 80%|███████▉  | 4624/5800 [12:57:37<2:16:04,  6.94s/it]score1 tensor([[0.4199],
-        [0.5820],
-        [0.4199],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4258, 0.5742, 0.4180, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:44:14,465] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 21:44:14,471] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.89 | bwd_microstep: 4623.33 | bwd_inner_microstep: 4618.26 | bwd_allreduce_microstep: 4.96 | step_microstep: 48.46
-[2025-01-25 21:44:14,471] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.87 | bwd: 4623.36 | bwd_inner: 4618.26 | bwd_allreduce: 5.02 | step: 48.47
- 80%|███████▉  | 4625/5800 [12:57:44<2:15:45,  6.93s/it]                                                        {'loss': 0.0049, 'grad_norm': 0.42628684639930725, 'learning_rate': 4.152783835152263e-06, 'epoch': 39.87}
- 80%|███████▉  | 4625/5800 [12:57:44<2:15:45,  6.93s/it]score1 tensor([[0.5000],
-        [0.4551],
-        [0.6016],
-        [0.5352]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.4492, 0.6016, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0044, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:44:21,335] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 21:44:21,336] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.06 | bwd_microstep: 4578.51 | bwd_inner_microstep: 4573.88 | bwd_allreduce_microstep: 4.56 | step_microstep: 48.02
-[2025-01-25 21:44:21,337] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.03 | bwd: 4578.53 | bwd_inner: 4573.88 | bwd_allreduce: 4.59 | step: 48.03
- 80%|███████▉  | 4626/5800 [12:57:51<2:15:15,  6.91s/it]                                                        {'loss': 0.0044, 'grad_norm': 2.0308687686920166, 'learning_rate': 4.145973162902626e-06, 'epoch': 39.88}
- 80%|███████▉  | 4626/5800 [12:57:51<2:15:15,  6.91s/it]score1 tensor([[0.4004],
-        [0.7109],
-        [0.4355],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3926, 0.7031, 0.4258, 0.5000], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:44:28,225] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.37
-[2025-01-25 21:44:28,227] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.20 | bwd_microstep: 4617.47 | bwd_inner_microstep: 4612.59 | bwd_allreduce_microstep: 4.76 | step_microstep: 42.89
-[2025-01-25 21:44:28,227] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.16 | bwd: 4617.49 | bwd_inner: 4612.59 | bwd_allreduce: 4.82 | step: 42.89
- 80%|███████▉  | 4627/5800 [12:57:58<2:15:00,  6.91s/it]                                                        {'loss': 0.0093, 'grad_norm': 8.155844688415527, 'learning_rate': 4.139167434213651e-06, 'epoch': 39.89}
- 80%|███████▉  | 4627/5800 [12:57:58<2:15:00,  6.91s/it]score1 tensor([[0.3164],
-        [0.4922],
-        [0.4453],
-        [0.4551]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3086, 0.4941, 0.4492, 0.4609], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:44:35,128] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.36
-[2025-01-25 21:44:35,129] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.17 | bwd_microstep: 4624.73 | bwd_inner_microstep: 4620.09 | bwd_allreduce_microstep: 4.56 | step_microstep: 44.17
-[2025-01-25 21:44:35,130] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.14 | bwd: 4624.76 | bwd_inner: 4620.09 | bwd_allreduce: 4.60 | step: 44.18
- 80%|███████▉  | 4628/5800 [12:58:05<2:14:52,  6.91s/it]                                                        {'loss': 0.0049, 'grad_norm': 4.204542636871338, 'learning_rate': 4.132366651207482e-06, 'epoch': 39.9}
- 80%|███████▉  | 4628/5800 [12:58:05<2:14:52,  6.91s/it]score1 tensor([[0.5039],
-        [0.5469],
-        [0.4844],
-        [0.3789]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.5508, 0.4922, 0.3789], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0029, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:44:41,946] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.55 | optimizer_step: 4.36
-[2025-01-25 21:44:41,947] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.61 | bwd_microstep: 4538.79 | bwd_inner_microstep: 4533.18 | bwd_allreduce_microstep: 5.49 | step_microstep: 43.69
-[2025-01-25 21:44:41,948] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.58 | bwd: 4538.82 | bwd_inner: 4533.19 | bwd_allreduce: 5.55 | step: 43.70
- 80%|███████▉  | 4629/5800 [12:58:11<2:14:14,  6.88s/it]                                                        {'loss': 0.0029, 'grad_norm': 4.070145130157471, 'learning_rate': 4.125570816004711e-06, 'epoch': 39.91}
- 80%|███████▉  | 4629/5800 [12:58:11<2:14:14,  6.88s/it]score1 tensor([[0.5938],
-        [0.4531],
-        [0.4824],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6016, 0.4492, 0.4844, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:44:48,839] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 21:44:48,840] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.62 | bwd_microstep: 4622.89 | bwd_inner_microstep: 4617.53 | bwd_allreduce_microstep: 5.24 | step_microstep: 41.93
-[2025-01-25 21:44:48,841] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.59 | bwd: 4622.92 | bwd_inner: 4617.53 | bwd_allreduce: 5.31 | step: 41.94
- 80%|███████▉  | 4630/5800 [12:58:18<2:14:13,  6.88s/it]                                                        {'loss': 0.0054, 'grad_norm': 0.43478456139564514, 'learning_rate': 4.118779930724412e-06, 'epoch': 39.91}
- 80%|███████▉  | 4630/5800 [12:58:18<2:14:13,  6.88s/it]score1 tensor([[0.4941],
-        [0.5938],
-        [0.4961],
-        [0.4609]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4961, 0.5898, 0.4961, 0.4629], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:44:55,703] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.37
-[2025-01-25 21:44:55,705] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.75 | bwd_microstep: 4584.42 | bwd_inner_microstep: 4579.45 | bwd_allreduce_microstep: 4.88 | step_microstep: 43.60
-[2025-01-25 21:44:55,705] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.70 | bwd: 4584.45 | bwd_inner: 4579.45 | bwd_allreduce: 4.92 | step: 43.60
- 80%|███████▉  | 4631/5800 [12:58:25<2:14:02,  6.88s/it]                                                        {'loss': 0.002, 'grad_norm': 1.715922474861145, 'learning_rate': 4.111993997484089e-06, 'epoch': 39.92}
- 80%|███████▉  | 4631/5800 [12:58:25<2:14:02,  6.88s/it]score1 tensor([[0.4473],
-        [0.4199],
-        [0.5078],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.4180, 0.4941, 0.5547], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:45:02,557] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.53 | optimizer_step: 4.36
-[2025-01-25 21:45:02,558] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.56 | bwd_microstep: 4570.22 | bwd_inner_microstep: 4562.37 | bwd_allreduce_microstep: 7.75 | step_microstep: 42.37
-[2025-01-25 21:45:02,558] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.51 | bwd: 4570.25 | bwd_inner: 4562.37 | bwd_allreduce: 7.80 | step: 42.37
- 80%|███████▉  | 4632/5800 [12:58:32<2:13:43,  6.87s/it]                                                        {'loss': 0.0059, 'grad_norm': 1.7764832973480225, 'learning_rate': 4.105213018399723e-06, 'epoch': 39.93}
- 80%|███████▉  | 4632/5800 [12:58:32<2:13:43,  6.87s/it]score1 tensor([[0.4121],
-        [0.5078],
-        [0.3984],
-        [0.4199]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4062, 0.4980, 0.3906, 0.4082], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:45:09,447] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.55 | optimizer_step: 4.37
-[2025-01-25 21:45:09,448] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.63 | bwd_microstep: 4617.56 | bwd_inner_microstep: 4613.14 | bwd_allreduce_microstep: 4.34 | step_microstep: 42.27
-[2025-01-25 21:45:09,449] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.60 | bwd: 4617.58 | bwd_inner: 4613.14 | bwd_allreduce: 4.38 | step: 42.28
- 80%|███████▉  | 4633/5800 [12:58:39<2:13:44,  6.88s/it]                                                        {'loss': 0.0088, 'grad_norm': 7.362080097198486, 'learning_rate': 4.098436995585735e-06, 'epoch': 39.94}
- 80%|███████▉  | 4633/5800 [12:58:39<2:13:44,  6.88s/it]score1 tensor([[0.4688],
-        [0.6211],
-        [0.4531],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4746, 0.6172, 0.4453, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:45:16,354] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.95 | optimizer_step: 4.36
-[2025-01-25 21:45:16,356] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.77 | bwd_microstep: 4626.74 | bwd_inner_microstep: 4620.74 | bwd_allreduce_microstep: 5.89 | step_microstep: 48.87
-[2025-01-25 21:45:16,356] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.75 | bwd: 4626.77 | bwd_inner: 4620.74 | bwd_allreduce: 5.95 | step: 48.89
- 80%|███████▉  | 4634/5800 [12:58:46<2:13:49,  6.89s/it]                                                        {'loss': 0.0073, 'grad_norm': 4.446817398071289, 'learning_rate': 4.091665931155005e-06, 'epoch': 39.95}
- 80%|███████▉  | 4634/5800 [12:58:46<2:13:49,  6.89s/it]score1 tensor([[0.4316],
-        [0.5117],
-        [0.5430],
-        [0.5938]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4199, 0.5273, 0.5391, 0.5938], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:45:23,226] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 21:45:23,227] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.25 | bwd_microstep: 4586.04 | bwd_inner_microstep: 4580.30 | bwd_allreduce_microstep: 5.62 | step_microstep: 46.32
-[2025-01-25 21:45:23,228] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.21 | bwd: 4586.06 | bwd_inner: 4580.30 | bwd_allreduce: 5.69 | step: 46.33
- 80%|███████▉  | 4635/5800 [12:58:53<2:13:39,  6.88s/it]                                                        {'loss': 0.0078, 'grad_norm': 1.9715771675109863, 'learning_rate': 4.084899827218876e-06, 'epoch': 39.96}
- 80%|███████▉  | 4635/5800 [12:58:53<2:13:39,  6.88s/it]score1 tensor([[0.5469],
-        [0.6523],
-        [0.5547],
-        [0.4980]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.6641, 0.5703, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:45:30,134] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 21:45:30,135] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.17 | bwd_microstep: 4624.97 | bwd_inner_microstep: 4620.60 | bwd_allreduce_microstep: 4.30 | step_microstep: 42.12
-[2025-01-25 21:45:30,136] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.14 | bwd: 4624.99 | bwd_inner: 4620.60 | bwd_allreduce: 4.33 | step: 42.13
- 80%|███████▉  | 4636/5800 [12:59:00<2:13:38,  6.89s/it]                                                        {'loss': 0.0112, 'grad_norm': 4.610647201538086, 'learning_rate': 4.078138685887125e-06, 'epoch': 39.97}
- 80%|███████▉  | 4636/5800 [12:59:00<2:13:38,  6.89s/it]score1 tensor([[0.6211],
-        [0.4375],
-        [0.5273],
-        [0.4180]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133, 0.4316, 0.5234, 0.4121], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:45:37,044] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 21:45:37,045] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.90 | bwd_microstep: 4626.96 | bwd_inner_microstep: 4621.84 | bwd_allreduce_microstep: 5.02 | step_microstep: 48.54
-[2025-01-25 21:45:37,045] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.87 | bwd: 4626.98 | bwd_inner: 4621.84 | bwd_allreduce: 5.07 | step: 48.55
- 80%|███████▉  | 4637/5800 [12:59:07<2:13:38,  6.89s/it]                                                        {'loss': 0.0059, 'grad_norm': 8.001194953918457, 'learning_rate': 4.071382509268005e-06, 'epoch': 39.97}
- 80%|███████▉  | 4637/5800 [12:59:07<2:13:38,  6.89s/it]score1 tensor([[0.4219],
-        [0.5430],
-        [0.4961],
-        [0.4922]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.5391, 0.5000, 0.4863], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0044, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:45:43,944] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.36
-[2025-01-25 21:45:43,946] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.54 | bwd_microstep: 4619.76 | bwd_inner_microstep: 4614.60 | bwd_allreduce_microstep: 5.06 | step_microstep: 45.60
-[2025-01-25 21:45:43,946] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.50 | bwd: 4619.79 | bwd_inner: 4614.60 | bwd_allreduce: 5.11 | step: 45.61
- 80%|███████▉  | 4638/5800 [12:59:13<2:13:35,  6.90s/it]                                                        {'loss': 0.0044, 'grad_norm': 3.924823522567749, 'learning_rate': 4.0646312994682e-06, 'epoch': 39.98}
- 80%|███████▉  | 4638/5800 [12:59:13<2:13:35,  6.90s/it]score1 tensor([[0.6484],
-        [0.4980],
-        [0.3945],
-        [0.3965]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.4922, 0.3867, 0.4004], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:45:50,844] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 21:45:50,845] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.65 | bwd_microstep: 4619.41 | bwd_inner_microstep: 4614.73 | bwd_allreduce_microstep: 4.58 | step_microstep: 42.68
-[2025-01-25 21:45:50,845] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.61 | bwd: 4619.44 | bwd_inner: 4614.73 | bwd_allreduce: 4.63 | step: 42.69
- 80%|███████▉  | 4639/5800 [12:59:20<2:13:27,  6.90s/it]                                                        {'loss': 0.0054, 'grad_norm': 4.350193977355957, 'learning_rate': 4.057885058592865e-06, 'epoch': 39.99}
- 80%|███████▉  | 4639/5800 [12:59:20<2:13:27,  6.90s/it][93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-score1 tensor([[0.4492]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4590], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:45:55,103] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.59 | optimizer_step: 4.37
-[2025-01-25 21:45:55,104] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 572.14 | bwd_microstep: 1219.42 | bwd_inner_microstep: 1215.05 | bwd_allreduce_microstep: 4.30 | step_microstep: 41.61
-[2025-01-25 21:45:55,105] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 572.11 | bwd: 1219.44 | bwd_inner: 1215.05 | bwd_allreduce: 4.34 | step: 41.62
- 80%|████████  | 4640/5800 [12:59:25<1:58:01,  6.10s/it]                                                        {'loss': 0.0098, 'grad_norm': 7.536888122558594, 'learning_rate': 4.051143788745588e-06, 'epoch': 40.0}
- 80%|████████  | 4640/5800 [12:59:25<1:58:01,  6.10s/it][2025-01-25 21:45:59,529] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 21:46:09,515] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 21:46:19,528] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-[2025-01-25 21:46:30,541] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:49: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
-  def forward(ctx, input, weight, bias=None):
-/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/deepspeed/runtime/zero/linear.py:67: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.
-  def backward(ctx, grad_output):
-score1 tensor([[0.4668],
-        [0.4941],
-        [0.5703],
-        [0.4141]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4727, 0.4961, 0.5781, 0.4219], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:46:53,524] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.32 | optimizer_step: 4.37
-[2025-01-25 21:46:53,529] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2136.35 | bwd_microstep: 4598.82 | bwd_inner_microstep: 4593.53 | bwd_allreduce_microstep: 5.17 | step_microstep: 49.78
-[2025-01-25 21:46:53,530] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2136.33 | bwd: 4598.84 | bwd_inner: 4593.52 | bwd_allreduce: 5.24 | step: 49.79
- 80%|████████  | 4641/5800 [13:00:23<7:01:06, 21.80s/it]                                                        {'loss': 0.0059, 'grad_norm': 7.894054889678955, 'learning_rate': 4.044407492028408e-06, 'epoch': 40.01}
- 80%|████████  | 4641/5800 [13:00:23<7:01:06, 21.80s/it]score1 tensor([[0.4570],
-        [0.5859],
-        [0.4941],
-        [0.6953]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.6016, 0.5156, 0.6953], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:47:00,310] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.48 | optimizer_step: 4.37
-[2025-01-25 21:47:00,311] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2127.00 | bwd_microstep: 4539.10 | bwd_inner_microstep: 4533.91 | bwd_allreduce_microstep: 5.10 | step_microstep: 42.12
-[2025-01-25 21:47:00,312] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2126.96 | bwd: 4539.13 | bwd_inner: 4533.91 | bwd_allreduce: 5.15 | step: 42.12
- 80%|████████  | 4642/5800 [13:00:30<5:33:47, 17.29s/it]                                                        {'loss': 0.0112, 'grad_norm': 6.130862712860107, 'learning_rate': 4.037676170541831e-06, 'epoch': 40.02}
- 80%|████████  | 4642/5800 [13:00:30<5:33:47, 17.29s/it]score1 tensor([[0.5391],
-        [0.5430],
-        [0.5586],
-        [0.4023]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.5391, 0.5625, 0.4043], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0034, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:47:07,141] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.79 | optimizer_step: 4.36
-[2025-01-25 21:47:07,143] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2131.79 | bwd_microstep: 4586.17 | bwd_inner_microstep: 4581.55 | bwd_allreduce_microstep: 4.54 | step_microstep: 48.70
-[2025-01-25 21:47:07,143] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2131.77 | bwd: 4586.21 | bwd_inner: 4581.55 | bwd_allreduce: 4.58 | step: 48.70
- 80%|████████  | 4643/5800 [13:00:37<4:33:01, 14.16s/it]                                                        {'loss': 0.0034, 'grad_norm': 3.931161403656006, 'learning_rate': 4.030949826384791e-06, 'epoch': 40.03}
- 80%|████████  | 4643/5800 [13:00:37<4:33:01, 14.16s/it]score1 tensor([[0.5273],
-        [0.6641],
-        [0.4219],
-        [0.7109]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.6445, 0.4199, 0.7070], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:47:14,000] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.46 | optimizer_step: 4.37
-[2025-01-25 21:47:14,001] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2138.16 | bwd_microstep: 4591.79 | bwd_inner_microstep: 4587.26 | bwd_allreduce_microstep: 4.43 | step_microstep: 40.67
-[2025-01-25 21:47:14,001] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2138.12 | bwd: 4591.81 | bwd_inner: 4587.26 | bwd_allreduce: 4.48 | step: 40.67
- 80%|████████  | 4644/5800 [13:00:43<3:50:32, 11.97s/it]                                                        {'loss': 0.0093, 'grad_norm': 4.7124528884887695, 'learning_rate': 4.024228461654685e-06, 'epoch': 40.03}
- 80%|████████  | 4644/5800 [13:00:43<3:50:32, 11.97s/it]score1 tensor([[0.6172],
-        [0.4961],
-        [0.4414],
-        [0.4688]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6289, 0.5117, 0.4355, 0.4785], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:47:20,847] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.22 | optimizer_step: 4.37
-[2025-01-25 21:47:20,848] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2142.45 | bwd_microstep: 4596.06 | bwd_inner_microstep: 4591.98 | bwd_allreduce_microstep: 4.02 | step_microstep: 33.18
-[2025-01-25 21:47:20,848] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2142.42 | bwd: 4596.08 | bwd_inner: 4591.98 | bwd_allreduce: 4.05 | step: 33.19
- 80%|████████  | 4645/5800 [13:00:50<3:20:43, 10.43s/it]                                                        {'loss': 0.0107, 'grad_norm': 4.339723587036133, 'learning_rate': 4.0175120784473495e-06, 'epoch': 40.04}
- 80%|████████  | 4645/5800 [13:00:50<3:20:43, 10.43s/it]score1 tensor([[0.5078],
-        [0.4922],
-        [0.4805],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5078, 0.4922, 0.4844, 0.5586], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:47:27,605] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.36
-[2025-01-25 21:47:27,607] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2144.31 | bwd_microstep: 4515.25 | bwd_inner_microstep: 4510.27 | bwd_allreduce_microstep: 4.90 | step_microstep: 40.83
-[2025-01-25 21:47:27,607] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2144.29 | bwd: 4515.28 | bwd_inner: 4510.27 | bwd_allreduce: 4.94 | step: 40.84
- 80%|████████  | 4646/5800 [13:00:57<2:59:30,  9.33s/it]                                                        {'loss': 0.002, 'grad_norm': 0.38539135456085205, 'learning_rate': 4.0108006788570634e-06, 'epoch': 40.05}
- 80%|████████  | 4646/5800 [13:00:57<2:59:30,  9.33s/it]score1 tensor([[0.4316],
-        [0.4238],
-        [0.3535],
-        [0.4590]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4258, 0.4238, 0.3438, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:47:34,420] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.18 | optimizer_step: 4.36
-[2025-01-25 21:47:34,421] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2145.26 | bwd_microstep: 4550.63 | bwd_inner_microstep: 4546.33 | bwd_allreduce_microstep: 4.24 | step_microstep: 32.77
-[2025-01-25 21:47:34,421] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2145.24 | bwd: 4550.64 | bwd_inner: 4546.33 | bwd_allreduce: 4.27 | step: 32.78
- 80%|████████  | 4647/5800 [13:01:04<2:44:42,  8.57s/it]                                                        {'loss': 0.0049, 'grad_norm': 5.41064453125, 'learning_rate': 4.004094264976568e-06, 'epoch': 40.06}
- 80%|████████  | 4647/5800 [13:01:04<2:44:42,  8.57s/it]score1 tensor([[0.5195],
-        [0.5898],
-        [0.4551],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.5938, 0.4570, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0024, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:47:41,203] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.24 | optimizer_step: 4.37
-[2025-01-25 21:47:41,204] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2145.55 | bwd_microstep: 4546.73 | bwd_inner_microstep: 4542.61 | bwd_allreduce_microstep: 4.05 | step_microstep: 32.55
-[2025-01-25 21:47:41,204] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2145.53 | bwd: 4546.75 | bwd_inner: 4542.61 | bwd_allreduce: 4.08 | step: 32.56
- 80%|████████  | 4648/5800 [13:01:11<2:34:15,  8.03s/it]                                                        {'loss': 0.0024, 'grad_norm': 6.267258644104004, 'learning_rate': 3.997392838897027e-06, 'epoch': 40.07}
- 80%|████████  | 4648/5800 [13:01:11<2:34:15,  8.03s/it]score1 tensor([[0.5195],
-        [0.4453],
-        [0.4980],
-        [0.4609]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5078, 0.4453, 0.4863, 0.4570], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:47:47,998] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.45 | optimizer_step: 4.37
-[2025-01-25 21:47:47,999] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2146.13 | bwd_microstep: 4553.26 | bwd_inner_microstep: 4549.12 | bwd_allreduce_microstep: 4.08 | step_microstep: 40.11
-[2025-01-25 21:47:48,000] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2146.11 | bwd: 4553.28 | bwd_inner: 4549.12 | bwd_allreduce: 4.11 | step: 40.11
- 80%|████████  | 4649/5800 [13:01:17<2:27:08,  7.67s/it]                                                        {'loss': 0.0068, 'grad_norm': 5.9089884757995605, 'learning_rate': 3.990696402708074e-06, 'epoch': 40.08}
- 80%|████████  | 4649/5800 [13:01:17<2:27:08,  7.67s/it]score1 tensor([[0.5156],
-        [0.5195],
-        [0.6367],
-        [0.4707]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5078, 0.5195, 0.6250, 0.4629], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:47:54,838] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 21:47:54,840] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2152.17 | bwd_microstep: 4563.72 | bwd_inner_microstep: 4559.19 | bwd_allreduce_microstep: 4.45 | step_microstep: 42.86
-[2025-01-25 21:47:54,840] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2152.14 | bwd: 4563.74 | bwd_inner: 4559.20 | bwd_allreduce: 4.49 | step: 42.87
- 80%|████████  | 4650/5800 [13:01:24<2:22:12,  7.42s/it]                                                        {'loss': 0.0068, 'grad_norm': 6.323869228363037, 'learning_rate': 3.984004958497765e-06, 'epoch': 40.09}
- 80%|████████  | 4650/5800 [13:01:24<2:22:12,  7.42s/it]score1 tensor([[0.4688],
-        [0.5742],
-        [0.6445],
-        [0.6289]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.5664, 0.6445, 0.6289], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0029, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:48:01,660] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 21:48:01,662] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.11 | bwd_microstep: 4536.25 | bwd_inner_microstep: 4530.39 | bwd_allreduce_microstep: 5.74 | step_microstep: 43.16
-[2025-01-25 21:48:01,662] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.07 | bwd: 4536.27 | bwd_inner: 4530.39 | bwd_allreduce: 5.81 | step: 43.17
- 80%|████████  | 4651/5800 [13:01:31<2:18:37,  7.24s/it]                                                        {'loss': 0.0029, 'grad_norm': 4.100780010223389, 'learning_rate': 3.9773185083526146e-06, 'epoch': 40.09}
- 80%|████████  | 4651/5800 [13:01:31<2:18:37,  7.24s/it]score1 tensor([[0.5391],
-        [0.4590],
-        [0.3262],
-        [0.4609]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.4551, 0.3223, 0.4609], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:48:08,512] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.12 | optimizer_step: 4.37
-[2025-01-25 21:48:08,514] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.35 | bwd_microstep: 4567.11 | bwd_inner_microstep: 4561.82 | bwd_allreduce_microstep: 5.16 | step_microstep: 46.14
-[2025-01-25 21:48:08,514] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.31 | bwd: 4567.14 | bwd_inner: 4561.82 | bwd_allreduce: 5.23 | step: 46.15
- 80%|████████  | 4652/5800 [13:01:38<2:16:17,  7.12s/it]                                                        {'loss': 0.0049, 'grad_norm': 1.4835541248321533, 'learning_rate': 3.970637054357571e-06, 'epoch': 40.1}
- 80%|████████  | 4652/5800 [13:01:38<2:16:17,  7.12s/it]score1 tensor([[0.5508],
-        [0.5078],
-        [0.5273],
-        [0.4941]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.5273, 0.5391, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0103, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:48:15,368] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 21:48:15,369] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.96 | bwd_microstep: 4561.54 | bwd_inner_microstep: 4556.81 | bwd_allreduce_microstep: 4.64 | step_microstep: 44.03
-[2025-01-25 21:48:15,370] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.92 | bwd: 4561.57 | bwd_inner: 4556.81 | bwd_allreduce: 4.68 | step: 44.04
- 80%|████████  | 4653/5800 [13:01:45<2:14:38,  7.04s/it]                                                        {'loss': 0.0103, 'grad_norm': 2.122035026550293, 'learning_rate': 3.963960598596024e-06, 'epoch': 40.11}
- 80%|████████  | 4653/5800 [13:01:45<2:14:38,  7.04s/it]score1 tensor([[0.4707],
-        [0.4219],
-        [0.4844],
-        [0.3965]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4570, 0.4336, 0.4980, 0.4023], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0112, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:48:22,276] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.53 | optimizer_step: 4.37
-[2025-01-25 21:48:22,277] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2153.29 | bwd_microstep: 4614.13 | bwd_inner_microstep: 4609.09 | bwd_allreduce_microstep: 4.93 | step_microstep: 46.23
-[2025-01-25 21:48:22,278] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2153.25 | bwd: 4614.15 | bwd_inner: 4609.09 | bwd_allreduce: 4.98 | step: 46.23
- 80%|████████  | 4654/5800 [13:01:52<2:13:44,  7.00s/it]                                                        {'loss': 0.0112, 'grad_norm': 3.6047844886779785, 'learning_rate': 3.957289143149816e-06, 'epoch': 40.12}
- 80%|████████  | 4654/5800 [13:01:52<2:13:44,  7.00s/it]score1 tensor([[0.3730],
-        [0.5508],
-        [0.4629],
-        [0.4824]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3730, 0.5391, 0.4629, 0.4863], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:48:29,101] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.89 | optimizer_step: 4.37
-[2025-01-25 21:48:29,103] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2155.20 | bwd_microstep: 4528.92 | bwd_inner_microstep: 4523.74 | bwd_allreduce_microstep: 5.10 | step_microstep: 52.06
-[2025-01-25 21:48:29,103] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2155.16 | bwd: 4528.94 | bwd_inner: 4523.74 | bwd_allreduce: 5.14 | step: 52.07
- 80%|████████  | 4655/5800 [13:01:59<2:12:35,  6.95s/it]                                                        {'loss': 0.0039, 'grad_norm': 0.3193601667881012, 'learning_rate': 3.950622690099213e-06, 'epoch': 40.13}
- 80%|████████  | 4655/5800 [13:01:59<2:12:35,  6.95s/it]score1 tensor([[0.5938],
-        [0.4707],
-        [0.3105],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.4844, 0.3086, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0137, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:48:36,004] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 21:48:36,005] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.80 | bwd_microstep: 4611.69 | bwd_inner_microstep: 4606.37 | bwd_allreduce_microstep: 5.21 | step_microstep: 44.78
-[2025-01-25 21:48:36,006] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.74 | bwd: 4611.71 | bwd_inner: 4606.37 | bwd_allreduce: 5.26 | step: 44.78
- 80%|████████  | 4656/5800 [13:02:05<2:12:13,  6.93s/it]                                                        {'loss': 0.0137, 'grad_norm': 4.785782337188721, 'learning_rate': 3.943961241522942e-06, 'epoch': 40.14}
- 80%|████████  | 4656/5800 [13:02:05<2:12:13,  6.93s/it]score1 tensor([[0.4980],
-        [0.6445],
-        [0.5195],
-        [0.4980]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5039, 0.6641, 0.5000, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:48:42,896] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.83 | optimizer_step: 4.36
-[2025-01-25 21:48:42,897] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.61 | bwd_microstep: 4612.16 | bwd_inner_microstep: 4607.30 | bwd_allreduce_microstep: 4.76 | step_microstep: 45.00
-[2025-01-25 21:48:42,897] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.58 | bwd: 4612.19 | bwd_inner: 4607.30 | bwd_allreduce: 4.81 | step: 45.01
- 80%|████████  | 4657/5800 [13:02:12<2:11:51,  6.92s/it]                                                        {'loss': 0.0127, 'grad_norm': 4.289093494415283, 'learning_rate': 3.937304799498147e-06, 'epoch': 40.15}
- 80%|████████  | 4657/5800 [13:02:12<2:11:51,  6.92s/it]score1 tensor([[0.5938],
-        [0.5625],
-        [0.6641],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5898, 0.5781, 0.6875, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0117, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:48:49,797] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 21:48:49,799] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.36 | bwd_microstep: 4614.03 | bwd_inner_microstep: 4609.34 | bwd_allreduce_microstep: 4.59 | step_microstep: 46.30
-[2025-01-25 21:48:49,799] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.32 | bwd: 4614.06 | bwd_inner: 4609.34 | bwd_allreduce: 4.65 | step: 46.31
- 80%|████████  | 4658/5800 [13:02:19<2:11:36,  6.91s/it]                                                        {'loss': 0.0117, 'grad_norm': 4.304685592651367, 'learning_rate': 3.930653366100425e-06, 'epoch': 40.16}
- 80%|████████  | 4658/5800 [13:02:19<2:11:36,  6.91s/it]score1 tensor([[0.3438],
-        [0.5312],
-        [0.5391],
-        [0.4492]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3477, 0.5430, 0.5469, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:48:56,704] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.85 | optimizer_step: 4.37
-[2025-01-25 21:48:56,705] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.07 | bwd_microstep: 4612.50 | bwd_inner_microstep: 4607.43 | bwd_allreduce_microstep: 4.98 | step_microstep: 43.71
-[2025-01-25 21:48:56,706] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.04 | bwd: 4612.53 | bwd_inner: 4607.43 | bwd_allreduce: 5.03 | step: 43.72
- 80%|████████  | 4659/5800 [13:02:26<2:11:29,  6.91s/it]                                                        {'loss': 0.0068, 'grad_norm': 3.931917428970337, 'learning_rate': 3.92400694340382e-06, 'epoch': 40.16}
- 80%|████████  | 4659/5800 [13:02:26<2:11:29,  6.91s/it]score1 tensor([[0.4141],
-        [0.4512],
-        [0.4844],
-        [0.4902]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.4492, 0.4922, 0.4902], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0034, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:49:03,564] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 21:49:03,565] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2158.56 | bwd_microstep: 4577.73 | bwd_inner_microstep: 4572.69 | bwd_allreduce_microstep: 4.92 | step_microstep: 42.91
-[2025-01-25 21:49:03,566] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2158.47 | bwd: 4577.76 | bwd_inner: 4572.69 | bwd_allreduce: 4.99 | step: 42.92
- 80%|████████  | 4660/5800 [13:02:33<2:11:02,  6.90s/it]                                                        {'loss': 0.0034, 'grad_norm': 1.914523720741272, 'learning_rate': 3.917365533480786e-06, 'epoch': 40.17}
- 80%|████████  | 4660/5800 [13:02:33<2:11:02,  6.90s/it]score1 tensor([[0.5508],
-        [0.3770],
-        [0.4805],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.3789, 0.4941, 0.5703], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:49:10,476] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.54 | optimizer_step: 4.36
-[2025-01-25 21:49:10,477] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.90 | bwd_microstep: 4624.74 | bwd_inner_microstep: 4619.73 | bwd_allreduce_microstep: 4.93 | step_microstep: 45.63
-[2025-01-25 21:49:10,479] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.87 | bwd: 4624.77 | bwd_inner: 4619.73 | bwd_allreduce: 4.97 | step: 45.64
- 80%|████████  | 4661/5800 [13:02:40<2:11:00,  6.90s/it]                                                        {'loss': 0.0078, 'grad_norm': 0.9151231646537781, 'learning_rate': 3.910729138402241e-06, 'epoch': 40.18}
- 80%|████████  | 4661/5800 [13:02:40<2:11:00,  6.90s/it]score1 tensor([[0.5781],
-        [0.5000],
-        [0.5586],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5742, 0.4844, 0.5586, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:49:17,301] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 21:49:17,302] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2157.95 | bwd_microstep: 4539.88 | bwd_inner_microstep: 4534.78 | bwd_allreduce_microstep: 5.00 | step_microstep: 42.78
-[2025-01-25 21:49:17,303] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2157.91 | bwd: 4539.90 | bwd_inner: 4534.78 | bwd_allreduce: 5.05 | step: 42.79
- 80%|████████  | 4662/5800 [13:02:47<2:10:27,  6.88s/it]                                                        {'loss': 0.0049, 'grad_norm': 4.182605266571045, 'learning_rate': 3.90409776023752e-06, 'epoch': 40.19}
- 80%|████████  | 4662/5800 [13:02:47<2:10:27,  6.88s/it]score1 tensor([[0.5039],
-        [0.5312],
-        [0.4180],
-        [0.4512]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4961, 0.5273, 0.4180, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:49:24,157] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.36
-[2025-01-25 21:49:24,159] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2156.89 | bwd_microstep: 4578.12 | bwd_inner_microstep: 4573.06 | bwd_allreduce_microstep: 4.96 | step_microstep: 47.18
-[2025-01-25 21:49:24,159] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2156.84 | bwd: 4578.14 | bwd_inner: 4573.06 | bwd_allreduce: 5.02 | step: 47.19
- 80%|████████  | 4663/5800 [13:02:54<2:10:13,  6.87s/it]                                                        {'loss': 0.0039, 'grad_norm': 5.989253997802734, 'learning_rate': 3.897471401054411e-06, 'epoch': 40.2}
- 80%|████████  | 4663/5800 [13:02:54<2:10:13,  6.87s/it]score1 tensor([[0.4023],
-        [0.6172],
-        [0.4727],
-        [0.6602]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3984, 0.6133, 0.4590, 0.6484], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:49:31,075] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.58 | optimizer_step: 4.36
-[2025-01-25 21:49:31,076] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2160.32 | bwd_microstep: 4639.41 | bwd_inner_microstep: 4634.24 | bwd_allreduce_microstep: 5.07 | step_microstep: 42.80
-[2025-01-25 21:49:31,076] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2160.28 | bwd: 4639.44 | bwd_inner: 4634.24 | bwd_allreduce: 5.12 | step: 42.81
- 80%|████████  | 4664/5800 [13:03:01<2:10:22,  6.89s/it]                                                        {'loss': 0.0083, 'grad_norm': 8.359221458435059, 'learning_rate': 3.89085006291912e-06, 'epoch': 40.21}
- 80%|████████  | 4664/5800 [13:03:01<2:10:22,  6.89s/it]score1 tensor([[0.4043],
-        [0.4395],
-        [0.5547],
-        [0.4766]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4082, 0.4414, 0.5586, 0.4766], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0024, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:49:37,959] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 21:49:37,961] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.93 | bwd_microstep: 4596.57 | bwd_inner_microstep: 4591.13 | bwd_allreduce_microstep: 5.33 | step_microstep: 47.20
-[2025-01-25 21:49:37,961] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.89 | bwd: 4596.60 | bwd_inner: 4591.13 | bwd_allreduce: 5.39 | step: 47.20
- 80%|████████  | 4665/5800 [13:03:07<2:10:15,  6.89s/it]                                                        {'loss': 0.0024, 'grad_norm': 5.728772163391113, 'learning_rate': 3.884233747896295e-06, 'epoch': 40.22}
- 80%|████████  | 4665/5800 [13:03:07<2:10:15,  6.89s/it]score1 tensor([[0.4180],
-        [0.4746],
-        [0.5039],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4180, 0.4727, 0.5000, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0034, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:49:44,829] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 21:49:44,831] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.38 | bwd_microstep: 4580.30 | bwd_inner_microstep: 4575.23 | bwd_allreduce_microstep: 4.90 | step_microstep: 44.26
-[2025-01-25 21:49:44,831] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.34 | bwd: 4580.33 | bwd_inner: 4575.23 | bwd_allreduce: 5.01 | step: 44.26
- 80%|████████  | 4666/5800 [13:03:14<2:10:04,  6.88s/it]                                                        {'loss': 0.0034, 'grad_norm': 6.1319499015808105, 'learning_rate': 3.8776224580490245e-06, 'epoch': 40.22}
- 80%|████████  | 4666/5800 [13:03:14<2:10:04,  6.88s/it]score1 tensor([[0.4258],
-        [0.4492],
-        [0.4727],
-        [0.4863]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4277, 0.4355, 0.4688, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:49:51,767] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 21:49:51,769] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.66 | bwd_microstep: 4638.87 | bwd_inner_microstep: 4633.26 | bwd_allreduce_microstep: 5.49 | step_microstep: 45.16
-[2025-01-25 21:49:51,769] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.63 | bwd: 4638.90 | bwd_inner: 4633.26 | bwd_allreduce: 5.55 | step: 45.16
- 80%|████████  | 4667/5800 [13:03:21<2:10:13,  6.90s/it]                                                        {'loss': 0.0054, 'grad_norm': 3.9497103691101074, 'learning_rate': 3.8710161954388125e-06, 'epoch': 40.23}
- 80%|████████  | 4667/5800 [13:03:21<2:10:13,  6.90s/it]score1 tensor([[0.5039],
-        [0.4766],
-        [0.5703],
-        [0.4219]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5117, 0.4609, 0.5625, 0.4180], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:49:58,701] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.36
-[2025-01-25 21:49:58,702] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.40 | bwd_microstep: 4640.78 | bwd_inner_microstep: 4636.08 | bwd_allreduce_microstep: 4.61 | step_microstep: 42.13
-[2025-01-25 21:49:58,702] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.36 | bwd: 4640.80 | bwd_inner: 4636.09 | bwd_allreduce: 4.65 | step: 42.15
- 80%|████████  | 4668/5800 [13:03:28<2:10:19,  6.91s/it]                                                        {'loss': 0.0088, 'grad_norm': 3.9307942390441895, 'learning_rate': 3.86441496212562e-06, 'epoch': 40.24}
- 80%|████████  | 4668/5800 [13:03:28<2:10:19,  6.91s/it]score1 tensor([[0.4980],
-        [0.6094],
-        [0.4453],
-        [0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.6055, 0.4473, 0.4805], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0034, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:50:05,626] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.89 | optimizer_step: 4.37
-[2025-01-25 21:50:05,627] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.34 | bwd_microstep: 4633.03 | bwd_inner_microstep: 4627.94 | bwd_allreduce_microstep: 4.98 | step_microstep: 42.93
-[2025-01-25 21:50:05,628] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.30 | bwd: 4633.06 | bwd_inner: 4627.95 | bwd_allreduce: 5.03 | step: 42.94
- 80%|████████  | 4669/5800 [13:03:35<2:10:21,  6.92s/it]                                                        {'loss': 0.0034, 'grad_norm': 4.411696910858154, 'learning_rate': 3.857818760167813e-06, 'epoch': 40.25}
- 80%|████████  | 4669/5800 [13:03:35<2:10:21,  6.92s/it]score1 tensor([[0.4980],
-        [0.5195],
-        [0.4609],
-        [0.4395]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.5195, 0.4570, 0.4277], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:50:12,469] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.36
-[2025-01-25 21:50:12,471] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.26 | bwd_microstep: 4544.93 | bwd_inner_microstep: 4539.58 | bwd_allreduce_microstep: 5.27 | step_microstep: 47.21
-[2025-01-25 21:50:12,471] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.22 | bwd: 4544.96 | bwd_inner: 4539.58 | bwd_allreduce: 5.32 | step: 47.23
- 81%|████████  | 4670/5800 [13:03:42<2:09:48,  6.89s/it]                                                        {'loss': 0.0039, 'grad_norm': 3.7535643577575684, 'learning_rate': 3.851227591622213e-06, 'epoch': 40.26}
- 81%|████████  | 4670/5800 [13:03:42<2:09:48,  6.89s/it]score1 tensor([[0.3770],
-        [0.5469],
-        [0.5312],
-        [0.4609]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.5547, 0.5430, 0.4746], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:50:19,402] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 21:50:19,404] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.15 | bwd_microstep: 4636.50 | bwd_inner_microstep: 4631.11 | bwd_allreduce_microstep: 5.30 | step_microstep: 45.15
-[2025-01-25 21:50:19,404] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.10 | bwd: 4636.54 | bwd_inner: 4631.11 | bwd_allreduce: 5.34 | step: 45.16
- 81%|████████  | 4671/5800 [13:03:49<2:09:54,  6.90s/it]                                                        {'loss': 0.0088, 'grad_norm': 4.3966288566589355, 'learning_rate': 3.844641458544056e-06, 'epoch': 40.27}
- 81%|████████  | 4671/5800 [13:03:49<2:09:54,  6.90s/it]score1 tensor([[0.5664],
-        [0.4824],
-        [0.4316],
-        [0.4492]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.4941, 0.4258, 0.4453], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:50:26,271] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 21:50:26,272] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.19 | bwd_microstep: 4578.68 | bwd_inner_microstep: 4573.24 | bwd_allreduce_microstep: 5.30 | step_microstep: 42.70
-[2025-01-25 21:50:26,272] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.16 | bwd: 4578.72 | bwd_inner: 4573.25 | bwd_allreduce: 5.36 | step: 42.71
- 81%|████████  | 4672/5800 [13:03:56<2:09:36,  6.89s/it]                                                        {'loss': 0.0054, 'grad_norm': 1.8093788623809814, 'learning_rate': 3.838060362987006e-06, 'epoch': 40.28}
- 81%|████████  | 4672/5800 [13:03:56<2:09:36,  6.89s/it]score1 tensor([[0.6016],
-        [0.3691],
-        [0.4941],
-        [0.3086]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.3730, 0.4980, 0.3105], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0044, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:50:33,199] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.76 | optimizer_step: 4.36
-[2025-01-25 21:50:33,200] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.19 | bwd_microstep: 4645.19 | bwd_inner_microstep: 4639.83 | bwd_allreduce_microstep: 5.26 | step_microstep: 44.17
-[2025-01-25 21:50:33,201] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.15 | bwd: 4645.21 | bwd_inner: 4639.84 | bwd_allreduce: 5.31 | step: 44.18
- 81%|████████  | 4673/5800 [13:04:03<2:09:41,  6.90s/it]                                                        {'loss': 0.0044, 'grad_norm': 7.521743297576904, 'learning_rate': 3.831484307003175e-06, 'epoch': 40.28}
- 81%|████████  | 4673/5800 [13:04:03<2:09:41,  6.90s/it]score1 tensor([[0.5859],
-        [0.4453],
-        [0.4453],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5898, 0.4551, 0.4434, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:50:40,138] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.86 | optimizer_step: 4.37
-[2025-01-25 21:50:40,139] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.60 | bwd_microstep: 4646.90 | bwd_inner_microstep: 4641.13 | bwd_allreduce_microstep: 5.66 | step_microstep: 43.27
-[2025-01-25 21:50:40,140] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.57 | bwd: 4646.94 | bwd_inner: 4641.13 | bwd_allreduce: 5.72 | step: 43.28
- 81%|████████  | 4674/5800 [13:04:10<2:09:46,  6.92s/it]                                                        {'loss': 0.0078, 'grad_norm': 4.362701892852783, 'learning_rate': 3.82491329264308e-06, 'epoch': 40.29}
- 81%|████████  | 4674/5800 [13:04:10<2:09:46,  6.92s/it]score1 tensor([[0.5664],
-        [0.6016],
-        [0.5078],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5938, 0.6094, 0.5078, 0.5820], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0127, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:50:47,020] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 21:50:47,021] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.92 | bwd_microstep: 4586.55 | bwd_inner_microstep: 4581.14 | bwd_allreduce_microstep: 5.30 | step_microstep: 51.61
-[2025-01-25 21:50:47,022] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.89 | bwd: 4586.58 | bwd_inner: 4581.14 | bwd_allreduce: 5.36 | step: 51.62
- 81%|████████  | 4675/5800 [13:04:16<2:09:27,  6.90s/it]                                                        {'loss': 0.0127, 'grad_norm': 6.598202705383301, 'learning_rate': 3.8183473219556865e-06, 'epoch': 40.3}
- 81%|████████  | 4675/5800 [13:04:16<2:09:27,  6.90s/it]score1 tensor([[0.4062],
-        [0.5078],
-        [0.3789],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4082, 0.5117, 0.3789, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0029, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:50:53,903] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.73 | optimizer_step: 4.37
-[2025-01-25 21:50:53,905] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.24 | bwd_microstep: 4594.84 | bwd_inner_microstep: 4589.65 | bwd_allreduce_microstep: 5.09 | step_microstep: 43.42
-[2025-01-25 21:50:53,905] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.21 | bwd: 4594.87 | bwd_inner: 4589.65 | bwd_allreduce: 5.14 | step: 43.43
- 81%|████████  | 4676/5800 [13:04:23<2:09:13,  6.90s/it]                                                        {'loss': 0.0029, 'grad_norm': 5.770474910736084, 'learning_rate': 3.8117863969883704e-06, 'epoch': 40.31}
- 81%|████████  | 4676/5800 [13:04:23<2:09:13,  6.90s/it]score1 tensor([[0.4414],
-        [0.4707],
-        [0.4473],
-        [0.6289]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4473, 0.4805, 0.4453, 0.6328], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:51:00,831] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 21:51:00,832] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.66 | bwd_microstep: 4642.96 | bwd_inner_microstep: 4638.15 | bwd_allreduce_microstep: 4.68 | step_microstep: 45.00
-[2025-01-25 21:51:00,833] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.62 | bwd: 4642.98 | bwd_inner: 4638.16 | bwd_allreduce: 4.75 | step: 45.02
- 81%|████████  | 4677/5800 [13:04:30<2:09:16,  6.91s/it]                                                        {'loss': 0.0054, 'grad_norm': 4.251932621002197, 'learning_rate': 3.805230519786951e-06, 'epoch': 40.32}
- 81%|████████  | 4677/5800 [13:04:30<2:09:16,  6.91s/it]score1 tensor([[0.4473],
-        [0.5273],
-        [0.5000],
-        [0.5938]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4492, 0.5352, 0.5117, 0.6016], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:51:07,756] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 21:51:07,757] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.35 | bwd_microstep: 4637.94 | bwd_inner_microstep: 4632.50 | bwd_allreduce_microstep: 5.30 | step_microstep: 45.88
-[2025-01-25 21:51:07,757] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.30 | bwd: 4637.97 | bwd_inner: 4632.50 | bwd_allreduce: 5.37 | step: 45.88
- 81%|████████  | 4678/5800 [13:04:37<2:09:15,  6.91s/it]                                                        {'loss': 0.0073, 'grad_norm': 8.193316459655762, 'learning_rate': 3.7986796923956594e-06, 'epoch': 40.33}
- 81%|████████  | 4678/5800 [13:04:37<2:09:15,  6.91s/it]score1 tensor([[0.4922],
-        [0.4297],
-        [0.4492],
-        [0.4727]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4961, 0.4297, 0.4492, 0.4707], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0015, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:51:14,592] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 21:51:14,593] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.97 | bwd_microstep: 4542.23 | bwd_inner_microstep: 4537.20 | bwd_allreduce_microstep: 4.95 | step_microstep: 48.61
-[2025-01-25 21:51:14,593] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.94 | bwd: 4542.26 | bwd_inner: 4537.20 | bwd_allreduce: 4.99 | step: 48.61
- 81%|████████  | 4679/5800 [13:04:44<2:08:43,  6.89s/it]                                                        {'loss': 0.0015, 'grad_norm': 0.280099481344223, 'learning_rate': 3.79213391685715e-06, 'epoch': 40.34}
- 81%|████████  | 4679/5800 [13:04:44<2:08:43,  6.89s/it]score1 tensor([[0.3809],
-        [0.4062],
-        [0.4766],
-        [0.5000]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.4043, 0.4863, 0.4941], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:51:21,535] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 21:51:21,536] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.11 | bwd_microstep: 4645.82 | bwd_inner_microstep: 4639.93 | bwd_allreduce_microstep: 5.76 | step_microstep: 44.51
-[2025-01-25 21:51:21,537] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.08 | bwd: 4645.85 | bwd_inner: 4639.93 | bwd_allreduce: 5.83 | step: 44.53
- 81%|████████  | 4680/5800 [13:04:51<2:08:57,  6.91s/it]                                                        {'loss': 0.0059, 'grad_norm': 3.615825891494751, 'learning_rate': 3.7855931952125225e-06, 'epoch': 40.34}
- 81%|████████  | 4680/5800 [13:04:51<2:08:57,  6.91s/it]score1 tensor([[0.6133],
-        [0.4297],
-        [0.5352],
-        [0.4766]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.4316, 0.5312, 0.4805], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0034, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:51:28,469] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 21:51:28,471] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.86 | bwd_microstep: 4639.16 | bwd_inner_microstep: 4633.23 | bwd_allreduce_microstep: 5.82 | step_microstep: 46.03
-[2025-01-25 21:51:28,471] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.80 | bwd: 4639.18 | bwd_inner: 4633.23 | bwd_allreduce: 5.88 | step: 46.04
- 81%|████████  | 4681/5800 [13:04:58<2:08:56,  6.91s/it]                                                        {'loss': 0.0034, 'grad_norm': 0.8438090682029724, 'learning_rate': 3.7790575295012753e-06, 'epoch': 40.35}
- 81%|████████  | 4681/5800 [13:04:58<2:08:56,  6.91s/it]score1 tensor([[0.5430],
-        [0.5469],
-        [0.5391],
-        [0.4023]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5391, 0.5508, 0.5391, 0.4043], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0024, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:51:35,356] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 21:51:35,358] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.69 | bwd_microstep: 4597.59 | bwd_inner_microstep: 4592.11 | bwd_allreduce_microstep: 5.39 | step_microstep: 46.35
-[2025-01-25 21:51:35,358] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.63 | bwd: 4597.62 | bwd_inner: 4592.11 | bwd_allreduce: 5.44 | step: 46.36
- 81%|████████  | 4682/5800 [13:05:05<2:08:41,  6.91s/it]                                                        {'loss': 0.0024, 'grad_norm': 1.810581922531128, 'learning_rate': 3.7725269217613547e-06, 'epoch': 40.36}
- 81%|████████  | 4682/5800 [13:05:05<2:08:41,  6.91s/it]score1 tensor([[0.4688],
-        [0.4863],
-        [0.5312],
-        [0.5703]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4707, 0.4883, 0.5273, 0.5664], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0029, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:51:42,292] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 21:51:42,294] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.40 | bwd_microstep: 4637.84 | bwd_inner_microstep: 4632.32 | bwd_allreduce_microstep: 5.41 | step_microstep: 46.62
-[2025-01-25 21:51:42,294] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.36 | bwd: 4637.88 | bwd_inner: 4632.32 | bwd_allreduce: 5.46 | step: 46.63
- 81%|████████  | 4683/5800 [13:05:12<2:08:43,  6.91s/it]                                                        {'loss': 0.0029, 'grad_norm': 0.5808476209640503, 'learning_rate': 3.7660013740291114e-06, 'epoch': 40.37}
- 81%|████████  | 4683/5800 [13:05:12<2:08:43,  6.91s/it]score1 tensor([[0.3906],
-        [0.4688],
-        [0.3984],
-        [0.4824]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3750, 0.4648, 0.4004, 0.4844], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:51:49,230] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 21:51:49,232] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.27 | bwd_microstep: 4644.74 | bwd_inner_microstep: 4638.77 | bwd_allreduce_microstep: 5.82 | step_microstep: 49.61
-[2025-01-25 21:51:49,232] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.24 | bwd: 4644.76 | bwd_inner: 4638.77 | bwd_allreduce: 5.89 | step: 49.62
- 81%|████████  | 4684/5800 [13:05:19<2:08:46,  6.92s/it]                                                        {'loss': 0.0059, 'grad_norm': 0.3187166750431061, 'learning_rate': 3.7594808883393175e-06, 'epoch': 40.38}
- 81%|████████  | 4684/5800 [13:05:19<2:08:46,  6.92s/it]score1 tensor([[0.6172],
-        [0.6367],
-        [0.5625],
-        [0.4609]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6133, 0.6367, 0.5586, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:51:56,111] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 21:51:56,112] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.11 | bwd_microstep: 4587.49 | bwd_inner_microstep: 4582.18 | bwd_allreduce_microstep: 5.21 | step_microstep: 47.46
-[2025-01-25 21:51:56,113] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.08 | bwd: 4587.51 | bwd_inner: 4582.18 | bwd_allreduce: 5.26 | step: 47.46
- 81%|████████  | 4685/5800 [13:05:26<2:08:22,  6.91s/it]                                                        {'loss': 0.0049, 'grad_norm': 6.350485324859619, 'learning_rate': 3.7529654667251845e-06, 'epoch': 40.39}
- 81%|████████  | 4685/5800 [13:05:26<2:08:22,  6.91s/it]score1 tensor([[0.6133],
-        [0.5039],
-        [0.5547],
-        [0.5742]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.5156, 0.5547, 0.5781], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:52:03,002] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.67 | optimizer_step: 4.37
-[2025-01-25 21:52:03,003] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.83 | bwd_microstep: 4596.40 | bwd_inner_microstep: 4591.25 | bwd_allreduce_microstep: 5.07 | step_microstep: 48.14
-[2025-01-25 21:52:03,004] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.79 | bwd: 4596.42 | bwd_inner: 4591.25 | bwd_allreduce: 5.11 | step: 48.15
- 81%|████████  | 4686/5800 [13:05:32<2:08:10,  6.90s/it]                                                        {'loss': 0.0049, 'grad_norm': 6.438380718231201, 'learning_rate': 3.7464551112183257e-06, 'epoch': 40.4}
- 81%|████████  | 4686/5800 [13:05:32<2:08:10,  6.90s/it]score1 tensor([[0.4590],
-        [0.3906],
-        [0.5508],
-        [0.4727]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4551, 0.3867, 0.5469, 0.4727], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0029, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:52:09,885] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 21:52:09,886] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.85 | bwd_microstep: 4596.13 | bwd_inner_microstep: 4590.75 | bwd_allreduce_microstep: 5.29 | step_microstep: 43.10
-[2025-01-25 21:52:09,886] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.80 | bwd: 4596.15 | bwd_inner: 4590.75 | bwd_allreduce: 5.34 | step: 43.10
- 81%|████████  | 4687/5800 [13:05:39<2:07:56,  6.90s/it]                                                        {'loss': 0.0029, 'grad_norm': 5.783924579620361, 'learning_rate': 3.7399498238487918e-06, 'epoch': 40.41}
- 81%|████████  | 4687/5800 [13:05:39<2:07:56,  6.90s/it]score1 tensor([[0.5625],
-        [0.4551],
-        [0.5547],
-        [0.3848]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5547, 0.4512, 0.5469, 0.3262], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0195, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:52:16,823] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.36
-[2025-01-25 21:52:16,825] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.42 | bwd_microstep: 4639.26 | bwd_inner_microstep: 4634.19 | bwd_allreduce_microstep: 4.96 | step_microstep: 46.30
-[2025-01-25 21:52:16,825] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.38 | bwd: 4639.29 | bwd_inner: 4634.19 | bwd_allreduce: 5.01 | step: 46.31
- 81%|████████  | 4688/5800 [13:05:46<2:08:03,  6.91s/it]                                                        {'loss': 0.0195, 'grad_norm': 7.873157978057861, 'learning_rate': 3.7334496066450345e-06, 'epoch': 40.41}
- 81%|████████  | 4688/5800 [13:05:46<2:08:03,  6.91s/it]score1 tensor([[0.3730],
-        [0.5234],
-        [0.4590],
-        [0.4570]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3672, 0.5156, 0.4668, 0.4609], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:52:23,756] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 21:52:23,757] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.19 | bwd_microstep: 4638.61 | bwd_inner_microstep: 4633.54 | bwd_allreduce_microstep: 4.98 | step_microstep: 44.93
-[2025-01-25 21:52:23,757] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.15 | bwd: 4638.63 | bwd_inner: 4633.54 | bwd_allreduce: 5.02 | step: 44.94
- 81%|████████  | 4689/5800 [13:05:53<2:08:04,  6.92s/it]                                                        {'loss': 0.0063, 'grad_norm': 0.32937201857566833, 'learning_rate': 3.726954461633945e-06, 'epoch': 40.42}
- 81%|████████  | 4689/5800 [13:05:53<2:08:04,  6.92s/it]score1 tensor([[0.5742],
-        [0.5391],
-        [0.5508],
-        [0.3496]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5664, 0.5430, 0.5469, 0.3340], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:52:30,682] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.36
-[2025-01-25 21:52:30,683] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.59 | bwd_microstep: 4641.54 | bwd_inner_microstep: 4636.07 | bwd_allreduce_microstep: 5.37 | step_microstep: 44.63
-[2025-01-25 21:52:30,684] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.56 | bwd: 4641.57 | bwd_inner: 4636.07 | bwd_allreduce: 5.43 | step: 44.64
- 81%|████████  | 4690/5800 [13:06:00<2:08:02,  6.92s/it]                                                        {'loss': 0.0078, 'grad_norm': 3.894688367843628, 'learning_rate': 3.7204643908408102e-06, 'epoch': 40.43}
- 81%|████████  | 4690/5800 [13:06:00<2:08:02,  6.92s/it]score1 tensor([[0.3906],
-        [0.4473],
-        [0.5703],
-        [0.4629]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3945, 0.4277, 0.5625, 0.4590], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:52:37,622] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 21:52:37,624] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.99 | bwd_microstep: 4642.99 | bwd_inner_microstep: 4638.01 | bwd_allreduce_microstep: 4.88 | step_microstep: 46.30
-[2025-01-25 21:52:37,624] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.95 | bwd: 4643.01 | bwd_inner: 4638.01 | bwd_allreduce: 4.93 | step: 46.30
- 81%|████████  | 4691/5800 [13:06:07<2:07:59,  6.92s/it]                                                        {'loss': 0.0088, 'grad_norm': 4.254029273986816, 'learning_rate': 3.7139793962893555e-06, 'epoch': 40.44}
- 81%|████████  | 4691/5800 [13:06:07<2:07:59,  6.92s/it]score1 tensor([[0.5352],
-        [0.6562],
-        [0.3848],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5508, 0.6641, 0.3750, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:52:44,544] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 21:52:44,545] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.87 | bwd_microstep: 4637.94 | bwd_inner_microstep: 4632.62 | bwd_allreduce_microstep: 5.19 | step_microstep: 45.01
-[2025-01-25 21:52:44,546] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.82 | bwd: 4637.97 | bwd_inner: 4632.62 | bwd_allreduce: 5.26 | step: 45.02
- 81%|████████  | 4692/5800 [13:06:14<2:07:51,  6.92s/it]                                                        {'loss': 0.0093, 'grad_norm': 4.880202293395996, 'learning_rate': 3.7074994800017105e-06, 'epoch': 40.45}
- 81%|████████  | 4692/5800 [13:06:14<2:07:51,  6.92s/it]score1 tensor([[0.6406],
-        [0.4102],
-        [0.1660],
-        [0.4473]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6445, 0.4062, 0.1787, 0.4473], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0051, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:52:51,435] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.37
-[2025-01-25 21:52:51,436] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.47 | bwd_microstep: 4599.89 | bwd_inner_microstep: 4594.93 | bwd_allreduce_microstep: 4.85 | step_microstep: 44.70
-[2025-01-25 21:52:51,436] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.43 | bwd: 4599.92 | bwd_inner: 4594.93 | bwd_allreduce: 4.90 | step: 44.68
- 81%|████████  | 4693/5800 [13:06:21<2:07:35,  6.92s/it]                                                        {'loss': 0.0051, 'grad_norm': 1.6538969278335571, 'learning_rate': 3.70102464399843e-06, 'epoch': 40.46}
- 81%|████████  | 4693/5800 [13:06:21<2:07:35,  6.92s/it]score1 tensor([[0.6875],
-        [0.5586],
-        [0.4062],
-        [0.5430]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6719, 0.5820, 0.4004, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0132, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:52:58,375] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.36
-[2025-01-25 21:52:58,376] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2166.86 | bwd_microstep: 4640.99 | bwd_inner_microstep: 4635.63 | bwd_allreduce_microstep: 5.26 | step_microstep: 47.16
-[2025-01-25 21:52:58,377] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2166.82 | bwd: 4641.01 | bwd_inner: 4635.63 | bwd_allreduce: 5.31 | step: 47.15
- 81%|████████  | 4694/5800 [13:06:28<2:07:35,  6.92s/it]                                                        {'loss': 0.0132, 'grad_norm': 4.196359634399414, 'learning_rate': 3.694554890298474e-06, 'epoch': 40.47}
- 81%|████████  | 4694/5800 [13:06:28<2:07:35,  6.92s/it]score1 tensor([[0.4805],
-        [0.5312],
-        [0.4414],
-        [0.5156]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4824, 0.5430, 0.4395, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:53:05,267] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.37
-[2025-01-25 21:53:05,269] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.84 | bwd_microstep: 4599.52 | bwd_inner_microstep: 4593.99 | bwd_allreduce_microstep: 5.40 | step_microstep: 47.35
-[2025-01-25 21:53:05,269] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.81 | bwd: 4599.54 | bwd_inner: 4593.99 | bwd_allreduce: 5.47 | step: 47.36
- 81%|████████  | 4695/5800 [13:06:35<2:07:18,  6.91s/it]                                                        {'loss': 0.0039, 'grad_norm': 2.201416015625, 'learning_rate': 3.6880902209192315e-06, 'epoch': 40.47}
- 81%|████████  | 4695/5800 [13:06:35<2:07:18,  6.91s/it]score1 tensor([[0.4980],
-        [0.3945],
-        [0.4668],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4941, 0.3906, 0.4688, 0.5195], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0024, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:53:12,165] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.66 | optimizer_step: 4.37
-[2025-01-25 21:53:12,166] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.96 | bwd_microstep: 4596.17 | bwd_inner_microstep: 4590.61 | bwd_allreduce_microstep: 5.45 | step_microstep: 50.59
-[2025-01-25 21:53:12,167] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.91 | bwd: 4596.19 | bwd_inner: 4590.61 | bwd_allreduce: 5.51 | step: 50.59
- 81%|████████  | 4696/5800 [13:06:42<2:07:06,  6.91s/it]                                                        {'loss': 0.0024, 'grad_norm': 1.869300127029419, 'learning_rate': 3.68163063787649e-06, 'epoch': 40.48}
- 81%|████████  | 4696/5800 [13:06:42<2:07:06,  6.91s/it]score1 tensor([[0.3730],
-        [0.5664],
-        [0.5195],
-        [0.6289]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3652, 0.5625, 0.5234, 0.6445], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:53:19,105] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.37
-[2025-01-25 21:53:19,107] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.67 | bwd_microstep: 4643.50 | bwd_inner_microstep: 4638.29 | bwd_allreduce_microstep: 5.11 | step_microstep: 45.93
-[2025-01-25 21:53:19,107] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.64 | bwd: 4643.52 | bwd_inner: 4638.29 | bwd_allreduce: 5.16 | step: 45.93
- 81%|████████  | 4697/5800 [13:06:49<2:07:10,  6.92s/it]                                                        {'loss': 0.0078, 'grad_norm': 0.7362734079360962, 'learning_rate': 3.6751761431844668e-06, 'epoch': 40.49}
- 81%|████████  | 4697/5800 [13:06:49<2:07:10,  6.92s/it]score1 tensor([[0.4707],
-        [0.4570],
-        [0.4492],
-        [0.4512]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4707, 0.4473, 0.4473, 0.4492], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0034, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:53:26,002] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.71 | optimizer_step: 4.36
-[2025-01-25 21:53:26,004] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.50 | bwd_microstep: 4593.17 | bwd_inner_microstep: 4585.21 | bwd_allreduce_microstep: 7.84 | step_microstep: 51.22
-[2025-01-25 21:53:26,004] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.45 | bwd: 4593.20 | bwd_inner: 4585.21 | bwd_allreduce: 7.90 | step: 51.23
- 81%|████████  | 4698/5800 [13:06:55<2:06:56,  6.91s/it]                                                        {'loss': 0.0034, 'grad_norm': 5.695019245147705, 'learning_rate': 3.668726738855779e-06, 'epoch': 40.5}
- 81%|████████  | 4698/5800 [13:06:55<2:06:56,  6.91s/it]score1 tensor([[0.5234],
-        [0.4727],
-        [0.5000],
-        [0.5273]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.4551, 0.4980, 0.5312], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:53:32,939] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.36
-[2025-01-25 21:53:32,941] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.60 | bwd_microstep: 4638.03 | bwd_inner_microstep: 4632.92 | bwd_allreduce_microstep: 5.04 | step_microstep: 44.71
-[2025-01-25 21:53:32,941] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.56 | bwd: 4638.05 | bwd_inner: 4632.92 | bwd_allreduce: 5.07 | step: 44.73
- 81%|████████  | 4699/5800 [13:07:02<2:06:59,  6.92s/it]                                                        {'loss': 0.0068, 'grad_norm': 0.4571056067943573, 'learning_rate': 3.662282426901471e-06, 'epoch': 40.51}
- 81%|████████  | 4699/5800 [13:07:02<2:06:59,  6.92s/it]score1 tensor([[0.5195],
-        [0.4590],
-        [0.6914],
-        [0.5039]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5234, 0.4629, 0.6875, 0.5039], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0029, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:53:39,845] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 21:53:39,846] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.25 | bwd_microstep: 4598.79 | bwd_inner_microstep: 4593.24 | bwd_allreduce_microstep: 5.42 | step_microstep: 49.27
-[2025-01-25 21:53:39,847] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.21 | bwd: 4598.83 | bwd_inner: 4593.24 | bwd_allreduce: 5.49 | step: 49.28
- 81%|████████  | 4700/5800 [13:07:09<2:06:46,  6.92s/it]                                                        {'loss': 0.0029, 'grad_norm': 1.6052155494689941, 'learning_rate': 3.6558432093309825e-06, 'epoch': 40.52}
- 81%|████████  | 4700/5800 [13:07:09<2:06:46,  6.92s/it]score1 tensor([[0.4746],
-        [0.5117],
-        [0.5781],
-        [0.5547]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4844, 0.5117, 0.5781, 0.5742], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0073, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:53:46,680] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.80 | optimizer_step: 4.36
-[2025-01-25 21:53:46,681] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.73 | bwd_microstep: 4542.43 | bwd_inner_microstep: 4537.51 | bwd_allreduce_microstep: 4.84 | step_microstep: 44.64
-[2025-01-25 21:53:46,681] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.70 | bwd: 4542.45 | bwd_inner: 4537.51 | bwd_allreduce: 4.87 | step: 44.65
- 81%|████████  | 4701/5800 [13:07:16<2:06:14,  6.89s/it]                                                        {'loss': 0.0073, 'grad_norm': 4.103417873382568, 'learning_rate': 3.6494090881521827e-06, 'epoch': 40.53}
- 81%|████████  | 4701/5800 [13:07:16<2:06:14,  6.89s/it]score1 tensor([[0.6211],
-        [0.4609],
-        [0.4062],
-        [0.4551]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6250, 0.4688, 0.4023, 0.4551], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:53:53,567] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.53 | optimizer_step: 4.36
-[2025-01-25 21:53:53,568] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.67 | bwd_microstep: 4590.69 | bwd_inner_microstep: 4585.55 | bwd_allreduce_microstep: 5.07 | step_microstep: 44.59
-[2025-01-25 21:53:53,569] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.64 | bwd: 4590.72 | bwd_inner: 4585.55 | bwd_allreduce: 5.11 | step: 44.60
- 81%|████████  | 4702/5800 [13:07:23<2:06:04,  6.89s/it]                                                        {'loss': 0.0039, 'grad_norm': 2.4354312419891357, 'learning_rate': 3.642980065371333e-06, 'epoch': 40.53}
- 81%|████████  | 4702/5800 [13:07:23<2:06:04,  6.89s/it]score1 tensor([[0.6055],
-        [0.4121],
-        [0.4434],
-        [0.6445]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6055, 0.4121, 0.4512, 0.6484], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0029, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:54:00,422] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.01 | optimizer_step: 4.37
-[2025-01-25 21:54:00,423] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.71 | bwd_microstep: 4561.62 | bwd_inner_microstep: 4556.14 | bwd_allreduce_microstep: 5.37 | step_microstep: 50.28
-[2025-01-25 21:54:00,424] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.67 | bwd: 4561.64 | bwd_inner: 4556.14 | bwd_allreduce: 5.44 | step: 50.28
- 81%|████████  | 4703/5800 [13:07:30<2:05:46,  6.88s/it]                                                        {'loss': 0.0029, 'grad_norm': 4.312981605529785, 'learning_rate': 3.6365561429931173e-06, 'epoch': 40.54}
- 81%|████████  | 4703/5800 [13:07:30<2:05:46,  6.88s/it]score1 tensor([[0.4316],
-        [0.5039],
-        [0.5742],
-        [0.6133]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4277, 0.5000, 0.5664, 0.6016], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:54:07,356] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.03 | optimizer_step: 4.36
-[2025-01-25 21:54:07,357] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.72 | bwd_microstep: 4642.54 | bwd_inner_microstep: 4636.57 | bwd_allreduce_microstep: 5.85 | step_microstep: 45.68
-[2025-01-25 21:54:07,357] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.69 | bwd: 4642.57 | bwd_inner: 4636.58 | bwd_allreduce: 5.92 | step: 45.69
- 81%|████████  | 4704/5800 [13:07:37<2:05:57,  6.90s/it]                                                        {'loss': 0.0068, 'grad_norm': 8.313942909240723, 'learning_rate': 3.6301373230206284e-06, 'epoch': 40.55}
- 81%|████████  | 4704/5800 [13:07:37<2:05:57,  6.90s/it]score1 tensor([[0.4902],
-        [0.3711],
-        [0.5469],
-        [0.4883]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4922, 0.3809, 0.5312, 0.4961], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:54:14,296] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.84 | optimizer_step: 4.36
-[2025-01-25 21:54:14,297] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.29 | bwd_microstep: 4648.45 | bwd_inner_microstep: 4643.38 | bwd_allreduce_microstep: 4.98 | step_microstep: 45.27
-[2025-01-25 21:54:14,298] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.26 | bwd: 4648.47 | bwd_inner: 4643.38 | bwd_allreduce: 5.03 | step: 45.28
- 81%|████████  | 4705/5800 [13:07:44<2:06:04,  6.91s/it]                                                        {'loss': 0.0088, 'grad_norm': 3.5719778537750244, 'learning_rate': 3.6237236074553606e-06, 'epoch': 40.56}
- 81%|████████  | 4705/5800 [13:07:44<2:06:04,  6.91s/it]score1 tensor([[0.5547],
-        [0.4590],
-        [0.4277],
-        [0.5898]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.4629, 0.4141, 0.5977], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:54:21,229] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.36
-[2025-01-25 21:54:21,231] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.68 | bwd_microstep: 4646.61 | bwd_inner_microstep: 4636.89 | bwd_allreduce_microstep: 9.57 | step_microstep: 48.67
-[2025-01-25 21:54:21,231] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.64 | bwd: 4646.63 | bwd_inner: 4636.89 | bwd_allreduce: 9.67 | step: 48.68
- 81%|████████  | 4706/5800 [13:07:51<2:06:07,  6.92s/it]                                                        {'loss': 0.0093, 'grad_norm': 0.4421692192554474, 'learning_rate': 3.61731499829723e-06, 'epoch': 40.57}
- 81%|████████  | 4706/5800 [13:07:51<2:06:07,  6.92s/it]score1 tensor([[0.5430],
-        [0.4648],
-        [0.3965],
-        [0.3691]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5430, 0.4688, 0.4043, 0.3652], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0039, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:54:28,120] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.36
-[2025-01-25 21:54:28,122] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.65 | bwd_microstep: 4583.70 | bwd_inner_microstep: 4578.64 | bwd_allreduce_microstep: 4.96 | step_microstep: 48.25
-[2025-01-25 21:54:28,122] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.61 | bwd: 4583.72 | bwd_inner: 4578.64 | bwd_allreduce: 5.01 | step: 48.25
- 81%|████████  | 4707/5800 [13:07:58<2:05:51,  6.91s/it]                                                        {'loss': 0.0039, 'grad_norm': 2.0087385177612305, 'learning_rate': 3.6109114975445402e-06, 'epoch': 40.58}
- 81%|████████  | 4707/5800 [13:07:58<2:05:51,  6.91s/it]score1 tensor([[0.4941],
-        [0.4492],
-        [0.4961],
-        [0.6367]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4922, 0.4492, 0.4785, 0.6289], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0068, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:54:34,999] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.63 | optimizer_step: 4.37
-[2025-01-25 21:54:35,001] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.82 | bwd_microstep: 4581.16 | bwd_inner_microstep: 4575.74 | bwd_allreduce_microstep: 5.31 | step_microstep: 49.69
-[2025-01-25 21:54:35,001] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.76 | bwd: 4581.18 | bwd_inner: 4575.74 | bwd_allreduce: 5.37 | step: 49.69
- 81%|████████  | 4708/5800 [13:08:04<2:05:34,  6.90s/it]                                                        {'loss': 0.0068, 'grad_norm': 6.321859836578369, 'learning_rate': 3.6045131071940275e-06, 'epoch': 40.59}
- 81%|████████  | 4708/5800 [13:08:04<2:05:34,  6.90s/it]score1 tensor([[0.5312],
-        [0.4980],
-        [0.5664],
-        [0.5859]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.4922, 0.5664, 0.5898], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0034, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:54:41,879] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 21:54:41,881] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.11 | bwd_microstep: 4587.57 | bwd_inner_microstep: 4582.18 | bwd_allreduce_microstep: 5.29 | step_microstep: 48.41
-[2025-01-25 21:54:41,881] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.08 | bwd: 4587.60 | bwd_inner: 4582.18 | bwd_allreduce: 5.34 | step: 48.43
- 81%|████████  | 4709/5800 [13:08:11<2:05:22,  6.89s/it]                                                        {'loss': 0.0034, 'grad_norm': 2.338186025619507, 'learning_rate': 3.598119829240816e-06, 'epoch': 40.59}
- 81%|████████  | 4709/5800 [13:08:11<2:05:22,  6.89s/it]score1 tensor([[0.4785],
-        [0.5664],
-        [0.4160],
-        [0.4805]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4785, 0.5547, 0.4180, 0.4727], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:54:48,754] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.37
-[2025-01-25 21:54:48,756] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.28 | bwd_microstep: 4578.38 | bwd_inner_microstep: 4573.46 | bwd_allreduce_microstep: 4.84 | step_microstep: 44.31
-[2025-01-25 21:54:48,756] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.24 | bwd: 4578.40 | bwd_inner: 4573.46 | bwd_allreduce: 4.88 | step: 44.31
- 81%|████████  | 4710/5800 [13:08:18<2:05:06,  6.89s/it]                                                        {'loss': 0.0054, 'grad_norm': 2.329803228378296, 'learning_rate': 3.591731665678433e-06, 'epoch': 40.6}
- 81%|████████  | 4710/5800 [13:08:18<2:05:06,  6.89s/it]score1 tensor([[0.5195],
-        [0.4473],
-        [0.5703],
-        [0.5664]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.4453, 0.5703, 0.5625], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0015, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:54:55,583] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.65 | optimizer_step: 4.36
-[2025-01-25 21:54:55,584] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.33 | bwd_microstep: 4544.73 | bwd_inner_microstep: 4539.51 | bwd_allreduce_microstep: 5.13 | step_microstep: 48.32
-[2025-01-25 21:54:55,585] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.29 | bwd: 4544.75 | bwd_inner: 4539.51 | bwd_allreduce: 5.17 | step: 48.33
- 81%|████████  | 4711/5800 [13:08:25<2:04:40,  6.87s/it]                                                        {'loss': 0.0015, 'grad_norm': 4.022320747375488, 'learning_rate': 3.5853486184988297e-06, 'epoch': 40.61}
- 81%|████████  | 4711/5800 [13:08:25<2:04:40,  6.87s/it]score1 tensor([[0.4766],
-        [0.6133],
-        [0.4199],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4648, 0.6133, 0.4219, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0034, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:55:02,397] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.72 | optimizer_step: 4.37
-[2025-01-25 21:55:02,398] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2161.49 | bwd_microstep: 4539.42 | bwd_inner_microstep: 4534.41 | bwd_allreduce_microstep: 4.93 | step_microstep: 44.28
-[2025-01-25 21:55:02,399] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2161.45 | bwd: 4539.44 | bwd_inner: 4534.41 | bwd_allreduce: 4.97 | step: 44.30
- 81%|████████  | 4712/5800 [13:08:32<2:04:18,  6.85s/it]                                                        {'loss': 0.0034, 'grad_norm': 0.3112442195415497, 'learning_rate': 3.5789706896923404e-06, 'epoch': 40.62}
- 81%|████████  | 4712/5800 [13:08:32<2:04:18,  6.85s/it]score1 tensor([[0.4316],
-        [0.4355],
-        [0.4961],
-        [0.4004]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4238, 0.4199, 0.4844, 0.3984], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0093, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:55:09,335] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.69 | optimizer_step: 4.37
-[2025-01-25 21:55:09,336] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.75 | bwd_microstep: 4640.43 | bwd_inner_microstep: 4635.11 | bwd_allreduce_microstep: 5.23 | step_microstep: 50.57
-[2025-01-25 21:55:09,337] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.72 | bwd: 4640.45 | bwd_inner: 4635.11 | bwd_allreduce: 5.27 | step: 50.58
- 81%|████████▏ | 4713/5800 [13:08:39<2:04:36,  6.88s/it]                                                        {'loss': 0.0093, 'grad_norm': 7.466474533081055, 'learning_rate': 3.5725978812477234e-06, 'epoch': 40.63}
- 81%|████████▏ | 4713/5800 [13:08:39<2:04:36,  6.88s/it]score1 tensor([[0.4492],
-        [0.4180],
-        [0.3730],
-        [0.5781]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4375, 0.4160, 0.3652, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0122, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:55:16,253] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.90 | optimizer_step: 4.37
-[2025-01-25 21:55:16,254] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.55 | bwd_microstep: 4634.21 | bwd_inner_microstep: 4628.57 | bwd_allreduce_microstep: 5.54 | step_microstep: 51.84
-[2025-01-25 21:55:16,255] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.51 | bwd: 4634.23 | bwd_inner: 4628.57 | bwd_allreduce: 5.59 | step: 51.85
- 81%|████████▏ | 4714/5800 [13:08:46<2:04:42,  6.89s/it]                                                        {'loss': 0.0122, 'grad_norm': 3.2751524448394775, 'learning_rate': 3.5662301951521294e-06, 'epoch': 40.64}
- 81%|████████▏ | 4714/5800 [13:08:46<2:04:42,  6.89s/it]score1 tensor([[0.5352],
-        [0.5508],
-        [0.3633],
-        [0.4805]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5352, 0.5547, 0.3691, 0.4766], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0034, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:55:23,121] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.36
-[2025-01-25 21:55:23,123] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.91 | bwd_microstep: 4584.42 | bwd_inner_microstep: 4578.77 | bwd_allreduce_microstep: 5.55 | step_microstep: 46.79
-[2025-01-25 21:55:23,123] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.87 | bwd: 4584.44 | bwd_inner: 4578.77 | bwd_allreduce: 5.60 | step: 46.80
- 81%|████████▏ | 4715/5800 [13:08:53<2:04:29,  6.88s/it]                                                        {'loss': 0.0034, 'grad_norm': 1.8615673780441284, 'learning_rate': 3.5598676333911075e-06, 'epoch': 40.65}
- 81%|████████▏ | 4715/5800 [13:08:53<2:04:29,  6.88s/it]score1 tensor([[0.5312],
-        [0.4883],
-        [0.7109],
-        [0.3750]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5273, 0.4941, 0.7031, 0.3711], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0054, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:55:30,059] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.77 | optimizer_step: 4.36
-[2025-01-25 21:55:30,061] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.71 | bwd_microstep: 4641.64 | bwd_inner_microstep: 4636.50 | bwd_allreduce_microstep: 5.01 | step_microstep: 49.54
-[2025-01-25 21:55:30,061] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.67 | bwd: 4641.66 | bwd_inner: 4636.50 | bwd_allreduce: 5.07 | step: 49.54
- 81%|████████▏ | 4716/5800 [13:09:00<2:04:42,  6.90s/it]                                                        {'loss': 0.0054, 'grad_norm': 4.333514213562012, 'learning_rate': 3.553510197948622e-06, 'epoch': 40.66}
- 81%|████████▏ | 4716/5800 [13:09:00<2:04:42,  6.90s/it]score1 tensor([[0.4121],
-        [0.4121],
-        [0.5195],
-        [0.4844]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4121, 0.4121, 0.5195, 0.4805], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0010, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:55:36,885] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.03 | optimizer_step: 4.36
-[2025-01-25 21:55:36,886] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.34 | bwd_microstep: 4516.12 | bwd_inner_microstep: 4510.57 | bwd_allreduce_microstep: 5.42 | step_microstep: 56.97
-[2025-01-25 21:55:36,887] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.30 | bwd: 4516.15 | bwd_inner: 4510.57 | bwd_allreduce: 5.49 | step: 56.97
- 81%|████████▏ | 4717/5800 [13:09:06<2:04:07,  6.88s/it]                                                        {'loss': 0.001, 'grad_norm': 1.9600282907485962, 'learning_rate': 3.5471578908070225e-06, 'epoch': 40.66}
- 81%|████████▏ | 4717/5800 [13:09:06<2:04:07,  6.88s/it]score1 tensor([[0.5312],
-        [0.5859],
-        [0.5078],
-        [0.6250]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5195, 0.5820, 0.5039, 0.6211], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:55:43,815] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.36
-[2025-01-25 21:55:43,816] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2167.43 | bwd_microstep: 4632.75 | bwd_inner_microstep: 4627.47 | bwd_allreduce_microstep: 5.18 | step_microstep: 48.64
-[2025-01-25 21:55:43,817] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2167.38 | bwd: 4632.77 | bwd_inner: 4627.47 | bwd_allreduce: 5.23 | step: 48.65
- 81%|████████▏ | 4718/5800 [13:09:13<2:04:18,  6.89s/it]                                                        {'loss': 0.0059, 'grad_norm': 8.627897262573242, 'learning_rate': 3.5408107139470805e-06, 'epoch': 40.67}
- 81%|████████▏ | 4718/5800 [13:09:13<2:04:18,  6.89s/it]score1 tensor([[0.6172],
-        [0.4375],
-        [0.4648],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6172, 0.4336, 0.4648, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0020, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:55:50,645] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.68 | optimizer_step: 4.37
-[2025-01-25 21:55:50,646] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.13 | bwd_microstep: 4539.59 | bwd_inner_microstep: 4532.89 | bwd_allreduce_microstep: 6.58 | step_microstep: 47.14
-[2025-01-25 21:55:50,647] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.07 | bwd: 4539.61 | bwd_inner: 4532.89 | bwd_allreduce: 6.64 | step: 47.15
- 81%|████████▏ | 4719/5800 [13:09:20<2:03:51,  6.87s/it]                                                        {'loss': 0.002, 'grad_norm': 3.889409303665161, 'learning_rate': 3.5344686693479437e-06, 'epoch': 40.68}
- 81%|████████▏ | 4719/5800 [13:09:20<2:03:51,  6.87s/it]score1 tensor([[0.3340],
-        [0.5781],
-        [0.5547],
-        [0.6016]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.3457, 0.5781, 0.5508, 0.6055], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0049, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:55:57,536] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.09 | optimizer_step: 4.36
-[2025-01-25 21:55:57,537] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.95 | bwd_microstep: 4589.23 | bwd_inner_microstep: 4583.79 | bwd_allreduce_microstep: 5.34 | step_microstep: 55.48
-[2025-01-25 21:55:57,537] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.92 | bwd: 4589.25 | bwd_inner: 4583.79 | bwd_allreduce: 5.38 | step: 55.49
- 81%|████████▏ | 4720/5800 [13:09:27<2:03:50,  6.88s/it]                                                        {'loss': 0.0049, 'grad_norm': 1.7936084270477295, 'learning_rate': 3.5281317589871834e-06, 'epoch': 40.69}
- 81%|████████▏ | 4720/5800 [13:09:27<2:03:50,  6.88s/it]score1 tensor([[0.6016],
-        [0.3984],
-        [0.6133],
-        [0.5469]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6094, 0.4004, 0.6094, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0044, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:56:04,454] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.75 | optimizer_step: 4.36
-[2025-01-25 21:56:04,456] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.27 | bwd_microstep: 4628.71 | bwd_inner_microstep: 4623.23 | bwd_allreduce_microstep: 5.36 | step_microstep: 44.32
-[2025-01-25 21:56:04,456] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.22 | bwd: 4628.73 | bwd_inner: 4623.23 | bwd_allreduce: 5.42 | step: 44.33
- 81%|████████▏ | 4721/5800 [13:09:34<2:03:54,  6.89s/it]                                                        {'loss': 0.0044, 'grad_norm': 3.8346266746520996, 'learning_rate': 3.5217999848407525e-06, 'epoch': 40.7}
- 81%|████████▏ | 4721/5800 [13:09:34<2:03:54,  6.89s/it]score1 tensor([[0.4941],
-        [0.5977],
-        [0.5195],
-        [0.5234]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4980, 0.6055, 0.5117, 0.5352], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0078, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:56:11,392] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.92 | optimizer_step: 4.36
-[2025-01-25 21:56:11,394] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.84 | bwd_microstep: 4639.96 | bwd_inner_microstep: 4634.39 | bwd_allreduce_microstep: 5.45 | step_microstep: 51.26
-[2025-01-25 21:56:11,394] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.80 | bwd: 4639.98 | bwd_inner: 4634.39 | bwd_allreduce: 5.51 | step: 51.27
- 81%|████████▏ | 4722/5800 [13:09:41<2:04:04,  6.91s/it]                                                        {'loss': 0.0078, 'grad_norm': 4.215353488922119, 'learning_rate': 3.515473348883003e-06, 'epoch': 40.71}
- 81%|████████▏ | 4722/5800 [13:09:41<2:04:04,  6.91s/it]score1 tensor([[0.4004],
-        [0.5977],
-        [0.4844],
-        [0.5117]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4062, 0.6016, 0.5039, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0083, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:56:18,318] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 21:56:18,320] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.99 | bwd_microstep: 4630.75 | bwd_inner_microstep: 4624.87 | bwd_allreduce_microstep: 5.79 | step_microstep: 49.13
-[2025-01-25 21:56:18,320] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.96 | bwd: 4630.78 | bwd_inner: 4624.87 | bwd_allreduce: 5.83 | step: 49.14
- 81%|████████▏ | 4723/5800 [13:09:48<2:04:04,  6.91s/it]                                                        {'loss': 0.0083, 'grad_norm': 7.97247314453125, 'learning_rate': 3.5091518530866984e-06, 'epoch': 40.72}
- 81%|████████▏ | 4723/5800 [13:09:48<2:04:04,  6.91s/it]score1 tensor([[0.6328],
-        [0.3535],
-        [0.6367],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6406, 0.3555, 0.6445, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:56:25,255] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.42 | optimizer_step: 4.36
-[2025-01-25 21:56:25,256] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.01 | bwd_microstep: 4637.92 | bwd_inner_microstep: 4630.75 | bwd_allreduce_microstep: 7.05 | step_microstep: 49.85
-[2025-01-25 21:56:25,257] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.98 | bwd: 4637.95 | bwd_inner: 4630.75 | bwd_allreduce: 7.12 | step: 49.85
- 81%|████████▏ | 4724/5800 [13:09:55<2:04:06,  6.92s/it]                                                        {'loss': 0.0063, 'grad_norm': 8.402565956115723, 'learning_rate': 3.502835499422983e-06, 'epoch': 40.72}
- 81%|████████▏ | 4724/5800 [13:09:55<2:04:06,  6.92s/it]score1 tensor([[0.6094],
-        [0.6055],
-        [0.5391],
-        [0.4238]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.6094, 0.5352, 0.4297], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:56:32,189] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.82 | optimizer_step: 4.37
-[2025-01-25 21:56:32,190] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.40 | bwd_microstep: 4640.02 | bwd_inner_microstep: 4634.83 | bwd_allreduce_microstep: 5.11 | step_microstep: 45.47
-[2025-01-25 21:56:32,190] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.36 | bwd: 4640.05 | bwd_inner: 4634.83 | bwd_allreduce: 5.15 | step: 45.51
- 81%|████████▏ | 4725/5800 [13:10:02<2:04:01,  6.92s/it]                                                        {'loss': 0.0063, 'grad_norm': 4.230506420135498, 'learning_rate': 3.4965242898614162e-06, 'epoch': 40.73}
- 81%|████████▏ | 4725/5800 [13:10:02<2:04:01,  6.92s/it]score1 tensor([[0.5195],
-        [0.5977],
-        [0.4141],
-        [0.4746]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5312, 0.6172, 0.4141, 0.4863], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0107, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:56:39,066] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.74 | optimizer_step: 4.37
-[2025-01-25 21:56:39,067] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.23 | bwd_microstep: 4591.19 | bwd_inner_microstep: 4585.61 | bwd_allreduce_microstep: 5.41 | step_microstep: 46.16
-[2025-01-25 21:56:39,068] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.20 | bwd: 4591.21 | bwd_inner: 4585.61 | bwd_allreduce: 5.51 | step: 46.16
- 81%|████████▏ | 4726/5800 [13:10:09<2:03:39,  6.91s/it]                                                        {'loss': 0.0107, 'grad_norm': 6.493091106414795, 'learning_rate': 3.490218226369928e-06, 'epoch': 40.74}
- 81%|████████▏ | 4726/5800 [13:10:09<2:03:39,  6.91s/it]score1 tensor([[0.4688],
-        [0.6172],
-        [0.4180],
-        [0.5195]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4688, 0.6133, 0.4160, 0.5156], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0024, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:56:45,937] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.37
-[2025-01-25 21:56:45,938] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2165.92 | bwd_microstep: 4579.92 | bwd_inner_microstep: 4574.36 | bwd_allreduce_microstep: 5.45 | step_microstep: 48.99
-[2025-01-25 21:56:45,939] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2165.88 | bwd: 4579.95 | bwd_inner: 4574.36 | bwd_allreduce: 5.50 | step: 49.00
- 82%|████████▏ | 4727/5800 [13:10:15<2:03:21,  6.90s/it]                                                        {'loss': 0.0024, 'grad_norm': 6.166811943054199, 'learning_rate': 3.4839173109148728e-06, 'epoch': 40.75}
- 82%|████████▏ | 4727/5800 [13:10:15<2:03:21,  6.90s/it]score1 tensor([[0.4922],
-        [0.5664],
-        [0.3750],
-        [0.5625]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4883, 0.5469, 0.3750, 0.5508], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0088, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:56:52,814] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.81 | optimizer_step: 4.37
-[2025-01-25 21:56:52,815] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2162.45 | bwd_microstep: 4589.63 | bwd_inner_microstep: 4583.38 | bwd_allreduce_microstep: 6.15 | step_microstep: 47.33
-[2025-01-25 21:56:52,816] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2162.39 | bwd: 4589.65 | bwd_inner: 4583.38 | bwd_allreduce: 6.20 | step: 47.34
- 82%|████████▏ | 4728/5800 [13:10:22<2:03:07,  6.89s/it]                                                        {'loss': 0.0088, 'grad_norm': 6.293827533721924, 'learning_rate': 3.477621545460974e-06, 'epoch': 40.76}
- 82%|████████▏ | 4728/5800 [13:10:22<2:03:07,  6.89s/it]score1 tensor([[0.4004],
-        [0.4023],
-        [0.6914],
-        [0.5312]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.4004, 0.3887, 0.6836, 0.5273], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:56:59,690] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.64 | optimizer_step: 4.36
-[2025-01-25 21:56:59,692] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2159.77 | bwd_microstep: 4588.91 | bwd_inner_microstep: 4583.87 | bwd_allreduce_microstep: 4.97 | step_microstep: 45.41
-[2025-01-25 21:56:59,692] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2159.73 | bwd: 4588.93 | bwd_inner: 4583.87 | bwd_allreduce: 5.01 | step: 45.42
- 82%|████████▏ | 4729/5800 [13:10:29<2:02:55,  6.89s/it]                                                        {'loss': 0.0063, 'grad_norm': 6.3284711837768555, 'learning_rate': 3.471330931971371e-06, 'epoch': 40.77}
- 82%|████████▏ | 4729/5800 [13:10:29<2:02:55,  6.89s/it]score1 tensor([[0.6562],
-        [0.6562],
-        [0.6523],
-        [0.5391]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6562, 0.6875, 0.6445, 0.5391], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0098, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:57:06,519] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.70 | optimizer_step: 4.36
-[2025-01-25 21:57:06,520] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.40 | bwd_microstep: 4542.32 | bwd_inner_microstep: 4536.52 | bwd_allreduce_microstep: 5.68 | step_microstep: 44.97
-[2025-01-25 21:57:06,521] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.36 | bwd: 4542.35 | bwd_inner: 4536.52 | bwd_allreduce: 5.74 | step: 44.99
- 82%|████████▏ | 4730/5800 [13:10:36<2:02:31,  6.87s/it]                                                        {'loss': 0.0098, 'grad_norm': 0.3052787184715271, 'learning_rate': 3.4650454724075754e-06, 'epoch': 40.78}
- 82%|████████▏ | 4730/5800 [13:10:36<2:02:31,  6.87s/it]score1 tensor([[0.6133],
-        [0.5703],
-        [0.4766],
-        [0.6680]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.6211, 0.5664, 0.4766, 0.6797], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0059, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:57:13,420] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 4.74 | optimizer_step: 4.36
-[2025-01-25 21:57:13,422] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2164.45 | bwd_microstep: 4593.37 | bwd_inner_microstep: 4586.71 | bwd_allreduce_microstep: 6.46 | step_microstep: 50.91
-[2025-01-25 21:57:13,423] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2164.38 | bwd: 4593.39 | bwd_inner: 4586.71 | bwd_allreduce: 6.56 | step: 50.91
- 82%|████████▏ | 4731/5800 [13:10:43<2:02:35,  6.88s/it]                                                        {'loss': 0.0059, 'grad_norm': 2.581829309463501, 'learning_rate': 3.4587651687295076e-06, 'epoch': 40.78}
- 82%|████████▏ | 4731/5800 [13:10:43<2:02:35,  6.88s/it]score1 tensor([[0.5508],
-        [0.3379],
-        [0.5898],
-        [0.4941]], device='cuda:0', dtype=torch.bfloat16,
-       grad_fn=<ReluBackward0>)
-mos tensor([0.5469, 0.3457, 0.5977, 0.5000], device='cuda:0', dtype=torch.bfloat16)
-loss1 tensor(0.0063, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)
-[2025-01-25 21:57:20,351] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 0.00 | optimizer_gradients: 3.78 | optimizer_step: 4.36
-[2025-01-25 21:57:20,352] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2163.16 | bwd_microstep: 4630.20 | bwd_inner_microstep: 4624.67 | bwd_allreduce_microstep: 5.41 | step_microstep: 47.19
-[2025-01-25 21:57:20,353] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2163.12 | bwd: 4630.22 | bwd_inner: 4624.67 | bwd_allreduce: 5.47 | step: 47.20
- 82%|████████▏ | 4732/5800 [13:10:50<2:02:43,  6.89s/it]Exception in thread Thread-2:
-Traceback (most recent call last):
-  File "/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/tensorboard/summary/writer/event_file_writer.py", line 194, in flush
-    self._check_worker_status()
-  File "/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/tensorboard/summary/writer/event_file_writer.py", line 212, in _check_worker_status
-    raise exception
-  File "/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/threading.py", line 980, in _bootstrap_inner
-    self.run()
-  File "/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/tensorboard/summary/writer/event_file_writer.py", line 244, in run
-    self._run()
-  File "/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/tensorboard/summary/writer/event_file_writer.py", line 275, in _run
-    self._record_writer.write(data)
-  File "/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/tensorboard/summary/writer/record_writer.py", line 40, in write
-    self._writer.write(header + header_crc + data + footer_crc)
-  File "/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-packages/tensorboard/compat/tensorflow_stub/io/gfile.py", line 775, in write
-    [93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
-[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
-[93m [WARNING] [0m using untested triton version (3.0.0), only 1.0.0 is known to be compatible
-petrel_client is not installed. If you read data locally instead of from ceph, ignore it.
-Replace train sampler!!
-petrel_client is not installed. Using PIL to load images.
-self.fs.append(self.filename, file_content, self.binary_mode)
-  File "/home/wangjiarui/anaconda3/envs/internvl/lib/python3.9/site-pack
\ No newline at end of file