diff --git "a/train.log" "b/train.log" new file mode 100644--- /dev/null +++ "b/train.log" @@ -0,0 +1,47263 @@ +2025-05-10 11:30:10 - INFO - llana.model.llana - Using nf2vec. +2025-05-10 11:30:10 - INFO - stdout - Loading nf2vec config from /leonardo_scratch/fast/IscrC_V2Text/dev/LLaNA_objanerf/llana/model/nf2vec/nf2vec_2layer.yaml. +2025-05-10 11:30:10 - INFO - llana.model.llana - nerf2vec output dim: 516. +2025-05-10 11:30:10 - INFO - llana.model.llana - Use 1 projection hiddent layers. +2025-05-10 11:30:10 - INFO - llana.model.llana - Each layer with [2048] hidden units. +2025-05-10 11:30:10 - INFO - stdout - Each layer with [2048] hidden units. +2025-05-10 11:30:10 - INFO - stdout - Vec projector output dim: 5120. +2025-05-10 11:30:10 - INFO - llana.model.llana - =========== VEC PROJ PARAMETERS ============= +2025-05-10 11:30:10 - INFO - llana.model.llana - Vec projector architecture: Sequential( + (0): Linear(in_features=516, out_features=2048, bias=True) + (1): GELU(approximate='none') + (2): Linear(in_features=2048, out_features=5120, bias=True) +) +2025-05-10 11:30:10 - INFO - llana.model.llana - Total number of parameters: 11,549,696 +2025-05-10 11:30:10 - INFO - llana.model.llana - ============================================= +2025-05-10 11:30:10 - INFO - llana.model.llana - =========== LLANA TOKENIZER ============= +2025-05-10 11:30:10 - INFO - llana.model.llana - Tokenizer: Embedding(32003, 5120, padding_idx=0) +2025-05-10 11:30:10 - INFO - llana.model.llana - Total number of parameters: 163,855,360 +2025-05-10 11:30:10 - INFO - llana.model.llana - ============================================= +2025-05-10 11:30:10 - INFO - llana.model.llana - =========== LM HEAD PARAMETERS ============= +2025-05-10 11:30:10 - INFO - llana.model.llana - lm_head architecture: Linear(in_features=5120, out_features=32003, bias=False) +2025-05-10 11:30:10 - INFO - llana.model.llana - Total number of parameters: 163,855,360 +2025-05-10 11:30:10 - INFO - llana.model.llana - ============================================= +2025-05-10 11:30:10 - ERROR - stderr - Loading checkpoint shards: 0%| | 0/11 [00:00', 'DEFAULT_POINT_PATCH_TOKEN': '', 'DEFAULT_POINT_START_TOKEN': '', 'mm_use_point_start_end': True, 'model_type': 'llana', 'nf2vec_config_name': 'nf2vec_2layer', 'point_backbone': 'nf2vec', 'point_backbone_ckpt': '', 'use_color': True, 'output_dir': 'outputs/LLaNA_13B_train_stage2_recipe3_shapenerf_objanerf_AUGMENTED_nf2seq_spatial516/LLaNA_13B_train_stage2_recipe3_shapenerf_objanerf_AUGMENTED_nf2seq_spatial516', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': False, 'do_predict': False, 'evaluation_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 4, 'per_device_eval_batch_size': 1, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'learning_rate': 2e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3.0, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.03, 'warmup_steps': 0, 'log_level': 'info', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': 'outputs/LLaNA_13B_train_stage2_recipe3_shapenerf_objanerf_AUGMENTED_nf2seq_spatial516/LLaNA_13B_train_stage2_recipe3_shapenerf_objanerf_AUGMENTED_nf2seq_spatial516/runs/May10_12-30-51_lrdn1121.leonardo.local', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1.0, 'logging_nan_inf_filter': True, 'save_strategy': 'no', 'save_steps': 32860.0, 'save_total_limit': 1, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 100.0, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': 'llana_objanerf_13b_stage2_recipe3_shapenerf_objanerf_AUGMENTED_nf2seq_spatial516', 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': ['full_shard', 'auto_wrap'], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'transformer_layer_cls_to_wrap': ['LlamaDecoderLayer'], 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': 'LlamaDecoderLayer', 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'cache_dir': None, 'model_max_length': 4096, 'model_debug': False, 'fix_llm': False, 'force_fsdp': False, 'tune_mm_mlp_adapter': True, 'stage_2': True, 'pretrained_mm_mlp_adapter': None, 'detatch_point_token': ''} +2025-05-10 12:34:51 - ERROR - stderr - 0%| | 0/8973 [00:00 4096). Running this sequence through the model will result in indexing errors +2025-05-10 15:21:38 - WARNING - transformers.tokenization_utils_base - Token indices sequence length is longer than the specified maximum sequence length for this model (5986 > 4096). Running this sequence through the model will result in indexing errors +2025-05-10 15:21:49 - ERROR - stderr - 10%|█ | 901/8973 [2:46:58<26:12:54, 11.69s/it] +2025-05-10 15:21:49 - ERROR - stderr - +2025-05-10 15:21:49 - ERROR - stderr - +2025-05-10 15:21:49 - INFO - stdout - {'loss': 1.017, 'grad_norm': 0.794900119304657, 'learning_rate': 1.97417073784995e-05, 'epoch': 0.3} +2025-05-10 15:21:49 - ERROR - stderr - 10%|█ | 901/8973 [2:46:58<26:12:54, 11.69s/it] +2025-05-10 15:22:02 - ERROR - stderr - 10%|█ | 902/8973 [2:47:11<26:39:50, 11.89s/it] +2025-05-10 15:22:02 - ERROR - stderr - +2025-05-10 15:22:02 - ERROR - stderr - +2025-05-10 15:22:02 - INFO - stdout - {'loss': 1.0255, 'grad_norm': 0.9218485951423645, 'learning_rate': 1.9740891609376598e-05, 'epoch': 0.3} +2025-05-10 15:22:02 - ERROR - stderr - 10%|█ | 902/8973 [2:47:11<26:39:50, 11.89s/it] +2025-05-10 15:22:12 - ERROR - stderr - 10%|█ | 903/8973 [2:47:21<25:32:49, 11.40s/it] +2025-05-10 15:22:12 - ERROR - stderr - +2025-05-10 15:22:12 - ERROR - stderr - +2025-05-10 15:22:12 - INFO - stdout - {'loss': 0.9711, 'grad_norm': 0.7888413667678833, 'learning_rate': 1.974007457096479e-05, 'epoch': 0.3} +2025-05-10 15:22:12 - ERROR - stderr - 10%|█ | 903/8973 [2:47:21<25:32:49, 11.40s/it] +2025-05-10 15:22:24 - ERROR - stderr - 10%|█ | 904/8973 [2:47:33<26:12:24, 11.69s/it] +2025-05-10 15:22:24 - ERROR - stderr - +2025-05-10 15:22:24 - ERROR - stderr - +2025-05-10 15:22:24 - INFO - stdout - {'loss': 0.9616, 'grad_norm': 0.8873001337051392, 'learning_rate': 1.973925626337054e-05, 'epoch': 0.3} +2025-05-10 15:22:24 - ERROR - stderr - 10%|█ | 904/8973 [2:47:33<26:12:24, 11.69s/it] +2025-05-10 15:22:37 - ERROR - stderr - 10%|█ | 905/8973 [2:47:46<26:50:57, 11.98s/it] +2025-05-10 15:22:37 - ERROR - stderr - +2025-05-10 15:22:37 - ERROR - stderr - +2025-05-10 15:22:37 - INFO - stdout - {'loss': 0.9996, 'grad_norm': 0.8400043845176697, 'learning_rate': 1.9738436686700482e-05, 'epoch': 0.3} +2025-05-10 15:22:37 - ERROR - stderr - 10%|█ | 905/8973 [2:47:46<26:50:57, 11.98s/it] +2025-05-10 15:22:47 - ERROR - stderr - 10%|█ | 906/8973 [2:47:56<25:41:43, 11.47s/it] +2025-05-10 15:22:47 - ERROR - stderr - +2025-05-10 15:22:47 - ERROR - stderr - +2025-05-10 15:22:47 - INFO - stdout - {'loss': 1.0087, 'grad_norm': 0.9434440732002258, 'learning_rate': 1.9737615841061402e-05, 'epoch': 0.3} +2025-05-10 15:22:47 - ERROR - stderr - 10%|█ | 906/8973 [2:47:56<25:41:43, 11.47s/it] +2025-05-10 15:22:58 - ERROR - stderr - 10%|█ | 907/8973 [2:48:07<25:00:12, 11.16s/it] +2025-05-10 15:22:58 - ERROR - stderr - +2025-05-10 15:22:58 - ERROR - stderr - +2025-05-10 15:22:58 - INFO - stdout - {'loss': 1.0013, 'grad_norm': 0.9409571290016174, 'learning_rate': 1.9736793726560266e-05, 'epoch': 0.3} +2025-05-10 15:22:58 - ERROR - stderr - 10%|█ | 907/8973 [2:48:07<25:00:12, 11.16s/it] +2025-05-10 15:23:08 - ERROR - stderr - 10%|█ | 908/8973 [2:48:17<24:27:08, 10.91s/it] +2025-05-10 15:23:08 - ERROR - stderr - +2025-05-10 15:23:08 - ERROR - stderr - +2025-05-10 15:23:08 - INFO - stdout - {'loss': 0.9943, 'grad_norm': 0.906237006187439, 'learning_rate': 1.97359703433042e-05, 'epoch': 0.3} +2025-05-10 15:23:08 - ERROR - stderr - 10%|█ | 908/8973 [2:48:17<24:27:08, 10.91s/it] +2025-05-10 15:23:21 - ERROR - stderr - 10%|█ | 909/8973 [2:48:30<25:56:59, 11.58s/it] +2025-05-10 15:23:21 - ERROR - stderr - +2025-05-10 15:23:21 - ERROR - stderr - +2025-05-10 15:23:21 - INFO - stdout - {'loss': 0.9475, 'grad_norm': 0.8607956171035767, 'learning_rate': 1.9735145691400492e-05, 'epoch': 0.3} +2025-05-10 15:23:21 - ERROR - stderr - 10%|█ | 909/8973 [2:48:30<25:56:59, 11.58s/it] +2025-05-10 15:23:33 - ERROR - stderr - 10%|█ | 910/8973 [2:48:42<25:59:34, 11.61s/it] +2025-05-10 15:23:33 - ERROR - stderr - +2025-05-10 15:23:33 - ERROR - stderr - +2025-05-10 15:23:33 - INFO - stdout - {'loss': 0.955, 'grad_norm': 0.8654046654701233, 'learning_rate': 1.97343197709566e-05, 'epoch': 0.3} +2025-05-10 15:23:33 - ERROR - stderr - 10%|█ | 910/8973 [2:48:42<25:59:34, 11.61s/it] +2025-05-10 15:23:43 - ERROR - stderr - 10%|█ | 911/8973 [2:48:52<25:00:47, 11.17s/it] +2025-05-10 15:23:43 - ERROR - stderr - +2025-05-10 15:23:43 - ERROR - stderr - +2025-05-10 15:23:43 - INFO - stdout - {'loss': 0.9931, 'grad_norm': 0.8687731623649597, 'learning_rate': 1.973349258208015e-05, 'epoch': 0.3} +2025-05-10 15:23:43 - ERROR - stderr - 10%|█ | 911/8973 [2:48:52<25:00:47, 11.17s/it] +2025-05-10 15:23:54 - ERROR - stderr - 10%|█ | 912/8973 [2:49:02<24:38:37, 11.01s/it] +2025-05-10 15:23:54 - ERROR - stderr - +2025-05-10 15:23:54 - ERROR - stderr - +2025-05-10 15:23:54 - INFO - stdout - {'loss': 0.965, 'grad_norm': 0.8294503688812256, 'learning_rate': 1.973266412487892e-05, 'epoch': 0.3} +2025-05-10 15:23:54 - ERROR - stderr - 10%|█ | 912/8973 [2:49:03<24:38:37, 11.01s/it] +2025-05-10 15:24:04 - ERROR - stderr - 10%|█ | 913/8973 [2:49:13<24:05:40, 10.76s/it] +2025-05-10 15:24:04 - ERROR - stderr - +2025-05-10 15:24:04 - ERROR - stderr - +2025-05-10 15:24:04 - INFO - stdout - {'loss': 0.9922, 'grad_norm': 0.8054966330528259, 'learning_rate': 1.973183439946087e-05, 'epoch': 0.31} +2025-05-10 15:24:04 - ERROR - stderr - 10%|█ | 913/8973 [2:49:13<24:05:40, 10.76s/it] +2025-05-10 15:24:17 - ERROR - stderr - 10%|█ | 914/8973 [2:49:26<25:33:12, 11.41s/it] +2025-05-10 15:24:17 - ERROR - stderr - +2025-05-10 15:24:17 - ERROR - stderr - +2025-05-10 15:24:17 - INFO - stdout - {'loss': 0.9428, 'grad_norm': 0.8574280142784119, 'learning_rate': 1.9731003405934116e-05, 'epoch': 0.31} +2025-05-10 15:24:17 - ERROR - stderr - 10%|█ | 914/8973 [2:49:26<25:33:12, 11.41s/it] +2025-05-10 15:24:28 - ERROR - stderr - 10%|█ | 915/8973 [2:49:37<25:25:44, 11.36s/it] +2025-05-10 15:24:28 - ERROR - stderr - +2025-05-10 15:24:28 - ERROR - stderr - +2025-05-10 15:24:28 - INFO - stdout - {'loss': 0.982, 'grad_norm': 0.9882853627204895, 'learning_rate': 1.9730171144406934e-05, 'epoch': 0.31} +2025-05-10 15:24:28 - ERROR - stderr - 10%|█ | 915/8973 [2:49:37<25:25:44, 11.36s/it] +2025-05-10 15:24:38 - ERROR - stderr - 10%|█ | 916/8973 [2:49:47<24:43:52, 11.05s/it] +2025-05-10 15:24:38 - ERROR - stderr - +2025-05-10 15:24:38 - ERROR - stderr - +2025-05-10 15:24:38 - INFO - stdout - {'loss': 1.0437, 'grad_norm': 0.9476802349090576, 'learning_rate': 1.9729337614987782e-05, 'epoch': 0.31} +2025-05-10 15:24:38 - ERROR - stderr - 10%|█ | 916/8973 [2:49:47<24:43:52, 11.05s/it] +2025-05-10 15:24:49 - ERROR - stderr - 10%|█ | 917/8973 [2:49:58<24:23:44, 10.90s/it] +2025-05-10 15:24:49 - ERROR - stderr - +2025-05-10 15:24:49 - ERROR - stderr - +2025-05-10 15:24:49 - INFO - stdout - {'loss': 0.9801, 'grad_norm': 0.8160154223442078, 'learning_rate': 1.972850281778527e-05, 'epoch': 0.31} +2025-05-10 15:24:49 - ERROR - stderr - 10%|█ | 917/8973 [2:49:58<24:23:44, 10.90s/it] +2025-05-10 15:25:01 - ERROR - stderr - 10%|█ | 918/8973 [2:50:10<25:07:15, 11.23s/it] +2025-05-10 15:25:01 - ERROR - stderr - +2025-05-10 15:25:01 - ERROR - stderr - +2025-05-10 15:25:01 - INFO - stdout - {'loss': 0.9897, 'grad_norm': 0.8500818610191345, 'learning_rate': 1.9727666752908174e-05, 'epoch': 0.31} +2025-05-10 15:25:01 - ERROR - stderr - 10%|█ | 918/8973 [2:50:10<25:07:15, 11.23s/it] +2025-05-10 15:25:14 - ERROR - stderr - 10%|█ | 919/8973 [2:50:23<26:13:35, 11.72s/it] +2025-05-10 15:25:14 - ERROR - stderr - +2025-05-10 15:25:14 - ERROR - stderr - +2025-05-10 15:25:14 - INFO - stdout - {'loss': 0.9737, 'grad_norm': 0.839537501335144, 'learning_rate': 1.972682942046544e-05, 'epoch': 0.31} +2025-05-10 15:25:14 - ERROR - stderr - 10%|█ | 919/8973 [2:50:23<26:13:35, 11.72s/it] +2025-05-10 15:25:25 - ERROR - stderr - 10%|█ | 920/8973 [2:50:34<25:40:28, 11.48s/it] +2025-05-10 15:25:25 - ERROR - stderr - +2025-05-10 15:25:25 - ERROR - stderr - +2025-05-10 15:25:25 - INFO - stdout - {'loss': 0.9383, 'grad_norm': 0.8144556283950806, 'learning_rate': 1.9725990820566173e-05, 'epoch': 0.31} +2025-05-10 15:25:25 - ERROR - stderr - 10%|█ | 920/8973 [2:50:34<25:40:28, 11.48s/it] +2025-05-10 15:25:35 - ERROR - stderr - 10%|█ | 921/8973 [2:50:44<24:59:55, 11.18s/it] +2025-05-10 15:25:35 - ERROR - stderr - +2025-05-10 15:25:35 - ERROR - stderr - +2025-05-10 15:25:35 - INFO - stdout - {'loss': 0.9702, 'grad_norm': 0.8753960132598877, 'learning_rate': 1.9725150953319653e-05, 'epoch': 0.31} +2025-05-10 15:25:35 - ERROR - stderr - 10%|█ | 921/8973 [2:50:44<24:59:55, 11.18s/it] +2025-05-10 15:25:45 - ERROR - stderr - 10%|█ | 922/8973 [2:50:54<24:21:37, 10.89s/it] +2025-05-10 15:25:45 - ERROR - stderr - +2025-05-10 15:25:45 - ERROR - stderr - +2025-05-10 15:25:45 - INFO - stdout - {'loss': 1.0463, 'grad_norm': 0.9546355605125427, 'learning_rate': 1.9724309818835313e-05, 'epoch': 0.31} +2025-05-10 15:25:45 - ERROR - stderr - 10%|█ | 922/8973 [2:50:54<24:21:37, 10.89s/it] +2025-05-10 15:25:58 - ERROR - stderr - 10%|█ | 923/8973 [2:51:07<25:40:36, 11.48s/it] +2025-05-10 15:25:58 - ERROR - stderr - +2025-05-10 15:25:58 - ERROR - stderr - +2025-05-10 15:25:58 - INFO - stdout - {'loss': 0.992, 'grad_norm': 0.8249627351760864, 'learning_rate': 1.9723467417222763e-05, 'epoch': 0.31} +2025-05-10 15:25:58 - ERROR - stderr - 10%|█ | 923/8973 [2:51:07<25:40:36, 11.48s/it] +2025-05-10 15:26:10 - ERROR - stderr - 10%|█ | 924/8973 [2:51:18<25:37:56, 11.46s/it] +2025-05-10 15:26:10 - ERROR - stderr - +2025-05-10 15:26:10 - ERROR - stderr - +2025-05-10 15:26:10 - INFO - stdout - {'loss': 1.0593, 'grad_norm': 0.9125091433525085, 'learning_rate': 1.9722623748591764e-05, 'epoch': 0.31} +2025-05-10 15:26:10 - ERROR - stderr - 10%|█ | 924/8973 [2:51:19<25:37:56, 11.46s/it] +2025-05-10 15:26:20 - ERROR - stderr - 10%|█ | 925/8973 [2:51:29<25:04:28, 11.22s/it] +2025-05-10 15:26:20 - ERROR - stderr - +2025-05-10 15:26:20 - ERROR - stderr - +2025-05-10 15:26:20 - INFO - stdout - {'loss': 0.9784, 'grad_norm': 0.8537651896476746, 'learning_rate': 1.972177881305226e-05, 'epoch': 0.31} +2025-05-10 15:26:20 - ERROR - stderr - 10%|█ | 925/8973 [2:51:29<25:04:28, 11.22s/it] +2025-05-10 15:26:31 - ERROR - stderr - 10%|█ | 926/8973 [2:51:39<24:27:44, 10.94s/it] +2025-05-10 15:26:31 - ERROR - stderr - +2025-05-10 15:26:31 - ERROR - stderr - +2025-05-10 15:26:31 - INFO - stdout - {'loss': 0.9978, 'grad_norm': 0.8404205441474915, 'learning_rate': 1.9720932610714343e-05, 'epoch': 0.31} +2025-05-10 15:26:31 - ERROR - stderr - 10%|█ | 926/8973 [2:51:39<24:27:44, 10.94s/it] +2025-05-10 15:26:41 - ERROR - stderr - 10%|█ | 927/8973 [2:51:50<24:02:43, 10.76s/it] +2025-05-10 15:26:41 - ERROR - stderr - +2025-05-10 15:26:41 - ERROR - stderr - +2025-05-10 15:26:41 - INFO - stdout - {'loss': 1.0267, 'grad_norm': 0.8633952140808105, 'learning_rate': 1.9720085141688287e-05, 'epoch': 0.31} +2025-05-10 15:26:41 - ERROR - stderr - 10%|█ | 927/8973 [2:51:50<24:02:43, 10.76s/it] +2025-05-10 15:26:54 - ERROR - stderr - 10%|█ | 928/8973 [2:52:03<25:48:11, 11.55s/it] +2025-05-10 15:26:54 - ERROR - stderr - +2025-05-10 15:26:54 - ERROR - stderr - +2025-05-10 15:26:54 - INFO - stdout - {'loss': 1.0582, 'grad_norm': 0.955235481262207, 'learning_rate': 1.971923640608451e-05, 'epoch': 0.31} +2025-05-10 15:26:54 - ERROR - stderr - 10%|█ | 928/8973 [2:52:03<25:48:11, 11.55s/it] +2025-05-10 15:27:05 - ERROR - stderr - 10%|█ | 929/8973 [2:52:14<25:18:26, 11.33s/it] +2025-05-10 15:27:05 - ERROR - stderr - +2025-05-10 15:27:05 - ERROR - stderr - +2025-05-10 15:27:05 - INFO - stdout - {'loss': 0.992, 'grad_norm': 0.9115554094314575, 'learning_rate': 1.9718386404013614e-05, 'epoch': 0.31} +2025-05-10 15:27:05 - ERROR - stderr - 10%|█ | 929/8973 [2:52:14<25:18:26, 11.33s/it] +2025-05-10 15:27:16 - ERROR - stderr - 10%|█ | 930/8973 [2:52:24<24:46:16, 11.09s/it] +2025-05-10 15:27:16 - ERROR - stderr - +2025-05-10 15:27:16 - ERROR - stderr - +2025-05-10 15:27:16 - INFO - stdout - {'loss': 0.9501, 'grad_norm': 0.8799459338188171, 'learning_rate': 1.9717535135586354e-05, 'epoch': 0.31} +2025-05-10 15:27:16 - ERROR - stderr - 10%|█ | 930/8973 [2:52:25<24:46:16, 11.09s/it] +2025-05-10 15:27:27 - ERROR - stderr - 10%|█ | 931/8973 [2:52:36<24:46:15, 11.09s/it] +2025-05-10 15:27:27 - ERROR - stderr - +2025-05-10 15:27:27 - ERROR - stderr - +2025-05-10 15:27:27 - INFO - stdout - {'loss': 1.0335, 'grad_norm': 0.8597149848937988, 'learning_rate': 1.971668260091366e-05, 'epoch': 0.31} +2025-05-10 15:27:27 - ERROR - stderr - 10%|█ | 931/8973 [2:52:36<24:46:15, 11.09s/it] +2025-05-10 15:27:39 - ERROR - stderr - 10%|█ | 932/8973 [2:52:48<25:46:14, 11.54s/it] +2025-05-10 15:27:39 - ERROR - stderr - +2025-05-10 15:27:39 - ERROR - stderr - +2025-05-10 15:27:39 - INFO - stdout - {'loss': 0.9997, 'grad_norm': 0.8992306590080261, 'learning_rate': 1.971582880010662e-05, 'epoch': 0.31} +2025-05-10 15:27:39 - ERROR - stderr - 10%|█ | 932/8973 [2:52:48<25:46:14, 11.54s/it] +2025-05-10 15:27:52 - ERROR - stderr - 10%|█ | 933/8973 [2:53:01<26:41:20, 11.95s/it] +2025-05-10 15:27:52 - ERROR - stderr - +2025-05-10 15:27:52 - ERROR - stderr - +2025-05-10 15:27:52 - INFO - stdout - {'loss': 1.0799, 'grad_norm': 0.9135088920593262, 'learning_rate': 1.9714973733276486e-05, 'epoch': 0.31} +2025-05-10 15:27:52 - ERROR - stderr - 10%|█ | 933/8973 [2:53:01<26:41:20, 11.95s/it] +2025-05-10 15:28:03 - ERROR - stderr - 10%|█ | 934/8973 [2:53:11<25:38:11, 11.48s/it] +2025-05-10 15:28:03 - ERROR - stderr - +2025-05-10 15:28:03 - ERROR - stderr - +2025-05-10 15:28:03 - INFO - stdout - {'loss': 0.9595, 'grad_norm': 0.8543103933334351, 'learning_rate': 1.971411740053468e-05, 'epoch': 0.31} +2025-05-10 15:28:03 - ERROR - stderr - 10%|█ | 934/8973 [2:53:11<25:38:11, 11.48s/it] +2025-05-10 15:28:13 - ERROR - stderr - 10%|█ | 935/8973 [2:53:22<24:58:37, 11.19s/it] +2025-05-10 15:28:13 - ERROR - stderr - +2025-05-10 15:28:13 - ERROR - stderr - +2025-05-10 15:28:13 - INFO - stdout - {'loss': 0.982, 'grad_norm': 0.9316294193267822, 'learning_rate': 1.9713259801992784e-05, 'epoch': 0.31} +2025-05-10 15:28:13 - ERROR - stderr - 10%|█ | 935/8973 [2:53:22<24:58:37, 11.19s/it] +2025-05-10 15:28:13 - WARNING - transformers.tokenization_utils_base - Token indices sequence length is longer than the specified maximum sequence length for this model (5854 > 4096). Running this sequence through the model will result in indexing errors +2025-05-10 15:28:13 - WARNING - transformers.tokenization_utils_base - Token indices sequence length is longer than the specified maximum sequence length for this model (5854 > 4096). Running this sequence through the model will result in indexing errors +2025-05-10 15:28:23 - ERROR - stderr - 10%|█ | 936/8973 [2:53:32<24:20:43, 10.91s/it] +2025-05-10 15:28:23 - ERROR - stderr - +2025-05-10 15:28:23 - ERROR - stderr - +2025-05-10 15:28:23 - INFO - stdout - {'loss': 1.0204, 'grad_norm': 0.8781898021697998, 'learning_rate': 1.9712400937762554e-05, 'epoch': 0.31} +2025-05-10 15:28:23 - ERROR - stderr - 10%|█ | 936/8973 [2:53:32<24:20:43, 10.91s/it] +2025-05-10 15:28:39 - ERROR - stderr - 10%|█ | 937/8973 [2:53:47<27:14:31, 12.20s/it] +2025-05-10 15:28:39 - ERROR - stderr - +2025-05-10 15:28:39 - ERROR - stderr - +2025-05-10 15:28:39 - INFO - stdout - {'loss': 0.9369, 'grad_norm': 0.7610090970993042, 'learning_rate': 1.9711540807955897e-05, 'epoch': 0.31} +2025-05-10 15:28:39 - ERROR - stderr - 10%|█ | 937/8973 [2:53:47<27:14:31, 12.20s/it] +2025-05-10 15:28:50 - ERROR - stderr - 10%|█ | 938/8973 [2:53:58<26:25:22, 11.84s/it] +2025-05-10 15:28:50 - ERROR - stderr - +2025-05-10 15:28:50 - ERROR - stderr - +2025-05-10 15:28:50 - INFO - stdout - {'loss': 1.0069, 'grad_norm': 0.9061853289604187, 'learning_rate': 1.9710679412684898e-05, 'epoch': 0.31} +2025-05-10 15:28:50 - ERROR - stderr - 10%|█ | 938/8973 [2:53:58<26:25:22, 11.84s/it] +2025-05-10 15:29:00 - ERROR - stderr - 10%|█ | 939/8973 [2:54:09<25:19:20, 11.35s/it] +2025-05-10 15:29:00 - ERROR - stderr - +2025-05-10 15:29:00 - ERROR - stderr - +2025-05-10 15:29:00 - INFO - stdout - {'loss': 0.9574, 'grad_norm': 0.8868436217308044, 'learning_rate': 1.9709816752061797e-05, 'epoch': 0.31} +2025-05-10 15:29:00 - ERROR - stderr - 10%|█ | 939/8973 [2:54:09<25:19:20, 11.35s/it] +2025-05-10 15:29:10 - ERROR - stderr - 10%|█ | 940/8973 [2:54:19<24:46:27, 11.10s/it] +2025-05-10 15:29:10 - ERROR - stderr - +2025-05-10 15:29:10 - ERROR - stderr - +2025-05-10 15:29:10 - INFO - stdout - {'loss': 0.9291, 'grad_norm': 0.8408994078636169, 'learning_rate': 1.9708952826199002e-05, 'epoch': 0.31} +2025-05-10 15:29:10 - ERROR - stderr - 10%|█ | 940/8973 [2:54:19<24:46:27, 11.10s/it] +2025-05-10 15:29:21 - ERROR - stderr - 10%|█ | 941/8973 [2:54:30<24:19:53, 10.91s/it] +2025-05-10 15:29:21 - ERROR - stderr - +2025-05-10 15:29:21 - ERROR - stderr - +2025-05-10 15:29:21 - INFO - stdout - {'loss': 1.0372, 'grad_norm': 0.8759669065475464, 'learning_rate': 1.9708087635209096e-05, 'epoch': 0.31} +2025-05-10 15:29:21 - ERROR - stderr - 10%|█ | 941/8973 [2:54:30<24:19:53, 10.91s/it] +2025-05-10 15:29:34 - ERROR - stderr - 10%|█ | 942/8973 [2:54:42<25:30:19, 11.43s/it] +2025-05-10 15:29:34 - ERROR - stderr - +2025-05-10 15:29:34 - ERROR - stderr - +2025-05-10 15:29:34 - INFO - stdout - {'loss': 0.9928, 'grad_norm': 0.8475032448768616, 'learning_rate': 1.970722117920481e-05, 'epoch': 0.31} +2025-05-10 15:29:34 - ERROR - stderr - 10%|█ | 942/8973 [2:54:42<25:30:19, 11.43s/it] +2025-05-10 15:29:44 - ERROR - stderr - 11%|█ | 943/8973 [2:54:53<24:56:41, 11.18s/it] +2025-05-10 15:29:44 - ERROR - stderr - +2025-05-10 15:29:44 - ERROR - stderr - +2025-05-10 15:29:44 - INFO - stdout - {'loss': 1.0934, 'grad_norm': 0.898529052734375, 'learning_rate': 1.9706353458299048e-05, 'epoch': 0.32} +2025-05-10 15:29:44 - ERROR - stderr - 11%|█ | 943/8973 [2:54:53<24:56:41, 11.18s/it] +2025-05-10 15:29:55 - ERROR - stderr - 11%|█ | 944/8973 [2:55:03<24:28:02, 10.97s/it] +2025-05-10 15:29:55 - ERROR - stderr - +2025-05-10 15:29:55 - ERROR - stderr - +2025-05-10 15:29:55 - INFO - stdout - {'loss': 0.9508, 'grad_norm': 0.8242591023445129, 'learning_rate': 1.970548447260488e-05, 'epoch': 0.32} +2025-05-10 15:29:55 - ERROR - stderr - 11%|█ | 944/8973 [2:55:03<24:28:02, 10.97s/it] +2025-05-10 15:30:05 - ERROR - stderr - 11%|█ | 945/8973 [2:55:14<23:55:37, 10.73s/it] +2025-05-10 15:30:05 - ERROR - stderr - +2025-05-10 15:30:05 - ERROR - stderr - +2025-05-10 15:30:05 - INFO - stdout - {'loss': 0.9571, 'grad_norm': 0.8828511238098145, 'learning_rate': 1.9704614222235542e-05, 'epoch': 0.32} +2025-05-10 15:30:05 - ERROR - stderr - 11%|█ | 945/8973 [2:55:14<23:55:37, 10.73s/it] +2025-05-10 15:30:17 - ERROR - stderr - 11%|█ | 946/8973 [2:55:26<25:11:43, 11.30s/it] +2025-05-10 15:30:17 - ERROR - stderr - +2025-05-10 15:30:17 - ERROR - stderr - +2025-05-10 15:30:17 - INFO - stdout - {'loss': 0.9948, 'grad_norm': 0.8768924474716187, 'learning_rate': 1.9703742707304428e-05, 'epoch': 0.32} +2025-05-10 15:30:17 - ERROR - stderr - 11%|█ | 946/8973 [2:55:26<25:11:43, 11.30s/it] +2025-05-10 15:30:30 - ERROR - stderr - 11%|█ | 947/8973 [2:55:38<25:52:16, 11.60s/it] +2025-05-10 15:30:30 - ERROR - stderr - +2025-05-10 15:30:30 - ERROR - stderr - +2025-05-10 15:30:30 - INFO - stdout - {'loss': 1.012, 'grad_norm': 0.8347226977348328, 'learning_rate': 1.9702869927925105e-05, 'epoch': 0.32} +2025-05-10 15:30:30 - ERROR - stderr - 11%|█ | 947/8973 [2:55:38<25:52:16, 11.60s/it] +2025-05-10 15:30:40 - ERROR - stderr - 11%|█ | 948/8973 [2:55:49<25:11:44, 11.30s/it] +2025-05-10 15:30:40 - ERROR - stderr - +2025-05-10 15:30:40 - ERROR - stderr - +2025-05-10 15:30:40 - INFO - stdout - {'loss': 1.0894, 'grad_norm': 0.8674705028533936, 'learning_rate': 1.9701995884211295e-05, 'epoch': 0.32} +2025-05-10 15:30:40 - ERROR - stderr - 11%|█ | 948/8973 [2:55:49<25:11:44, 11.30s/it] +2025-05-10 15:30:51 - ERROR - stderr - 11%|█ | 949/8973 [2:56:00<24:44:42, 11.10s/it] +2025-05-10 15:30:51 - ERROR - stderr - +2025-05-10 15:30:51 - ERROR - stderr - +2025-05-10 15:30:51 - INFO - stdout - {'loss': 0.9666, 'grad_norm': 0.8951334953308105, 'learning_rate': 1.97011205762769e-05, 'epoch': 0.32} +2025-05-10 15:30:51 - ERROR - stderr - 11%|█ | 949/8973 [2:56:00<24:44:42, 11.10s/it] +2025-05-10 15:31:01 - ERROR - stderr - 11%|█ | 950/8973 [2:56:10<24:11:26, 10.85s/it] +2025-05-10 15:31:01 - ERROR - stderr - +2025-05-10 15:31:01 - ERROR - stderr - +2025-05-10 15:31:01 - INFO - stdout - {'loss': 1.0108, 'grad_norm': 0.8717060089111328, 'learning_rate': 1.9700244004235963e-05, 'epoch': 0.32} +2025-05-10 15:31:01 - ERROR - stderr - 11%|█ | 950/8973 [2:56:10<24:11:26, 10.85s/it] +2025-05-10 15:31:14 - ERROR - stderr - 11%|█ | 951/8973 [2:56:23<25:41:11, 11.53s/it] +2025-05-10 15:31:14 - ERROR - stderr - +2025-05-10 15:31:14 - ERROR - stderr - +2025-05-10 15:31:14 - INFO - stdout - {'loss': 0.9131, 'grad_norm': 0.83205246925354, 'learning_rate': 1.9699366168202715e-05, 'epoch': 0.32} +2025-05-10 15:31:14 - ERROR - stderr - 11%|█ | 951/8973 [2:56:23<25:41:11, 11.53s/it] +2025-05-10 15:31:27 - ERROR - stderr - 11%|█ | 952/8973 [2:56:36<26:20:41, 11.82s/it] +2025-05-10 15:31:27 - ERROR - stderr - +2025-05-10 15:31:27 - ERROR - stderr - +2025-05-10 15:31:27 - INFO - stdout - {'loss': 1.0555, 'grad_norm': 0.8620781302452087, 'learning_rate': 1.9698487068291546e-05, 'epoch': 0.32} +2025-05-10 15:31:27 - ERROR - stderr - 11%|█ | 952/8973 [2:56:36<26:20:41, 11.82s/it] +2025-05-10 15:31:37 - ERROR - stderr - 11%|█ | 953/8973 [2:56:46<25:12:07, 11.31s/it] +2025-05-10 15:31:37 - ERROR - stderr - +2025-05-10 15:31:37 - ERROR - stderr - +2025-05-10 15:31:37 - INFO - stdout - {'loss': 1.0079, 'grad_norm': 0.8906900882720947, 'learning_rate': 1.9697606704617e-05, 'epoch': 0.32} +2025-05-10 15:31:37 - ERROR - stderr - 11%|█ | 953/8973 [2:56:46<25:12:07, 11.31s/it] +2025-05-10 15:31:47 - ERROR - stderr - 11%|█ | 954/8973 [2:56:56<24:31:24, 11.01s/it] +2025-05-10 15:31:47 - ERROR - stderr - +2025-05-10 15:31:47 - ERROR - stderr - +2025-05-10 15:31:47 - INFO - stdout - {'loss': 1.0294, 'grad_norm': 0.8870159387588501, 'learning_rate': 1.9696725077293795e-05, 'epoch': 0.32} +2025-05-10 15:31:47 - ERROR - stderr - 11%|█ | 954/8973 [2:56:56<24:31:24, 11.01s/it] +2025-05-10 15:31:58 - ERROR - stderr - 11%|█ | 955/8973 [2:57:07<24:14:54, 10.89s/it] +2025-05-10 15:31:58 - ERROR - stderr - +2025-05-10 15:31:58 - ERROR - stderr - +2025-05-10 15:31:58 - INFO - stdout - {'loss': 0.9591, 'grad_norm': 0.8525518774986267, 'learning_rate': 1.9695842186436817e-05, 'epoch': 0.32} +2025-05-10 15:31:58 - ERROR - stderr - 11%|█ | 955/8973 [2:57:07<24:14:54, 10.89s/it] +2025-05-10 15:32:11 - ERROR - stderr - 11%|█ | 956/8973 [2:57:20<25:37:17, 11.51s/it] +2025-05-10 15:32:11 - ERROR - stderr - +2025-05-10 15:32:11 - ERROR - stderr - +2025-05-10 15:32:11 - INFO - stdout - {'loss': 1.0237, 'grad_norm': 0.8714250326156616, 'learning_rate': 1.9694958032161105e-05, 'epoch': 0.32} +2025-05-10 15:32:11 - ERROR - stderr - 11%|█ | 956/8973 [2:57:20<25:37:17, 11.51s/it] +2025-05-10 15:32:22 - ERROR - stderr - 11%|█ | 957/8973 [2:57:30<25:05:40, 11.27s/it] +2025-05-10 15:32:22 - ERROR - stderr - +2025-05-10 15:32:22 - ERROR - stderr - +2025-05-10 15:32:22 - INFO - stdout - {'loss': 1.0272, 'grad_norm': 0.8914247751235962, 'learning_rate': 1.9694072614581867e-05, 'epoch': 0.32} +2025-05-10 15:32:22 - ERROR - stderr - 11%|█ | 957/8973 [2:57:30<25:05:40, 11.27s/it] +2025-05-10 15:32:32 - ERROR - stderr - 11%|█ | 958/8973 [2:57:41<24:46:11, 11.13s/it] +2025-05-10 15:32:32 - ERROR - stderr - +2025-05-10 15:32:32 - ERROR - stderr - +2025-05-10 15:32:32 - INFO - stdout - {'loss': 0.9926, 'grad_norm': 0.8356149196624756, 'learning_rate': 1.9693185933814483e-05, 'epoch': 0.32} +2025-05-10 15:32:32 - ERROR - stderr - 11%|█ | 958/8973 [2:57:41<24:46:11, 11.13s/it] +2025-05-10 15:32:42 - ERROR - stderr - 11%|█ | 959/8973 [2:57:51<24:07:12, 10.84s/it] +2025-05-10 15:32:42 - ERROR - stderr - +2025-05-10 15:32:42 - ERROR - stderr - +2025-05-10 15:32:42 - INFO - stdout - {'loss': 0.9419, 'grad_norm': 0.8588907718658447, 'learning_rate': 1.969229798997449e-05, 'epoch': 0.32} +2025-05-10 15:32:42 - ERROR - stderr - 11%|█ | 959/8973 [2:57:51<24:07:12, 10.84s/it] +2025-05-10 15:32:55 - ERROR - stderr - 11%|█ | 960/8973 [2:58:04<25:08:52, 11.30s/it] +2025-05-10 15:32:55 - ERROR - stderr - +2025-05-10 15:32:55 - ERROR - stderr - +2025-05-10 15:32:55 - INFO - stdout - {'loss': 0.9901, 'grad_norm': 0.8566515445709229, 'learning_rate': 1.9691408783177594e-05, 'epoch': 0.32} +2025-05-10 15:32:55 - ERROR - stderr - 11%|█ | 960/8973 [2:58:04<25:08:52, 11.30s/it] +2025-05-10 15:33:07 - ERROR - stderr - 11%|█ | 961/8973 [2:58:16<25:33:03, 11.48s/it] +2025-05-10 15:33:07 - ERROR - stderr - +2025-05-10 15:33:07 - ERROR - stderr - +2025-05-10 15:33:07 - INFO - stdout - {'loss': 1.0142, 'grad_norm': 0.8818102478981018, 'learning_rate': 1.969051831353966e-05, 'epoch': 0.32} +2025-05-10 15:33:07 - ERROR - stderr - 11%|█ | 961/8973 [2:58:16<25:33:03, 11.48s/it] +2025-05-10 15:33:17 - ERROR - stderr - 11%|█ | 962/8973 [2:58:26<24:54:03, 11.19s/it] +2025-05-10 15:33:17 - ERROR - stderr - +2025-05-10 15:33:17 - ERROR - stderr - +2025-05-10 15:33:17 - INFO - stdout - {'loss': 0.9809, 'grad_norm': 0.8546748161315918, 'learning_rate': 1.9689626581176725e-05, 'epoch': 0.32} +2025-05-10 15:33:17 - ERROR - stderr - 11%|█ | 962/8973 [2:58:26<24:54:03, 11.19s/it] +2025-05-10 15:33:28 - ERROR - stderr - 11%|█ | 963/8973 [2:58:36<24:19:30, 10.93s/it] +2025-05-10 15:33:28 - ERROR - stderr - +2025-05-10 15:33:28 - ERROR - stderr - +2025-05-10 15:33:28 - INFO - stdout - {'loss': 0.903, 'grad_norm': 0.8557462692260742, 'learning_rate': 1.9688733586204977e-05, 'epoch': 0.32} +2025-05-10 15:33:28 - ERROR - stderr - 11%|█ | 963/8973 [2:58:36<24:19:30, 10.93s/it] +2025-05-10 15:33:38 - ERROR - stderr - 11%|█ | 964/8973 [2:58:47<23:47:44, 10.70s/it] +2025-05-10 15:33:38 - ERROR - stderr - +2025-05-10 15:33:38 - ERROR - stderr - +2025-05-10 15:33:38 - INFO - stdout - {'loss': 1.013, 'grad_norm': 0.8894135355949402, 'learning_rate': 1.9687839328740786e-05, 'epoch': 0.32} +2025-05-10 15:33:38 - ERROR - stderr - 11%|█ | 964/8973 [2:58:47<23:47:44, 10.70s/it] +2025-05-10 15:33:51 - ERROR - stderr - 11%|█ | 965/8973 [2:58:59<25:16:26, 11.36s/it] +2025-05-10 15:33:51 - ERROR - stderr - +2025-05-10 15:33:51 - ERROR - stderr - +2025-05-10 15:33:51 - INFO - stdout - {'loss': 1.0221, 'grad_norm': 0.892736554145813, 'learning_rate': 1.9686943808900683e-05, 'epoch': 0.32} +2025-05-10 15:33:51 - ERROR - stderr - 11%|█ | 965/8973 [2:58:59<25:16:26, 11.36s/it] +2025-05-10 15:34:03 - ERROR - stderr - 11%|█ | 966/8973 [2:59:11<25:36:19, 11.51s/it] +2025-05-10 15:34:03 - ERROR - stderr - +2025-05-10 15:34:03 - ERROR - stderr - +2025-05-10 15:34:03 - INFO - stdout - {'loss': 1.0037, 'grad_norm': 0.8614006638526917, 'learning_rate': 1.968604702680135e-05, 'epoch': 0.32} +2025-05-10 15:34:03 - ERROR - stderr - 11%|█ | 966/8973 [2:59:11<25:36:19, 11.51s/it] +2025-05-10 15:34:13 - ERROR - stderr - 11%|█ | 967/8973 [2:59:22<24:53:06, 11.19s/it] +2025-05-10 15:34:13 - ERROR - stderr - +2025-05-10 15:34:13 - ERROR - stderr - +2025-05-10 15:34:13 - INFO - stdout - {'loss': 0.968, 'grad_norm': 0.9357677102088928, 'learning_rate': 1.968514898255964e-05, 'epoch': 0.32} +2025-05-10 15:34:13 - ERROR - stderr - 11%|█ | 967/8973 [2:59:22<24:53:06, 11.19s/it] +2025-05-10 15:34:23 - ERROR - stderr - 11%|█ | 968/8973 [2:59:32<24:14:40, 10.90s/it] +2025-05-10 15:34:23 - ERROR - stderr - +2025-05-10 15:34:23 - ERROR - stderr - +2025-05-10 15:34:23 - INFO - stdout - {'loss': 0.9872, 'grad_norm': 0.8847966194152832, 'learning_rate': 1.968424967629258e-05, 'epoch': 0.32} +2025-05-10 15:34:23 - ERROR - stderr - 11%|█ | 968/8973 [2:59:32<24:14:40, 10.90s/it] +2025-05-10 15:34:34 - ERROR - stderr - 11%|█ | 969/8973 [2:59:42<23:54:41, 10.75s/it] +2025-05-10 15:34:34 - ERROR - stderr - +2025-05-10 15:34:34 - ERROR - stderr - +2025-05-10 15:34:34 - INFO - stdout - {'loss': 1.0339, 'grad_norm': 0.9192578196525574, 'learning_rate': 1.9683349108117355e-05, 'epoch': 0.32} +2025-05-10 15:34:34 - ERROR - stderr - 11%|█ | 969/8973 [2:59:42<23:54:41, 10.75s/it] +2025-05-10 15:34:46 - ERROR - stderr - 11%|█ | 970/8973 [2:59:55<25:02:50, 11.27s/it] +2025-05-10 15:34:46 - ERROR - stderr - +2025-05-10 15:34:46 - ERROR - stderr - +2025-05-10 15:34:46 - INFO - stdout - {'loss': 0.947, 'grad_norm': 0.8805443048477173, 'learning_rate': 1.968244727815131e-05, 'epoch': 0.32} +2025-05-10 15:34:46 - ERROR - stderr - 11%|█ | 970/8973 [2:59:55<25:02:50, 11.27s/it] +2025-05-10 15:34:56 - ERROR - stderr - 11%|█ | 971/8973 [3:00:05<24:20:06, 10.95s/it] +2025-05-10 15:34:56 - ERROR - stderr - +2025-05-10 15:34:56 - ERROR - stderr - +2025-05-10 15:34:56 - INFO - stdout - {'loss': 0.9037, 'grad_norm': 0.8353927731513977, 'learning_rate': 1.9681544186511957e-05, 'epoch': 0.32} +2025-05-10 15:34:56 - ERROR - stderr - 11%|█ | 971/8973 [3:00:05<24:20:06, 10.95s/it] +2025-05-10 15:35:07 - ERROR - stderr - 11%|█ | 972/8973 [3:00:15<23:59:49, 10.80s/it] +2025-05-10 15:35:07 - ERROR - stderr - +2025-05-10 15:35:07 - ERROR - stderr - +2025-05-10 15:35:07 - INFO - stdout - {'loss': 0.981, 'grad_norm': 0.864736020565033, 'learning_rate': 1.9680639833316976e-05, 'epoch': 0.32} +2025-05-10 15:35:07 - ERROR - stderr - 11%|█ | 972/8973 [3:00:15<23:59:49, 10.80s/it] +2025-05-10 15:35:17 - ERROR - stderr - 11%|█ | 973/8973 [3:00:26<23:40:28, 10.65s/it] +2025-05-10 15:35:17 - ERROR - stderr - +2025-05-10 15:35:17 - ERROR - stderr - +2025-05-10 15:35:17 - INFO - stdout - {'loss': 1.0105, 'grad_norm': 0.8510645031929016, 'learning_rate': 1.967973421868421e-05, 'epoch': 0.33} +2025-05-10 15:35:17 - ERROR - stderr - 11%|█ | 973/8973 [3:00:26<23:40:28, 10.65s/it] +2025-05-10 15:35:30 - ERROR - stderr - 11%|█ | 974/8973 [3:00:38<24:55:58, 11.22s/it] +2025-05-10 15:35:30 - ERROR - stderr - +2025-05-10 15:35:30 - ERROR - stderr - +2025-05-10 15:35:30 - INFO - stdout - {'loss': 0.9449, 'grad_norm': 0.8305413722991943, 'learning_rate': 1.967882734273166e-05, 'epoch': 0.33} +2025-05-10 15:35:30 - ERROR - stderr - 11%|█ | 974/8973 [3:00:38<24:55:58, 11.22s/it] +2025-05-10 15:35:42 - ERROR - stderr - 11%|█ | 975/8973 [3:00:51<25:48:19, 11.62s/it] +2025-05-10 15:35:42 - ERROR - stderr - +2025-05-10 15:35:42 - ERROR - stderr - +2025-05-10 15:35:42 - INFO - stdout - {'loss': 1.0578, 'grad_norm': 0.8541348576545715, 'learning_rate': 1.9677919205577504e-05, 'epoch': 0.33} +2025-05-10 15:35:42 - ERROR - stderr - 11%|█ | 975/8973 [3:00:51<25:48:19, 11.62s/it] +2025-05-10 15:35:52 - ERROR - stderr - 11%|█ | 976/8973 [3:01:01<24:53:25, 11.20s/it] +2025-05-10 15:35:52 - ERROR - stderr - +2025-05-10 15:35:52 - ERROR - stderr - +2025-05-10 15:35:52 - INFO - stdout - {'loss': 0.9804, 'grad_norm': 0.8733422160148621, 'learning_rate': 1.967700980734007e-05, 'epoch': 0.33} +2025-05-10 15:35:52 - ERROR - stderr - 11%|█ | 976/8973 [3:01:01<24:53:25, 11.20s/it] +2025-05-10 15:36:03 - ERROR - stderr - 11%|█ | 977/8973 [3:01:11<24:16:31, 10.93s/it] +2025-05-10 15:36:03 - ERROR - stderr - +2025-05-10 15:36:03 - ERROR - stderr - +2025-05-10 15:36:03 - INFO - stdout - {'loss': 0.9616, 'grad_norm': 0.8839080929756165, 'learning_rate': 1.9676099148137867e-05, 'epoch': 0.33} +2025-05-10 15:36:03 - ERROR - stderr - 11%|█ | 977/8973 [3:01:11<24:16:31, 10.93s/it] +2025-05-10 15:36:13 - ERROR - stderr - 11%|█ | 978/8973 [3:01:22<23:50:35, 10.74s/it] +2025-05-10 15:36:13 - ERROR - stderr - +2025-05-10 15:36:13 - ERROR - stderr - +2025-05-10 15:36:13 - INFO - stdout - {'loss': 0.998, 'grad_norm': 0.857496976852417, 'learning_rate': 1.9675187228089546e-05, 'epoch': 0.33} +2025-05-10 15:36:13 - ERROR - stderr - 11%|█ | 978/8973 [3:01:22<23:50:35, 10.74s/it] +2025-05-10 15:36:26 - ERROR - stderr - 11%|█ | 979/8973 [3:01:35<25:16:11, 11.38s/it] +2025-05-10 15:36:26 - ERROR - stderr - +2025-05-10 15:36:26 - ERROR - stderr - +2025-05-10 15:36:26 - INFO - stdout - {'loss': 0.9982, 'grad_norm': 0.9267235398292542, 'learning_rate': 1.9674274047313947e-05, 'epoch': 0.33} +2025-05-10 15:36:26 - ERROR - stderr - 11%|█ | 979/8973 [3:01:35<25:16:11, 11.38s/it] +2025-05-10 15:36:38 - ERROR - stderr - 11%|█ | 980/8973 [3:01:47<25:38:29, 11.55s/it] +2025-05-10 15:36:38 - ERROR - stderr - +2025-05-10 15:36:38 - ERROR - stderr - +2025-05-10 15:36:38 - INFO - stdout - {'loss': 1.0316, 'grad_norm': 0.8455819487571716, 'learning_rate': 1.9673359605930055e-05, 'epoch': 0.33} +2025-05-10 15:36:38 - ERROR - stderr - 11%|█ | 980/8973 [3:01:47<25:38:29, 11.55s/it] +2025-05-10 15:36:48 - ERROR - stderr - 11%|█ | 981/8973 [3:01:57<24:54:33, 11.22s/it] +2025-05-10 15:36:48 - ERROR - stderr - +2025-05-10 15:36:48 - ERROR - stderr - +2025-05-10 15:36:48 - INFO - stdout - {'loss': 1.0231, 'grad_norm': 0.8412938117980957, 'learning_rate': 1.9672443904057027e-05, 'epoch': 0.33} +2025-05-10 15:36:48 - ERROR - stderr - 11%|█ | 981/8973 [3:01:57<24:54:33, 11.22s/it] +2025-05-10 15:36:59 - ERROR - stderr - 11%|█ | 982/8973 [3:02:07<24:26:00, 11.01s/it] +2025-05-10 15:36:59 - ERROR - stderr - +2025-05-10 15:36:59 - ERROR - stderr - +2025-05-10 15:36:59 - INFO - stdout - {'loss': 1.1215, 'grad_norm': 0.9199548959732056, 'learning_rate': 1.9671526941814183e-05, 'epoch': 0.33} +2025-05-10 15:36:59 - ERROR - stderr - 11%|█ | 982/8973 [3:02:08<24:26:00, 11.01s/it] +2025-05-10 15:37:09 - ERROR - stderr - 11%|█ | 983/8973 [3:02:18<23:58:52, 10.81s/it] +2025-05-10 15:37:09 - ERROR - stderr - +2025-05-10 15:37:09 - ERROR - stderr - +2025-05-10 15:37:09 - INFO - stdout - {'loss': 0.9297, 'grad_norm': 0.9361135959625244, 'learning_rate': 1.9670608719321012e-05, 'epoch': 0.33} +2025-05-10 15:37:09 - ERROR - stderr - 11%|█ | 983/8973 [3:02:18<23:58:52, 10.81s/it] +2025-05-10 15:37:22 - ERROR - stderr - 11%|█ | 984/8973 [3:02:31<25:23:04, 11.44s/it] +2025-05-10 15:37:22 - ERROR - stderr - +2025-05-10 15:37:22 - ERROR - stderr - +2025-05-10 15:37:22 - INFO - stdout - {'loss': 1.0663, 'grad_norm': 0.9916256070137024, 'learning_rate': 1.966968923669716e-05, 'epoch': 0.33} +2025-05-10 15:37:22 - ERROR - stderr - 11%|█ | 984/8973 [3:02:31<25:23:04, 11.44s/it] +2025-05-10 15:37:33 - ERROR - stderr - 11%|█ | 985/8973 [3:02:42<25:02:47, 11.29s/it] +2025-05-10 15:37:33 - ERROR - stderr - +2025-05-10 15:37:33 - ERROR - stderr - +2025-05-10 15:37:33 - INFO - stdout - {'loss': 0.9593, 'grad_norm': 0.8354857563972473, 'learning_rate': 1.9668768494062442e-05, 'epoch': 0.33} +2025-05-10 15:37:33 - ERROR - stderr - 11%|█ | 985/8973 [3:02:42<25:02:47, 11.29s/it] +2025-05-10 15:37:46 - ERROR - stderr - 11%|█ | 986/8973 [3:02:54<26:01:07, 11.73s/it] +2025-05-10 15:37:46 - ERROR - stderr - +2025-05-10 15:37:46 - ERROR - stderr - +2025-05-10 15:37:46 - INFO - stdout - {'loss': 0.9799, 'grad_norm': 0.8451768755912781, 'learning_rate': 1.9667846491536832e-05, 'epoch': 0.33} +2025-05-10 15:37:46 - ERROR - stderr - 11%|█ | 986/8973 [3:02:54<26:01:07, 11.73s/it] +2025-05-10 15:37:56 - ERROR - stderr - 11%|█ | 987/8973 [3:03:05<25:01:59, 11.28s/it] +2025-05-10 15:37:56 - ERROR - stderr - +2025-05-10 15:37:56 - ERROR - stderr - +2025-05-10 15:37:56 - INFO - stdout - {'loss': 0.9448, 'grad_norm': 0.8025147318840027, 'learning_rate': 1.9666923229240478e-05, 'epoch': 0.33} +2025-05-10 15:37:56 - ERROR - stderr - 11%|█ | 987/8973 [3:03:05<25:01:59, 11.28s/it] +2025-05-10 15:38:09 - ERROR - stderr - 11%|█ | 988/8973 [3:03:18<26:11:49, 11.81s/it] +2025-05-10 15:38:09 - ERROR - stderr - +2025-05-10 15:38:09 - ERROR - stderr - +2025-05-10 15:38:09 - INFO - stdout - {'loss': 0.9887, 'grad_norm': 0.8592548370361328, 'learning_rate': 1.9665998707293682e-05, 'epoch': 0.33} +2025-05-10 15:38:09 - ERROR - stderr - 11%|█ | 988/8973 [3:03:18<26:11:49, 11.81s/it] +2025-05-10 15:38:21 - ERROR - stderr - 11%|█ | 989/8973 [3:03:30<26:38:41, 12.01s/it] +2025-05-10 15:38:21 - ERROR - stderr - +2025-05-10 15:38:21 - ERROR - stderr - +2025-05-10 15:38:21 - INFO - stdout - {'loss': 1.041, 'grad_norm': 0.8501874208450317, 'learning_rate': 1.9665072925816914e-05, 'epoch': 0.33} +2025-05-10 15:38:21 - ERROR - stderr - 11%|█ | 989/8973 [3:03:30<26:38:41, 12.01s/it] +2025-05-10 15:38:32 - ERROR - stderr - 11%|█ | 990/8973 [3:03:40<25:22:25, 11.44s/it] +2025-05-10 15:38:32 - ERROR - stderr - +2025-05-10 15:38:32 - ERROR - stderr - +2025-05-10 15:38:32 - INFO - stdout - {'loss': 1.0537, 'grad_norm': 1.0236964225769043, 'learning_rate': 1.9664145884930807e-05, 'epoch': 0.33} +2025-05-10 15:38:32 - ERROR - stderr - 11%|█ | 990/8973 [3:03:40<25:22:25, 11.44s/it] +2025-05-10 15:38:42 - ERROR - stderr - 11%|█ | 991/8973 [3:03:51<24:45:25, 11.17s/it] +2025-05-10 15:38:42 - ERROR - stderr - +2025-05-10 15:38:42 - ERROR - stderr - +2025-05-10 15:38:42 - INFO - stdout - {'loss': 0.9707, 'grad_norm': 0.8923915028572083, 'learning_rate': 1.9663217584756163e-05, 'epoch': 0.33} +2025-05-10 15:38:42 - ERROR - stderr - 11%|█ | 991/8973 [3:03:51<24:45:25, 11.17s/it] +2025-05-10 15:38:52 - ERROR - stderr - 11%|█ | 992/8973 [3:04:01<24:16:16, 10.95s/it] +2025-05-10 15:38:53 - ERROR - stderr - +2025-05-10 15:38:53 - ERROR - stderr - +2025-05-10 15:38:53 - INFO - stdout - {'loss': 0.9995, 'grad_norm': 0.9196142554283142, 'learning_rate': 1.966228802541394e-05, 'epoch': 0.33} +2025-05-10 15:38:53 - ERROR - stderr - 11%|█ | 992/8973 [3:04:01<24:16:16, 10.95s/it] +2025-05-10 15:39:06 - ERROR - stderr - 11%|█ | 993/8973 [3:04:15<25:51:16, 11.66s/it] +2025-05-10 15:39:06 - ERROR - stderr - +2025-05-10 15:39:06 - ERROR - stderr - +2025-05-10 15:39:06 - INFO - stdout - {'loss': 1.0653, 'grad_norm': 0.8898453712463379, 'learning_rate': 1.9661357207025268e-05, 'epoch': 0.33} +2025-05-10 15:39:06 - ERROR - stderr - 11%|█ | 993/8973 [3:04:15<25:51:16, 11.66s/it] +2025-05-10 15:39:17 - ERROR - stderr - 11%|█ | 994/8973 [3:04:25<25:15:54, 11.40s/it] +2025-05-10 15:39:17 - ERROR - stderr - +2025-05-10 15:39:17 - ERROR - stderr - +2025-05-10 15:39:17 - INFO - stdout - {'loss': 1.041, 'grad_norm': 0.8257574439048767, 'learning_rate': 1.9660425129711438e-05, 'epoch': 0.33} +2025-05-10 15:39:17 - ERROR - stderr - 11%|█ | 994/8973 [3:04:25<25:15:54, 11.40s/it] +2025-05-10 15:39:27 - ERROR - stderr - 11%|█ | 995/8973 [3:04:36<24:28:35, 11.04s/it] +2025-05-10 15:39:27 - ERROR - stderr - +2025-05-10 15:39:27 - ERROR - stderr - +2025-05-10 15:39:27 - INFO - stdout - {'loss': 0.8882, 'grad_norm': 0.8650773763656616, 'learning_rate': 1.96594917935939e-05, 'epoch': 0.33} +2025-05-10 15:39:27 - ERROR - stderr - 11%|█ | 995/8973 [3:04:36<24:28:35, 11.04s/it] +2025-05-10 15:39:37 - ERROR - stderr - 11%|█ | 996/8973 [3:04:46<23:58:05, 10.82s/it] +2025-05-10 15:39:37 - ERROR - stderr - +2025-05-10 15:39:37 - ERROR - stderr - +2025-05-10 15:39:37 - INFO - stdout - {'loss': 0.9987, 'grad_norm': 0.8299674391746521, 'learning_rate': 1.9658557198794273e-05, 'epoch': 0.33} +2025-05-10 15:39:37 - ERROR - stderr - 11%|█ | 996/8973 [3:04:46<23:58:05, 10.82s/it] +2025-05-10 15:39:50 - ERROR - stderr - 11%|█ | 997/8973 [3:04:58<25:06:11, 11.33s/it] +2025-05-10 15:39:50 - ERROR - stderr - +2025-05-10 15:39:50 - ERROR - stderr - +2025-05-10 15:39:50 - INFO - stdout - {'loss': 0.9814, 'grad_norm': 0.8490276336669922, 'learning_rate': 1.9657621345434344e-05, 'epoch': 0.33} +2025-05-10 15:39:50 - ERROR - stderr - 11%|█ | 997/8973 [3:04:58<25:06:11, 11.33s/it] +2025-05-10 15:40:03 - ERROR - stderr - 11%|█ | 998/8973 [3:05:12<26:17:08, 11.87s/it] +2025-05-10 15:40:03 - ERROR - stderr - +2025-05-10 15:40:03 - ERROR - stderr - +2025-05-10 15:40:03 - INFO - stdout - {'loss': 0.9348, 'grad_norm': 0.813496470451355, 'learning_rate': 1.965668423363606e-05, 'epoch': 0.33} +2025-05-10 15:40:03 - ERROR - stderr - 11%|█ | 998/8973 [3:05:12<26:17:08, 11.87s/it] +2025-05-10 15:40:13 - ERROR - stderr - 11%|█ | 999/8973 [3:05:22<25:11:25, 11.37s/it] +2025-05-10 15:40:13 - ERROR - stderr - +2025-05-10 15:40:13 - ERROR - stderr - +2025-05-10 15:40:13 - INFO - stdout - {'loss': 0.9933, 'grad_norm': 0.9239543080329895, 'learning_rate': 1.965574586352153e-05, 'epoch': 0.33} +2025-05-10 15:40:13 - ERROR - stderr - 11%|█ | 999/8973 [3:05:22<25:11:25, 11.37s/it] +2025-05-10 15:40:23 - ERROR - stderr - 11%|█ | 1000/8973 [3:05:32<24:33:40, 11.09s/it] +2025-05-10 15:40:23 - ERROR - stderr - +2025-05-10 15:40:23 - ERROR - stderr - +2025-05-10 15:40:23 - INFO - stdout - {'loss': 0.9827, 'grad_norm': 0.8784409761428833, 'learning_rate': 1.9654806235213023e-05, 'epoch': 0.33} +2025-05-10 15:40:23 - ERROR - stderr - 11%|█ | 1000/8973 [3:05:32<24:33:40, 11.09s/it] +2025-05-10 15:40:34 - ERROR - stderr - 11%|█ | 1001/8973 [3:05:43<24:05:37, 10.88s/it] +2025-05-10 15:40:34 - ERROR - stderr - +2025-05-10 15:40:34 - ERROR - stderr - +2025-05-10 15:40:34 - INFO - stdout - {'loss': 1.0435, 'grad_norm': 0.9770960807800293, 'learning_rate': 1.9653865348832985e-05, 'epoch': 0.33} +2025-05-10 15:40:34 - ERROR - stderr - 11%|█ | 1001/8973 [3:05:43<24:05:37, 10.88s/it] +2025-05-10 15:40:47 - ERROR - stderr - 11%|█ | 1002/8973 [3:05:56<25:28:22, 11.50s/it] +2025-05-10 15:40:47 - ERROR - stderr - +2025-05-10 15:40:47 - ERROR - stderr - +2025-05-10 15:40:47 - INFO - stdout - {'loss': 0.882, 'grad_norm': 0.7964608669281006, 'learning_rate': 1.9652923204504012e-05, 'epoch': 0.34} +2025-05-10 15:40:47 - ERROR - stderr - 11%|█ | 1002/8973 [3:05:56<25:28:22, 11.50s/it] +2025-05-10 15:40:59 - ERROR - stderr - 11%|█ | 1003/8973 [3:06:08<26:03:14, 11.77s/it] +2025-05-10 15:40:59 - ERROR - stderr - +2025-05-10 15:40:59 - ERROR - stderr - +2025-05-10 15:40:59 - INFO - stdout - {'loss': 1.0082, 'grad_norm': 0.8961206078529358, 'learning_rate': 1.9651979802348876e-05, 'epoch': 0.34} +2025-05-10 15:40:59 - ERROR - stderr - 11%|█ | 1003/8973 [3:06:08<26:03:14, 11.77s/it] +2025-05-10 15:41:09 - ERROR - stderr - 11%|█ | 1004/8973 [3:06:18<24:59:14, 11.29s/it] +2025-05-10 15:41:09 - ERROR - stderr - +2025-05-10 15:41:09 - ERROR - stderr - +2025-05-10 15:41:09 - INFO - stdout - {'loss': 1.0036, 'grad_norm': 0.8220065832138062, 'learning_rate': 1.9651035142490503e-05, 'epoch': 0.34} +2025-05-10 15:41:09 - ERROR - stderr - 11%|█ | 1004/8973 [3:06:18<24:59:14, 11.29s/it] +2025-05-10 15:41:20 - ERROR - stderr - 11%|█ | 1005/8973 [3:06:28<24:20:42, 11.00s/it] +2025-05-10 15:41:20 - ERROR - stderr - +2025-05-10 15:41:20 - ERROR - stderr - +2025-05-10 15:41:20 - INFO - stdout - {'loss': 0.9847, 'grad_norm': 0.900251030921936, 'learning_rate': 1.965008922505199e-05, 'epoch': 0.34} +2025-05-10 15:41:20 - ERROR - stderr - 11%|█ | 1005/8973 [3:06:28<24:20:42, 11.00s/it] +2025-05-10 15:41:30 - ERROR - stderr - 11%|█ | 1006/8973 [3:06:39<24:02:05, 10.86s/it] +2025-05-10 15:41:30 - ERROR - stderr - +2025-05-10 15:41:30 - ERROR - stderr - +2025-05-10 15:41:30 - INFO - stdout - {'loss': 0.9952, 'grad_norm': 0.9145344495773315, 'learning_rate': 1.964914205015659e-05, 'epoch': 0.34} +2025-05-10 15:41:30 - ERROR - stderr - 11%|█ | 1006/8973 [3:06:39<24:02:05, 10.86s/it] +2025-05-10 15:41:43 - ERROR - stderr - 11%|█ | 1007/8973 [3:06:52<25:37:55, 11.58s/it] +2025-05-10 15:41:43 - ERROR - stderr - +2025-05-10 15:41:43 - ERROR - stderr - +2025-05-10 15:41:43 - INFO - stdout - {'loss': 0.9785, 'grad_norm': 0.9133958220481873, 'learning_rate': 1.9648193617927733e-05, 'epoch': 0.34} +2025-05-10 15:41:43 - ERROR - stderr - 11%|█ | 1007/8973 [3:06:52<25:37:55, 11.58s/it] +2025-05-10 15:41:54 - ERROR - stderr - 11%|█ | 1008/8973 [3:07:03<24:47:24, 11.20s/it] +2025-05-10 15:41:54 - ERROR - stderr - +2025-05-10 15:41:54 - ERROR - stderr - +2025-05-10 15:41:54 - INFO - stdout - {'loss': 1.0277, 'grad_norm': 0.9503998756408691, 'learning_rate': 1.9647243928489e-05, 'epoch': 0.34} +2025-05-10 15:41:54 - ERROR - stderr - 11%|█ | 1008/8973 [3:07:03<24:47:24, 11.20s/it] +2025-05-10 15:42:04 - ERROR - stderr - 11%|█ | 1009/8973 [3:07:13<24:16:57, 10.98s/it] +2025-05-10 15:42:04 - ERROR - stderr - +2025-05-10 15:42:04 - ERROR - stderr - +2025-05-10 15:42:04 - INFO - stdout - {'loss': 0.9621, 'grad_norm': 0.8254138231277466, 'learning_rate': 1.964629298196414e-05, 'epoch': 0.34} +2025-05-10 15:42:04 - ERROR - stderr - 11%|█ | 1009/8973 [3:07:13<24:16:57, 10.98s/it] +2025-05-10 15:42:15 - ERROR - stderr - 11%|█▏ | 1010/8973 [3:07:23<23:58:04, 10.84s/it] +2025-05-10 15:42:15 - ERROR - stderr - +2025-05-10 15:42:15 - ERROR - stderr - +2025-05-10 15:42:15 - INFO - stdout - {'loss': 0.9086, 'grad_norm': 0.8727753162384033, 'learning_rate': 1.9645340778477068e-05, 'epoch': 0.34} +2025-05-10 15:42:15 - ERROR - stderr - 11%|█▏ | 1010/8973 [3:07:24<23:58:04, 10.84s/it] +2025-05-10 15:42:28 - ERROR - stderr - 11%|█▏ | 1011/8973 [3:07:37<25:30:59, 11.54s/it] +2025-05-10 15:42:28 - ERROR - stderr - +2025-05-10 15:42:28 - ERROR - stderr - +2025-05-10 15:42:28 - INFO - stdout - {'loss': 1.0416, 'grad_norm': 0.8787934184074402, 'learning_rate': 1.964438731815186e-05, 'epoch': 0.34} +2025-05-10 15:42:28 - ERROR - stderr - 11%|█▏ | 1011/8973 [3:07:37<25:30:59, 11.54s/it] +2025-05-10 15:42:41 - ERROR - stderr - 11%|█▏ | 1012/8973 [3:07:50<26:40:22, 12.06s/it] +2025-05-10 15:42:41 - ERROR - stderr - +2025-05-10 15:42:41 - ERROR - stderr - +2025-05-10 15:42:41 - INFO - stdout - {'loss': 0.9468, 'grad_norm': 0.8534150123596191, 'learning_rate': 1.9643432601112757e-05, 'epoch': 0.34} +2025-05-10 15:42:41 - ERROR - stderr - 11%|█▏ | 1012/8973 [3:07:50<26:40:22, 12.06s/it] +2025-05-10 15:42:52 - ERROR - stderr - 11%|█▏ | 1013/8973 [3:08:01<25:45:04, 11.65s/it] +2025-05-10 15:42:52 - ERROR - stderr - +2025-05-10 15:42:52 - ERROR - stderr - +2025-05-10 15:42:52 - INFO - stdout - {'loss': 0.9876, 'grad_norm': 0.8104965090751648, 'learning_rate': 1.964247662748416e-05, 'epoch': 0.34} +2025-05-10 15:42:52 - ERROR - stderr - 11%|█▏ | 1013/8973 [3:08:01<25:45:04, 11.65s/it] +2025-05-10 15:43:02 - ERROR - stderr - 11%|█▏ | 1014/8973 [3:08:11<24:50:18, 11.23s/it] +2025-05-10 15:43:02 - ERROR - stderr - +2025-05-10 15:43:02 - ERROR - stderr - +2025-05-10 15:43:02 - INFO - stdout - {'loss': 1.0674, 'grad_norm': 0.9486556649208069, 'learning_rate': 1.9641519397390647e-05, 'epoch': 0.34} +2025-05-10 15:43:02 - ERROR - stderr - 11%|█▏ | 1014/8973 [3:08:11<24:50:18, 11.23s/it] +2025-05-10 15:43:12 - ERROR - stderr - 11%|█▏ | 1015/8973 [3:08:21<24:15:35, 10.97s/it] +2025-05-10 15:43:13 - ERROR - stderr - +2025-05-10 15:43:13 - ERROR - stderr - +2025-05-10 15:43:13 - INFO - stdout - {'loss': 0.9382, 'grad_norm': 0.8317921161651611, 'learning_rate': 1.9640560910956942e-05, 'epoch': 0.34} +2025-05-10 15:43:13 - ERROR - stderr - 11%|█▏ | 1015/8973 [3:08:21<24:15:35, 10.97s/it] +2025-05-10 15:43:26 - ERROR - stderr - 11%|█▏ | 1016/8973 [3:08:34<25:41:56, 11.63s/it] +2025-05-10 15:43:26 - ERROR - stderr - +2025-05-10 15:43:26 - ERROR - stderr - +2025-05-10 15:43:26 - INFO - stdout - {'loss': 1.0013, 'grad_norm': 0.8840559124946594, 'learning_rate': 1.963960116830794e-05, 'epoch': 0.34} +2025-05-10 15:43:26 - ERROR - stderr - 11%|█▏ | 1016/8973 [3:08:34<25:41:56, 11.63s/it] +2025-05-10 15:43:37 - ERROR - stderr - 11%|█▏ | 1017/8973 [3:08:46<25:26:58, 11.52s/it] +2025-05-10 15:43:37 - ERROR - stderr - +2025-05-10 15:43:37 - ERROR - stderr - +2025-05-10 15:43:37 - INFO - stdout - {'loss': 0.9901, 'grad_norm': 0.8375911116600037, 'learning_rate': 1.9638640169568703e-05, 'epoch': 0.34} +2025-05-10 15:43:37 - ERROR - stderr - 11%|█▏ | 1017/8973 [3:08:46<25:26:58, 11.52s/it] +2025-05-10 15:43:47 - ERROR - stderr - 11%|█▏ | 1018/8973 [3:08:56<24:47:29, 11.22s/it] +2025-05-10 15:43:47 - ERROR - stderr - +2025-05-10 15:43:47 - ERROR - stderr - +2025-05-10 15:43:47 - INFO - stdout - {'loss': 1.0235, 'grad_norm': 0.8694691061973572, 'learning_rate': 1.963767791486446e-05, 'epoch': 0.34} +2025-05-10 15:43:47 - ERROR - stderr - 11%|█▏ | 1018/8973 [3:08:56<24:47:29, 11.22s/it] +2025-05-10 15:43:58 - ERROR - stderr - 11%|█▏ | 1019/8973 [3:09:07<24:19:04, 11.01s/it] +2025-05-10 15:43:58 - ERROR - stderr - +2025-05-10 15:43:58 - ERROR - stderr - +2025-05-10 15:43:58 - INFO - stdout - {'loss': 0.9292, 'grad_norm': 0.8370477557182312, 'learning_rate': 1.9636714404320586e-05, 'epoch': 0.34} +2025-05-10 15:43:58 - ERROR - stderr - 11%|█▏ | 1019/8973 [3:09:07<24:19:04, 11.01s/it] +2025-05-10 15:44:09 - ERROR - stderr - 11%|█▏ | 1020/8973 [3:09:18<24:10:52, 10.95s/it] +2025-05-10 15:44:09 - ERROR - stderr - +2025-05-10 15:44:09 - ERROR - stderr - +2025-05-10 15:44:09 - INFO - stdout - {'loss': 1.0092, 'grad_norm': 0.8774938583374023, 'learning_rate': 1.963574963806264e-05, 'epoch': 0.34} +2025-05-10 15:44:09 - ERROR - stderr - 11%|█▏ | 1020/8973 [3:09:18<24:10:52, 10.95s/it] +2025-05-10 15:44:22 - ERROR - stderr - 11%|█▏ | 1021/8973 [3:09:30<25:30:19, 11.55s/it] +2025-05-10 15:44:22 - ERROR - stderr - +2025-05-10 15:44:22 - ERROR - stderr - +2025-05-10 15:44:22 - INFO - stdout - {'loss': 0.9372, 'grad_norm': 0.8457070589065552, 'learning_rate': 1.9634783616216334e-05, 'epoch': 0.34} +2025-05-10 15:44:22 - ERROR - stderr - 11%|█▏ | 1021/8973 [3:09:30<25:30:19, 11.55s/it] +2025-05-10 15:44:32 - ERROR - stderr - 11%|█▏ | 1022/8973 [3:09:41<24:47:19, 11.22s/it] +2025-05-10 15:44:32 - ERROR - stderr - +2025-05-10 15:44:32 - ERROR - stderr - +2025-05-10 15:44:32 - INFO - stdout - {'loss': 0.9232, 'grad_norm': 0.8335723876953125, 'learning_rate': 1.9633816338907547e-05, 'epoch': 0.34} +2025-05-10 15:44:32 - ERROR - stderr - 11%|█▏ | 1022/8973 [3:09:41<24:47:19, 11.22s/it] +2025-05-10 15:44:43 - ERROR - stderr - 11%|█▏ | 1023/8973 [3:09:51<24:15:35, 10.99s/it] +2025-05-10 15:44:43 - ERROR - stderr - +2025-05-10 15:44:43 - ERROR - stderr - +2025-05-10 15:44:43 - INFO - stdout - {'loss': 0.926, 'grad_norm': 0.8248522281646729, 'learning_rate': 1.9632847806262315e-05, 'epoch': 0.34} +2025-05-10 15:44:43 - ERROR - stderr - 11%|█▏ | 1023/8973 [3:09:51<24:15:35, 10.99s/it] +2025-05-10 15:44:53 - ERROR - stderr - 11%|█▏ | 1024/8973 [3:10:02<24:12:14, 10.96s/it] +2025-05-10 15:44:54 - ERROR - stderr - +2025-05-10 15:44:54 - ERROR - stderr - +2025-05-10 15:44:54 - INFO - stdout - {'loss': 1.0535, 'grad_norm': 1.0262664556503296, 'learning_rate': 1.963187801840685e-05, 'epoch': 0.34} +2025-05-10 15:44:54 - ERROR - stderr - 11%|█▏ | 1024/8973 [3:10:02<24:12:14, 10.96s/it] +2025-05-10 15:45:07 - ERROR - stderr - 11%|█▏ | 1025/8973 [3:10:15<25:40:56, 11.63s/it] +2025-05-10 15:45:07 - ERROR - stderr - +2025-05-10 15:45:07 - ERROR - stderr - +2025-05-10 15:45:07 - INFO - stdout - {'loss': 1.0011, 'grad_norm': 0.8251902461051941, 'learning_rate': 1.9630906975467518e-05, 'epoch': 0.34} +2025-05-10 15:45:07 - ERROR - stderr - 11%|█▏ | 1025/8973 [3:10:15<25:40:56, 11.63s/it] +2025-05-10 15:45:19 - ERROR - stderr - 11%|█▏ | 1026/8973 [3:10:27<25:51:48, 11.72s/it] +2025-05-10 15:45:19 - ERROR - stderr - +2025-05-10 15:45:19 - ERROR - stderr - +2025-05-10 15:45:19 - INFO - stdout - {'loss': 0.9679, 'grad_norm': 0.8192188739776611, 'learning_rate': 1.9629934677570846e-05, 'epoch': 0.34} +2025-05-10 15:45:19 - ERROR - stderr - 11%|█▏ | 1026/8973 [3:10:27<25:51:48, 11.72s/it] +2025-05-10 15:45:29 - ERROR - stderr - 11%|█▏ | 1027/8973 [3:10:38<25:00:07, 11.33s/it] +2025-05-10 15:45:29 - ERROR - stderr - +2025-05-10 15:45:29 - ERROR - stderr - +2025-05-10 15:45:29 - INFO - stdout - {'loss': 0.9822, 'grad_norm': 1.114517092704773, 'learning_rate': 1.9628961124843537e-05, 'epoch': 0.34} +2025-05-10 15:45:29 - ERROR - stderr - 11%|█▏ | 1027/8973 [3:10:38<25:00:07, 11.33s/it] +2025-05-10 15:45:39 - ERROR - stderr - 11%|█▏ | 1028/8973 [3:10:48<24:21:49, 11.04s/it] +2025-05-10 15:45:39 - ERROR - stderr - +2025-05-10 15:45:39 - ERROR - stderr - +2025-05-10 15:45:39 - INFO - stdout - {'loss': 0.9178, 'grad_norm': 0.8703671097755432, 'learning_rate': 1.9627986317412442e-05, 'epoch': 0.34} +2025-05-10 15:45:39 - ERROR - stderr - 11%|█▏ | 1028/8973 [3:10:48<24:21:49, 11.04s/it] +2025-05-10 15:45:50 - ERROR - stderr - 11%|█▏ | 1029/8973 [3:10:58<23:49:50, 10.80s/it] +2025-05-10 15:45:50 - ERROR - stderr - +2025-05-10 15:45:50 - ERROR - stderr - +2025-05-10 15:45:50 - INFO - stdout - {'loss': 1.0677, 'grad_norm': 0.9106542468070984, 'learning_rate': 1.9627010255404592e-05, 'epoch': 0.34} +2025-05-10 15:45:50 - ERROR - stderr - 11%|█▏ | 1029/8973 [3:10:58<23:49:50, 10.80s/it] +2025-05-10 15:46:02 - ERROR - stderr - 11%|█▏ | 1030/8973 [3:11:11<25:00:33, 11.34s/it] +2025-05-10 15:46:02 - ERROR - stderr - +2025-05-10 15:46:02 - ERROR - stderr - +2025-05-10 15:46:02 - INFO - stdout - {'loss': 0.9866, 'grad_norm': 0.9060853123664856, 'learning_rate': 1.9626032938947165e-05, 'epoch': 0.34} +2025-05-10 15:46:02 - ERROR - stderr - 11%|█▏ | 1030/8973 [3:11:11<25:00:33, 11.34s/it] +2025-05-10 15:46:13 - ERROR - stderr - 11%|█▏ | 1031/8973 [3:11:22<24:48:44, 11.25s/it] +2025-05-10 15:46:13 - ERROR - stderr - +2025-05-10 15:46:13 - ERROR - stderr - +2025-05-10 15:46:13 - INFO - stdout - {'loss': 1.0224, 'grad_norm': 0.8627523183822632, 'learning_rate': 1.9625054368167516e-05, 'epoch': 0.34} +2025-05-10 15:46:13 - ERROR - stderr - 11%|█▏ | 1031/8973 [3:11:22<24:48:44, 11.25s/it] +2025-05-10 15:46:24 - ERROR - stderr - 12%|█▏ | 1032/8973 [3:11:32<24:15:14, 11.00s/it] +2025-05-10 15:46:24 - ERROR - stderr - +2025-05-10 15:46:24 - ERROR - stderr - +2025-05-10 15:46:24 - INFO - stdout - {'loss': 0.9974, 'grad_norm': 0.8379828929901123, 'learning_rate': 1.9624074543193155e-05, 'epoch': 0.35} +2025-05-10 15:46:24 - ERROR - stderr - 12%|█▏ | 1032/8973 [3:11:32<24:15:14, 11.00s/it] +2025-05-10 15:46:34 - ERROR - stderr - 12%|█▏ | 1033/8973 [3:11:43<23:56:42, 10.86s/it] +2025-05-10 15:46:34 - ERROR - stderr - +2025-05-10 15:46:34 - ERROR - stderr - +2025-05-10 15:46:34 - INFO - stdout - {'loss': 0.9355, 'grad_norm': 0.8282051682472229, 'learning_rate': 1.962309346415176e-05, 'epoch': 0.35} +2025-05-10 15:46:34 - ERROR - stderr - 12%|█▏ | 1033/8973 [3:11:43<23:56:42, 10.86s/it] +2025-05-10 15:46:46 - ERROR - stderr - 12%|█▏ | 1034/8973 [3:11:55<24:25:56, 11.08s/it] +2025-05-10 15:46:46 - ERROR - stderr - +2025-05-10 15:46:46 - ERROR - stderr - +2025-05-10 15:46:46 - INFO - stdout - {'loss': 0.9915, 'grad_norm': 0.9110119938850403, 'learning_rate': 1.962211113117117e-05, 'epoch': 0.35} +2025-05-10 15:46:46 - ERROR - stderr - 12%|█▏ | 1034/8973 [3:11:55<24:25:56, 11.08s/it] +2025-05-10 15:46:59 - ERROR - stderr - 12%|█▏ | 1035/8973 [3:12:08<26:01:25, 11.80s/it] +2025-05-10 15:46:59 - ERROR - stderr - +2025-05-10 15:46:59 - ERROR - stderr - +2025-05-10 15:46:59 - INFO - stdout - {'loss': 1.0578, 'grad_norm': 0.8920918107032776, 'learning_rate': 1.9621127544379393e-05, 'epoch': 0.35} +2025-05-10 15:46:59 - ERROR - stderr - 12%|█▏ | 1035/8973 [3:12:08<26:01:25, 11.80s/it] +2025-05-10 15:47:10 - ERROR - stderr - 12%|█▏ | 1036/8973 [3:12:18<25:06:54, 11.39s/it] +2025-05-10 15:47:10 - ERROR - stderr - +2025-05-10 15:47:10 - ERROR - stderr - +2025-05-10 15:47:10 - INFO - stdout - {'loss': 1.0026, 'grad_norm': 0.932476282119751, 'learning_rate': 1.9620142703904586e-05, 'epoch': 0.35} +2025-05-10 15:47:10 - ERROR - stderr - 12%|█▏ | 1036/8973 [3:12:19<25:06:54, 11.39s/it] +2025-05-10 15:47:20 - ERROR - stderr - 12%|█▏ | 1037/8973 [3:12:29<24:25:34, 11.08s/it] +2025-05-10 15:47:20 - ERROR - stderr - +2025-05-10 15:47:20 - ERROR - stderr - +2025-05-10 15:47:20 - INFO - stdout - {'loss': 0.9467, 'grad_norm': 0.8864158987998962, 'learning_rate': 1.9619156609875082e-05, 'epoch': 0.35} +2025-05-10 15:47:20 - ERROR - stderr - 12%|█▏ | 1037/8973 [3:12:29<24:25:34, 11.08s/it] +2025-05-10 15:47:31 - ERROR - stderr - 12%|█▏ | 1038/8973 [3:12:39<24:00:30, 10.89s/it] +2025-05-10 15:47:31 - ERROR - stderr - +2025-05-10 15:47:31 - ERROR - stderr - +2025-05-10 15:47:31 - INFO - stdout - {'loss': 0.9654, 'grad_norm': 0.8529897928237915, 'learning_rate': 1.961816926241938e-05, 'epoch': 0.35} +2025-05-10 15:47:31 - ERROR - stderr - 12%|█▏ | 1038/8973 [3:12:39<24:00:30, 10.89s/it] +2025-05-10 15:47:44 - ERROR - stderr - 12%|█▏ | 1039/8973 [3:12:53<25:39:56, 11.65s/it] +2025-05-10 15:47:44 - ERROR - stderr - +2025-05-10 15:47:44 - ERROR - stderr - +2025-05-10 15:47:44 - INFO - stdout - {'loss': 1.0345, 'grad_norm': 0.9214997291564941, 'learning_rate': 1.9617180661666127e-05, 'epoch': 0.35} +2025-05-10 15:47:44 - ERROR - stderr - 12%|█▏ | 1039/8973 [3:12:53<25:39:56, 11.65s/it] +2025-05-10 15:47:56 - ERROR - stderr - 12%|█▏ | 1040/8973 [3:13:05<26:07:54, 11.86s/it] +2025-05-10 15:47:56 - ERROR - stderr - +2025-05-10 15:47:56 - ERROR - stderr - +2025-05-10 15:47:56 - INFO - stdout - {'loss': 0.9677, 'grad_norm': 0.8337312936782837, 'learning_rate': 1.961619080774415e-05, 'epoch': 0.35} +2025-05-10 15:47:56 - ERROR - stderr - 12%|█▏ | 1040/8973 [3:13:05<26:07:54, 11.86s/it] +2025-05-10 15:48:07 - ERROR - stderr - 12%|█▏ | 1041/8973 [3:13:16<25:18:49, 11.49s/it] +2025-05-10 15:48:07 - ERROR - stderr - +2025-05-10 15:48:07 - ERROR - stderr - +2025-05-10 15:48:07 - INFO - stdout - {'loss': 1.0307, 'grad_norm': 0.8849655985832214, 'learning_rate': 1.961519970078243e-05, 'epoch': 0.35} +2025-05-10 15:48:07 - ERROR - stderr - 12%|█▏ | 1041/8973 [3:13:16<25:18:49, 11.49s/it] +2025-05-10 15:48:18 - ERROR - stderr - 12%|█▏ | 1042/8973 [3:13:26<24:46:31, 11.25s/it] +2025-05-10 15:48:18 - ERROR - stderr - +2025-05-10 15:48:18 - ERROR - stderr - +2025-05-10 15:48:18 - INFO - stdout - {'loss': 1.0812, 'grad_norm': 0.9163777232170105, 'learning_rate': 1.9614207340910116e-05, 'epoch': 0.35} +2025-05-10 15:48:18 - ERROR - stderr - 12%|█▏ | 1042/8973 [3:13:26<24:46:31, 11.25s/it] +2025-05-10 15:48:28 - ERROR - stderr - 12%|█▏ | 1043/8973 [3:13:37<24:08:55, 10.96s/it] +2025-05-10 15:48:28 - ERROR - stderr - +2025-05-10 15:48:28 - ERROR - stderr - +2025-05-10 15:48:28 - INFO - stdout - {'loss': 0.9944, 'grad_norm': 0.8824440836906433, 'learning_rate': 1.9613213728256515e-05, 'epoch': 0.35} +2025-05-10 15:48:28 - ERROR - stderr - 12%|█▏ | 1043/8973 [3:13:37<24:08:55, 10.96s/it] +2025-05-10 15:48:41 - ERROR - stderr - 12%|█▏ | 1044/8973 [3:13:50<25:51:21, 11.74s/it] +2025-05-10 15:48:41 - ERROR - stderr - +2025-05-10 15:48:41 - ERROR - stderr - +2025-05-10 15:48:41 - INFO - stdout - {'loss': 0.9722, 'grad_norm': 0.8374669551849365, 'learning_rate': 1.96122188629511e-05, 'epoch': 0.35} +2025-05-10 15:48:41 - ERROR - stderr - 12%|█▏ | 1044/8973 [3:13:50<25:51:21, 11.74s/it] +2025-05-10 15:48:52 - ERROR - stderr - 12%|█▏ | 1045/8973 [3:14:01<24:58:11, 11.34s/it] +2025-05-10 15:48:52 - ERROR - stderr - +2025-05-10 15:48:52 - ERROR - stderr - +2025-05-10 15:48:52 - INFO - stdout - {'loss': 0.9408, 'grad_norm': 0.858892560005188, 'learning_rate': 1.9611222745123503e-05, 'epoch': 0.35} +2025-05-10 15:48:52 - ERROR - stderr - 12%|█▏ | 1045/8973 [3:14:01<24:58:11, 11.34s/it] +2025-05-10 15:49:03 - ERROR - stderr - 12%|█▏ | 1046/8973 [3:14:12<24:42:14, 11.22s/it] +2025-05-10 15:49:03 - ERROR - stderr - +2025-05-10 15:49:03 - ERROR - stderr - +2025-05-10 15:49:03 - INFO - stdout - {'loss': 0.9804, 'grad_norm': 0.8040887117385864, 'learning_rate': 1.9610225374903534e-05, 'epoch': 0.35} +2025-05-10 15:49:03 - ERROR - stderr - 12%|█▏ | 1046/8973 [3:14:12<24:42:14, 11.22s/it] +2025-05-10 15:49:13 - ERROR - stderr - 12%|█▏ | 1047/8973 [3:14:22<24:12:30, 11.00s/it] +2025-05-10 15:49:13 - ERROR - stderr - +2025-05-10 15:49:13 - ERROR - stderr - +2025-05-10 15:49:13 - INFO - stdout - {'loss': 1.0595, 'grad_norm': 0.935232400894165, 'learning_rate': 1.9609226752421145e-05, 'epoch': 0.35} +2025-05-10 15:49:13 - ERROR - stderr - 12%|█▏ | 1047/8973 [3:14:22<24:12:30, 11.00s/it] +2025-05-10 15:49:27 - ERROR - stderr - 12%|█▏ | 1048/8973 [3:14:35<25:42:45, 11.68s/it] +2025-05-10 15:49:27 - ERROR - stderr - +2025-05-10 15:49:27 - ERROR - stderr - +2025-05-10 15:49:27 - INFO - stdout - {'loss': 0.9757, 'grad_norm': 0.837334156036377, 'learning_rate': 1.9608226877806468e-05, 'epoch': 0.35} +2025-05-10 15:49:27 - ERROR - stderr - 12%|█▏ | 1048/8973 [3:14:35<25:42:45, 11.68s/it] +2025-05-10 15:49:39 - ERROR - stderr - 12%|█▏ | 1049/8973 [3:14:48<26:08:00, 11.87s/it] +2025-05-10 15:49:39 - ERROR - stderr - +2025-05-10 15:49:39 - ERROR - stderr - +2025-05-10 15:49:39 - INFO - stdout - {'loss': 0.9083, 'grad_norm': 0.8342932462692261, 'learning_rate': 1.960722575118979e-05, 'epoch': 0.35} +2025-05-10 15:49:39 - ERROR - stderr - 12%|█▏ | 1049/8973 [3:14:48<26:08:00, 11.87s/it] +2025-05-10 15:49:49 - ERROR - stderr - 12%|█▏ | 1050/8973 [3:14:58<25:07:49, 11.42s/it] +2025-05-10 15:49:49 - ERROR - stderr - +2025-05-10 15:49:49 - ERROR - stderr - +2025-05-10 15:49:49 - INFO - stdout - {'loss': 0.9447, 'grad_norm': 0.8199008703231812, 'learning_rate': 1.960622337270156e-05, 'epoch': 0.35} +2025-05-10 15:49:49 - ERROR - stderr - 12%|█▏ | 1050/8973 [3:14:58<25:07:49, 11.42s/it] +2025-05-10 15:49:59 - ERROR - stderr - 12%|█▏ | 1051/8973 [3:15:08<24:18:24, 11.05s/it] +2025-05-10 15:49:59 - ERROR - stderr - +2025-05-10 15:49:59 - ERROR - stderr - +2025-05-10 15:49:59 - INFO - stdout - {'loss': 1.032, 'grad_norm': 0.8448853492736816, 'learning_rate': 1.9605219742472393e-05, 'epoch': 0.35} +2025-05-10 15:49:59 - ERROR - stderr - 12%|█▏ | 1051/8973 [3:15:08<24:18:24, 11.05s/it] +2025-05-10 15:50:10 - ERROR - stderr - 12%|█▏ | 1052/8973 [3:15:18<23:43:34, 10.78s/it] +2025-05-10 15:50:10 - ERROR - stderr - +2025-05-10 15:50:10 - ERROR - stderr - +2025-05-10 15:50:10 - INFO - stdout - {'loss': 0.9421, 'grad_norm': 0.7916757464408875, 'learning_rate': 1.9604214860633077e-05, 'epoch': 0.35} +2025-05-10 15:50:10 - ERROR - stderr - 12%|█▏ | 1052/8973 [3:15:18<23:43:34, 10.78s/it] +2025-05-10 15:50:23 - ERROR - stderr - 12%|█▏ | 1053/8973 [3:15:32<25:20:06, 11.52s/it] +2025-05-10 15:50:23 - ERROR - stderr - +2025-05-10 15:50:23 - ERROR - stderr - +2025-05-10 15:50:23 - INFO - stdout - {'loss': 1.0412, 'grad_norm': 0.9093542098999023, 'learning_rate': 1.960320872731454e-05, 'epoch': 0.35} +2025-05-10 15:50:23 - ERROR - stderr - 12%|█▏ | 1053/8973 [3:15:32<25:20:06, 11.52s/it] +2025-05-10 15:50:34 - ERROR - stderr - 12%|█▏ | 1054/8973 [3:15:43<25:00:28, 11.37s/it] +2025-05-10 15:50:34 - ERROR - stderr - +2025-05-10 15:50:34 - ERROR - stderr - +2025-05-10 15:50:34 - INFO - stdout - {'loss': 0.9395, 'grad_norm': 0.8604532480239868, 'learning_rate': 1.96022013426479e-05, 'epoch': 0.35} +2025-05-10 15:50:34 - ERROR - stderr - 12%|█▏ | 1054/8973 [3:15:43<25:00:28, 11.37s/it] +2025-05-10 15:50:44 - ERROR - stderr - 12%|█▏ | 1055/8973 [3:15:53<24:15:57, 11.03s/it] +2025-05-10 15:50:44 - ERROR - stderr - +2025-05-10 15:50:44 - ERROR - stderr - +2025-05-10 15:50:44 - INFO - stdout - {'loss': 0.9309, 'grad_norm': 0.8024330735206604, 'learning_rate': 1.9601192706764413e-05, 'epoch': 0.35} +2025-05-10 15:50:44 - ERROR - stderr - 12%|█▏ | 1055/8973 [3:15:53<24:15:57, 11.03s/it] +2025-05-10 15:50:54 - ERROR - stderr - 12%|█▏ | 1056/8973 [3:16:03<23:44:22, 10.79s/it] +2025-05-10 15:50:54 - ERROR - stderr - +2025-05-10 15:50:54 - ERROR - stderr - +2025-05-10 15:50:54 - INFO - stdout - {'loss': 0.9555, 'grad_norm': 0.8191692233085632, 'learning_rate': 1.9600182819795515e-05, 'epoch': 0.35} +2025-05-10 15:50:54 - ERROR - stderr - 12%|█▏ | 1056/8973 [3:16:03<23:44:22, 10.79s/it] +2025-05-10 15:51:05 - ERROR - stderr - 12%|█▏ | 1057/8973 [3:16:14<23:55:39, 10.88s/it] +2025-05-10 15:51:05 - ERROR - stderr - +2025-05-10 15:51:05 - ERROR - stderr - +2025-05-10 15:51:05 - INFO - stdout - {'loss': 0.9688, 'grad_norm': 0.7913084030151367, 'learning_rate': 1.9599171681872796e-05, 'epoch': 0.35} +2025-05-10 15:51:05 - ERROR - stderr - 12%|█▏ | 1057/8973 [3:16:14<23:55:39, 10.88s/it] +2025-05-10 15:57:55 - INFO - llana.model.llana - Using nf2vec. +2025-05-10 15:57:55 - INFO - stdout - Loading nf2vec config from /leonardo_scratch/fast/IscrC_V2Text/dev/LLaNA_objanerf/llana/model/nf2vec/nf2vec_2layer.yaml. +2025-05-10 15:57:55 - INFO - llana.model.llana - nerf2vec output dim: 516. +2025-05-10 15:57:55 - INFO - llana.model.llana - Use 1 projection hiddent layers. +2025-05-10 15:57:55 - INFO - llana.model.llana - Each layer with [2048] hidden units. +2025-05-10 15:57:55 - INFO - stdout - Each layer with [2048] hidden units. +2025-05-10 15:57:55 - INFO - stdout - Vec projector output dim: 5120. +2025-05-10 15:57:55 - INFO - llana.model.llana - =========== VEC PROJ PARAMETERS ============= +2025-05-10 15:57:55 - INFO - llana.model.llana - Vec projector architecture: Sequential( + (0): Linear(in_features=516, out_features=2048, bias=True) + (1): GELU(approximate='none') + (2): Linear(in_features=2048, out_features=5120, bias=True) +) +2025-05-10 15:57:55 - INFO - llana.model.llana - Total number of parameters: 11,549,696 +2025-05-10 15:57:55 - INFO - llana.model.llana - ============================================= +2025-05-10 15:57:55 - INFO - llana.model.llana - =========== LLANA TOKENIZER ============= +2025-05-10 15:57:55 - INFO - llana.model.llana - Tokenizer: Embedding(32003, 5120, padding_idx=0) +2025-05-10 15:57:55 - INFO - llana.model.llana - Total number of parameters: 163,855,360 +2025-05-10 15:57:55 - INFO - llana.model.llana - ============================================= +2025-05-10 15:57:55 - INFO - llana.model.llana - =========== LM HEAD PARAMETERS ============= +2025-05-10 15:57:55 - INFO - llana.model.llana - lm_head architecture: Linear(in_features=5120, out_features=32003, bias=False) +2025-05-10 15:57:55 - INFO - llana.model.llana - Total number of parameters: 163,855,360 +2025-05-10 15:57:55 - INFO - llana.model.llana - ============================================= +2025-05-10 15:57:55 - INFO - llana.model.llana - Using nf2vec. +2025-05-10 15:57:55 - INFO - stdout - Loading nf2vec config from /leonardo_scratch/fast/IscrC_V2Text/dev/LLaNA_objanerf/llana/model/nf2vec/nf2vec_2layer.yaml. +2025-05-10 15:57:55 - INFO - llana.model.llana - nerf2vec output dim: 516. +2025-05-10 15:57:55 - INFO - llana.model.llana - Use 1 projection hiddent layers. +2025-05-10 15:57:55 - INFO - llana.model.llana - Each layer with [2048] hidden units. +2025-05-10 15:57:55 - INFO - stdout - Each layer with [2048] hidden units. +2025-05-10 15:57:55 - INFO - stdout - Vec projector output dim: 5120. +2025-05-10 15:57:55 - INFO - llana.model.llana - =========== VEC PROJ PARAMETERS ============= +2025-05-10 15:57:55 - INFO - llana.model.llana - Vec projector architecture: Sequential( + (0): Linear(in_features=516, out_features=2048, bias=True) + (1): GELU(approximate='none') + (2): Linear(in_features=2048, out_features=5120, bias=True) +) +2025-05-10 15:57:55 - INFO - llana.model.llana - Total number of parameters: 11,549,696 +2025-05-10 15:57:55 - INFO - llana.model.llana - ============================================= +2025-05-10 15:57:55 - INFO - llana.model.llana - =========== LLANA TOKENIZER ============= +2025-05-10 15:57:55 - INFO - llana.model.llana - Tokenizer: Embedding(32003, 5120, padding_idx=0) +2025-05-10 15:57:55 - INFO - llana.model.llana - Total number of parameters: 163,855,360 +2025-05-10 15:57:55 - INFO - llana.model.llana - ============================================= +2025-05-10 15:57:55 - INFO - llana.model.llana - =========== LM HEAD PARAMETERS ============= +2025-05-10 15:57:55 - INFO - llana.model.llana - lm_head architecture: Linear(in_features=5120, out_features=32003, bias=False) +2025-05-10 15:57:55 - INFO - llana.model.llana - Total number of parameters: 163,855,360 +2025-05-10 15:57:55 - INFO - llana.model.llana - ============================================= +2025-05-10 15:57:55 - INFO - llana.model.llana - Using nf2vec. +2025-05-10 15:57:55 - INFO - llana.model.llana - Using nf2vec. +2025-05-10 15:57:55 - INFO - stdout - Loading nf2vec config from /leonardo_scratch/fast/IscrC_V2Text/dev/LLaNA_objanerf/llana/model/nf2vec/nf2vec_2layer.yaml. +2025-05-10 15:57:55 - INFO - llana.model.llana - Using nf2vec. +2025-05-10 15:57:55 - INFO - stdout - Loading nf2vec config from /leonardo_scratch/fast/IscrC_V2Text/dev/LLaNA_objanerf/llana/model/nf2vec/nf2vec_2layer.yaml. +2025-05-10 15:57:55 - INFO - stdout - Loading nf2vec config from /leonardo_scratch/fast/IscrC_V2Text/dev/LLaNA_objanerf/llana/model/nf2vec/nf2vec_2layer.yaml. +2025-05-10 15:57:55 - INFO - llana.model.llana - nerf2vec output dim: 516. +2025-05-10 15:57:55 - INFO - llana.model.llana - Use 1 projection hiddent layers. +2025-05-10 15:57:55 - INFO - llana.model.llana - Each layer with [2048] hidden units. +2025-05-10 15:57:55 - INFO - stdout - Each layer with [2048] hidden units. +2025-05-10 15:57:55 - INFO - stdout - Vec projector output dim: 5120. +2025-05-10 15:57:55 - INFO - llana.model.llana - =========== VEC PROJ PARAMETERS ============= +2025-05-10 15:57:55 - INFO - llana.model.llana - Vec projector architecture: Sequential( + (0): Linear(in_features=516, out_features=2048, bias=True) + (1): GELU(approximate='none') + (2): Linear(in_features=2048, out_features=5120, bias=True) +) +2025-05-10 15:57:55 - INFO - llana.model.llana - Total number of parameters: 11,549,696 +2025-05-10 15:57:55 - INFO - llana.model.llana - ============================================= +2025-05-10 15:57:55 - INFO - llana.model.llana - =========== LLANA TOKENIZER ============= +2025-05-10 15:57:55 - INFO - llana.model.llana - Tokenizer: Embedding(32003, 5120, padding_idx=0) +2025-05-10 15:57:55 - INFO - llana.model.llana - nerf2vec output dim: 516. +2025-05-10 15:57:55 - INFO - llana.model.llana - Total number of parameters: 163,855,360 +2025-05-10 15:57:55 - INFO - llana.model.llana - nerf2vec output dim: 516. +2025-05-10 15:57:55 - INFO - llana.model.llana - Use 1 projection hiddent layers. +2025-05-10 15:57:55 - INFO - llana.model.llana - ============================================= +2025-05-10 15:57:55 - INFO - llana.model.llana - Use 1 projection hiddent layers. +2025-05-10 15:57:55 - INFO - llana.model.llana - Each layer with [2048] hidden units. +2025-05-10 15:57:55 - INFO - llana.model.llana - =========== LM HEAD PARAMETERS ============= +2025-05-10 15:57:55 - INFO - llana.model.llana - Each layer with [2048] hidden units. +2025-05-10 15:57:55 - INFO - stdout - Each layer with [2048] hidden units. +2025-05-10 15:57:55 - INFO - llana.model.llana - lm_head architecture: Linear(in_features=5120, out_features=32003, bias=False) +2025-05-10 15:57:55 - INFO - stdout - Each layer with [2048] hidden units. +2025-05-10 15:57:55 - INFO - stdout - Vec projector output dim: 5120. +2025-05-10 15:57:55 - INFO - llana.model.llana - Total number of parameters: 163,855,360 +2025-05-10 15:57:55 - INFO - stdout - Vec projector output dim: 5120. +2025-05-10 15:57:55 - INFO - llana.model.llana - =========== VEC PROJ PARAMETERS ============= +2025-05-10 15:57:55 - INFO - llana.model.llana - ============================================= +2025-05-10 15:57:55 - INFO - llana.model.llana - =========== VEC PROJ PARAMETERS ============= +2025-05-10 15:57:55 - INFO - llana.model.llana - Vec projector architecture: Sequential( + (0): Linear(in_features=516, out_features=2048, bias=True) + (1): GELU(approximate='none') + (2): Linear(in_features=2048, out_features=5120, bias=True) +) +2025-05-10 15:57:55 - INFO - llana.model.llana - Vec projector architecture: Sequential( + (0): Linear(in_features=516, out_features=2048, bias=True) + (1): GELU(approximate='none') + (2): Linear(in_features=2048, out_features=5120, bias=True) +) +2025-05-10 15:57:55 - INFO - llana.model.llana - Total number of parameters: 11,549,696 +2025-05-10 15:57:55 - INFO - llana.model.llana - Total number of parameters: 11,549,696 +2025-05-10 15:57:55 - INFO - llana.model.llana - ============================================= +2025-05-10 15:57:55 - INFO - llana.model.llana - ============================================= +2025-05-10 15:57:55 - INFO - llana.model.llana - =========== LLANA TOKENIZER ============= +2025-05-10 15:57:55 - INFO - llana.model.llana - =========== LLANA TOKENIZER ============= +2025-05-10 15:57:55 - INFO - llana.model.llana - Tokenizer: Embedding(32003, 5120, padding_idx=0) +2025-05-10 15:57:55 - INFO - llana.model.llana - Tokenizer: Embedding(32003, 5120, padding_idx=0) +2025-05-10 15:57:55 - INFO - llana.model.llana - Total number of parameters: 163,855,360 +2025-05-10 15:57:55 - INFO - llana.model.llana - Total number of parameters: 163,855,360 +2025-05-10 15:57:55 - INFO - llana.model.llana - ============================================= +2025-05-10 15:57:55 - INFO - llana.model.llana - ============================================= +2025-05-10 15:57:55 - INFO - llana.model.llana - =========== LM HEAD PARAMETERS ============= +2025-05-10 15:57:55 - INFO - llana.model.llana - =========== LM HEAD PARAMETERS ============= +2025-05-10 15:57:55 - INFO - llana.model.llana - lm_head architecture: Linear(in_features=5120, out_features=32003, bias=False) +2025-05-10 15:57:55 - INFO - llana.model.llana - lm_head architecture: Linear(in_features=5120, out_features=32003, bias=False) +2025-05-10 15:57:55 - INFO - llana.model.llana - Total number of parameters: 163,855,360 +2025-05-10 15:57:55 - INFO - llana.model.llana - Total number of parameters: 163,855,360 +2025-05-10 15:57:55 - INFO - llana.model.llana - ============================================= +2025-05-10 15:57:55 - INFO - llana.model.llana - ============================================= +2025-05-10 15:57:55 - ERROR - stderr - Loading checkpoint shards: 0%| | 0/11 [00:00', 'DEFAULT_POINT_PATCH_TOKEN': '', 'DEFAULT_POINT_START_TOKEN': '', 'mm_use_point_start_end': True, 'model_type': 'llana', 'nf2vec_config_name': 'nf2vec_2layer', 'point_backbone': 'nf2vec', 'point_backbone_ckpt': '', 'use_color': True, 'output_dir': 'outputs/LLaNA_13B_train_stage2_recipe3_shapenerf_objanerf_AUGMENTED_nf2seq_spatial516/LLaNA_13B_train_stage2_recipe3_shapenerf_objanerf_AUGMENTED_nf2seq_spatial516', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': False, 'do_predict': False, 'evaluation_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 4, 'per_device_eval_batch_size': 1, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'learning_rate': 2e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3.0, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.03, 'warmup_steps': 0, 'log_level': 'info', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': 'outputs/LLaNA_13B_train_stage2_recipe3_shapenerf_objanerf_AUGMENTED_nf2seq_spatial516/LLaNA_13B_train_stage2_recipe3_shapenerf_objanerf_AUGMENTED_nf2seq_spatial516/runs/May10_15-57-55_lrdn1300.leonardo.local', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1.0, 'logging_nan_inf_filter': True, 'save_strategy': 'no', 'save_steps': 32860.0, 'save_total_limit': 1, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 100.0, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': 'llana_objanerf_13b_stage2_recipe3_shapenerf_objanerf_AUGMENTED_nf2seq_spatial516', 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': ['full_shard', 'auto_wrap'], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'transformer_layer_cls_to_wrap': ['LlamaDecoderLayer'], 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': 'LlamaDecoderLayer', 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'cache_dir': None, 'model_max_length': 4096, 'model_debug': False, 'fix_llm': False, 'force_fsdp': False, 'tune_mm_mlp_adapter': True, 'stage_2': True, 'pretrained_mm_mlp_adapter': None, 'detatch_point_token': ''} +2025-05-10 16:02:10 - ERROR - stderr - 0%| | 0/7479 [00:00', 'DEFAULT_POINT_PATCH_TOKEN': '', 'DEFAULT_POINT_START_TOKEN': '', 'mm_use_point_start_end': True, 'model_type': 'llana', 'nf2vec_config_name': 'nf2vec_2layer', 'point_backbone': 'nf2vec', 'point_backbone_ckpt': '', 'use_color': True, 'output_dir': 'outputs/LLaNA_13B_train_stage2_recipe3_shapenerf_objanerf_AUGMENTED_nf2seq_spatial516/LLaNA_13B_train_stage2_recipe3_shapenerf_objanerf_AUGMENTED_nf2seq_spatial516', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': False, 'do_predict': False, 'evaluation_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 4, 'per_device_eval_batch_size': 1, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'learning_rate': 2e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3.0, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.03, 'warmup_steps': 0, 'log_level': 'info', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': 'outputs/LLaNA_13B_train_stage2_recipe3_shapenerf_objanerf_AUGMENTED_nf2seq_spatial516/LLaNA_13B_train_stage2_recipe3_shapenerf_objanerf_AUGMENTED_nf2seq_spatial516/runs/May10_16-51-17_lrdn1027.leonardo.local', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1.0, 'logging_nan_inf_filter': True, 'save_strategy': 'no', 'save_steps': 32860.0, 'save_total_limit': 1, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 100.0, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': 'llana_objanerf_13b_stage2_recipe3_shapenerf_objanerf_AUGMENTED_nf2seq_spatial516', 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': ['full_shard', 'auto_wrap'], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'transformer_layer_cls_to_wrap': ['LlamaDecoderLayer'], 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': 'LlamaDecoderLayer', 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'cache_dir': None, 'model_max_length': 4096, 'model_debug': False, 'fix_llm': False, 'force_fsdp': False, 'tune_mm_mlp_adapter': True, 'stage_2': True, 'pretrained_mm_mlp_adapter': None, 'detatch_point_token': ''} +2025-05-10 17:06:13 - ERROR - stderr - 0%| | 0/5610 [00:00', 'DEFAULT_POINT_PATCH_TOKEN': '', 'DEFAULT_POINT_START_TOKEN': '', 'mm_use_point_start_end': True, 'model_type': 'llana', 'nf2vec_config_name': 'nf2vec_2layer', 'point_backbone': 'nf2vec', 'point_backbone_ckpt': '', 'use_color': True, 'output_dir': 'outputs/LLaNA_13B_train_stage2_recipe3_shapenerf_objanerf_AUGMENTED_nf2seq_spatial516/LLaNA_13B_train_stage2_recipe3_shapenerf_objanerf_AUGMENTED_nf2seq_spatial516', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': False, 'do_predict': False, 'evaluation_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 1, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'learning_rate': 2e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3.0, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.03, 'warmup_steps': 0, 'log_level': 'info', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': 'outputs/LLaNA_13B_train_stage2_recipe3_shapenerf_objanerf_AUGMENTED_nf2seq_spatial516/LLaNA_13B_train_stage2_recipe3_shapenerf_objanerf_AUGMENTED_nf2seq_spatial516/runs/May10_17-20-05_lrdn1040.leonardo.local', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1.0, 'logging_nan_inf_filter': True, 'save_strategy': 'no', 'save_steps': 32860.0, 'save_total_limit': 1, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 100.0, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': 'llana_objanerf_13b_stage2_recipe3_shapenerf_objanerf_AUGMENTED_nf2seq_spatial516', 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': ['full_shard', 'auto_wrap'], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'transformer_layer_cls_to_wrap': ['LlamaDecoderLayer'], 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': 'LlamaDecoderLayer', 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': False, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'cache_dir': None, 'model_max_length': 4096, 'model_debug': False, 'fix_llm': False, 'force_fsdp': False, 'tune_mm_mlp_adapter': True, 'stage_2': True, 'pretrained_mm_mlp_adapter': None, 'detatch_point_token': ''} +2025-05-10 17:25:53 - ERROR - stderr - 0%| | 0/3741 [00:00 4096). Running this sequence through the model will result in indexing errors +2025-05-10 19:45:24 - WARNING - transformers.tokenization_utils_base - Token indices sequence length is longer than the specified maximum sequence length for this model (5986 > 4096). Running this sequence through the model will result in indexing errors +2025-05-10 19:45:43 - ERROR - stderr - 10%|█ | 375/3741 [2:19:50<18:57:48, 20.28s/it] +2025-05-10 19:45:43 - ERROR - stderr - +2025-05-10 19:45:43 - ERROR - stderr - +2025-05-10 19:45:43 - INFO - stdout - {'loss': 0.9267, 'grad_norm': 0.7359748482704163, 'learning_rate': 1.97437439331107e-05, 'epoch': 0.3} +2025-05-10 19:45:43 - ERROR - stderr - 10%|█ | 375/3741 [2:19:50<18:57:48, 20.28s/it] +2025-05-10 19:46:10 - ERROR - stderr - 10%|█ | 376/3741 [2:20:16<20:38:37, 22.09s/it] +2025-05-10 19:46:10 - ERROR - stderr - +2025-05-10 19:46:10 - ERROR - stderr - +2025-05-10 19:46:10 - INFO - stdout - {'loss': 0.967, 'grad_norm': 0.7002710103988647, 'learning_rate': 1.97417925248128e-05, 'epoch': 0.3} +2025-05-10 19:46:10 - ERROR - stderr - 10%|█ | 376/3741 [2:20:16<20:38:37, 22.09s/it] +2025-05-10 19:46:29 - ERROR - stderr - 10%|█ | 377/3741 [2:20:36<19:56:35, 21.34s/it] +2025-05-10 19:46:29 - ERROR - stderr - +2025-05-10 19:46:29 - ERROR - stderr - +2025-05-10 19:46:29 - INFO - stdout - {'loss': 0.9648, 'grad_norm': 0.7859190702438354, 'learning_rate': 1.9739833811787097e-05, 'epoch': 0.3} +2025-05-10 19:46:29 - ERROR - stderr - 10%|█ | 377/3741 [2:20:36<19:56:35, 21.34s/it] +2025-05-10 19:46:51 - ERROR - stderr - 10%|█ | 378/3741 [2:20:57<19:57:09, 21.36s/it] +2025-05-10 19:46:51 - ERROR - stderr - +2025-05-10 19:46:51 - ERROR - stderr - +2025-05-10 19:46:51 - INFO - stdout - {'loss': 0.9899, 'grad_norm': 0.7745339870452881, 'learning_rate': 1.9737867795502298e-05, 'epoch': 0.3} +2025-05-10 19:46:51 - ERROR - stderr - 10%|█ | 378/3741 [2:20:57<19:57:09, 21.36s/it] +2025-05-10 19:47:10 - ERROR - stderr - 10%|█ | 379/3741 [2:21:16<19:25:29, 20.80s/it] +2025-05-10 19:47:10 - ERROR - stderr - +2025-05-10 19:47:10 - ERROR - stderr - +2025-05-10 19:47:10 - INFO - stdout - {'loss': 0.9445, 'grad_norm': 0.6724187731742859, 'learning_rate': 1.973589447743259e-05, 'epoch': 0.3} +2025-05-10 19:47:10 - ERROR - stderr - 10%|█ | 379/3741 [2:21:16<19:25:29, 20.80s/it] +2025-05-10 19:47:34 - ERROR - stderr - 10%|█ | 380/3741 [2:21:40<20:11:56, 21.64s/it] +2025-05-10 19:47:34 - ERROR - stderr - +2025-05-10 19:47:34 - ERROR - stderr - +2025-05-10 19:47:34 - INFO - stdout - {'loss': 0.9603, 'grad_norm': 0.7949398159980774, 'learning_rate': 1.9733913859057637e-05, 'epoch': 0.3} +2025-05-10 19:47:34 - ERROR - stderr - 10%|█ | 380/3741 [2:21:40<20:11:56, 21.64s/it] +2025-05-10 19:47:53 - ERROR - stderr - 10%|█ | 381/3741 [2:22:00<19:35:54, 21.00s/it] +2025-05-10 19:47:53 - ERROR - stderr - +2025-05-10 19:47:53 - ERROR - stderr - +2025-05-10 19:47:53 - INFO - stdout - {'loss': 0.9573, 'grad_norm': 0.7890524864196777, 'learning_rate': 1.9731925941862573e-05, 'epoch': 0.31} +2025-05-10 19:47:53 - ERROR - stderr - 10%|█ | 381/3741 [2:22:00<19:35:54, 21.00s/it] +2025-05-10 19:48:17 - ERROR - stderr - 10%|█ | 382/3741 [2:22:23<20:20:42, 21.80s/it] +2025-05-10 19:48:17 - ERROR - stderr - +2025-05-10 19:48:17 - ERROR - stderr - +2025-05-10 19:48:17 - INFO - stdout - {'loss': 1.0042, 'grad_norm': 0.6943206787109375, 'learning_rate': 1.9729930727338004e-05, 'epoch': 0.31} +2025-05-10 19:48:17 - ERROR - stderr - 10%|█ | 382/3741 [2:22:23<20:20:42, 21.80s/it] +2025-05-10 19:48:37 - ERROR - stderr - 10%|█ | 383/3741 [2:22:43<19:44:23, 21.16s/it] +2025-05-10 19:48:37 - ERROR - stderr - +2025-05-10 19:48:37 - ERROR - stderr - +2025-05-10 19:48:37 - INFO - stdout - {'loss': 0.9527, 'grad_norm': 0.6898202300071716, 'learning_rate': 1.972792821698001e-05, 'epoch': 0.31} +2025-05-10 19:48:37 - ERROR - stderr - 10%|█ | 383/3741 [2:22:43<19:44:23, 21.16s/it] +2025-05-10 19:49:02 - ERROR - stderr - 10%|█ | 384/3741 [2:23:08<20:58:10, 22.49s/it] +2025-05-10 19:49:02 - ERROR - stderr - +2025-05-10 19:49:02 - ERROR - stderr - +2025-05-10 19:49:02 - INFO - stdout - {'loss': 0.9755, 'grad_norm': 0.7329158782958984, 'learning_rate': 1.9725918412290142e-05, 'epoch': 0.31} +2025-05-10 19:49:02 - ERROR - stderr - 10%|█ | 384/3741 [2:23:08<20:58:10, 22.49s/it] +2025-05-10 19:49:23 - ERROR - stderr - 10%|█ | 385/3741 [2:23:29<20:28:12, 21.96s/it] +2025-05-10 19:49:23 - ERROR - stderr - +2025-05-10 19:49:23 - ERROR - stderr - +2025-05-10 19:49:23 - INFO - stdout - {'loss': 1.0134, 'grad_norm': 0.7760060429573059, 'learning_rate': 1.9723901314775423e-05, 'epoch': 0.31} +2025-05-10 19:49:23 - ERROR - stderr - 10%|█ | 385/3741 [2:23:29<20:28:12, 21.96s/it] +2025-05-10 19:49:47 - ERROR - stderr - 10%|█ | 386/3741 [2:23:53<20:56:51, 22.48s/it] +2025-05-10 19:49:47 - ERROR - stderr - +2025-05-10 19:49:47 - ERROR - stderr - +2025-05-10 19:49:47 - INFO - stdout - {'loss': 0.973, 'grad_norm': 0.6890391111373901, 'learning_rate': 1.9721876925948336e-05, 'epoch': 0.31} +2025-05-10 19:49:47 - ERROR - stderr - 10%|█ | 386/3741 [2:23:53<20:56:51, 22.48s/it] +2025-05-10 19:50:06 - ERROR - stderr - 10%|█ | 387/3741 [2:24:12<20:08:04, 21.61s/it] +2025-05-10 19:50:06 - ERROR - stderr - +2025-05-10 19:50:06 - ERROR - stderr - +2025-05-10 19:50:06 - INFO - stdout - {'loss': 1.0187, 'grad_norm': 0.6739248037338257, 'learning_rate': 1.971984524732684e-05, 'epoch': 0.31} +2025-05-10 19:50:06 - ERROR - stderr - 10%|█ | 387/3741 [2:24:12<20:08:04, 21.61s/it] +2025-05-10 19:50:30 - ERROR - stderr - 10%|█ | 388/3741 [2:24:37<20:51:13, 22.39s/it] +2025-05-10 19:50:30 - ERROR - stderr - +2025-05-10 19:50:30 - ERROR - stderr - +2025-05-10 19:50:30 - INFO - stdout - {'loss': 0.9873, 'grad_norm': 0.729393482208252, 'learning_rate': 1.971780628043436e-05, 'epoch': 0.31} +2025-05-10 19:50:30 - ERROR - stderr - 10%|█ | 388/3741 [2:24:37<20:51:13, 22.39s/it] +2025-05-10 19:50:50 - ERROR - stderr - 10%|█ | 389/3741 [2:24:56<20:06:01, 21.59s/it] +2025-05-10 19:50:50 - ERROR - stderr - +2025-05-10 19:50:50 - ERROR - stderr - +2025-05-10 19:50:50 - INFO - stdout - {'loss': 0.9992, 'grad_norm': 0.7092537879943848, 'learning_rate': 1.9715760026799776e-05, 'epoch': 0.31} +2025-05-10 19:50:50 - ERROR - stderr - 10%|█ | 389/3741 [2:24:56<20:06:01, 21.59s/it] +2025-05-10 19:50:51 - WARNING - transformers.tokenization_utils_base - Token indices sequence length is longer than the specified maximum sequence length for this model (5854 > 4096). Running this sequence through the model will result in indexing errors +2025-05-10 19:50:51 - WARNING - transformers.tokenization_utils_base - Token indices sequence length is longer than the specified maximum sequence length for this model (5854 > 4096). Running this sequence through the model will result in indexing errors +2025-05-10 19:51:13 - ERROR - stderr - 10%|█ | 390/3741 [2:25:20<20:36:46, 22.14s/it] +2025-05-10 19:51:14 - ERROR - stderr - +2025-05-10 19:51:14 - ERROR - stderr - +2025-05-10 19:51:14 - INFO - stdout - {'loss': 0.9793, 'grad_norm': 0.657490074634552, 'learning_rate': 1.971370648795744e-05, 'epoch': 0.31} +2025-05-10 19:51:14 - ERROR - stderr - 10%|█ | 390/3741 [2:25:20<20:36:46, 22.14s/it] +2025-05-10 19:51:38 - ERROR - stderr - 10%|█ | 391/3741 [2:25:44<21:11:52, 22.78s/it] +2025-05-10 19:51:38 - ERROR - stderr - +2025-05-10 19:51:38 - ERROR - stderr - +2025-05-10 19:51:38 - INFO - stdout - {'loss': 0.9412, 'grad_norm': 0.7291781902313232, 'learning_rate': 1.971164566544717e-05, 'epoch': 0.31} +2025-05-10 19:51:38 - ERROR - stderr - 10%|█ | 391/3741 [2:25:44<21:11:52, 22.78s/it] +2025-05-10 19:52:03 - ERROR - stderr - 10%|█ | 392/3741 [2:26:10<21:59:19, 23.64s/it] +2025-05-10 19:52:03 - ERROR - stderr - +2025-05-10 19:52:03 - ERROR - stderr - +2025-05-10 19:52:03 - INFO - stdout - {'loss': 0.994, 'grad_norm': 0.8005679845809937, 'learning_rate': 1.970957756081424e-05, 'epoch': 0.31} +2025-05-10 19:52:03 - ERROR - stderr - 10%|█ | 392/3741 [2:26:10<21:59:19, 23.64s/it] +2025-05-10 19:52:23 - ERROR - stderr - 11%|█ | 393/3741 [2:26:30<20:55:59, 22.51s/it] +2025-05-10 19:52:23 - ERROR - stderr - +2025-05-10 19:52:23 - ERROR - stderr - +2025-05-10 19:52:23 - INFO - stdout - {'loss': 1.0069, 'grad_norm': 0.6840459704399109, 'learning_rate': 1.9707502175609377e-05, 'epoch': 0.32} +2025-05-10 19:52:23 - ERROR - stderr - 11%|█ | 393/3741 [2:26:30<20:55:59, 22.51s/it] +2025-05-10 19:52:49 - ERROR - stderr - 11%|█ | 394/3741 [2:26:55<21:46:35, 23.42s/it] +2025-05-10 19:52:49 - ERROR - stderr - +2025-05-10 19:52:49 - ERROR - stderr - +2025-05-10 19:52:49 - INFO - stdout - {'loss': 0.949, 'grad_norm': 0.6707590222358704, 'learning_rate': 1.9705419511388784e-05, 'epoch': 0.32} +2025-05-10 19:52:49 - ERROR - stderr - 11%|█ | 394/3741 [2:26:55<21:46:35, 23.42s/it] +2025-05-10 19:53:09 - ERROR - stderr - 11%|█ | 395/3741 [2:27:15<20:44:01, 22.31s/it] +2025-05-10 19:53:09 - ERROR - stderr - +2025-05-10 19:53:09 - ERROR - stderr - +2025-05-10 19:53:09 - INFO - stdout - {'loss': 1.0329, 'grad_norm': 0.66581130027771, 'learning_rate': 1.9703329569714114e-05, 'epoch': 0.32} +2025-05-10 19:53:09 - ERROR - stderr - 11%|█ | 395/3741 [2:27:15<20:44:01, 22.31s/it] +2025-05-10 19:53:33 - ERROR - stderr - 11%|█ | 396/3741 [2:27:40<21:27:06, 23.09s/it] +2025-05-10 19:53:33 - ERROR - stderr - +2025-05-10 19:53:33 - ERROR - stderr - +2025-05-10 19:53:33 - INFO - stdout - {'loss': 0.9578, 'grad_norm': 0.7102177739143372, 'learning_rate': 1.9701232352152472e-05, 'epoch': 0.32} +2025-05-10 19:53:33 - ERROR - stderr - 11%|█ | 396/3741 [2:27:40<21:27:06, 23.09s/it] +2025-05-10 19:53:53 - ERROR - stderr - 11%|█ | 397/3741 [2:28:00<20:33:32, 22.13s/it] +2025-05-10 19:53:53 - ERROR - stderr - +2025-05-10 19:53:53 - ERROR - stderr - +2025-05-10 19:53:53 - INFO - stdout - {'loss': 0.989, 'grad_norm': 0.7258641719818115, 'learning_rate': 1.9699127860276426e-05, 'epoch': 0.32} +2025-05-10 19:53:53 - ERROR - stderr - 11%|█ | 397/3741 [2:28:00<20:33:32, 22.13s/it] +2025-05-10 19:54:17 - ERROR - stderr - 11%|█ | 398/3741 [2:28:24<21:07:05, 22.74s/it] +2025-05-10 19:54:18 - ERROR - stderr - +2025-05-10 19:54:18 - ERROR - stderr - +2025-05-10 19:54:18 - INFO - stdout - {'loss': 0.9845, 'grad_norm': 0.646165668964386, 'learning_rate': 1.969701609566399e-05, 'epoch': 0.32} +2025-05-10 19:54:18 - ERROR - stderr - 11%|█ | 398/3741 [2:28:24<21:07:05, 22.74s/it] +2025-05-10 19:54:38 - ERROR - stderr - 11%|█ | 399/3741 [2:28:44<20:27:54, 22.04s/it] +2025-05-10 19:54:38 - ERROR - stderr - +2025-05-10 19:54:38 - ERROR - stderr - +2025-05-10 19:54:38 - INFO - stdout - {'loss': 1.009, 'grad_norm': 0.6825928688049316, 'learning_rate': 1.9694897059898648e-05, 'epoch': 0.32} +2025-05-10 19:54:38 - ERROR - stderr - 11%|█ | 399/3741 [2:28:44<20:27:54, 22.04s/it] +2025-05-10 19:55:01 - ERROR - stderr - 11%|█ | 400/3741 [2:29:07<20:40:11, 22.27s/it] +2025-05-10 19:55:01 - ERROR - stderr - +2025-05-10 19:55:01 - ERROR - stderr - +2025-05-10 19:55:01 - INFO - stdout - {'loss': 0.966, 'grad_norm': 0.7049286365509033, 'learning_rate': 1.9692770754569316e-05, 'epoch': 0.32} +2025-05-10 19:55:01 - ERROR - stderr - 11%|█ | 400/3741 [2:29:07<20:40:11, 22.27s/it] +2025-05-10 19:55:20 - ERROR - stderr - 11%|█ | 401/3741 [2:29:27<19:56:36, 21.50s/it] +2025-05-10 19:55:20 - ERROR - stderr - +2025-05-10 19:55:20 - ERROR - stderr - +2025-05-10 19:55:20 - INFO - stdout - {'loss': 0.9642, 'grad_norm': 0.6968984603881836, 'learning_rate': 1.9690637181270372e-05, 'epoch': 0.32} +2025-05-10 19:55:20 - ERROR - stderr - 11%|█ | 401/3741 [2:29:27<19:56:36, 21.50s/it] +2025-05-10 19:55:40 - ERROR - stderr - 11%|█ | 402/3741 [2:29:46<19:22:56, 20.90s/it] +2025-05-10 19:55:40 - ERROR - stderr - +2025-05-10 19:55:40 - ERROR - stderr - +2025-05-10 19:55:40 - INFO - stdout - {'loss': 0.9723, 'grad_norm': 0.7258249521255493, 'learning_rate': 1.9688496341601647e-05, 'epoch': 0.32} +2025-05-10 19:55:40 - ERROR - stderr - 11%|█ | 402/3741 [2:29:46<19:22:56, 20.90s/it] +2025-05-10 19:56:00 - ERROR - stderr - 11%|█ | 403/3741 [2:30:06<19:02:48, 20.54s/it] +2025-05-10 19:56:00 - ERROR - stderr - +2025-05-10 19:56:00 - ERROR - stderr - +2025-05-10 19:56:00 - INFO - stdout - {'loss': 0.9803, 'grad_norm': 0.6794790029525757, 'learning_rate': 1.9686348237168408e-05, 'epoch': 0.32} +2025-05-10 19:56:00 - ERROR - stderr - 11%|█ | 403/3741 [2:30:06<19:02:48, 20.54s/it] +2025-05-10 19:56:19 - ERROR - stderr - 11%|█ | 404/3741 [2:30:26<18:46:43, 20.26s/it] +2025-05-10 19:56:19 - ERROR - stderr - +2025-05-10 19:56:19 - ERROR - stderr - +2025-05-10 19:56:19 - INFO - stdout - {'loss': 0.9987, 'grad_norm': 0.666203498840332, 'learning_rate': 1.9684192869581376e-05, 'epoch': 0.32} +2025-05-10 19:56:19 - ERROR - stderr - 11%|█ | 404/3741 [2:30:26<18:46:43, 20.26s/it] +2025-05-10 19:56:42 - ERROR - stderr - 11%|█ | 405/3741 [2:30:48<19:23:48, 20.93s/it] +2025-05-10 19:56:42 - ERROR - stderr - +2025-05-10 19:56:42 - ERROR - stderr - +2025-05-10 19:56:42 - INFO - stdout - {'loss': 0.9196, 'grad_norm': 0.6866022348403931, 'learning_rate': 1.968203024045673e-05, 'epoch': 0.32} +2025-05-10 19:56:42 - ERROR - stderr - 11%|█ | 405/3741 [2:30:48<19:23:48, 20.93s/it] +2025-05-10 19:57:01 - ERROR - stderr - 11%|█ | 406/3741 [2:31:08<19:00:13, 20.51s/it] +2025-05-10 19:57:01 - ERROR - stderr - +2025-05-10 19:57:01 - ERROR - stderr - +2025-05-10 19:57:01 - INFO - stdout - {'loss': 0.9677, 'grad_norm': 0.6703433990478516, 'learning_rate': 1.9679860351416075e-05, 'epoch': 0.33} +2025-05-10 19:57:01 - ERROR - stderr - 11%|█ | 406/3741 [2:31:08<19:00:13, 20.51s/it] +2025-05-10 19:57:27 - ERROR - stderr - 11%|█ | 407/3741 [2:31:34<20:30:27, 22.14s/it] +2025-05-10 19:57:27 - ERROR - stderr - +2025-05-10 19:57:27 - ERROR - stderr - +2025-05-10 19:57:27 - INFO - stdout - {'loss': 0.9897, 'grad_norm': 0.7272356152534485, 'learning_rate': 1.967768320408647e-05, 'epoch': 0.33} +2025-05-10 19:57:27 - ERROR - stderr - 11%|█ | 407/3741 [2:31:34<20:30:27, 22.14s/it] +2025-05-10 19:57:47 - ERROR - stderr - 11%|█ | 408/3741 [2:31:53<19:47:16, 21.37s/it] +2025-05-10 19:57:47 - ERROR - stderr - +2025-05-10 19:57:47 - ERROR - stderr - +2025-05-10 19:57:47 - INFO - stdout - {'loss': 0.9674, 'grad_norm': 0.7687215805053711, 'learning_rate': 1.967549880010041e-05, 'epoch': 0.33} +2025-05-10 19:57:47 - ERROR - stderr - 11%|█ | 408/3741 [2:31:53<19:47:16, 21.37s/it] +2025-05-10 19:58:13 - ERROR - stderr - 11%|█ | 409/3741 [2:32:19<21:01:58, 22.72s/it] +2025-05-10 19:58:13 - ERROR - stderr - +2025-05-10 19:58:13 - ERROR - stderr - +2025-05-10 19:58:13 - INFO - stdout - {'loss': 1.0214, 'grad_norm': 0.6770045757293701, 'learning_rate': 1.967330714109584e-05, 'epoch': 0.33} +2025-05-10 19:58:13 - ERROR - stderr - 11%|█ | 409/3741 [2:32:19<21:01:58, 22.72s/it] +2025-05-10 19:58:14 - WARNING - transformers.tokenization_utils_base - Token indices sequence length is longer than the specified maximum sequence length for this model (5949 > 4096). Running this sequence through the model will result in indexing errors +2025-05-10 19:58:14 - WARNING - transformers.tokenization_utils_base - Token indices sequence length is longer than the specified maximum sequence length for this model (5949 > 4096). Running this sequence through the model will result in indexing errors +2025-05-10 19:58:32 - ERROR - stderr - 11%|█ | 410/3741 [2:32:39<20:10:40, 21.81s/it] +2025-05-10 19:58:32 - ERROR - stderr - +2025-05-10 19:58:32 - ERROR - stderr - +2025-05-10 19:58:32 - INFO - stdout - {'loss': 1.0004, 'grad_norm': 0.7255867123603821, 'learning_rate': 1.9671108228716142e-05, 'epoch': 0.33} +2025-05-10 19:58:32 - ERROR - stderr - 11%|█ | 410/3741 [2:32:39<20:10:40, 21.81s/it] +2025-05-10 19:59:01 - ERROR - stderr - 11%|█ | 411/3741 [2:33:08<22:11:33, 23.99s/it] +2025-05-10 19:59:01 - ERROR - stderr - +2025-05-10 19:59:01 - ERROR - stderr - +2025-05-10 19:59:01 - INFO - stdout - {'loss': 0.9386, 'grad_norm': 0.6760930418968201, 'learning_rate': 1.9668902064610128e-05, 'epoch': 0.33} +2025-05-10 19:59:01 - ERROR - stderr - 11%|█ | 411/3741 [2:33:08<22:11:33, 23.99s/it] +2025-05-10 19:59:21 - ERROR - stderr - 11%|█ | 412/3741 [2:33:27<20:57:33, 22.67s/it] +2025-05-10 19:59:21 - ERROR - stderr - +2025-05-10 19:59:21 - ERROR - stderr - +2025-05-10 19:59:21 - INFO - stdout - {'loss': 0.9547, 'grad_norm': 0.708378255367279, 'learning_rate': 1.9666688650432063e-05, 'epoch': 0.33} +2025-05-10 19:59:21 - ERROR - stderr - 11%|█ | 412/3741 [2:33:27<20:57:33, 22.67s/it] +2025-05-10 19:59:45 - ERROR - stderr - 11%|█ | 413/3741 [2:33:51<21:17:04, 23.02s/it] +2025-05-10 19:59:45 - ERROR - stderr - +2025-05-10 19:59:45 - ERROR - stderr - +2025-05-10 19:59:45 - INFO - stdout - {'loss': 0.9848, 'grad_norm': 0.7338705658912659, 'learning_rate': 1.9664467987841632e-05, 'epoch': 0.33} +2025-05-10 19:59:45 - ERROR - stderr - 11%|█ | 413/3741 [2:33:51<21:17:04, 23.02s/it] +2025-05-10 20:00:05 - ERROR - stderr - 11%|█ | 414/3741 [2:34:11<20:26:49, 22.12s/it] +2025-05-10 20:00:05 - ERROR - stderr - +2025-05-10 20:00:05 - ERROR - stderr - +2025-05-10 20:00:05 - INFO - stdout - {'loss': 1.0165, 'grad_norm': 0.6751573085784912, 'learning_rate': 1.9662240078503975e-05, 'epoch': 0.33} +2025-05-10 20:00:05 - ERROR - stderr - 11%|█ | 414/3741 [2:34:11<20:26:49, 22.12s/it] +2025-05-10 20:00:30 - ERROR - stderr - 11%|█ | 415/3741 [2:34:36<21:12:07, 22.95s/it] +2025-05-10 20:00:30 - ERROR - stderr - +2025-05-10 20:00:30 - ERROR - stderr - +2025-05-10 20:00:30 - INFO - stdout - {'loss': 0.9326, 'grad_norm': 0.6512075066566467, 'learning_rate': 1.9660004924089644e-05, 'epoch': 0.33} +2025-05-10 20:00:30 - ERROR - stderr - 11%|█ | 415/3741 [2:34:36<21:12:07, 22.95s/it] +2025-05-10 20:00:49 - ERROR - stderr - 11%|█ | 416/3741 [2:34:56<20:15:21, 21.93s/it] +2025-05-10 20:00:49 - ERROR - stderr - +2025-05-10 20:00:49 - ERROR - stderr - +2025-05-10 20:00:49 - INFO - stdout - {'loss': 0.9494, 'grad_norm': 0.6776466369628906, 'learning_rate': 1.965776252627464e-05, 'epoch': 0.33} +2025-05-10 20:00:49 - ERROR - stderr - 11%|█ | 416/3741 [2:34:56<20:15:21, 21.93s/it] +2025-05-10 20:01:15 - ERROR - stderr - 11%|█ | 417/3741 [2:35:22<21:23:49, 23.17s/it] +2025-05-10 20:01:15 - ERROR - stderr - +2025-05-10 20:01:15 - ERROR - stderr - +2025-05-10 20:01:15 - INFO - stdout - {'loss': 0.9866, 'grad_norm': 0.7271110415458679, 'learning_rate': 1.9655512886740383e-05, 'epoch': 0.33} +2025-05-10 20:01:15 - ERROR - stderr - 11%|█ | 417/3741 [2:35:22<21:23:49, 23.17s/it] +2025-05-10 20:01:35 - ERROR - stderr - 11%|█ | 418/3741 [2:35:41<20:24:43, 22.11s/it] +2025-05-10 20:01:35 - ERROR - stderr - +2025-05-10 20:01:35 - ERROR - stderr - +2025-05-10 20:01:35 - INFO - stdout - {'loss': 0.9433, 'grad_norm': 0.6701963543891907, 'learning_rate': 1.9653256007173735e-05, 'epoch': 0.34} +2025-05-10 20:01:35 - ERROR - stderr - 11%|█ | 418/3741 [2:35:41<20:24:43, 22.11s/it] +2025-05-10 20:02:01 - ERROR - stderr - 11%|█ | 419/3741 [2:36:07<21:31:34, 23.33s/it] +2025-05-10 20:02:01 - ERROR - stderr - +2025-05-10 20:02:01 - ERROR - stderr - +2025-05-10 20:02:01 - INFO - stdout - {'loss': 1.0014, 'grad_norm': 0.7227078080177307, 'learning_rate': 1.965099188926698e-05, 'epoch': 0.34} +2025-05-10 20:02:01 - ERROR - stderr - 11%|█ | 419/3741 [2:36:08<21:31:34, 23.33s/it] +2025-05-10 20:02:21 - ERROR - stderr - 11%|█ | 420/3741 [2:36:27<20:25:03, 22.13s/it] +2025-05-10 20:02:21 - ERROR - stderr - +2025-05-10 20:02:21 - ERROR - stderr - +2025-05-10 20:02:21 - INFO - stdout - {'loss': 0.9595, 'grad_norm': 0.6832866668701172, 'learning_rate': 1.964872053471783e-05, 'epoch': 0.34} +2025-05-10 20:02:21 - ERROR - stderr - 11%|█ | 420/3741 [2:36:27<20:25:03, 22.13s/it] +2025-05-10 20:02:46 - ERROR - stderr - 11%|█▏ | 421/3741 [2:36:53<21:24:20, 23.21s/it] +2025-05-10 20:02:46 - ERROR - stderr - +2025-05-10 20:02:46 - ERROR - stderr - +2025-05-10 20:02:46 - INFO - stdout - {'loss': 0.9574, 'grad_norm': 0.6797085404396057, 'learning_rate': 1.9646441945229424e-05, 'epoch': 0.34} +2025-05-10 20:02:46 - ERROR - stderr - 11%|█▏ | 421/3741 [2:36:53<21:24:20, 23.21s/it] +2025-05-10 20:03:06 - ERROR - stderr - 11%|█▏ | 422/3741 [2:37:12<20:25:08, 22.15s/it] +2025-05-10 20:03:06 - ERROR - stderr - +2025-05-10 20:03:06 - ERROR - stderr - +2025-05-10 20:03:06 - INFO - stdout - {'loss': 0.98, 'grad_norm': 0.6965078711509705, 'learning_rate': 1.9644156122510326e-05, 'epoch': 0.34} +2025-05-10 20:03:06 - ERROR - stderr - 11%|█▏ | 422/3741 [2:37:12<20:25:08, 22.15s/it] +2025-05-10 20:03:32 - ERROR - stderr - 11%|█▏ | 423/3741 [2:37:39<21:33:53, 23.40s/it] +2025-05-10 20:03:32 - ERROR - stderr - +2025-05-10 20:03:32 - ERROR - stderr - +2025-05-10 20:03:32 - INFO - stdout - {'loss': 0.9619, 'grad_norm': 0.6841316223144531, 'learning_rate': 1.9641863068274523e-05, 'epoch': 0.34} +2025-05-10 20:03:32 - ERROR - stderr - 11%|█▏ | 423/3741 [2:37:39<21:33:53, 23.40s/it] +2025-05-10 20:03:52 - ERROR - stderr - 11%|█▏ | 424/3741 [2:37:58<20:28:21, 22.22s/it] +2025-05-10 20:03:52 - ERROR - stderr - +2025-05-10 20:03:52 - ERROR - stderr - +2025-05-10 20:03:52 - INFO - stdout - {'loss': 0.979, 'grad_norm': 0.7349586486816406, 'learning_rate': 1.9639562784241426e-05, 'epoch': 0.34} +2025-05-10 20:03:52 - ERROR - stderr - 11%|█▏ | 424/3741 [2:37:58<20:28:21, 22.22s/it] +2025-05-10 20:04:13 - ERROR - stderr - 11%|█▏ | 425/3741 [2:38:19<20:10:07, 21.90s/it] +2025-05-10 20:04:13 - ERROR - stderr - +2025-05-10 20:04:13 - ERROR - stderr - +2025-05-10 20:04:13 - INFO - stdout - {'loss': 0.9508, 'grad_norm': 0.6306387782096863, 'learning_rate': 1.9637255272135863e-05, 'epoch': 0.34} +2025-05-10 20:04:13 - ERROR - stderr - 11%|█▏ | 425/3741 [2:38:19<20:10:07, 21.90s/it] +2025-05-10 20:04:33 - ERROR - stderr - 11%|█▏ | 426/3741 [2:38:39<19:34:19, 21.25s/it] +2025-05-10 20:04:33 - ERROR - stderr - +2025-05-10 20:04:33 - ERROR - stderr - +2025-05-10 20:04:33 - INFO - stdout - {'loss': 0.9172, 'grad_norm': 0.6524477601051331, 'learning_rate': 1.9634940533688094e-05, 'epoch': 0.34} +2025-05-10 20:04:33 - ERROR - stderr - 11%|█▏ | 426/3741 [2:38:39<19:34:19, 21.25s/it] +2025-05-10 20:04:52 - ERROR - stderr - 11%|█▏ | 427/3741 [2:38:59<19:08:26, 20.79s/it] +2025-05-10 20:04:52 - ERROR - stderr - +2025-05-10 20:04:52 - ERROR - stderr - +2025-05-10 20:04:52 - INFO - stdout - {'loss': 0.986, 'grad_norm': 0.6789130568504333, 'learning_rate': 1.9632618570633782e-05, 'epoch': 0.34} +2025-05-10 20:04:52 - ERROR - stderr - 11%|█▏ | 427/3741 [2:38:59<19:08:26, 20.79s/it] +2025-05-10 20:05:15 - ERROR - stderr - 11%|█▏ | 428/3741 [2:39:21<19:35:56, 21.30s/it] +2025-05-10 20:05:15 - ERROR - stderr - +2025-05-10 20:05:15 - ERROR - stderr - +2025-05-10 20:05:15 - INFO - stdout - {'loss': 0.9511, 'grad_norm': 0.6674696803092957, 'learning_rate': 1.9630289384714014e-05, 'epoch': 0.34} +2025-05-10 20:05:15 - ERROR - stderr - 11%|█▏ | 428/3741 [2:39:21<19:35:56, 21.30s/it] +2025-05-10 20:05:34 - ERROR - stderr - 11%|█▏ | 429/3741 [2:39:41<19:06:21, 20.77s/it] +2025-05-10 20:05:34 - ERROR - stderr - +2025-05-10 20:05:34 - ERROR - stderr - +2025-05-10 20:05:34 - INFO - stdout - {'loss': 0.9889, 'grad_norm': 0.703323245048523, 'learning_rate': 1.9627952977675292e-05, 'epoch': 0.34} +2025-05-10 20:05:34 - ERROR - stderr - 11%|█▏ | 429/3741 [2:39:41<19:06:21, 20.77s/it] +2025-05-10 20:05:59 - ERROR - stderr - 11%|█▏ | 430/3741 [2:40:06<20:14:11, 22.00s/it] +2025-05-10 20:05:59 - ERROR - stderr - +2025-05-10 20:05:59 - ERROR - stderr - +2025-05-10 20:05:59 - INFO - stdout - {'loss': 0.9957, 'grad_norm': 0.670559823513031, 'learning_rate': 1.962560935126954e-05, 'epoch': 0.34} +2025-05-10 20:05:59 - ERROR - stderr - 11%|█▏ | 430/3741 [2:40:06<20:14:11, 22.00s/it] +2025-05-10 20:06:19 - ERROR - stderr - 12%|█▏ | 431/3741 [2:40:25<19:30:34, 21.22s/it] +2025-05-10 20:06:19 - ERROR - stderr - +2025-05-10 20:06:19 - ERROR - stderr - +2025-05-10 20:06:19 - INFO - stdout - {'loss': 0.9794, 'grad_norm': 0.6248535513877869, 'learning_rate': 1.962325850725408e-05, 'epoch': 0.35} +2025-05-10 20:06:19 - ERROR - stderr - 12%|█▏ | 431/3741 [2:40:25<19:30:34, 21.22s/it] +2025-05-10 20:06:43 - ERROR - stderr - 12%|█▏ | 432/3741 [2:40:49<20:22:05, 22.16s/it] +2025-05-10 20:06:43 - ERROR - stderr - +2025-05-10 20:06:43 - ERROR - stderr - +2025-05-10 20:06:43 - INFO - stdout - {'loss': 0.951, 'grad_norm': 0.6565462946891785, 'learning_rate': 1.9620900447391663e-05, 'epoch': 0.35} +2025-05-10 20:06:43 - ERROR - stderr - 12%|█▏ | 432/3741 [2:40:49<20:22:05, 22.16s/it] +2025-05-10 20:07:03 - ERROR - stderr - 12%|█▏ | 433/3741 [2:41:09<19:44:48, 21.49s/it] +2025-05-10 20:07:03 - ERROR - stderr - +2025-05-10 20:07:03 - ERROR - stderr - +2025-05-10 20:07:03 - INFO - stdout - {'loss': 1.0089, 'grad_norm': 0.6794214844703674, 'learning_rate': 1.9618535173450434e-05, 'epoch': 0.35} +2025-05-10 20:07:03 - ERROR - stderr - 12%|█▏ | 433/3741 [2:41:09<19:44:48, 21.49s/it] +2025-05-10 20:07:28 - ERROR - stderr - 12%|█▏ | 434/3741 [2:41:35<20:51:34, 22.71s/it] +2025-05-10 20:07:28 - ERROR - stderr - +2025-05-10 20:07:28 - ERROR - stderr - +2025-05-10 20:07:28 - INFO - stdout - {'loss': 1.0291, 'grad_norm': 0.6835659742355347, 'learning_rate': 1.9616162687203966e-05, 'epoch': 0.35} +2025-05-10 20:07:28 - ERROR - stderr - 12%|█▏ | 434/3741 [2:41:35<20:51:34, 22.71s/it] +2025-05-10 20:07:48 - ERROR - stderr - 12%|█▏ | 435/3741 [2:41:54<19:57:34, 21.73s/it] +2025-05-10 20:07:48 - ERROR - stderr - +2025-05-10 20:07:48 - ERROR - stderr - +2025-05-10 20:07:48 - INFO - stdout - {'loss': 0.9599, 'grad_norm': 0.6657028794288635, 'learning_rate': 1.9613782990431223e-05, 'epoch': 0.35} +2025-05-10 20:07:48 - ERROR - stderr - 12%|█▏ | 435/3741 [2:41:54<19:57:34, 21.73s/it] +2025-05-10 20:08:11 - ERROR - stderr - 12%|█▏ | 436/3741 [2:42:18<20:23:55, 22.22s/it] +2025-05-10 20:08:11 - ERROR - stderr - +2025-05-10 20:08:11 - ERROR - stderr - +2025-05-10 20:08:11 - INFO - stdout - {'loss': 0.9688, 'grad_norm': 0.6608729362487793, 'learning_rate': 1.9611396084916587e-05, 'epoch': 0.35} +2025-05-10 20:08:11 - ERROR - stderr - 12%|█▏ | 436/3741 [2:42:18<20:23:55, 22.22s/it] +2025-05-10 20:08:31 - ERROR - stderr - 12%|█▏ | 437/3741 [2:42:38<19:48:03, 21.57s/it] +2025-05-10 20:08:31 - ERROR - stderr - +2025-05-10 20:08:31 - ERROR - stderr - +2025-05-10 20:08:31 - INFO - stdout - {'loss': 0.9353, 'grad_norm': 0.6747941374778748, 'learning_rate': 1.9609001972449834e-05, 'epoch': 0.35} +2025-05-10 20:08:31 - ERROR - stderr - 12%|█▏ | 437/3741 [2:42:38<19:48:03, 21.57s/it] +2025-05-10 20:08:52 - ERROR - stderr - 12%|█▏ | 438/3741 [2:42:58<19:25:36, 21.17s/it] +2025-05-10 20:08:52 - ERROR - stderr - +2025-05-10 20:08:52 - ERROR - stderr - +2025-05-10 20:08:52 - INFO - stdout - {'loss': 0.9698, 'grad_norm': 0.6459916234016418, 'learning_rate': 1.960660065482616e-05, 'epoch': 0.35} +2025-05-10 20:08:52 - ERROR - stderr - 12%|█▏ | 438/3741 [2:42:58<19:25:36, 21.17s/it] +2025-05-10 20:09:11 - ERROR - stderr - 12%|█▏ | 439/3741 [2:43:18<19:00:01, 20.72s/it] +2025-05-10 20:09:11 - ERROR - stderr - +2025-05-10 20:09:11 - ERROR - stderr - +2025-05-10 20:09:11 - INFO - stdout - {'loss': 0.9672, 'grad_norm': 0.6540493369102478, 'learning_rate': 1.9604192133846147e-05, 'epoch': 0.35} +2025-05-10 20:09:11 - ERROR - stderr - 12%|█▏ | 439/3741 [2:43:18<19:00:01, 20.72s/it] +2025-05-10 20:09:31 - ERROR - stderr - 12%|█▏ | 440/3741 [2:43:37<18:39:18, 20.34s/it] +2025-05-10 20:09:31 - ERROR - stderr - +2025-05-10 20:09:31 - ERROR - stderr - +2025-05-10 20:09:31 - INFO - stdout - {'loss': 0.9312, 'grad_norm': 0.6698585152626038, 'learning_rate': 1.960177641131579e-05, 'epoch': 0.35} +2025-05-10 20:09:31 - ERROR - stderr - 12%|█▏ | 440/3741 [2:43:37<18:39:18, 20.34s/it] +2025-05-10 20:09:51 - ERROR - stderr - 12%|█▏ | 441/3741 [2:43:57<18:36:42, 20.30s/it] +2025-05-10 20:09:51 - ERROR - stderr - +2025-05-10 20:09:51 - ERROR - stderr - +2025-05-10 20:09:51 - INFO - stdout - {'loss': 0.939, 'grad_norm': 0.6463971734046936, 'learning_rate': 1.959935348904648e-05, 'epoch': 0.35} +2025-05-10 20:09:51 - ERROR - stderr - 12%|█▏ | 441/3741 [2:43:57<18:36:42, 20.30s/it] +2025-05-10 20:10:10 - ERROR - stderr - 12%|█▏ | 442/3741 [2:44:17<18:22:39, 20.05s/it] +2025-05-10 20:10:10 - ERROR - stderr - +2025-05-10 20:10:10 - ERROR - stderr - +2025-05-10 20:10:10 - INFO - stdout - {'loss': 0.9928, 'grad_norm': 0.7033871412277222, 'learning_rate': 1.9596923368855006e-05, 'epoch': 0.35} +2025-05-10 20:10:10 - ERROR - stderr - 12%|█▏ | 442/3741 [2:44:17<18:22:39, 20.05s/it] +2025-05-10 20:10:34 - ERROR - stderr - 12%|█▏ | 443/3741 [2:44:40<19:21:11, 21.13s/it] +2025-05-10 20:10:34 - ERROR - stderr - +2025-05-10 20:10:34 - ERROR - stderr - +2025-05-10 20:10:34 - INFO - stdout - {'loss': 0.8954, 'grad_norm': 0.6210078597068787, 'learning_rate': 1.9594486052563556e-05, 'epoch': 0.36} +2025-05-10 20:10:34 - ERROR - stderr - 12%|█▏ | 443/3741 [2:44:40<19:21:11, 21.13s/it] +2025-05-10 20:10:54 - ERROR - stderr - 12%|█▏ | 444/3741 [2:45:00<18:56:50, 20.69s/it] +2025-05-10 20:10:54 - ERROR - stderr - +2025-05-10 20:10:54 - ERROR - stderr - +2025-05-10 20:10:54 - INFO - stdout - {'loss': 1.0045, 'grad_norm': 0.658398449420929, 'learning_rate': 1.959204154199971e-05, 'epoch': 0.36} +2025-05-10 20:10:54 - ERROR - stderr - 12%|█▏ | 444/3741 [2:45:00<18:56:50, 20.69s/it] +2025-05-10 20:11:18 - ERROR - stderr - 12%|█▏ | 445/3741 [2:45:24<19:55:02, 21.75s/it] +2025-05-10 20:11:18 - ERROR - stderr - +2025-05-10 20:11:18 - ERROR - stderr - +2025-05-10 20:11:18 - INFO - stdout - {'loss': 0.9873, 'grad_norm': 0.6751113533973694, 'learning_rate': 1.958958983899645e-05, 'epoch': 0.36} +2025-05-10 20:11:18 - ERROR - stderr - 12%|█▏ | 445/3741 [2:45:24<19:55:02, 21.75s/it] +2025-05-10 20:11:38 - ERROR - stderr - 12%|█▏ | 446/3741 [2:45:44<19:22:48, 21.17s/it] +2025-05-10 20:11:38 - ERROR - stderr - +2025-05-10 20:11:38 - ERROR - stderr - +2025-05-10 20:11:38 - INFO - stdout - {'loss': 0.9433, 'grad_norm': 0.6434077024459839, 'learning_rate': 1.958713094539214e-05, 'epoch': 0.36} +2025-05-10 20:11:38 - ERROR - stderr - 12%|█▏ | 446/3741 [2:45:44<19:22:48, 21.17s/it] +2025-05-10 20:12:02 - ERROR - stderr - 12%|█▏ | 447/3741 [2:46:08<20:15:02, 22.13s/it] +2025-05-10 20:12:02 - ERROR - stderr - +2025-05-10 20:12:02 - ERROR - stderr - +2025-05-10 20:12:02 - INFO - stdout - {'loss': 0.9705, 'grad_norm': 0.7159045338630676, 'learning_rate': 1.958466486303055e-05, 'epoch': 0.36} +2025-05-10 20:12:02 - ERROR - stderr - 12%|█▏ | 447/3741 [2:46:08<20:15:02, 22.13s/it] +2025-05-10 20:12:22 - ERROR - stderr - 12%|█▏ | 448/3741 [2:46:28<19:32:59, 21.37s/it] +2025-05-10 20:12:22 - ERROR - stderr - +2025-05-10 20:12:22 - ERROR - stderr - +2025-05-10 20:12:22 - INFO - stdout - {'loss': 0.9326, 'grad_norm': 0.6778410077095032, 'learning_rate': 1.9582191593760825e-05, 'epoch': 0.36} +2025-05-10 20:12:22 - ERROR - stderr - 12%|█▏ | 448/3741 [2:46:28<19:32:59, 21.37s/it] +2025-05-10 20:12:46 - ERROR - stderr - 12%|█▏ | 449/3741 [2:46:53<20:28:07, 22.38s/it] +2025-05-10 20:12:46 - ERROR - stderr - +2025-05-10 20:12:46 - ERROR - stderr - +2025-05-10 20:12:46 - INFO - stdout - {'loss': 0.9582, 'grad_norm': 0.6995593905448914, 'learning_rate': 1.957971113943751e-05, 'epoch': 0.36} +2025-05-10 20:12:46 - ERROR - stderr - 12%|█▏ | 449/3741 [2:46:53<20:28:07, 22.38s/it] +2025-05-10 20:13:06 - ERROR - stderr - 12%|█▏ | 450/3741 [2:47:13<19:46:24, 21.63s/it] +2025-05-10 20:13:06 - ERROR - stderr - +2025-05-10 20:13:06 - ERROR - stderr - +2025-05-10 20:13:06 - INFO - stdout - {'loss': 0.9635, 'grad_norm': 0.6641433835029602, 'learning_rate': 1.9577223501920532e-05, 'epoch': 0.36} +2025-05-10 20:13:06 - ERROR - stderr - 12%|█▏ | 450/3741 [2:47:13<19:46:24, 21.63s/it] +2025-05-10 20:13:31 - ERROR - stderr - 12%|█▏ | 451/3741 [2:47:38<20:40:30, 22.62s/it] +2025-05-10 20:13:31 - ERROR - stderr - +2025-05-10 20:13:31 - ERROR - stderr - +2025-05-10 20:13:31 - INFO - stdout - {'loss': 1.0151, 'grad_norm': 0.6719247698783875, 'learning_rate': 1.957472868307521e-05, 'epoch': 0.36} +2025-05-10 20:13:31 - ERROR - stderr - 12%|█▏ | 451/3741 [2:47:38<20:40:30, 22.62s/it] +2025-05-10 20:13:51 - ERROR - stderr - 12%|█▏ | 452/3741 [2:47:57<19:48:23, 21.68s/it] +2025-05-10 20:13:51 - ERROR - stderr - +2025-05-10 20:13:51 - ERROR - stderr - +2025-05-10 20:13:51 - INFO - stdout - {'loss': 0.9371, 'grad_norm': 0.6560412049293518, 'learning_rate': 1.9572226684772243e-05, 'epoch': 0.36} +2025-05-10 20:13:51 - ERROR - stderr - 12%|█▏ | 452/3741 [2:47:57<19:48:23, 21.68s/it] +2025-05-10 20:14:16 - ERROR - stderr - 12%|█▏ | 453/3741 [2:48:22<20:39:27, 22.62s/it] +2025-05-10 20:14:16 - ERROR - stderr - +2025-05-10 20:14:16 - ERROR - stderr - +2025-05-10 20:14:16 - INFO - stdout - {'loss': 0.9462, 'grad_norm': 0.6818994879722595, 'learning_rate': 1.956971750888771e-05, 'epoch': 0.36} +2025-05-10 20:14:16 - ERROR - stderr - 12%|█▏ | 453/3741 [2:48:22<20:39:27, 22.62s/it] +2025-05-10 20:14:35 - ERROR - stderr - 12%|█▏ | 454/3741 [2:48:41<19:48:25, 21.69s/it] +2025-05-10 20:14:35 - ERROR - stderr - +2025-05-10 20:14:35 - ERROR - stderr - +2025-05-10 20:14:35 - INFO - stdout - {'loss': 0.9549, 'grad_norm': 0.7130508422851562, 'learning_rate': 1.9567201157303086e-05, 'epoch': 0.36} +2025-05-10 20:14:35 - ERROR - stderr - 12%|█▏ | 454/3741 [2:48:41<19:48:25, 21.69s/it] +2025-05-10 20:15:00 - ERROR - stderr - 12%|█▏ | 455/3741 [2:49:06<20:35:04, 22.55s/it] +2025-05-10 20:15:00 - ERROR - stderr - +2025-05-10 20:15:00 - ERROR - stderr - +2025-05-10 20:15:00 - INFO - stdout - {'loss': 0.9852, 'grad_norm': 0.6851775050163269, 'learning_rate': 1.956467763190521e-05, 'epoch': 0.36} +2025-05-10 20:15:00 - ERROR - stderr - 12%|█▏ | 455/3741 [2:49:06<20:35:04, 22.55s/it] +2025-05-10 20:15:19 - ERROR - stderr - 12%|█▏ | 456/3741 [2:49:25<19:44:31, 21.64s/it] +2025-05-10 20:15:19 - ERROR - stderr - +2025-05-10 20:15:19 - ERROR - stderr - +2025-05-10 20:15:19 - INFO - stdout - {'loss': 0.9623, 'grad_norm': 0.6840097308158875, 'learning_rate': 1.9562146934586307e-05, 'epoch': 0.37} +2025-05-10 20:15:19 - ERROR - stderr - 12%|█▏ | 456/3741 [2:49:25<19:44:31, 21.64s/it] +2025-05-10 20:15:44 - ERROR - stderr - 12%|█▏ | 457/3741 [2:49:50<20:37:58, 22.62s/it] +2025-05-10 20:15:44 - ERROR - stderr - +2025-05-10 20:15:44 - ERROR - stderr - +2025-05-10 20:15:44 - INFO - stdout - {'loss': 0.9375, 'grad_norm': 0.6426949501037598, 'learning_rate': 1.955960906724398e-05, 'epoch': 0.37} +2025-05-10 20:15:44 - ERROR - stderr - 12%|█▏ | 457/3741 [2:49:50<20:37:58, 22.62s/it] +2025-05-10 20:16:04 - ERROR - stderr - 12%|█▏ | 458/3741 [2:50:10<19:49:05, 21.73s/it] +2025-05-10 20:16:04 - ERROR - stderr - +2025-05-10 20:16:04 - ERROR - stderr - +2025-05-10 20:16:04 - INFO - stdout - {'loss': 0.9336, 'grad_norm': 0.6146557927131653, 'learning_rate': 1.9557064031781216e-05, 'epoch': 0.37} +2025-05-10 20:16:04 - ERROR - stderr - 12%|█▏ | 458/3741 [2:50:10<19:49:05, 21.73s/it] +2025-05-10 20:16:28 - ERROR - stderr - 12%|█▏ | 459/3741 [2:50:35<20:35:12, 22.58s/it] +2025-05-10 20:16:28 - ERROR - stderr - +2025-05-10 20:16:28 - ERROR - stderr - +2025-05-10 20:16:28 - INFO - stdout - {'loss': 0.95, 'grad_norm': 0.6573811769485474, 'learning_rate': 1.9554511830106356e-05, 'epoch': 0.37} +2025-05-10 20:16:28 - ERROR - stderr - 12%|█▏ | 459/3741 [2:50:35<20:35:12, 22.58s/it] +2025-05-10 20:16:48 - ERROR - stderr - 12%|█▏ | 460/3741 [2:50:54<19:43:17, 21.64s/it] +2025-05-10 20:16:48 - ERROR - stderr - +2025-05-10 20:16:48 - ERROR - stderr - +2025-05-10 20:16:48 - INFO - stdout - {'loss': 0.946, 'grad_norm': 0.6667237877845764, 'learning_rate': 1.955195246413314e-05, 'epoch': 0.37} +2025-05-10 20:16:48 - ERROR - stderr - 12%|█▏ | 460/3741 [2:50:54<19:43:17, 21.64s/it] +2025-05-10 20:17:11 - ERROR - stderr - 12%|█▏ | 461/3741 [2:51:18<20:14:37, 22.22s/it] +2025-05-10 20:17:11 - ERROR - stderr - +2025-05-10 20:17:11 - ERROR - stderr - +2025-05-10 20:17:11 - INFO - stdout - {'loss': 0.9359, 'grad_norm': 0.6584280729293823, 'learning_rate': 1.9549385935780664e-05, 'epoch': 0.37} +2025-05-10 20:17:11 - ERROR - stderr - 12%|█▏ | 461/3741 [2:51:18<20:14:37, 22.22s/it] +2025-05-10 20:17:31 - ERROR - stderr - 12%|█▏ | 462/3741 [2:51:37<19:32:55, 21.46s/it] +2025-05-10 20:17:31 - ERROR - stderr - +2025-05-10 20:17:31 - ERROR - stderr - +2025-05-10 20:17:31 - INFO - stdout - {'loss': 0.9396, 'grad_norm': 0.6643354296684265, 'learning_rate': 1.9546812246973395e-05, 'epoch': 0.37} +2025-05-10 20:17:31 - ERROR - stderr - 12%|█▏ | 462/3741 [2:51:37<19:32:55, 21.46s/it] +2025-05-10 20:17:57 - ERROR - stderr - 12%|█▏ | 463/3741 [2:52:03<20:43:11, 22.76s/it] +2025-05-10 20:17:57 - ERROR - stderr - +2025-05-10 20:17:57 - ERROR - stderr - +2025-05-10 20:17:57 - INFO - stdout - {'loss': 0.9443, 'grad_norm': 0.6772244572639465, 'learning_rate': 1.9544231399641176e-05, 'epoch': 0.37} +2025-05-10 20:17:57 - ERROR - stderr - 12%|█▏ | 463/3741 [2:52:03<20:43:11, 22.76s/it] +2025-05-10 20:18:16 - ERROR - stderr - 12%|█▏ | 464/3741 [2:52:23<19:51:58, 21.82s/it] +2025-05-10 20:18:16 - ERROR - stderr - +2025-05-10 20:18:16 - ERROR - stderr - +2025-05-10 20:18:16 - INFO - stdout - {'loss': 0.9429, 'grad_norm': 0.6740161180496216, 'learning_rate': 1.954164339571921e-05, 'epoch': 0.37} +2025-05-10 20:18:16 - ERROR - stderr - 12%|█▏ | 464/3741 [2:52:23<19:51:58, 21.82s/it] +2025-05-10 20:18:40 - ERROR - stderr - 12%|█▏ | 465/3741 [2:52:46<20:14:40, 22.25s/it] +2025-05-10 20:18:40 - ERROR - stderr - +2025-05-10 20:18:40 - ERROR - stderr - +2025-05-10 20:18:40 - INFO - stdout - {'loss': 0.923, 'grad_norm': 0.70747309923172, 'learning_rate': 1.9539048237148078e-05, 'epoch': 0.37} +2025-05-10 20:18:40 - ERROR - stderr - 12%|█▏ | 465/3741 [2:52:46<20:14:40, 22.25s/it] +2025-05-10 20:18:59 - ERROR - stderr - 12%|█▏ | 466/3741 [2:53:06<19:31:14, 21.46s/it] +2025-05-10 20:18:59 - ERROR - stderr - +2025-05-10 20:18:59 - ERROR - stderr - +2025-05-10 20:18:59 - INFO - stdout - {'loss': 0.9421, 'grad_norm': 0.6899964809417725, 'learning_rate': 1.953644592587371e-05, 'epoch': 0.37} +2025-05-10 20:18:59 - ERROR - stderr - 12%|█▏ | 466/3741 [2:53:06<19:31:14, 21.46s/it] +2025-05-10 20:19:24 - ERROR - stderr - 12%|█▏ | 467/3741 [2:53:30<20:19:19, 22.35s/it] +2025-05-10 20:19:24 - ERROR - stderr - +2025-05-10 20:19:24 - ERROR - stderr - +2025-05-10 20:19:24 - INFO - stdout - {'loss': 0.9893, 'grad_norm': 0.6563026905059814, 'learning_rate': 1.953383646384741e-05, 'epoch': 0.37} +2025-05-10 20:19:24 - ERROR - stderr - 12%|█▏ | 467/3741 [2:53:30<20:19:19, 22.35s/it] +2025-05-10 20:19:44 - ERROR - stderr - 13%|█▎ | 468/3741 [2:53:50<19:43:54, 21.70s/it] +2025-05-10 20:19:44 - ERROR - stderr - +2025-05-10 20:19:44 - ERROR - stderr - +2025-05-10 20:19:44 - INFO - stdout - {'loss': 0.902, 'grad_norm': 0.624575674533844, 'learning_rate': 1.953121985302585e-05, 'epoch': 0.38} +2025-05-10 20:19:44 - ERROR - stderr - 13%|█▎ | 468/3741 [2:53:50<19:43:54, 21.70s/it] +2025-05-10 20:20:07 - ERROR - stderr - 13%|█▎ | 469/3741 [2:54:14<20:12:33, 22.24s/it] +2025-05-10 20:20:07 - ERROR - stderr - +2025-05-10 20:20:07 - ERROR - stderr - +2025-05-10 20:20:07 - INFO - stdout - {'loss': 0.9884, 'grad_norm': 0.6469770669937134, 'learning_rate': 1.952859609537104e-05, 'epoch': 0.38} +2025-05-10 20:20:07 - ERROR - stderr - 13%|█▎ | 469/3741 [2:54:14<20:12:33, 22.24s/it] +2025-05-10 20:20:27 - ERROR - stderr - 13%|█▎ | 470/3741 [2:54:33<19:28:25, 21.43s/it] +2025-05-10 20:20:27 - ERROR - stderr - +2025-05-10 20:20:27 - ERROR - stderr - +2025-05-10 20:20:27 - INFO - stdout - {'loss': 0.9554, 'grad_norm': 0.6481389999389648, 'learning_rate': 1.952596519285037e-05, 'epoch': 0.38} +2025-05-10 20:20:27 - ERROR - stderr - 13%|█▎ | 470/3741 [2:54:33<19:28:25, 21.43s/it] +2025-05-10 20:20:50 - ERROR - stderr - 13%|█▎ | 471/3741 [2:54:56<19:48:30, 21.81s/it] +2025-05-10 20:20:50 - ERROR - stderr - +2025-05-10 20:20:50 - ERROR - stderr - +2025-05-10 20:20:50 - INFO - stdout - {'loss': 0.9758, 'grad_norm': 0.65255206823349, 'learning_rate': 1.9523327147436585e-05, 'epoch': 0.38} +2025-05-10 20:20:50 - ERROR - stderr - 13%|█▎ | 471/3741 [2:54:56<19:48:30, 21.81s/it] +2025-05-10 20:21:09 - ERROR - stderr - 13%|█▎ | 472/3741 [2:55:15<19:11:09, 21.13s/it] +2025-05-10 20:21:09 - ERROR - stderr - +2025-05-10 20:21:09 - ERROR - stderr - +2025-05-10 20:21:09 - INFO - stdout - {'loss': 0.9768, 'grad_norm': 0.6691866517066956, 'learning_rate': 1.9520681961107772e-05, 'epoch': 0.38} +2025-05-10 20:21:09 - ERROR - stderr - 13%|█▎ | 472/3741 [2:55:15<19:11:09, 21.13s/it] +2025-05-10 20:21:31 - ERROR - stderr - 13%|█▎ | 473/3741 [2:55:37<19:19:24, 21.29s/it] +2025-05-10 20:21:31 - ERROR - stderr - +2025-05-10 20:21:31 - ERROR - stderr - +2025-05-10 20:21:31 - INFO - stdout - {'loss': 0.9436, 'grad_norm': 0.6792327165603638, 'learning_rate': 1.9518029635847387e-05, 'epoch': 0.38} +2025-05-10 20:21:31 - ERROR - stderr - 13%|█▎ | 473/3741 [2:55:37<19:19:24, 21.29s/it] +2025-05-10 20:21:50 - ERROR - stderr - 13%|█▎ | 474/3741 [2:55:57<18:50:51, 20.77s/it] +2025-05-10 20:21:50 - ERROR - stderr - +2025-05-10 20:21:50 - ERROR - stderr - +2025-05-10 20:21:50 - INFO - stdout - {'loss': 0.9722, 'grad_norm': 0.6820612549781799, 'learning_rate': 1.9515370173644235e-05, 'epoch': 0.38} +2025-05-10 20:21:50 - ERROR - stderr - 13%|█▎ | 474/3741 [2:55:57<18:50:51, 20.77s/it] +2025-05-10 20:22:13 - ERROR - stderr - 13%|█▎ | 475/3741 [2:56:19<19:20:01, 21.31s/it] +2025-05-10 20:22:13 - ERROR - stderr - +2025-05-10 20:22:13 - ERROR - stderr - +2025-05-10 20:22:13 - INFO - stdout - {'loss': 1.0122, 'grad_norm': 0.6797659397125244, 'learning_rate': 1.9512703576492466e-05, 'epoch': 0.38} +2025-05-10 20:22:13 - ERROR - stderr - 13%|█▎ | 475/3741 [2:56:19<19:20:01, 21.31s/it] +2025-05-10 20:22:33 - ERROR - stderr - 13%|█▎ | 476/3741 [2:56:39<18:57:23, 20.90s/it] +2025-05-10 20:22:33 - ERROR - stderr - +2025-05-10 20:22:33 - ERROR - stderr - +2025-05-10 20:22:33 - INFO - stdout - {'loss': 0.954, 'grad_norm': 0.6471715569496155, 'learning_rate': 1.9510029846391588e-05, 'epoch': 0.38} +2025-05-10 20:22:33 - ERROR - stderr - 13%|█▎ | 476/3741 [2:56:39<18:57:23, 20.90s/it] +2025-05-10 20:22:56 - ERROR - stderr - 13%|█▎ | 477/3741 [2:57:02<19:27:01, 21.45s/it] +2025-05-10 20:22:56 - ERROR - stderr - +2025-05-10 20:22:56 - ERROR - stderr - +2025-05-10 20:22:56 - INFO - stdout - {'loss': 0.9461, 'grad_norm': 0.7427453398704529, 'learning_rate': 1.9507348985346458e-05, 'epoch': 0.38} +2025-05-10 20:22:56 - ERROR - stderr - 13%|█▎ | 477/3741 [2:57:02<19:27:01, 21.45s/it] +2025-05-10 20:23:15 - ERROR - stderr - 13%|█▎ | 478/3741 [2:57:22<18:58:08, 20.93s/it] +2025-05-10 20:23:15 - ERROR - stderr - +2025-05-10 20:23:15 - ERROR - stderr - +2025-05-10 20:23:15 - INFO - stdout - {'loss': 0.9503, 'grad_norm': 0.7047792077064514, 'learning_rate': 1.9504660995367275e-05, 'epoch': 0.38} +2025-05-10 20:23:15 - ERROR - stderr - 13%|█▎ | 478/3741 [2:57:22<18:58:08, 20.93s/it] +2025-05-10 20:23:39 - ERROR - stderr - 13%|█▎ | 479/3741 [2:57:46<19:50:00, 21.89s/it] +2025-05-10 20:23:39 - ERROR - stderr - +2025-05-10 20:23:39 - ERROR - stderr - +2025-05-10 20:23:39 - INFO - stdout - {'loss': 0.9848, 'grad_norm': 0.6744017601013184, 'learning_rate': 1.950196587846958e-05, 'epoch': 0.38} +2025-05-10 20:23:39 - ERROR - stderr - 13%|█▎ | 479/3741 [2:57:46<19:50:00, 21.89s/it] +2025-05-10 20:23:59 - ERROR - stderr - 13%|█▎ | 480/3741 [2:58:05<19:09:16, 21.15s/it] +2025-05-10 20:23:59 - ERROR - stderr - +2025-05-10 20:23:59 - ERROR - stderr - +2025-05-10 20:23:59 - INFO - stdout - {'loss': 0.9156, 'grad_norm': 0.7120094895362854, 'learning_rate': 1.9499263636674273e-05, 'epoch': 0.38} +2025-05-10 20:23:59 - ERROR - stderr - 13%|█▎ | 480/3741 [2:58:05<19:09:16, 21.15s/it] +2025-05-10 20:24:22 - ERROR - stderr - 13%|█▎ | 481/3741 [2:58:29<19:45:22, 21.82s/it] +2025-05-10 20:24:22 - ERROR - stderr - +2025-05-10 20:24:22 - ERROR - stderr - +2025-05-10 20:24:22 - INFO - stdout - {'loss': 0.9404, 'grad_norm': 0.6583890914916992, 'learning_rate': 1.949655427200758e-05, 'epoch': 0.39} +2025-05-10 20:24:22 - ERROR - stderr - 13%|█▎ | 481/3741 [2:58:29<19:45:22, 21.82s/it] +2025-05-10 20:24:42 - ERROR - stderr - 13%|█▎ | 482/3741 [2:58:48<19:12:44, 21.22s/it] +2025-05-10 20:24:42 - ERROR - stderr - +2025-05-10 20:24:42 - ERROR - stderr - +2025-05-10 20:24:42 - INFO - stdout - {'loss': 0.9957, 'grad_norm': 0.7101068496704102, 'learning_rate': 1.9493837786501077e-05, 'epoch': 0.39} +2025-05-10 20:24:42 - ERROR - stderr - 13%|█▎ | 482/3741 [2:58:48<19:12:44, 21.22s/it] +2025-05-10 20:25:06 - ERROR - stderr - 13%|█▎ | 483/3741 [2:59:12<19:50:17, 21.92s/it] +2025-05-10 20:25:06 - ERROR - stderr - +2025-05-10 20:25:06 - ERROR - stderr - +2025-05-10 20:25:06 - INFO - stdout - {'loss': 1.0279, 'grad_norm': 0.7440847754478455, 'learning_rate': 1.949111418219168e-05, 'epoch': 0.39} +2025-05-10 20:25:06 - ERROR - stderr - 13%|█▎ | 483/3741 [2:59:12<19:50:17, 21.92s/it] +2025-05-10 20:25:25 - ERROR - stderr - 13%|█▎ | 484/3741 [2:59:31<19:10:10, 21.19s/it] +2025-05-10 20:25:25 - ERROR - stderr - +2025-05-10 20:25:25 - ERROR - stderr - +2025-05-10 20:25:25 - INFO - stdout - {'loss': 0.9855, 'grad_norm': 0.7091655135154724, 'learning_rate': 1.9488383461121634e-05, 'epoch': 0.39} +2025-05-10 20:25:25 - ERROR - stderr - 13%|█▎ | 484/3741 [2:59:31<19:10:10, 21.19s/it] +2025-05-10 20:25:51 - ERROR - stderr - 13%|█▎ | 485/3741 [2:59:57<20:21:22, 22.51s/it] +2025-05-10 20:25:51 - ERROR - stderr - +2025-05-10 20:25:51 - ERROR - stderr - +2025-05-10 20:25:51 - INFO - stdout - {'loss': 0.9564, 'grad_norm': 0.6298947334289551, 'learning_rate': 1.948564562533853e-05, 'epoch': 0.39} +2025-05-10 20:25:51 - ERROR - stderr - 13%|█▎ | 485/3741 [2:59:57<20:21:22, 22.51s/it] +2025-05-10 20:26:10 - ERROR - stderr - 13%|█▎ | 486/3741 [3:00:16<19:30:58, 21.58s/it] +2025-05-10 20:26:10 - ERROR - stderr - +2025-05-10 20:26:10 - ERROR - stderr - +2025-05-10 20:26:10 - INFO - stdout - {'loss': 0.9372, 'grad_norm': 0.6431513428688049, 'learning_rate': 1.9482900676895297e-05, 'epoch': 0.39} +2025-05-10 20:26:10 - ERROR - stderr - 13%|█▎ | 486/3741 [3:00:16<19:30:58, 21.58s/it] +2025-05-10 20:26:35 - ERROR - stderr - 13%|█▎ | 487/3741 [3:00:41<20:17:48, 22.45s/it] +2025-05-10 20:26:35 - ERROR - stderr - +2025-05-10 20:26:35 - ERROR - stderr - +2025-05-10 20:26:35 - INFO - stdout - {'loss': 0.9654, 'grad_norm': 0.7604116201400757, 'learning_rate': 1.948014861785018e-05, 'epoch': 0.39} +2025-05-10 20:26:35 - ERROR - stderr - 13%|█▎ | 487/3741 [3:00:41<20:17:48, 22.45s/it] +2025-05-10 20:26:54 - ERROR - stderr - 13%|█▎ | 488/3741 [3:01:00<19:28:12, 21.55s/it] +2025-05-10 20:26:54 - ERROR - stderr - +2025-05-10 20:26:54 - ERROR - stderr - +2025-05-10 20:26:54 - INFO - stdout - {'loss': 0.9184, 'grad_norm': 0.652585506439209, 'learning_rate': 1.9477389450266768e-05, 'epoch': 0.39} +2025-05-10 20:26:54 - ERROR - stderr - 13%|█▎ | 488/3741 [3:01:00<19:28:12, 21.55s/it] +2025-05-10 20:27:17 - ERROR - stderr - 13%|█▎ | 489/3741 [3:01:24<19:55:10, 22.05s/it] +2025-05-10 20:27:17 - ERROR - stderr - +2025-05-10 20:27:17 - ERROR - stderr - +2025-05-10 20:27:17 - INFO - stdout - {'loss': 0.9951, 'grad_norm': 0.6592057943344116, 'learning_rate': 1.9474623176213988e-05, 'epoch': 0.39} +2025-05-10 20:27:17 - ERROR - stderr - 13%|█▎ | 489/3741 [3:01:24<19:55:10, 22.05s/it] +2025-05-10 20:27:37 - ERROR - stderr - 13%|█▎ | 490/3741 [3:01:44<19:21:13, 21.43s/it] +2025-05-10 20:27:37 - ERROR - stderr - +2025-05-10 20:27:37 - ERROR - stderr - +2025-05-10 20:27:37 - INFO - stdout - {'loss': 0.9337, 'grad_norm': 0.7231782674789429, 'learning_rate': 1.9471849797766075e-05, 'epoch': 0.39} +2025-05-10 20:27:37 - ERROR - stderr - 13%|█▎ | 490/3741 [3:01:44<19:21:13, 21.43s/it] +2025-05-10 20:27:59 - ERROR - stderr - 13%|█▎ | 491/3741 [3:02:05<19:24:47, 21.50s/it] +2025-05-10 20:27:59 - ERROR - stderr - +2025-05-10 20:27:59 - ERROR - stderr - +2025-05-10 20:27:59 - INFO - stdout - {'loss': 0.9529, 'grad_norm': 0.6437721848487854, 'learning_rate': 1.9469069317002614e-05, 'epoch': 0.39} +2025-05-10 20:27:59 - ERROR - stderr - 13%|█▎ | 491/3741 [3:02:05<19:24:47, 21.50s/it] +2025-05-10 20:28:18 - ERROR - stderr - 13%|█▎ | 492/3741 [3:02:25<18:51:57, 20.90s/it] +2025-05-10 20:28:18 - ERROR - stderr - +2025-05-10 20:28:18 - ERROR - stderr - +2025-05-10 20:28:18 - INFO - stdout - {'loss': 1.0073, 'grad_norm': 0.6871363520622253, 'learning_rate': 1.9466281736008495e-05, 'epoch': 0.39} +2025-05-10 20:28:18 - ERROR - stderr - 13%|█▎ | 492/3741 [3:02:25<18:51:57, 20.90s/it] +2025-05-10 20:28:42 - ERROR - stderr - 13%|█▎ | 493/3741 [3:02:48<19:28:09, 21.58s/it] +2025-05-10 20:28:42 - ERROR - stderr - +2025-05-10 20:28:42 - ERROR - stderr - +2025-05-10 20:28:42 - INFO - stdout - {'loss': 0.89, 'grad_norm': 0.6335092782974243, 'learning_rate': 1.9463487056873945e-05, 'epoch': 0.4} +2025-05-10 20:28:42 - ERROR - stderr - 13%|█▎ | 493/3741 [3:02:48<19:28:09, 21.58s/it] +2025-05-10 20:29:01 - ERROR - stderr - 13%|█▎ | 494/3741 [3:03:07<18:53:21, 20.94s/it] +2025-05-10 20:29:01 - ERROR - stderr - +2025-05-10 20:29:01 - ERROR - stderr - +2025-05-10 20:29:01 - INFO - stdout - {'loss': 0.9542, 'grad_norm': 0.6468705534934998, 'learning_rate': 1.946068528169451e-05, 'epoch': 0.4} +2025-05-10 20:29:01 - ERROR - stderr - 13%|█▎ | 494/3741 [3:03:07<18:53:21, 20.94s/it] +2025-05-10 20:29:25 - ERROR - stderr - 13%|█▎ | 495/3741 [3:03:31<19:38:48, 21.79s/it] +2025-05-10 20:29:25 - ERROR - stderr - +2025-05-10 20:29:25 - ERROR - stderr - +2025-05-10 20:29:25 - INFO - stdout - {'loss': 0.926, 'grad_norm': 0.6464216709136963, 'learning_rate': 1.9457876412571053e-05, 'epoch': 0.4} +2025-05-10 20:29:25 - ERROR - stderr - 13%|█▎ | 495/3741 [3:03:31<19:38:48, 21.79s/it] +2025-05-10 20:29:44 - ERROR - stderr - 13%|█▎ | 496/3741 [3:03:51<19:01:59, 21.12s/it] +2025-05-10 20:29:44 - ERROR - stderr - +2025-05-10 20:29:44 - ERROR - stderr - +2025-05-10 20:29:44 - INFO - stdout - {'loss': 0.9718, 'grad_norm': 0.6910549998283386, 'learning_rate': 1.9455060451609765e-05, 'epoch': 0.4} +2025-05-10 20:29:44 - ERROR - stderr - 13%|█▎ | 496/3741 [3:03:51<19:01:59, 21.12s/it] +2025-05-10 20:30:08 - ERROR - stderr - 13%|█▎ | 497/3741 [3:04:14<19:43:50, 21.90s/it] +2025-05-10 20:30:08 - ERROR - stderr - +2025-05-10 20:30:08 - ERROR - stderr - +2025-05-10 20:30:08 - INFO - stdout - {'loss': 0.9153, 'grad_norm': 0.6526033878326416, 'learning_rate': 1.9452237400922142e-05, 'epoch': 0.4} +2025-05-10 20:30:08 - ERROR - stderr - 13%|█▎ | 497/3741 [3:04:14<19:43:50, 21.90s/it] +2025-05-10 20:30:28 - ERROR - stderr - 13%|█▎ | 498/3741 [3:04:34<19:06:45, 21.22s/it] +2025-05-10 20:30:28 - ERROR - stderr - +2025-05-10 20:30:28 - ERROR - stderr - +2025-05-10 20:30:28 - INFO - stdout - {'loss': 0.9803, 'grad_norm': 0.6653629541397095, 'learning_rate': 1.9449407262625015e-05, 'epoch': 0.4} +2025-05-10 20:30:28 - ERROR - stderr - 13%|█▎ | 498/3741 [3:04:34<19:06:45, 21.22s/it] +2025-05-10 20:30:52 - ERROR - stderr - 13%|█▎ | 499/3741 [3:04:58<19:51:27, 22.05s/it] +2025-05-10 20:30:52 - ERROR - stderr - +2025-05-10 20:30:52 - ERROR - stderr - +2025-05-10 20:30:52 - INFO - stdout - {'loss': 0.9739, 'grad_norm': 0.6513515710830688, 'learning_rate': 1.9446570038840505e-05, 'epoch': 0.4} +2025-05-10 20:30:52 - ERROR - stderr - 13%|█▎ | 499/3741 [3:04:58<19:51:27, 22.05s/it] +2025-05-10 20:31:11 - ERROR - stderr - 13%|█▎ | 500/3741 [3:05:17<19:09:58, 21.29s/it] +2025-05-10 20:31:11 - ERROR - stderr - +2025-05-10 20:31:11 - ERROR - stderr - +2025-05-10 20:31:11 - INFO - stdout - {'loss': 1.0026, 'grad_norm': 0.7147772908210754, 'learning_rate': 1.944372573169607e-05, 'epoch': 0.4} +2025-05-10 20:31:11 - ERROR - stderr - 13%|█▎ | 500/3741 [3:05:18<19:09:58, 21.29s/it] +2025-05-10 20:31:36 - ERROR - stderr - 13%|█▎ | 501/3741 [3:05:42<20:07:47, 22.37s/it] +2025-05-10 20:31:36 - ERROR - stderr - +2025-05-10 20:31:36 - ERROR - stderr - +2025-05-10 20:31:36 - INFO - stdout - {'loss': 1.0261, 'grad_norm': 0.6582165360450745, 'learning_rate': 1.9440874343324464e-05, 'epoch': 0.4} +2025-05-10 20:31:36 - ERROR - stderr - 13%|█▎ | 501/3741 [3:05:42<20:07:47, 22.37s/it] +2025-05-10 20:31:56 - ERROR - stderr - 13%|█▎ | 502/3741 [3:06:02<19:25:54, 21.60s/it] +2025-05-10 20:31:56 - ERROR - stderr - +2025-05-10 20:31:56 - ERROR - stderr - +2025-05-10 20:31:56 - INFO - stdout - {'loss': 0.9979, 'grad_norm': 0.6714770197868347, 'learning_rate': 1.943801587586375e-05, 'epoch': 0.4} +2025-05-10 20:31:56 - ERROR - stderr - 13%|█▎ | 502/3741 [3:06:02<19:25:54, 21.60s/it] +2025-05-10 20:32:22 - ERROR - stderr - 13%|█▎ | 503/3741 [3:06:29<20:42:43, 23.03s/it] +2025-05-10 20:32:22 - ERROR - stderr - +2025-05-10 20:32:22 - ERROR - stderr - +2025-05-10 20:32:22 - INFO - stdout - {'loss': 1.0059, 'grad_norm': 0.6295056939125061, 'learning_rate': 1.9435150331457314e-05, 'epoch': 0.4} +2025-05-10 20:32:22 - ERROR - stderr - 13%|█▎ | 503/3741 [3:06:29<20:42:43, 23.03s/it] +2025-05-10 20:32:42 - ERROR - stderr - 13%|█▎ | 504/3741 [3:06:48<19:45:00, 21.97s/it] +2025-05-10 20:32:42 - ERROR - stderr - +2025-05-10 20:32:42 - ERROR - stderr - +2025-05-10 20:32:42 - INFO - stdout - {'loss': 0.9456, 'grad_norm': 0.6907420754432678, 'learning_rate': 1.943227771225383e-05, 'epoch': 0.4} +2025-05-10 20:32:42 - ERROR - stderr - 13%|█▎ | 504/3741 [3:06:48<19:45:00, 21.97s/it] +2025-05-10 20:33:06 - ERROR - stderr - 13%|█▎ | 505/3741 [3:07:12<20:19:23, 22.61s/it] +2025-05-10 20:33:06 - ERROR - stderr - +2025-05-10 20:33:06 - ERROR - stderr - +2025-05-10 20:33:06 - INFO - stdout - {'loss': 0.9187, 'grad_norm': 0.6090110540390015, 'learning_rate': 1.9429398020407292e-05, 'epoch': 0.4} +2025-05-10 20:33:06 - ERROR - stderr - 13%|█▎ | 505/3741 [3:07:12<20:19:23, 22.61s/it] +2025-05-10 20:33:25 - ERROR - stderr - 14%|█▎ | 506/3741 [3:07:32<19:31:02, 21.72s/it] +2025-05-10 20:33:25 - ERROR - stderr - +2025-05-10 20:33:25 - ERROR - stderr - +2025-05-10 20:33:25 - INFO - stdout - {'loss': 0.952, 'grad_norm': 0.6557995080947876, 'learning_rate': 1.9426511258076988e-05, 'epoch': 0.41} +2025-05-10 20:33:25 - ERROR - stderr - 14%|█▎ | 506/3741 [3:07:32<19:31:02, 21.72s/it] +2025-05-10 20:33:49 - ERROR - stderr - 14%|█▎ | 507/3741 [3:07:56<20:03:54, 22.34s/it] +2025-05-10 20:33:49 - ERROR - stderr - +2025-05-10 20:33:49 - ERROR - stderr - +2025-05-10 20:33:49 - INFO - stdout - {'loss': 0.9657, 'grad_norm': 0.6791728138923645, 'learning_rate': 1.942361742742751e-05, 'epoch': 0.41} +2025-05-10 20:33:49 - ERROR - stderr - 14%|█▎ | 507/3741 [3:07:56<20:03:54, 22.34s/it] +2025-05-10 20:34:09 - ERROR - stderr - 14%|█▎ | 508/3741 [3:08:15<19:23:15, 21.59s/it] +2025-05-10 20:34:09 - ERROR - stderr - +2025-05-10 20:34:09 - ERROR - stderr - +2025-05-10 20:34:09 - INFO - stdout - {'loss': 1.0223, 'grad_norm': 0.6913565993309021, 'learning_rate': 1.9420716530628752e-05, 'epoch': 0.41} +2025-05-10 20:34:09 - ERROR - stderr - 14%|█▎ | 508/3741 [3:08:15<19:23:15, 21.59s/it] +2025-05-10 20:34:33 - ERROR - stderr - 14%|█▎ | 509/3741 [3:08:39<19:58:59, 22.26s/it] +2025-05-10 20:34:33 - ERROR - stderr - +2025-05-10 20:34:33 - ERROR - stderr - +2025-05-10 20:34:33 - INFO - stdout - {'loss': 0.9489, 'grad_norm': 0.6940714716911316, 'learning_rate': 1.9417808569855907e-05, 'epoch': 0.41} +2025-05-10 20:34:33 - ERROR - stderr - 14%|█▎ | 509/3741 [3:08:39<19:58:59, 22.26s/it] +2025-05-10 20:34:52 - ERROR - stderr - 14%|█▎ | 510/3741 [3:08:59<19:14:35, 21.44s/it] +2025-05-10 20:34:52 - ERROR - stderr - +2025-05-10 20:34:52 - ERROR - stderr - +2025-05-10 20:34:52 - INFO - stdout - {'loss': 0.9388, 'grad_norm': 0.733680009841919, 'learning_rate': 1.9414893547289458e-05, 'epoch': 0.41} +2025-05-10 20:34:52 - ERROR - stderr - 14%|█▎ | 510/3741 [3:08:59<19:14:35, 21.44s/it] +2025-05-10 20:35:18 - ERROR - stderr - 14%|█▎ | 511/3741 [3:09:25<20:27:47, 22.81s/it] +2025-05-10 20:35:18 - ERROR - stderr - +2025-05-10 20:35:18 - ERROR - stderr - +2025-05-10 20:35:18 - INFO - stdout - {'loss': 0.9455, 'grad_norm': 0.6628260016441345, 'learning_rate': 1.9411971465115197e-05, 'epoch': 0.41} +2025-05-10 20:35:18 - ERROR - stderr - 14%|█▎ | 511/3741 [3:09:25<20:27:47, 22.81s/it] +2025-05-10 20:35:38 - ERROR - stderr - 14%|█▎ | 512/3741 [3:09:44<19:31:14, 21.76s/it] +2025-05-10 20:35:38 - ERROR - stderr - +2025-05-10 20:35:38 - ERROR - stderr - +2025-05-10 20:35:38 - INFO - stdout - {'loss': 0.9224, 'grad_norm': 0.6788282990455627, 'learning_rate': 1.940904232552419e-05, 'epoch': 0.41} +2025-05-10 20:35:38 - ERROR - stderr - 14%|█▎ | 512/3741 [3:09:44<19:31:14, 21.76s/it] +2025-05-10 20:36:03 - ERROR - stderr - 14%|█▎ | 513/3741 [3:10:09<20:19:55, 22.68s/it] +2025-05-10 20:36:03 - ERROR - stderr - +2025-05-10 20:36:03 - ERROR - stderr - +2025-05-10 20:36:03 - INFO - stdout - {'loss': 0.9927, 'grad_norm': 0.6449699997901917, 'learning_rate': 1.9406106130712813e-05, 'epoch': 0.41} +2025-05-10 20:36:03 - ERROR - stderr - 14%|█▎ | 513/3741 [3:10:09<20:19:55, 22.68s/it] +2025-05-10 20:36:22 - ERROR - stderr - 14%|█▎ | 514/3741 [3:10:29<19:32:53, 21.81s/it] +2025-05-10 20:36:22 - ERROR - stderr - +2025-05-10 20:36:22 - ERROR - stderr - +2025-05-10 20:36:22 - INFO - stdout - {'loss': 0.9647, 'grad_norm': 0.6500270962715149, 'learning_rate': 1.9403162882882722e-05, 'epoch': 0.41} +2025-05-10 20:36:22 - ERROR - stderr - 14%|█▎ | 514/3741 [3:10:29<19:32:53, 21.81s/it] +2025-05-10 20:36:47 - ERROR - stderr - 14%|█▍ | 515/3741 [3:10:53<20:17:33, 22.65s/it] +2025-05-10 20:36:47 - ERROR - stderr - +2025-05-10 20:36:47 - ERROR - stderr - +2025-05-10 20:36:47 - INFO - stdout - {'loss': 0.967, 'grad_norm': 0.6693797707557678, 'learning_rate': 1.9400212584240867e-05, 'epoch': 0.41} +2025-05-10 20:36:47 - ERROR - stderr - 14%|█▍ | 515/3741 [3:10:53<20:17:33, 22.65s/it] +2025-05-10 20:37:06 - ERROR - stderr - 14%|█▍ | 516/3741 [3:11:13<19:26:57, 21.71s/it] +2025-05-10 20:37:06 - ERROR - stderr - +2025-05-10 20:37:06 - ERROR - stderr - +2025-05-10 20:37:06 - INFO - stdout - {'loss': 0.9768, 'grad_norm': 0.714789628982544, 'learning_rate': 1.9397255236999478e-05, 'epoch': 0.41} +2025-05-10 20:37:06 - ERROR - stderr - 14%|█▍ | 516/3741 [3:11:13<19:26:57, 21.71s/it] +2025-05-10 20:37:29 - ERROR - stderr - 14%|█▍ | 517/3741 [3:11:36<19:44:02, 22.04s/it] +2025-05-10 20:37:29 - ERROR - stderr - +2025-05-10 20:37:29 - ERROR - stderr - +2025-05-10 20:37:29 - INFO - stdout - {'loss': 0.9542, 'grad_norm': 0.6399978399276733, 'learning_rate': 1.939429084337608e-05, 'epoch': 0.41} +2025-05-10 20:37:29 - ERROR - stderr - 14%|█▍ | 517/3741 [3:11:36<19:44:02, 22.04s/it] +2025-05-10 20:37:49 - ERROR - stderr - 14%|█▍ | 518/3741 [3:11:55<19:01:14, 21.25s/it] +2025-05-10 20:37:49 - ERROR - stderr - +2025-05-10 20:37:49 - ERROR - stderr - +2025-05-10 20:37:49 - INFO - stdout - {'loss': 0.9968, 'grad_norm': 0.644829273223877, 'learning_rate': 1.939131940559347e-05, 'epoch': 0.42} +2025-05-10 20:37:49 - ERROR - stderr - 14%|█▍ | 518/3741 [3:11:55<19:01:14, 21.25s/it] +2025-05-10 20:38:09 - ERROR - stderr - 14%|█▍ | 519/3741 [3:12:15<18:48:05, 21.01s/it] +2025-05-10 20:38:09 - ERROR - stderr - +2025-05-10 20:38:09 - ERROR - stderr - +2025-05-10 20:38:09 - INFO - stdout - {'loss': 0.9511, 'grad_norm': 0.7262901067733765, 'learning_rate': 1.938834092587974e-05, 'epoch': 0.42} +2025-05-10 20:38:09 - ERROR - stderr - 14%|█▍ | 519/3741 [3:12:15<18:48:05, 21.01s/it] +2025-05-10 20:38:29 - ERROR - stderr - 14%|█▍ | 520/3741 [3:12:35<18:23:05, 20.55s/it] +2025-05-10 20:38:29 - ERROR - stderr - +2025-05-10 20:38:29 - ERROR - stderr - +2025-05-10 20:38:29 - INFO - stdout - {'loss': 0.9986, 'grad_norm': 0.6648424863815308, 'learning_rate': 1.938535540646825e-05, 'epoch': 0.42} +2025-05-10 20:38:29 - ERROR - stderr - 14%|█▍ | 520/3741 [3:12:35<18:23:05, 20.55s/it] +2025-05-10 20:38:48 - ERROR - stderr - 14%|█▍ | 521/3741 [3:12:55<18:09:08, 20.29s/it] +2025-05-10 20:38:48 - ERROR - stderr - +2025-05-10 20:38:48 - ERROR - stderr - +2025-05-10 20:38:48 - INFO - stdout - {'loss': 0.9664, 'grad_norm': 0.7087076902389526, 'learning_rate': 1.938236284959765e-05, 'epoch': 0.42} +2025-05-10 20:38:48 - ERROR - stderr - 14%|█▍ | 521/3741 [3:12:55<18:09:08, 20.29s/it] +2025-05-10 20:39:08 - ERROR - stderr - 14%|█▍ | 522/3741 [3:13:14<17:56:45, 20.07s/it] +2025-05-10 20:39:08 - ERROR - stderr - +2025-05-10 20:39:08 - ERROR - stderr - +2025-05-10 20:39:08 - INFO - stdout - {'loss': 0.9482, 'grad_norm': 0.7221333384513855, 'learning_rate': 1.9379363257511855e-05, 'epoch': 0.42} +2025-05-10 20:39:08 - ERROR - stderr - 14%|█▍ | 522/3741 [3:13:14<17:56:45, 20.07s/it] +2025-05-10 20:39:28 - ERROR - stderr - 14%|█▍ | 523/3741 [3:13:34<17:50:25, 19.96s/it] +2025-05-10 20:39:28 - ERROR - stderr - +2025-05-10 20:39:28 - ERROR - stderr - +2025-05-10 20:39:28 - INFO - stdout - {'loss': 1.0003, 'grad_norm': 0.6906344294548035, 'learning_rate': 1.9376356632460063e-05, 'epoch': 0.42} +2025-05-10 20:39:28 - ERROR - stderr - 14%|█▍ | 523/3741 [3:13:34<17:50:25, 19.96s/it] +2025-05-10 20:39:47 - ERROR - stderr - 14%|█▍ | 524/3741 [3:13:53<17:43:37, 19.84s/it] +2025-05-10 20:39:47 - ERROR - stderr - +2025-05-10 20:39:47 - ERROR - stderr - +2025-05-10 20:39:47 - INFO - stdout - {'loss': 0.9728, 'grad_norm': 0.7014548778533936, 'learning_rate': 1.9373342976696742e-05, 'epoch': 0.42} +2025-05-10 20:39:47 - ERROR - stderr - 14%|█▍ | 524/3741 [3:13:53<17:43:37, 19.84s/it] +2025-05-10 20:40:07 - ERROR - stderr - 14%|█▍ | 525/3741 [3:14:13<17:38:57, 19.76s/it] +2025-05-10 20:40:07 - ERROR - stderr - +2025-05-10 20:40:07 - ERROR - stderr - +2025-05-10 20:40:07 - INFO - stdout - {'loss': 0.9788, 'grad_norm': 0.6935135722160339, 'learning_rate': 1.9370322292481642e-05, 'epoch': 0.42} +2025-05-10 20:40:07 - ERROR - stderr - 14%|█▍ | 525/3741 [3:14:13<17:38:57, 19.76s/it] +2025-05-10 20:40:29 - ERROR - stderr - 14%|█▍ | 526/3741 [3:14:35<18:14:19, 20.42s/it] +2025-05-10 20:40:29 - ERROR - stderr - +2025-05-10 20:40:29 - ERROR - stderr - +2025-05-10 20:40:29 - INFO - stdout - {'loss': 0.982, 'grad_norm': 0.6556846499443054, 'learning_rate': 1.9367294582079768e-05, 'epoch': 0.42} +2025-05-10 20:40:29 - ERROR - stderr - 14%|█▍ | 526/3741 [3:14:35<18:14:19, 20.42s/it] +2025-05-10 20:40:48 - ERROR - stderr - 14%|█▍ | 527/3741 [3:14:55<18:01:50, 20.20s/it] +2025-05-10 20:40:48 - ERROR - stderr - +2025-05-10 20:40:48 - ERROR - stderr - +2025-05-10 20:40:48 - INFO - stdout - {'loss': 0.9361, 'grad_norm': 0.6862344145774841, 'learning_rate': 1.93642598477614e-05, 'epoch': 0.42} +2025-05-10 20:40:48 - ERROR - stderr - 14%|█▍ | 527/3741 [3:14:55<18:01:50, 20.20s/it] +2025-05-10 20:41:13 - ERROR - stderr - 14%|█▍ | 528/3741 [3:15:19<19:08:28, 21.45s/it] +2025-05-10 20:41:13 - ERROR - stderr - +2025-05-10 20:41:13 - ERROR - stderr - +2025-05-10 20:41:13 - INFO - stdout - {'loss': 0.9717, 'grad_norm': 0.6807497143745422, 'learning_rate': 1.9361218091802088e-05, 'epoch': 0.42} +2025-05-10 20:41:13 - ERROR - stderr - 14%|█▍ | 528/3741 [3:15:19<19:08:28, 21.45s/it] +2025-05-10 20:41:32 - ERROR - stderr - 14%|█▍ | 529/3741 [3:15:39<18:39:59, 20.92s/it] +2025-05-10 20:41:32 - ERROR - stderr - +2025-05-10 20:41:32 - ERROR - stderr - +2025-05-10 20:41:32 - INFO - stdout - {'loss': 0.9416, 'grad_norm': 0.646615743637085, 'learning_rate': 1.935816931648264e-05, 'epoch': 0.42} +2025-05-10 20:41:32 - ERROR - stderr - 14%|█▍ | 529/3741 [3:15:39<18:39:59, 20.92s/it] +2025-05-10 20:41:56 - ERROR - stderr - 14%|█▍ | 530/3741 [3:16:03<19:29:06, 21.85s/it] +2025-05-10 20:41:56 - ERROR - stderr - +2025-05-10 20:41:56 - ERROR - stderr - +2025-05-10 20:41:56 - INFO - stdout - {'loss': 0.952, 'grad_norm': 0.646940290927887, 'learning_rate': 1.9355113524089137e-05, 'epoch': 0.43} +2025-05-10 20:41:56 - ERROR - stderr - 14%|█▍ | 530/3741 [3:16:03<19:29:06, 21.85s/it] +2025-05-10 20:42:16 - ERROR - stderr - 14%|█▍ | 531/3741 [3:16:22<18:51:02, 21.14s/it] +2025-05-10 20:42:16 - ERROR - stderr - +2025-05-10 20:42:16 - ERROR - stderr - +2025-05-10 20:42:16 - INFO - stdout - {'loss': 0.9744, 'grad_norm': 0.7170730233192444, 'learning_rate': 1.9352050716912915e-05, 'epoch': 0.43} +2025-05-10 20:42:16 - ERROR - stderr - 14%|█▍ | 531/3741 [3:16:22<18:51:02, 21.14s/it] +2025-05-10 20:42:41 - ERROR - stderr - 14%|█▍ | 532/3741 [3:16:48<20:02:03, 22.48s/it] +2025-05-10 20:42:41 - ERROR - stderr - +2025-05-10 20:42:41 - ERROR - stderr - +2025-05-10 20:42:41 - INFO - stdout - {'loss': 0.9409, 'grad_norm': 0.6803928017616272, 'learning_rate': 1.934898089725057e-05, 'epoch': 0.43} +2025-05-10 20:42:41 - ERROR - stderr - 14%|█▍ | 532/3741 [3:16:48<20:02:03, 22.48s/it] +2025-05-10 20:43:01 - ERROR - stderr - 14%|█▍ | 533/3741 [3:17:08<19:21:22, 21.72s/it] +2025-05-10 20:43:01 - ERROR - stderr - +2025-05-10 20:43:01 - ERROR - stderr - +2025-05-10 20:43:01 - INFO - stdout - {'loss': 0.9368, 'grad_norm': 0.6328278183937073, 'learning_rate': 1.9345904067403953e-05, 'epoch': 0.43} +2025-05-10 20:43:01 - ERROR - stderr - 14%|█▍ | 533/3741 [3:17:08<19:21:22, 21.72s/it] +2025-05-10 20:43:25 - ERROR - stderr - 14%|█▍ | 534/3741 [3:17:31<19:49:43, 22.26s/it] +2025-05-10 20:43:25 - ERROR - stderr - +2025-05-10 20:43:25 - ERROR - stderr - +2025-05-10 20:43:25 - INFO - stdout - {'loss': 0.9771, 'grad_norm': 0.6864063143730164, 'learning_rate': 1.9342820229680185e-05, 'epoch': 0.43} +2025-05-10 20:43:25 - ERROR - stderr - 14%|█▍ | 534/3741 [3:17:31<19:49:43, 22.26s/it] +2025-05-10 20:43:44 - ERROR - stderr - 14%|█▍ | 535/3741 [3:17:51<19:06:04, 21.45s/it] +2025-05-10 20:43:45 - ERROR - stderr - +2025-05-10 20:43:45 - ERROR - stderr - +2025-05-10 20:43:45 - INFO - stdout - {'loss': 0.9774, 'grad_norm': 0.6935616135597229, 'learning_rate': 1.9339729386391622e-05, 'epoch': 0.43} +2025-05-10 20:43:45 - ERROR - stderr - 14%|█▍ | 535/3741 [3:17:51<19:06:04, 21.45s/it] +2025-05-10 20:44:09 - ERROR - stderr - 14%|█▍ | 536/3741 [3:18:16<20:00:56, 22.48s/it] +2025-05-10 20:44:09 - ERROR - stderr - +2025-05-10 20:44:09 - ERROR - stderr - +2025-05-10 20:44:09 - INFO - stdout - {'loss': 0.9468, 'grad_norm': 0.6815831065177917, 'learning_rate': 1.9336631539855895e-05, 'epoch': 0.43} +2025-05-10 20:44:09 - ERROR - stderr - 14%|█▍ | 536/3741 [3:18:16<20:00:56, 22.48s/it] +2025-05-10 20:44:29 - ERROR - stderr - 14%|█▍ | 537/3741 [3:18:35<19:15:37, 21.64s/it] +2025-05-10 20:44:29 - ERROR - stderr - +2025-05-10 20:44:29 - ERROR - stderr - +2025-05-10 20:44:29 - INFO - stdout - {'loss': 0.9433, 'grad_norm': 0.6866287589073181, 'learning_rate': 1.9333526692395863e-05, 'epoch': 0.43} +2025-05-10 20:44:29 - ERROR - stderr - 14%|█▍ | 537/3741 [3:18:35<19:15:37, 21.64s/it] +2025-05-10 20:44:52 - ERROR - stderr - 14%|█▍ | 538/3741 [3:18:58<19:35:33, 22.02s/it] +2025-05-10 20:44:52 - ERROR - stderr - +2025-05-10 20:44:52 - ERROR - stderr - +2025-05-10 20:44:52 - INFO - stdout - {'loss': 0.9595, 'grad_norm': 0.7279961109161377, 'learning_rate': 1.9330414846339656e-05, 'epoch': 0.43} +2025-05-10 20:44:52 - ERROR - stderr - 14%|█▍ | 538/3741 [3:18:58<19:35:33, 22.02s/it] +2025-05-10 20:45:12 - ERROR - stderr - 14%|█▍ | 539/3741 [3:19:18<19:00:00, 21.36s/it] +2025-05-10 20:45:12 - ERROR - stderr - +2025-05-10 20:45:12 - ERROR - stderr - +2025-05-10 20:45:12 - INFO - stdout - {'loss': 0.9593, 'grad_norm': 0.659054160118103, 'learning_rate': 1.9327296004020638e-05, 'epoch': 0.43} +2025-05-10 20:45:12 - ERROR - stderr - 14%|█▍ | 539/3741 [3:19:18<19:00:00, 21.36s/it] +2025-05-10 20:45:36 - ERROR - stderr - 14%|█▍ | 540/3741 [3:19:42<19:43:46, 22.19s/it] +2025-05-10 20:45:36 - ERROR - stderr - +2025-05-10 20:45:36 - ERROR - stderr - +2025-05-10 20:45:36 - INFO - stdout - {'loss': 0.9569, 'grad_norm': 0.6249253749847412, 'learning_rate': 1.9324170167777425e-05, 'epoch': 0.43} +2025-05-10 20:45:36 - ERROR - stderr - 14%|█▍ | 540/3741 [3:19:42<19:43:46, 22.19s/it] +2025-05-10 20:45:55 - ERROR - stderr - 14%|█▍ | 541/3741 [3:20:02<19:00:16, 21.38s/it] +2025-05-10 20:45:55 - ERROR - stderr - +2025-05-10 20:45:55 - ERROR - stderr - +2025-05-10 20:45:55 - INFO - stdout - {'loss': 0.9529, 'grad_norm': 0.6949421167373657, 'learning_rate': 1.9321037339953873e-05, 'epoch': 0.43} +2025-05-10 20:45:55 - ERROR - stderr - 14%|█▍ | 541/3741 [3:20:02<19:00:16, 21.38s/it] +2025-05-10 20:46:20 - ERROR - stderr - 14%|█▍ | 542/3741 [3:20:26<19:50:24, 22.33s/it] +2025-05-10 20:46:20 - ERROR - stderr - +2025-05-10 20:46:20 - ERROR - stderr - +2025-05-10 20:46:20 - INFO - stdout - {'loss': 1.0171, 'grad_norm': 0.7360992431640625, 'learning_rate': 1.9317897522899082e-05, 'epoch': 0.43} +2025-05-10 20:46:20 - ERROR - stderr - 14%|█▍ | 542/3741 [3:20:26<19:50:24, 22.33s/it] +2025-05-10 20:46:40 - ERROR - stderr - 15%|█▍ | 543/3741 [3:20:46<19:07:49, 21.54s/it] +2025-05-10 20:46:40 - ERROR - stderr - +2025-05-10 20:46:40 - ERROR - stderr - +2025-05-10 20:46:40 - INFO - stdout - {'loss': 0.9597, 'grad_norm': 0.6973049640655518, 'learning_rate': 1.93147507189674e-05, 'epoch': 0.44} +2025-05-10 20:46:40 - ERROR - stderr - 15%|█▍ | 543/3741 [3:20:46<19:07:49, 21.54s/it] +2025-05-10 20:47:04 - ERROR - stderr - 15%|█▍ | 544/3741 [3:21:10<19:47:29, 22.29s/it] +2025-05-10 20:47:04 - ERROR - stderr - +2025-05-10 20:47:04 - ERROR - stderr - +2025-05-10 20:47:04 - INFO - stdout - {'loss': 0.9106, 'grad_norm': 0.6927620768547058, 'learning_rate': 1.93115969305184e-05, 'epoch': 0.44} +2025-05-10 20:47:04 - ERROR - stderr - 15%|█▍ | 544/3741 [3:21:10<19:47:29, 22.29s/it] +2025-05-10 20:47:24 - ERROR - stderr - 15%|█▍ | 545/3741 [3:21:30<19:08:02, 21.55s/it] +2025-05-10 20:47:24 - ERROR - stderr - +2025-05-10 20:47:24 - ERROR - stderr - +2025-05-10 20:47:24 - INFO - stdout - {'loss': 0.9958, 'grad_norm': 0.6799963712692261, 'learning_rate': 1.9308436159916905e-05, 'epoch': 0.44} +2025-05-10 20:47:24 - ERROR - stderr - 15%|█▍ | 545/3741 [3:21:30<19:08:02, 21.55s/it] +2025-05-10 20:47:43 - ERROR - stderr - 15%|█▍ | 546/3741 [3:21:50<18:39:52, 21.03s/it] +2025-05-10 20:47:43 - ERROR - stderr - +2025-05-10 20:47:43 - ERROR - stderr - +2025-05-10 20:47:43 - INFO - stdout - {'loss': 0.9605, 'grad_norm': 0.6450375914573669, 'learning_rate': 1.9305268409532968e-05, 'epoch': 0.44} +2025-05-10 20:47:43 - ERROR - stderr - 15%|█▍ | 546/3741 [3:21:50<18:39:52, 21.03s/it] +2025-05-10 20:48:03 - ERROR - stderr - 15%|█▍ | 547/3741 [3:22:09<18:16:46, 20.60s/it] +2025-05-10 20:48:03 - ERROR - stderr - +2025-05-10 20:48:03 - ERROR - stderr - +2025-05-10 20:48:03 - INFO - stdout - {'loss': 0.9424, 'grad_norm': 0.6617172360420227, 'learning_rate': 1.9302093681741874e-05, 'epoch': 0.44} +2025-05-10 20:48:03 - ERROR - stderr - 15%|█▍ | 547/3741 [3:22:09<18:16:46, 20.60s/it] +2025-05-10 20:48:22 - ERROR - stderr - 15%|█▍ | 548/3741 [3:22:29<17:59:31, 20.29s/it] +2025-05-10 20:48:23 - ERROR - stderr - +2025-05-10 20:48:23 - ERROR - stderr - +2025-05-10 20:48:23 - INFO - stdout - {'loss': 0.9857, 'grad_norm': 0.7010754346847534, 'learning_rate': 1.9298911978924142e-05, 'epoch': 0.44} +2025-05-10 20:48:23 - ERROR - stderr - 15%|█▍ | 548/3741 [3:22:29<17:59:31, 20.29s/it] +2025-05-10 20:48:44 - ERROR - stderr - 15%|█▍ | 549/3741 [3:22:50<18:14:59, 20.58s/it] +2025-05-10 20:48:44 - ERROR - stderr - +2025-05-10 20:48:44 - ERROR - stderr - +2025-05-10 20:48:44 - INFO - stdout - {'loss': 0.9495, 'grad_norm': 0.665642499923706, 'learning_rate': 1.9295723303465523e-05, 'epoch': 0.44} +2025-05-10 20:48:44 - ERROR - stderr - 15%|█▍ | 549/3741 [3:22:50<18:14:59, 20.58s/it] +2025-05-10 20:49:03 - ERROR - stderr - 15%|█▍ | 550/3741 [3:23:10<17:59:49, 20.30s/it] +2025-05-10 20:49:03 - ERROR - stderr - +2025-05-10 20:49:03 - ERROR - stderr - +2025-05-10 20:49:03 - INFO - stdout - {'loss': 0.9411, 'grad_norm': 0.6675366759300232, 'learning_rate': 1.9292527657756994e-05, 'epoch': 0.44} +2025-05-10 20:49:03 - ERROR - stderr - 15%|█▍ | 550/3741 [3:23:10<17:59:49, 20.30s/it] +2025-05-10 20:49:25 - ERROR - stderr - 15%|█▍ | 551/3741 [3:23:31<18:14:24, 20.58s/it] +2025-05-10 20:49:25 - ERROR - stderr - +2025-05-10 20:49:25 - ERROR - stderr - +2025-05-10 20:49:25 - INFO - stdout - {'loss': 0.9939, 'grad_norm': 0.6773011684417725, 'learning_rate': 1.928932504419476e-05, 'epoch': 0.44} +2025-05-10 20:49:25 - ERROR - stderr - 15%|█▍ | 551/3741 [3:23:31<18:14:24, 20.58s/it] +2025-05-10 20:49:44 - ERROR - stderr - 15%|█▍ | 552/3741 [3:23:51<18:00:06, 20.32s/it] +2025-05-10 20:49:44 - ERROR - stderr - +2025-05-10 20:49:44 - ERROR - stderr - +2025-05-10 20:49:44 - INFO - stdout - {'loss': 0.9641, 'grad_norm': 0.691259503364563, 'learning_rate': 1.9286115465180248e-05, 'epoch': 0.44} +2025-05-10 20:49:44 - ERROR - stderr - 15%|█▍ | 552/3741 [3:23:51<18:00:06, 20.32s/it] +2025-05-10 20:50:09 - ERROR - stderr - 15%|█▍ | 553/3741 [3:24:15<19:04:42, 21.54s/it] +2025-05-10 20:50:09 - ERROR - stderr - +2025-05-10 20:50:09 - ERROR - stderr - +2025-05-10 20:50:09 - INFO - stdout - {'loss': 0.9077, 'grad_norm': 0.6108399033546448, 'learning_rate': 1.928289892312011e-05, 'epoch': 0.44} +2025-05-10 20:50:09 - ERROR - stderr - 15%|█▍ | 553/3741 [3:24:15<19:04:42, 21.54s/it] +2025-05-10 20:50:28 - ERROR - stderr - 15%|█▍ | 554/3741 [3:24:35<18:31:07, 20.92s/it] +2025-05-10 20:50:28 - ERROR - stderr - +2025-05-10 20:50:28 - ERROR - stderr - +2025-05-10 20:50:28 - INFO - stdout - {'loss': 0.9379, 'grad_norm': 0.6582357287406921, 'learning_rate': 1.927967542042622e-05, 'epoch': 0.44} +2025-05-10 20:50:28 - ERROR - stderr - 15%|█▍ | 554/3741 [3:24:35<18:31:07, 20.92s/it] +2025-05-10 20:50:53 - ERROR - stderr - 15%|█▍ | 555/3741 [3:24:59<19:33:36, 22.10s/it] +2025-05-10 20:50:53 - ERROR - stderr - +2025-05-10 20:50:53 - ERROR - stderr - +2025-05-10 20:50:53 - INFO - stdout - {'loss': 0.9621, 'grad_norm': 0.7069655060768127, 'learning_rate': 1.9276444959515664e-05, 'epoch': 0.45} +2025-05-10 20:50:53 - ERROR - stderr - 15%|█▍ | 555/3741 [3:24:59<19:33:36, 22.10s/it] +2025-05-10 20:51:13 - ERROR - stderr - 15%|█▍ | 556/3741 [3:25:19<18:51:22, 21.31s/it] +2025-05-10 20:51:13 - ERROR - stderr - +2025-05-10 20:51:13 - ERROR - stderr - +2025-05-10 20:51:13 - INFO - stdout - {'loss': 0.9675, 'grad_norm': 0.6511080265045166, 'learning_rate': 1.9273207542810764e-05, 'epoch': 0.45} +2025-05-10 20:51:13 - ERROR - stderr - 15%|█▍ | 556/3741 [3:25:19<18:51:22, 21.31s/it] +2025-05-10 20:51:38 - ERROR - stderr - 15%|█▍ | 557/3741 [3:25:45<20:00:33, 22.62s/it] +2025-05-10 20:51:38 - ERROR - stderr - +2025-05-10 20:51:38 - ERROR - stderr - +2025-05-10 20:51:38 - INFO - stdout - {'loss': 0.9744, 'grad_norm': 0.6380482912063599, 'learning_rate': 1.9269963172739033e-05, 'epoch': 0.45} +2025-05-10 20:51:38 - ERROR - stderr - 15%|█▍ | 557/3741 [3:25:45<20:00:33, 22.62s/it] +2025-05-10 20:51:58 - ERROR - stderr - 15%|█▍ | 558/3741 [3:26:05<19:22:05, 21.91s/it] +2025-05-10 20:51:58 - ERROR - stderr - +2025-05-10 20:51:58 - ERROR - stderr - +2025-05-10 20:51:58 - INFO - stdout - {'loss': 0.9644, 'grad_norm': 0.6568742394447327, 'learning_rate': 1.9266711851733214e-05, 'epoch': 0.45} +2025-05-10 20:51:58 - ERROR - stderr - 15%|█▍ | 558/3741 [3:26:05<19:22:05, 21.91s/it] +2025-05-10 20:52:23 - ERROR - stderr - 15%|█▍ | 559/3741 [3:26:29<19:59:57, 22.63s/it] +2025-05-10 20:52:23 - ERROR - stderr - +2025-05-10 20:52:23 - ERROR - stderr - +2025-05-10 20:52:23 - INFO - stdout - {'loss': 0.9969, 'grad_norm': 0.6376577019691467, 'learning_rate': 1.9263453582231265e-05, 'epoch': 0.45} +2025-05-10 20:52:23 - ERROR - stderr - 15%|█▍ | 559/3741 [3:26:29<19:59:57, 22.63s/it] +2025-05-10 20:52:42 - ERROR - stderr - 15%|█▍ | 560/3741 [3:26:49<19:12:43, 21.74s/it] +2025-05-10 20:52:42 - ERROR - stderr - +2025-05-10 20:52:42 - ERROR - stderr - +2025-05-10 20:52:42 - INFO - stdout - {'loss': 0.9894, 'grad_norm': 0.6453221440315247, 'learning_rate': 1.9260188366676337e-05, 'epoch': 0.45} +2025-05-10 20:52:42 - ERROR - stderr - 15%|█▍ | 560/3741 [3:26:49<19:12:43, 21.74s/it] +2025-05-10 20:53:05 - ERROR - stderr - 15%|█▍ | 561/3741 [3:27:12<19:29:15, 22.06s/it] +2025-05-10 20:53:05 - ERROR - stderr - +2025-05-10 20:53:05 - ERROR - stderr - +2025-05-10 20:53:05 - INFO - stdout - {'loss': 0.9315, 'grad_norm': 0.6480368375778198, 'learning_rate': 1.9256916207516806e-05, 'epoch': 0.45} +2025-05-10 20:53:05 - ERROR - stderr - 15%|█▍ | 561/3741 [3:27:12<19:29:15, 22.06s/it] +2025-05-10 20:53:25 - ERROR - stderr - 15%|█▌ | 562/3741 [3:27:31<18:48:10, 21.29s/it] +2025-05-10 20:53:25 - ERROR - stderr - +2025-05-10 20:53:25 - ERROR - stderr - +2025-05-10 20:53:25 - INFO - stdout - {'loss': 0.9886, 'grad_norm': 0.6618868708610535, 'learning_rate': 1.9253637107206246e-05, 'epoch': 0.45} +2025-05-10 20:53:25 - ERROR - stderr - 15%|█▌ | 562/3741 [3:27:31<18:48:10, 21.29s/it] +2025-05-10 20:53:44 - ERROR - stderr - 15%|█▌ | 563/3741 [3:27:50<18:17:55, 20.73s/it] +2025-05-10 20:53:44 - ERROR - stderr - +2025-05-10 20:53:44 - ERROR - stderr - +2025-05-10 20:53:44 - INFO - stdout - {'loss': 0.9983, 'grad_norm': 0.646225094795227, 'learning_rate': 1.9250351068203442e-05, 'epoch': 0.45} +2025-05-10 20:53:44 - ERROR - stderr - 15%|█▌ | 563/3741 [3:27:50<18:17:55, 20.73s/it] +2025-05-10 20:54:04 - ERROR - stderr - 15%|█▌ | 564/3741 [3:28:10<17:59:46, 20.39s/it] +2025-05-10 20:54:04 - ERROR - stderr - +2025-05-10 20:54:04 - ERROR - stderr - +2025-05-10 20:54:04 - INFO - stdout - {'loss': 0.9496, 'grad_norm': 0.6107761859893799, 'learning_rate': 1.9247058092972372e-05, 'epoch': 0.45} +2025-05-10 20:54:04 - ERROR - stderr - 15%|█▌ | 564/3741 [3:28:10<17:59:46, 20.39s/it] +2025-05-10 20:54:23 - ERROR - stderr - 15%|█▌ | 565/3741 [3:28:29<17:42:52, 20.08s/it] +2025-05-10 20:54:23 - ERROR - stderr - +2025-05-10 20:54:23 - ERROR - stderr - +2025-05-10 20:54:23 - INFO - stdout - {'loss': 0.9751, 'grad_norm': 0.6536424160003662, 'learning_rate': 1.9243758183982226e-05, 'epoch': 0.45} +2025-05-10 20:54:23 - ERROR - stderr - 15%|█▌ | 565/3741 [3:28:29<17:42:52, 20.08s/it] +2025-05-10 20:54:45 - ERROR - stderr - 15%|█▌ | 566/3741 [3:28:51<18:06:19, 20.53s/it] +2025-05-10 20:54:45 - ERROR - stderr - +2025-05-10 20:54:45 - ERROR - stderr - +2025-05-10 20:54:45 - INFO - stdout - {'loss': 0.9534, 'grad_norm': 0.5984099507331848, 'learning_rate': 1.9240451343707382e-05, 'epoch': 0.45} +2025-05-10 20:54:45 - ERROR - stderr - 15%|█▌ | 566/3741 [3:28:51<18:06:19, 20.53s/it] +2025-05-10 20:55:04 - ERROR - stderr - 15%|█▌ | 567/3741 [3:29:11<17:52:38, 20.28s/it] +2025-05-10 20:55:04 - ERROR - stderr - +2025-05-10 20:55:04 - ERROR - stderr - +2025-05-10 20:55:04 - INFO - stdout - {'loss': 0.9064, 'grad_norm': 0.622818112373352, 'learning_rate': 1.9237137574627433e-05, 'epoch': 0.45} +2025-05-10 20:55:04 - ERROR - stderr - 15%|█▌ | 567/3741 [3:29:11<17:52:38, 20.28s/it] +2025-05-10 20:55:28 - ERROR - stderr - 15%|█▌ | 568/3741 [3:29:34<18:46:10, 21.30s/it] +2025-05-10 20:55:28 - ERROR - stderr - +2025-05-10 20:55:28 - ERROR - stderr - +2025-05-10 20:55:28 - INFO - stdout - {'loss': 0.9416, 'grad_norm': 0.6724113821983337, 'learning_rate': 1.923381687922714e-05, 'epoch': 0.46} +2025-05-10 20:55:28 - ERROR - stderr - 15%|█▌ | 568/3741 [3:29:34<18:46:10, 21.30s/it] +2025-05-10 20:55:48 - ERROR - stderr - 15%|█▌ | 569/3741 [3:29:54<18:19:24, 20.80s/it] +2025-05-10 20:55:48 - ERROR - stderr - +2025-05-10 20:55:48 - ERROR - stderr - +2025-05-10 20:55:48 - INFO - stdout - {'loss': 0.9413, 'grad_norm': 0.6443886160850525, 'learning_rate': 1.9230489259996487e-05, 'epoch': 0.46} +2025-05-10 20:55:48 - ERROR - stderr - 15%|█▌ | 569/3741 [3:29:54<18:19:24, 20.80s/it] +2025-05-10 20:56:12 - ERROR - stderr - 15%|█▌ | 570/3741 [3:30:19<19:19:53, 21.95s/it] +2025-05-10 20:56:12 - ERROR - stderr - +2025-05-10 20:56:12 - ERROR - stderr - +2025-05-10 20:56:12 - INFO - stdout - {'loss': 0.9813, 'grad_norm': 0.6603150963783264, 'learning_rate': 1.922715471943063e-05, 'epoch': 0.46} +2025-05-10 20:56:12 - ERROR - stderr - 15%|█▌ | 570/3741 [3:30:19<19:19:53, 21.95s/it] +2025-05-10 20:56:32 - ERROR - stderr - 15%|█▌ | 571/3741 [3:30:39<18:47:00, 21.33s/it] +2025-05-10 20:56:32 - ERROR - stderr - +2025-05-10 20:56:32 - ERROR - stderr - +2025-05-10 20:56:32 - INFO - stdout - {'loss': 0.9405, 'grad_norm': 0.642634928226471, 'learning_rate': 1.9223813260029922e-05, 'epoch': 0.46} +2025-05-10 20:56:32 - ERROR - stderr - 15%|█▌ | 571/3741 [3:30:39<18:47:00, 21.33s/it] +2025-05-10 20:56:56 - ERROR - stderr - 15%|█▌ | 572/3741 [3:31:02<19:24:41, 22.05s/it] +2025-05-10 20:56:56 - ERROR - stderr - +2025-05-10 20:56:56 - ERROR - stderr - +2025-05-10 20:56:56 - INFO - stdout - {'loss': 0.9891, 'grad_norm': 0.668830931186676, 'learning_rate': 1.92204648842999e-05, 'epoch': 0.46} +2025-05-10 20:56:56 - ERROR - stderr - 15%|█▌ | 572/3741 [3:31:02<19:24:41, 22.05s/it] +2025-05-10 20:57:16 - ERROR - stderr - 15%|█▌ | 573/3741 [3:31:22<18:48:38, 21.38s/it] +2025-05-10 20:57:16 - ERROR - stderr - +2025-05-10 20:57:16 - ERROR - stderr - +2025-05-10 20:57:16 - INFO - stdout - {'loss': 0.971, 'grad_norm': 0.617743968963623, 'learning_rate': 1.9217109594751303e-05, 'epoch': 0.46} +2025-05-10 20:57:16 - ERROR - stderr - 15%|█▌ | 573/3741 [3:31:22<18:48:38, 21.38s/it] +2025-05-10 20:57:40 - ERROR - stderr - 15%|█▌ | 574/3741 [3:31:46<19:34:56, 22.26s/it] +2025-05-10 20:57:40 - ERROR - stderr - +2025-05-10 20:57:40 - ERROR - stderr - +2025-05-10 20:57:40 - INFO - stdout - {'loss': 0.9542, 'grad_norm': 0.6333216428756714, 'learning_rate': 1.9213747393900025e-05, 'epoch': 0.46} +2025-05-10 20:57:40 - ERROR - stderr - 15%|█▌ | 574/3741 [3:31:46<19:34:56, 22.26s/it] +2025-05-10 20:58:00 - ERROR - stderr - 15%|█▌ | 575/3741 [3:32:06<18:52:25, 21.46s/it] +2025-05-10 20:58:00 - ERROR - stderr - +2025-05-10 20:58:00 - ERROR - stderr - +2025-05-10 20:58:00 - INFO - stdout - {'loss': 0.9329, 'grad_norm': 0.6373317241668701, 'learning_rate': 1.9210378284267166e-05, 'epoch': 0.46} +2025-05-10 20:58:00 - ERROR - stderr - 15%|█▌ | 575/3741 [3:32:06<18:52:25, 21.46s/it] +2025-05-10 20:58:25 - ERROR - stderr - 15%|█▌ | 576/3741 [3:32:31<19:49:42, 22.55s/it] +2025-05-10 20:58:25 - ERROR - stderr - +2025-05-10 20:58:25 - ERROR - stderr - +2025-05-10 20:58:25 - INFO - stdout - {'loss': 0.9708, 'grad_norm': 0.617574155330658, 'learning_rate': 1.9207002268378998e-05, 'epoch': 0.46} +2025-05-10 20:58:25 - ERROR - stderr - 15%|█▌ | 576/3741 [3:32:31<19:49:42, 22.55s/it] +2025-05-10 20:58:44 - ERROR - stderr - 15%|█▌ | 577/3741 [3:32:51<19:03:23, 21.68s/it] +2025-05-10 20:58:44 - ERROR - stderr - +2025-05-10 20:58:44 - ERROR - stderr - +2025-05-10 20:58:44 - INFO - stdout - {'loss': 0.9154, 'grad_norm': 0.6191926002502441, 'learning_rate': 1.9203619348766974e-05, 'epoch': 0.46} +2025-05-10 20:58:44 - ERROR - stderr - 15%|█▌ | 577/3741 [3:32:51<19:03:23, 21.68s/it] +2025-05-10 20:59:09 - ERROR - stderr - 15%|█▌ | 578/3741 [3:33:15<19:44:53, 22.48s/it] +2025-05-10 20:59:09 - ERROR - stderr - +2025-05-10 20:59:09 - ERROR - stderr - +2025-05-10 20:59:09 - INFO - stdout - {'loss': 0.9354, 'grad_norm': 0.6222400069236755, 'learning_rate': 1.9200229527967724e-05, 'epoch': 0.46} +2025-05-10 20:59:09 - ERROR - stderr - 15%|█▌ | 578/3741 [3:33:15<19:44:53, 22.48s/it] +2025-05-10 20:59:28 - ERROR - stderr - 15%|█▌ | 579/3741 [3:33:35<18:57:40, 21.59s/it] +2025-05-10 20:59:28 - ERROR - stderr - +2025-05-10 20:59:28 - ERROR - stderr - +2025-05-10 20:59:28 - INFO - stdout - {'loss': 0.9424, 'grad_norm': 0.6831260919570923, 'learning_rate': 1.9196832808523048e-05, 'epoch': 0.46} +2025-05-10 20:59:28 - ERROR - stderr - 15%|█▌ | 579/3741 [3:33:35<18:57:40, 21.59s/it] +2025-05-10 20:59:48 - ERROR - stderr - 16%|█▌ | 580/3741 [3:33:55<18:31:11, 21.09s/it] +2025-05-10 20:59:48 - ERROR - stderr - +2025-05-10 20:59:48 - ERROR - stderr - +2025-05-10 20:59:48 - INFO - stdout - {'loss': 0.9589, 'grad_norm': 0.6363519430160522, 'learning_rate': 1.919342919297992e-05, 'epoch': 0.47} +2025-05-10 20:59:48 - ERROR - stderr - 16%|█▌ | 580/3741 [3:33:55<18:31:11, 21.09s/it] +2025-05-10 21:00:08 - ERROR - stderr - 16%|█▌ | 581/3741 [3:34:14<18:05:37, 20.61s/it] +2025-05-10 21:00:08 - ERROR - stderr - +2025-05-10 21:00:08 - ERROR - stderr - +2025-05-10 21:00:08 - INFO - stdout - {'loss': 0.9204, 'grad_norm': 0.6219954490661621, 'learning_rate': 1.9190018683890492e-05, 'epoch': 0.47} +2025-05-10 21:00:08 - ERROR - stderr - 16%|█▌ | 581/3741 [3:34:14<18:05:37, 20.61s/it] +2025-05-10 21:00:27 - ERROR - stderr - 16%|█▌ | 582/3741 [3:34:34<17:48:52, 20.30s/it] +2025-05-10 21:00:27 - ERROR - stderr - +2025-05-10 21:00:27 - ERROR - stderr - +2025-05-10 21:00:27 - INFO - stdout - {'loss': 0.9249, 'grad_norm': 0.6711027026176453, 'learning_rate': 1.9186601283812077e-05, 'epoch': 0.47} +2025-05-10 21:00:27 - ERROR - stderr - 16%|█▌ | 582/3741 [3:34:34<17:48:52, 20.30s/it] +2025-05-10 21:00:49 - ERROR - stderr - 16%|█▌ | 583/3741 [3:34:55<18:08:54, 20.69s/it] +2025-05-10 21:00:49 - ERROR - stderr - +2025-05-10 21:00:49 - ERROR - stderr - +2025-05-10 21:00:49 - INFO - stdout - {'loss': 0.9821, 'grad_norm': 0.656484067440033, 'learning_rate': 1.9183176995307156e-05, 'epoch': 0.47} +2025-05-10 21:00:49 - ERROR - stderr - 16%|█▌ | 583/3741 [3:34:55<18:08:54, 20.69s/it] +2025-05-10 21:01:08 - ERROR - stderr - 16%|█▌ | 584/3741 [3:35:15<17:51:08, 20.36s/it] +2025-05-10 21:01:08 - ERROR - stderr - +2025-05-10 21:01:08 - ERROR - stderr - +2025-05-10 21:01:08 - INFO - stdout - {'loss': 0.9759, 'grad_norm': 0.6418080925941467, 'learning_rate': 1.9179745820943382e-05, 'epoch': 0.47} +2025-05-10 21:01:08 - ERROR - stderr - 16%|█▌ | 584/3741 [3:35:15<17:51:08, 20.36s/it] +2025-05-10 21:01:31 - ERROR - stderr - 16%|█▌ | 585/3741 [3:35:38<18:30:43, 21.12s/it] +2025-05-10 21:01:31 - ERROR - stderr - +2025-05-10 21:01:31 - ERROR - stderr - +2025-05-10 21:01:31 - INFO - stdout - {'loss': 0.9328, 'grad_norm': 0.7414655089378357, 'learning_rate': 1.9176307763293563e-05, 'epoch': 0.47} +2025-05-10 21:01:31 - ERROR - stderr - 16%|█▌ | 585/3741 [3:35:38<18:30:43, 21.12s/it] +2025-05-10 21:01:51 - ERROR - stderr - 16%|█▌ | 586/3741 [3:35:57<18:02:30, 20.59s/it] +2025-05-10 21:01:51 - ERROR - stderr - +2025-05-10 21:01:51 - ERROR - stderr - +2025-05-10 21:01:51 - INFO - stdout - {'loss': 0.918, 'grad_norm': 0.634429931640625, 'learning_rate': 1.9172862824935677e-05, 'epoch': 0.47} +2025-05-10 21:01:51 - ERROR - stderr - 16%|█▌ | 586/3741 [3:35:57<18:02:30, 20.59s/it] +2025-05-10 21:02:14 - ERROR - stderr - 16%|█▌ | 587/3741 [3:36:21<18:48:12, 21.46s/it] +2025-05-10 21:02:14 - ERROR - stderr - +2025-05-10 21:02:14 - ERROR - stderr - +2025-05-10 21:02:14 - INFO - stdout - {'loss': 0.9247, 'grad_norm': 0.6168124675750732, 'learning_rate': 1.9169411008452847e-05, 'epoch': 0.47} +2025-05-10 21:02:14 - ERROR - stderr - 16%|█▌ | 587/3741 [3:36:21<18:48:12, 21.46s/it] +2025-05-10 21:02:34 - ERROR - stderr - 16%|█▌ | 588/3741 [3:36:40<18:14:54, 20.84s/it] +2025-05-10 21:02:34 - ERROR - stderr - +2025-05-10 21:02:34 - ERROR - stderr - +2025-05-10 21:02:34 - INFO - stdout - {'loss': 0.9379, 'grad_norm': 0.6918452978134155, 'learning_rate': 1.9165952316433367e-05, 'epoch': 0.47} +2025-05-10 21:02:34 - ERROR - stderr - 16%|█▌ | 588/3741 [3:36:40<18:14:54, 20.84s/it] +2025-05-10 21:02:57 - ERROR - stderr - 16%|█▌ | 589/3741 [3:37:04<18:59:20, 21.69s/it] +2025-05-10 21:02:57 - ERROR - stderr - +2025-05-10 21:02:57 - ERROR - stderr - +2025-05-10 21:02:57 - INFO - stdout - {'loss': 0.9685, 'grad_norm': 0.6637231111526489, 'learning_rate': 1.9162486751470687e-05, 'epoch': 0.47} +2025-05-10 21:02:57 - ERROR - stderr - 16%|█▌ | 589/3741 [3:37:04<18:59:20, 21.69s/it] +2025-05-10 21:03:17 - ERROR - stderr - 16%|█▌ | 590/3741 [3:37:23<18:27:03, 21.08s/it] +2025-05-10 21:03:17 - ERROR - stderr - +2025-05-10 21:03:17 - ERROR - stderr - +2025-05-10 21:03:17 - INFO - stdout - {'loss': 0.9876, 'grad_norm': 0.6197507381439209, 'learning_rate': 1.9159014316163395e-05, 'epoch': 0.47} +2025-05-10 21:03:17 - ERROR - stderr - 16%|█▌ | 590/3741 [3:37:23<18:27:03, 21.08s/it] +2025-05-10 21:03:40 - ERROR - stderr - 16%|█▌ | 591/3741 [3:37:47<19:05:44, 21.82s/it] +2025-05-10 21:03:40 - ERROR - stderr - +2025-05-10 21:03:40 - ERROR - stderr - +2025-05-10 21:03:40 - INFO - stdout - {'loss': 0.9288, 'grad_norm': 0.6182752251625061, 'learning_rate': 1.915553501311525e-05, 'epoch': 0.47} +2025-05-10 21:03:40 - ERROR - stderr - 16%|█▌ | 591/3741 [3:37:47<19:05:44, 21.82s/it] +2025-05-10 21:04:00 - ERROR - stderr - 16%|█▌ | 592/3741 [3:38:06<18:28:31, 21.12s/it] +2025-05-10 21:04:00 - ERROR - stderr - +2025-05-10 21:04:00 - ERROR - stderr - +2025-05-10 21:04:00 - INFO - stdout - {'loss': 0.9284, 'grad_norm': 0.6498412489891052, 'learning_rate': 1.9152048844935152e-05, 'epoch': 0.47} +2025-05-10 21:04:00 - ERROR - stderr - 16%|█▌ | 592/3741 [3:38:06<18:28:31, 21.12s/it] +2025-05-10 21:04:24 - ERROR - stderr - 16%|█▌ | 593/3741 [3:38:30<19:11:47, 21.95s/it] +2025-05-10 21:04:24 - ERROR - stderr - +2025-05-10 21:04:24 - ERROR - stderr - +2025-05-10 21:04:24 - INFO - stdout - {'loss': 0.9159, 'grad_norm': 0.6597406268119812, 'learning_rate': 1.914855581423714e-05, 'epoch': 0.48} +2025-05-10 21:04:24 - ERROR - stderr - 16%|█▌ | 593/3741 [3:38:30<19:11:47, 21.95s/it] +2025-05-10 21:04:44 - ERROR - stderr - 16%|█▌ | 594/3741 [3:38:50<18:36:32, 21.29s/it] +2025-05-10 21:04:44 - ERROR - stderr - +2025-05-10 21:04:44 - ERROR - stderr - +2025-05-10 21:04:44 - INFO - stdout - {'loss': 0.9473, 'grad_norm': 0.6668150424957275, 'learning_rate': 1.9145055923640417e-05, 'epoch': 0.48} +2025-05-10 21:04:44 - ERROR - stderr - 16%|█▌ | 594/3741 [3:38:50<18:36:32, 21.29s/it] +2025-05-10 21:05:09 - ERROR - stderr - 16%|█▌ | 595/3741 [3:39:15<19:40:24, 22.51s/it] +2025-05-10 21:05:09 - ERROR - stderr - +2025-05-10 21:05:09 - ERROR - stderr - +2025-05-10 21:05:09 - INFO - stdout - {'loss': 0.9343, 'grad_norm': 0.7026738524436951, 'learning_rate': 1.9141549175769315e-05, 'epoch': 0.48} +2025-05-10 21:05:09 - ERROR - stderr - 16%|█▌ | 595/3741 [3:39:15<19:40:24, 22.51s/it] +2025-05-10 21:05:29 - ERROR - stderr - 16%|█▌ | 596/3741 [3:39:35<18:57:04, 21.69s/it] +2025-05-10 21:05:29 - ERROR - stderr - +2025-05-10 21:05:29 - ERROR - stderr - +2025-05-10 21:05:29 - INFO - stdout - {'loss': 0.9569, 'grad_norm': 0.7704558372497559, 'learning_rate': 1.9138035573253316e-05, 'epoch': 0.48} +2025-05-10 21:05:29 - ERROR - stderr - 16%|█▌ | 596/3741 [3:39:35<18:57:04, 21.69s/it] +2025-05-10 21:05:53 - ERROR - stderr - 16%|█▌ | 597/3741 [3:39:59<19:40:04, 22.52s/it] +2025-05-10 21:05:53 - ERROR - stderr - +2025-05-10 21:05:53 - ERROR - stderr - +2025-05-10 21:05:53 - INFO - stdout - {'loss': 0.9666, 'grad_norm': 0.6594985723495483, 'learning_rate': 1.9134515118727035e-05, 'epoch': 0.48} +2025-05-10 21:05:53 - ERROR - stderr - 16%|█▌ | 597/3741 [3:40:00<19:40:04, 22.52s/it] +2025-05-10 21:06:13 - ERROR - stderr - 16%|█▌ | 598/3741 [3:40:19<18:54:49, 21.66s/it] +2025-05-10 21:06:13 - ERROR - stderr - +2025-05-10 21:06:13 - ERROR - stderr - +2025-05-10 21:06:13 - INFO - stdout - {'loss': 0.9473, 'grad_norm': 0.6233870387077332, 'learning_rate': 1.913098781483023e-05, 'epoch': 0.48} +2025-05-10 21:06:13 - ERROR - stderr - 16%|█▌ | 598/3741 [3:40:19<18:54:49, 21.66s/it] +2025-05-10 21:06:35 - ERROR - stderr - 16%|█▌ | 599/3741 [3:40:41<19:01:24, 21.80s/it] +2025-05-10 21:06:35 - ERROR - stderr - +2025-05-10 21:06:35 - ERROR - stderr - +2025-05-10 21:06:35 - INFO - stdout - {'loss': 0.8946, 'grad_norm': 0.6997066736221313, 'learning_rate': 1.9127453664207798e-05, 'epoch': 0.48} +2025-05-10 21:06:35 - ERROR - stderr - 16%|█▌ | 599/3741 [3:40:41<19:01:24, 21.80s/it] +2025-05-10 21:06:54 - ERROR - stderr - 16%|█▌ | 600/3741 [3:41:01<18:25:25, 21.12s/it] +2025-05-10 21:06:54 - ERROR - stderr - +2025-05-10 21:06:54 - ERROR - stderr - +2025-05-10 21:06:54 - INFO - stdout - {'loss': 0.9911, 'grad_norm': 0.6761658191680908, 'learning_rate': 1.912391266950976e-05, 'epoch': 0.48} +2025-05-10 21:06:54 - ERROR - stderr - 16%|█▌ | 600/3741 [3:41:01<18:25:25, 21.12s/it] +2025-05-10 21:07:14 - ERROR - stderr - 16%|█▌ | 601/3741 [3:41:21<18:03:23, 20.70s/it] +2025-05-10 21:07:14 - ERROR - stderr - +2025-05-10 21:07:14 - ERROR - stderr - +2025-05-10 21:07:14 - INFO - stdout - {'loss': 0.9955, 'grad_norm': 0.6300480365753174, 'learning_rate': 1.9120364833391277e-05, 'epoch': 0.48} +2025-05-10 21:07:14 - ERROR - stderr - 16%|█▌ | 601/3741 [3:41:21<18:03:23, 20.70s/it] +2025-05-10 21:07:34 - ERROR - stderr - 16%|█▌ | 602/3741 [3:41:40<17:45:27, 20.37s/it] +2025-05-10 21:07:34 - ERROR - stderr - +2025-05-10 21:07:34 - ERROR - stderr - +2025-05-10 21:07:34 - INFO - stdout - {'loss': 0.9853, 'grad_norm': 0.6605967283248901, 'learning_rate': 1.9116810158512635e-05, 'epoch': 0.48} +2025-05-10 21:07:34 - ERROR - stderr - 16%|█▌ | 602/3741 [3:41:40<17:45:27, 20.37s/it] +2025-05-10 21:07:53 - ERROR - stderr - 16%|█▌ | 603/3741 [3:42:00<17:32:21, 20.12s/it] +2025-05-10 21:07:53 - ERROR - stderr - +2025-05-10 21:07:53 - ERROR - stderr - +2025-05-10 21:07:53 - INFO - stdout - {'loss': 0.9011, 'grad_norm': 0.6040114164352417, 'learning_rate': 1.9113248647539253e-05, 'epoch': 0.48} +2025-05-10 21:07:53 - ERROR - stderr - 16%|█▌ | 603/3741 [3:42:00<17:32:21, 20.12s/it] +2025-05-10 21:08:16 - ERROR - stderr - 16%|█▌ | 604/3741 [3:42:22<18:12:00, 20.89s/it] +2025-05-10 21:08:16 - ERROR - stderr - +2025-05-10 21:08:16 - ERROR - stderr - +2025-05-10 21:08:16 - INFO - stdout - {'loss': 0.9038, 'grad_norm': 0.6693778038024902, 'learning_rate': 1.9109680303141673e-05, 'epoch': 0.48} +2025-05-10 21:08:16 - ERROR - stderr - 16%|█▌ | 604/3741 [3:42:22<18:12:00, 20.89s/it] +2025-05-10 21:08:35 - ERROR - stderr - 16%|█▌ | 605/3741 [3:42:42<17:49:41, 20.47s/it] +2025-05-10 21:08:36 - ERROR - stderr - +2025-05-10 21:08:36 - ERROR - stderr - +2025-05-10 21:08:36 - INFO - stdout - {'loss': 0.9332, 'grad_norm': 0.6784869432449341, 'learning_rate': 1.910610512799556e-05, 'epoch': 0.49} +2025-05-10 21:08:36 - ERROR - stderr - 16%|█▌ | 605/3741 [3:42:42<17:49:41, 20.47s/it] +2025-05-10 21:08:57 - ERROR - stderr - 16%|█▌ | 606/3741 [3:43:03<18:00:15, 20.67s/it] +2025-05-10 21:08:57 - ERROR - stderr - +2025-05-10 21:08:57 - ERROR - stderr - +2025-05-10 21:08:57 - INFO - stdout - {'loss': 0.985, 'grad_norm': 0.6835043430328369, 'learning_rate': 1.91025231247817e-05, 'epoch': 0.49} +2025-05-10 21:08:57 - ERROR - stderr - 16%|█▌ | 606/3741 [3:43:03<18:00:15, 20.67s/it] +2025-05-10 21:09:16 - ERROR - stderr - 16%|█▌ | 607/3741 [3:43:23<17:42:33, 20.34s/it] +2025-05-10 21:09:16 - ERROR - stderr - +2025-05-10 21:09:16 - ERROR - stderr - +2025-05-10 21:09:16 - INFO - stdout - {'loss': 1.0014, 'grad_norm': 0.6370753645896912, 'learning_rate': 1.9098934296186006e-05, 'epoch': 0.49} +2025-05-10 21:09:16 - ERROR - stderr - 16%|█▌ | 607/3741 [3:43:23<17:42:33, 20.34s/it] +2025-05-10 21:09:40 - ERROR - stderr - 16%|█▋ | 608/3741 [3:43:46<18:29:16, 21.24s/it] +2025-05-10 21:09:40 - ERROR - stderr - +2025-05-10 21:09:40 - ERROR - stderr - +2025-05-10 21:09:40 - INFO - stdout - {'loss': 0.948, 'grad_norm': 0.7216833233833313, 'learning_rate': 1.9095338644899502e-05, 'epoch': 0.49} +2025-05-10 21:09:40 - ERROR - stderr - 16%|█▋ | 608/3741 [3:43:46<18:29:16, 21.24s/it] +2025-05-10 21:09:59 - ERROR - stderr - 16%|█▋ | 609/3741 [3:44:05<18:01:55, 20.73s/it] +2025-05-10 21:09:59 - ERROR - stderr - +2025-05-10 21:09:59 - ERROR - stderr - +2025-05-10 21:09:59 - INFO - stdout - {'loss': 0.9399, 'grad_norm': 0.6614647507667542, 'learning_rate': 1.9091736173618326e-05, 'epoch': 0.49} +2025-05-10 21:09:59 - ERROR - stderr - 16%|█▋ | 609/3741 [3:44:05<18:01:55, 20.73s/it] +2025-05-10 21:10:23 - ERROR - stderr - 16%|█▋ | 610/3741 [3:44:29<18:51:45, 21.69s/it] +2025-05-10 21:10:23 - ERROR - stderr - +2025-05-10 21:10:23 - ERROR - stderr - +2025-05-10 21:10:23 - INFO - stdout - {'loss': 0.9501, 'grad_norm': 0.6034402251243591, 'learning_rate': 1.908812688504374e-05, 'epoch': 0.49} +2025-05-10 21:10:23 - ERROR - stderr - 16%|█▋ | 610/3741 [3:44:29<18:51:45, 21.69s/it] +2025-05-10 21:10:43 - ERROR - stderr - 16%|█▋ | 611/3741 [3:44:49<18:18:22, 21.05s/it] +2025-05-10 21:10:43 - ERROR - stderr - +2025-05-10 21:10:43 - ERROR - stderr - +2025-05-10 21:10:43 - INFO - stdout - {'loss': 0.9393, 'grad_norm': 0.628848135471344, 'learning_rate': 1.9084510781882108e-05, 'epoch': 0.49} +2025-05-10 21:10:43 - ERROR - stderr - 16%|█▋ | 611/3741 [3:44:49<18:18:22, 21.05s/it] +2025-05-10 21:11:07 - ERROR - stderr - 16%|█▋ | 612/3741 [3:45:13<19:03:55, 21.94s/it] +2025-05-10 21:11:07 - ERROR - stderr - +2025-05-10 21:11:07 - ERROR - stderr - +2025-05-10 21:11:07 - INFO - stdout - {'loss': 0.9689, 'grad_norm': 0.5977146625518799, 'learning_rate': 1.9080887866844902e-05, 'epoch': 0.49} +2025-05-10 21:11:07 - ERROR - stderr - 16%|█▋ | 612/3741 [3:45:13<19:03:55, 21.94s/it] +2025-05-10 21:11:26 - ERROR - stderr - 16%|█▋ | 613/3741 [3:45:33<18:27:11, 21.24s/it] +2025-05-10 21:11:26 - ERROR - stderr - +2025-05-10 21:11:26 - ERROR - stderr - +2025-05-10 21:11:26 - INFO - stdout - {'loss': 0.9777, 'grad_norm': 0.6800901889801025, 'learning_rate': 1.907725814264872e-05, 'epoch': 0.49} +2025-05-10 21:11:26 - ERROR - stderr - 16%|█▋ | 613/3741 [3:45:33<18:27:11, 21.24s/it] +2025-05-10 21:11:50 - ERROR - stderr - 16%|█▋ | 614/3741 [3:45:56<19:04:14, 21.96s/it] +2025-05-10 21:11:50 - ERROR - stderr - +2025-05-10 21:11:50 - ERROR - stderr - +2025-05-10 21:11:50 - INFO - stdout - {'loss': 0.9549, 'grad_norm': 0.6149044036865234, 'learning_rate': 1.9073621612015244e-05, 'epoch': 0.49} +2025-05-10 21:11:50 - ERROR - stderr - 16%|█▋ | 614/3741 [3:45:56<19:04:14, 21.96s/it] +2025-05-10 21:12:09 - ERROR - stderr - 16%|█▋ | 615/3741 [3:46:16<18:25:44, 21.22s/it] +2025-05-10 21:12:09 - ERROR - stderr - +2025-05-10 21:12:09 - ERROR - stderr - +2025-05-10 21:12:09 - INFO - stdout - {'loss': 0.9653, 'grad_norm': 0.7120502591133118, 'learning_rate': 1.9069978277671266e-05, 'epoch': 0.49} +2025-05-10 21:12:09 - ERROR - stderr - 16%|█▋ | 615/3741 [3:46:16<18:25:44, 21.22s/it] +2025-05-10 21:12:33 - ERROR - stderr - 16%|█▋ | 616/3741 [3:46:39<19:03:37, 21.96s/it] +2025-05-10 21:12:33 - ERROR - stderr - +2025-05-10 21:12:33 - ERROR - stderr - +2025-05-10 21:12:33 - INFO - stdout - {'loss': 0.9387, 'grad_norm': 0.59898442029953, 'learning_rate': 1.906632814234869e-05, 'epoch': 0.49} +2025-05-10 21:12:33 - ERROR - stderr - 16%|█▋ | 616/3741 [3:46:39<19:03:37, 21.96s/it] +2025-05-10 21:12:53 - ERROR - stderr - 16%|█▋ | 617/3741 [3:46:59<18:25:32, 21.23s/it] +2025-05-10 21:12:53 - ERROR - stderr - +2025-05-10 21:12:53 - ERROR - stderr - +2025-05-10 21:12:53 - INFO - stdout - {'loss': 0.9482, 'grad_norm': 0.6274296045303345, 'learning_rate': 1.9062671208784508e-05, 'epoch': 0.49} +2025-05-10 21:12:53 - ERROR - stderr - 16%|█▋ | 617/3741 [3:46:59<18:25:32, 21.23s/it] +2025-05-10 21:13:15 - ERROR - stderr - 17%|█▋ | 618/3741 [3:47:21<18:44:53, 21.61s/it] +2025-05-10 21:13:15 - ERROR - stderr - +2025-05-10 21:13:15 - ERROR - stderr - +2025-05-10 21:13:15 - INFO - stdout - {'loss': 0.9233, 'grad_norm': 0.6537023186683655, 'learning_rate': 1.9059007479720807e-05, 'epoch': 0.5} +2025-05-10 21:13:15 - ERROR - stderr - 17%|█▋ | 618/3741 [3:47:21<18:44:53, 21.61s/it] +2025-05-10 21:13:35 - ERROR - stderr - 17%|█▋ | 619/3741 [3:47:41<18:13:26, 21.01s/it] +2025-05-10 21:13:35 - ERROR - stderr - +2025-05-10 21:13:35 - ERROR - stderr - +2025-05-10 21:13:35 - INFO - stdout - {'loss': 0.9676, 'grad_norm': 0.6578821539878845, 'learning_rate': 1.905533695790479e-05, 'epoch': 0.5} +2025-05-10 21:13:35 - ERROR - stderr - 17%|█▋ | 619/3741 [3:47:41<18:13:26, 21.01s/it] +2025-05-10 21:13:55 - ERROR - stderr - 17%|█▋ | 620/3741 [3:48:01<17:56:19, 20.69s/it] +2025-05-10 21:13:55 - ERROR - stderr - +2025-05-10 21:13:55 - ERROR - stderr - +2025-05-10 21:13:55 - INFO - stdout - {'loss': 0.9104, 'grad_norm': 0.6332679986953735, 'learning_rate': 1.9051659646088726e-05, 'epoch': 0.5} +2025-05-10 21:13:55 - ERROR - stderr - 17%|█▋ | 620/3741 [3:48:01<17:56:19, 20.69s/it] +2025-05-10 21:14:15 - ERROR - stderr - 17%|█▋ | 621/3741 [3:48:21<17:53:11, 20.64s/it] +2025-05-10 21:14:15 - ERROR - stderr - +2025-05-10 21:14:15 - ERROR - stderr - +2025-05-10 21:14:15 - INFO - stdout - {'loss': 0.9788, 'grad_norm': 0.66425621509552, 'learning_rate': 1.9047975547029998e-05, 'epoch': 0.5} +2025-05-10 21:14:15 - ERROR - stderr - 17%|█▋ | 621/3741 [3:48:21<17:53:11, 20.64s/it] +2025-05-10 21:14:35 - ERROR - stderr - 17%|█▋ | 622/3741 [3:48:41<17:38:00, 20.35s/it] +2025-05-10 21:14:35 - ERROR - stderr - +2025-05-10 21:14:35 - ERROR - stderr - +2025-05-10 21:14:35 - INFO - stdout - {'loss': 0.9555, 'grad_norm': 0.6680029630661011, 'learning_rate': 1.9044284663491065e-05, 'epoch': 0.5} +2025-05-10 21:14:35 - ERROR - stderr - 17%|█▋ | 622/3741 [3:48:41<17:38:00, 20.35s/it] +2025-05-10 21:14:55 - ERROR - stderr - 17%|█▋ | 623/3741 [3:49:01<17:37:27, 20.35s/it] +2025-05-10 21:14:55 - ERROR - stderr - +2025-05-10 21:14:55 - ERROR - stderr - +2025-05-10 21:14:55 - INFO - stdout - {'loss': 0.988, 'grad_norm': 0.6043557524681091, 'learning_rate': 1.9040586998239472e-05, 'epoch': 0.5} +2025-05-10 21:14:55 - ERROR - stderr - 17%|█▋ | 623/3741 [3:49:01<17:37:27, 20.35s/it] +2025-05-10 21:15:15 - ERROR - stderr - 17%|█▋ | 624/3741 [3:49:22<17:36:54, 20.34s/it] +2025-05-10 21:15:16 - ERROR - stderr - +2025-05-10 21:15:16 - ERROR - stderr - +2025-05-10 21:15:16 - INFO - stdout - {'loss': 0.953, 'grad_norm': 0.6627247929573059, 'learning_rate': 1.903688255404786e-05, 'epoch': 0.5} +2025-05-10 21:15:16 - ERROR - stderr - 17%|█▋ | 624/3741 [3:49:22<17:36:54, 20.34s/it] +2025-05-10 21:15:37 - ERROR - stderr - 17%|█▋ | 625/3741 [3:49:43<17:56:58, 20.74s/it] +2025-05-10 21:15:37 - ERROR - stderr - +2025-05-10 21:15:37 - ERROR - stderr - +2025-05-10 21:15:37 - INFO - stdout - {'loss': 0.9308, 'grad_norm': 0.6448099613189697, 'learning_rate': 1.9033171333693952e-05, 'epoch': 0.5} +2025-05-10 21:15:37 - ERROR - stderr - 17%|█▋ | 625/3741 [3:49:43<17:56:58, 20.74s/it] +2025-05-10 21:15:57 - ERROR - stderr - 17%|█▋ | 626/3741 [3:50:03<17:40:16, 20.42s/it] +2025-05-10 21:15:57 - ERROR - stderr - +2025-05-10 21:15:57 - ERROR - stderr - +2025-05-10 21:15:57 - INFO - stdout - {'loss': 0.9421, 'grad_norm': 0.5838706493377686, 'learning_rate': 1.902945333996054e-05, 'epoch': 0.5} +2025-05-10 21:15:57 - ERROR - stderr - 17%|█▋ | 626/3741 [3:50:03<17:40:16, 20.42s/it] +2025-05-10 21:16:20 - ERROR - stderr - 17%|█▋ | 627/3741 [3:50:27<18:26:07, 21.31s/it] +2025-05-10 21:16:20 - ERROR - stderr - +2025-05-10 21:16:20 - ERROR - stderr - +2025-05-10 21:16:20 - INFO - stdout - {'loss': 0.9472, 'grad_norm': 0.6396023631095886, 'learning_rate': 1.9025728575635503e-05, 'epoch': 0.5} +2025-05-10 21:16:20 - ERROR - stderr - 17%|█▋ | 627/3741 [3:50:27<18:26:07, 21.31s/it] +2025-05-10 21:16:40 - ERROR - stderr - 17%|█▋ | 628/3741 [3:50:46<18:03:05, 20.88s/it] +2025-05-10 21:16:40 - ERROR - stderr - +2025-05-10 21:16:40 - ERROR - stderr - +2025-05-10 21:16:40 - INFO - stdout - {'loss': 0.9113, 'grad_norm': 0.5953710675239563, 'learning_rate': 1.9021997043511798e-05, 'epoch': 0.5} +2025-05-10 21:16:40 - ERROR - stderr - 17%|█▋ | 628/3741 [3:50:46<18:03:05, 20.88s/it] +2025-05-10 21:17:04 - ERROR - stderr - 17%|█▋ | 629/3741 [3:51:10<18:52:00, 21.83s/it] +2025-05-10 21:17:04 - ERROR - stderr - +2025-05-10 21:17:04 - ERROR - stderr - +2025-05-10 21:17:04 - INFO - stdout - {'loss': 0.9839, 'grad_norm': 0.7014410495758057, 'learning_rate': 1.9018258746387458e-05, 'epoch': 0.5} +2025-05-10 21:17:04 - ERROR - stderr - 17%|█▋ | 629/3741 [3:51:10<18:52:00, 21.83s/it] +2025-05-10 21:17:24 - ERROR - stderr - 17%|█▋ | 630/3741 [3:51:31<18:25:48, 21.33s/it] +2025-05-10 21:17:24 - ERROR - stderr - +2025-05-10 21:17:24 - ERROR - stderr - +2025-05-10 21:17:24 - INFO - stdout - {'loss': 0.9552, 'grad_norm': 0.6346995830535889, 'learning_rate': 1.901451368706558e-05, 'epoch': 0.51} +2025-05-10 21:17:24 - ERROR - stderr - 17%|█▋ | 630/3741 [3:51:31<18:25:48, 21.33s/it] +2025-05-10 21:17:48 - ERROR - stderr - 17%|█▋ | 631/3741 [3:51:54<19:01:13, 22.02s/it] +2025-05-10 21:17:48 - ERROR - stderr - +2025-05-10 21:17:48 - ERROR - stderr - +2025-05-10 21:17:48 - INFO - stdout - {'loss': 0.9407, 'grad_norm': 0.6501613855361938, 'learning_rate': 1.9010761868354336e-05, 'epoch': 0.51} +2025-05-10 21:17:48 - ERROR - stderr - 17%|█▋ | 631/3741 [3:51:54<19:01:13, 22.02s/it] +2025-05-10 21:18:08 - ERROR - stderr - 17%|█▋ | 632/3741 [3:52:14<18:29:23, 21.41s/it] +2025-05-10 21:18:08 - ERROR - stderr - +2025-05-10 21:18:08 - ERROR - stderr - +2025-05-10 21:18:08 - INFO - stdout - {'loss': 0.9881, 'grad_norm': 0.7061483860015869, 'learning_rate': 1.9007003293066973e-05, 'epoch': 0.51} +2025-05-10 21:18:08 - ERROR - stderr - 17%|█▋ | 632/3741 [3:52:14<18:29:23, 21.41s/it] +2025-05-10 21:18:31 - ERROR - stderr - 17%|█▋ | 633/3741 [3:52:38<19:01:02, 22.03s/it] +2025-05-10 21:18:31 - ERROR - stderr - +2025-05-10 21:18:31 - ERROR - stderr - +2025-05-10 21:18:31 - INFO - stdout - {'loss': 0.9514, 'grad_norm': 0.6285912394523621, 'learning_rate': 1.9003237964021796e-05, 'epoch': 0.51} +2025-05-10 21:18:31 - ERROR - stderr - 17%|█▋ | 633/3741 [3:52:38<19:01:02, 22.03s/it] +2025-05-10 21:18:51 - ERROR - stderr - 17%|█▋ | 634/3741 [3:52:57<18:23:24, 21.31s/it] +2025-05-10 21:18:51 - ERROR - stderr - +2025-05-10 21:18:51 - ERROR - stderr - +2025-05-10 21:18:51 - INFO - stdout - {'loss': 0.9336, 'grad_norm': 0.7684087753295898, 'learning_rate': 1.899946588404218e-05, 'epoch': 0.51} +2025-05-10 21:18:51 - ERROR - stderr - 17%|█▋ | 634/3741 [3:52:57<18:23:24, 21.31s/it] +2025-05-10 21:19:13 - ERROR - stderr - 17%|█▋ | 635/3741 [3:53:20<18:37:30, 21.59s/it] +2025-05-10 21:19:13 - ERROR - stderr - +2025-05-10 21:19:13 - ERROR - stderr - +2025-05-10 21:19:13 - INFO - stdout - {'loss': 0.8914, 'grad_norm': 0.7490344047546387, 'learning_rate': 1.8995687055956555e-05, 'epoch': 0.51} +2025-05-10 21:19:13 - ERROR - stderr - 17%|█▋ | 635/3741 [3:53:20<18:37:30, 21.59s/it] +2025-05-10 21:19:33 - ERROR - stderr - 17%|█▋ | 636/3741 [3:53:39<18:07:41, 21.02s/it] +2025-05-10 21:19:33 - ERROR - stderr - +2025-05-10 21:19:33 - ERROR - stderr - +2025-05-10 21:19:33 - INFO - stdout - {'loss': 0.9701, 'grad_norm': 0.8029311299324036, 'learning_rate': 1.8991901482598414e-05, 'epoch': 0.51} +2025-05-10 21:19:33 - ERROR - stderr - 17%|█▋ | 636/3741 [3:53:39<18:07:41, 21.02s/it] +2025-05-10 21:19:56 - ERROR - stderr - 17%|█▋ | 637/3741 [3:54:02<18:36:47, 21.59s/it] +2025-05-10 21:19:56 - ERROR - stderr - +2025-05-10 21:19:56 - ERROR - stderr - +2025-05-10 21:19:56 - INFO - stdout - {'loss': 0.9437, 'grad_norm': 0.6485514044761658, 'learning_rate': 1.8988109166806313e-05, 'epoch': 0.51} +2025-05-10 21:19:56 - ERROR - stderr - 17%|█▋ | 637/3741 [3:54:02<18:36:47, 21.59s/it] +2025-05-10 21:20:15 - ERROR - stderr - 17%|█▋ | 638/3741 [3:54:22<18:06:13, 21.00s/it] +2025-05-10 21:20:16 - ERROR - stderr - +2025-05-10 21:20:16 - ERROR - stderr - +2025-05-10 21:20:16 - INFO - stdout - {'loss': 0.9561, 'grad_norm': 0.6395050883293152, 'learning_rate': 1.8984310111423855e-05, 'epoch': 0.51} +2025-05-10 21:20:16 - ERROR - stderr - 17%|█▋ | 638/3741 [3:54:22<18:06:13, 21.00s/it] +2025-05-10 21:20:38 - ERROR - stderr - 17%|█▋ | 639/3741 [3:54:45<18:32:47, 21.52s/it] +2025-05-10 21:20:38 - ERROR - stderr - +2025-05-10 21:20:38 - ERROR - stderr - +2025-05-10 21:20:38 - INFO - stdout - {'loss': 0.9247, 'grad_norm': 0.6431874632835388, 'learning_rate': 1.8980504319299705e-05, 'epoch': 0.51} +2025-05-10 21:20:38 - ERROR - stderr - 17%|█▋ | 639/3741 [3:54:45<18:32:47, 21.52s/it] +2025-05-10 21:20:58 - ERROR - stderr - 17%|█▋ | 640/3741 [3:55:04<18:02:27, 20.94s/it] +2025-05-10 21:20:58 - ERROR - stderr - +2025-05-10 21:20:58 - ERROR - stderr - +2025-05-10 21:20:58 - INFO - stdout - {'loss': 0.9203, 'grad_norm': 0.675888180732727, 'learning_rate': 1.8976691793287575e-05, 'epoch': 0.51} +2025-05-10 21:20:58 - ERROR - stderr - 17%|█▋ | 640/3741 [3:55:04<18:02:27, 20.94s/it] +2025-05-10 21:21:20 - ERROR - stderr - 17%|█▋ | 641/3741 [3:55:26<18:19:33, 21.28s/it] +2025-05-10 21:21:20 - ERROR - stderr - +2025-05-10 21:21:20 - ERROR - stderr - +2025-05-10 21:21:20 - INFO - stdout - {'loss': 0.9709, 'grad_norm': 0.6630160212516785, 'learning_rate': 1.8972872536246224e-05, 'epoch': 0.51} +2025-05-10 21:21:20 - ERROR - stderr - 17%|█▋ | 641/3741 [3:55:26<18:19:33, 21.28s/it] +2025-05-10 21:21:39 - ERROR - stderr - 17%|█▋ | 642/3741 [3:55:46<17:50:57, 20.73s/it] +2025-05-10 21:21:39 - ERROR - stderr - +2025-05-10 21:21:39 - ERROR - stderr - +2025-05-10 21:21:39 - INFO - stdout - {'loss': 0.987, 'grad_norm': 0.6319396495819092, 'learning_rate': 1.8969046551039466e-05, 'epoch': 0.51} +2025-05-10 21:21:39 - ERROR - stderr - 17%|█▋ | 642/3741 [3:55:46<17:50:57, 20.73s/it] +2025-05-10 21:22:02 - ERROR - stderr - 17%|█▋ | 643/3741 [3:56:08<18:21:26, 21.33s/it] +2025-05-10 21:22:02 - ERROR - stderr - +2025-05-10 21:22:02 - ERROR - stderr - +2025-05-10 21:22:02 - INFO - stdout - {'loss': 0.9802, 'grad_norm': 0.6689966320991516, 'learning_rate': 1.8965213840536152e-05, 'epoch': 0.52} +2025-05-10 21:22:02 - ERROR - stderr - 17%|█▋ | 643/3741 [3:56:08<18:21:26, 21.33s/it] +2025-05-10 21:22:22 - ERROR - stderr - 17%|█▋ | 644/3741 [3:56:28<17:52:58, 20.79s/it] +2025-05-10 21:22:22 - ERROR - stderr - +2025-05-10 21:22:22 - ERROR - stderr - +2025-05-10 21:22:22 - INFO - stdout - {'loss': 0.9682, 'grad_norm': 0.6527170538902283, 'learning_rate': 1.8961374407610177e-05, 'epoch': 0.52} +2025-05-10 21:22:22 - ERROR - stderr - 17%|█▋ | 644/3741 [3:56:28<17:52:58, 20.79s/it] +2025-05-10 21:22:46 - ERROR - stderr - 17%|█▋ | 645/3741 [3:56:52<18:50:07, 21.90s/it] +2025-05-10 21:22:46 - ERROR - stderr - +2025-05-10 21:22:46 - ERROR - stderr - +2025-05-10 21:22:46 - INFO - stdout - {'loss': 0.9256, 'grad_norm': 0.5882049202919006, 'learning_rate': 1.8957528255140482e-05, 'epoch': 0.52} +2025-05-10 21:22:46 - ERROR - stderr - 17%|█▋ | 645/3741 [3:56:52<18:50:07, 21.90s/it] +2025-05-10 21:23:06 - ERROR - stderr - 17%|█▋ | 646/3741 [3:57:12<18:19:09, 21.31s/it] +2025-05-10 21:23:06 - ERROR - stderr - +2025-05-10 21:23:06 - ERROR - stderr - +2025-05-10 21:23:06 - INFO - stdout - {'loss': 0.9512, 'grad_norm': 0.6243289709091187, 'learning_rate': 1.895367538601104e-05, 'epoch': 0.52} +2025-05-10 21:23:06 - ERROR - stderr - 17%|█▋ | 646/3741 [3:57:12<18:19:09, 21.31s/it] +2025-05-10 21:23:30 - ERROR - stderr - 17%|█▋ | 647/3741 [3:57:36<18:56:40, 22.04s/it] +2025-05-10 21:23:30 - ERROR - stderr - +2025-05-10 21:23:30 - ERROR - stderr - +2025-05-10 21:23:30 - INFO - stdout - {'loss': 0.9402, 'grad_norm': 0.6396244764328003, 'learning_rate': 1.894981580311087e-05, 'epoch': 0.52} +2025-05-10 21:23:30 - ERROR - stderr - 17%|█▋ | 647/3741 [3:57:36<18:56:40, 22.04s/it] +2025-05-10 21:23:50 - ERROR - stderr - 17%|█▋ | 648/3741 [3:57:56<18:20:54, 21.36s/it] +2025-05-10 21:23:50 - ERROR - stderr - +2025-05-10 21:23:50 - ERROR - stderr - +2025-05-10 21:23:50 - INFO - stdout - {'loss': 0.9745, 'grad_norm': 0.6135784387588501, 'learning_rate': 1.8945949509334008e-05, 'epoch': 0.52} +2025-05-10 21:23:50 - ERROR - stderr - 17%|█▋ | 648/3741 [3:57:56<18:20:54, 21.36s/it] +2025-05-10 21:24:14 - ERROR - stderr - 17%|█▋ | 649/3741 [3:58:20<19:04:22, 22.21s/it] +2025-05-10 21:24:14 - ERROR - stderr - +2025-05-10 21:24:14 - ERROR - stderr - +2025-05-10 21:24:14 - INFO - stdout - {'loss': 0.929, 'grad_norm': 0.6294798851013184, 'learning_rate': 1.894207650757954e-05, 'epoch': 0.52} +2025-05-10 21:24:14 - ERROR - stderr - 17%|█▋ | 649/3741 [3:58:20<19:04:22, 22.21s/it] +2025-05-10 21:24:33 - ERROR - stderr - 17%|█▋ | 650/3741 [3:58:40<18:21:58, 21.39s/it] +2025-05-10 21:24:33 - ERROR - stderr - +2025-05-10 21:24:33 - ERROR - stderr - +2025-05-10 21:24:33 - INFO - stdout - {'loss': 0.9595, 'grad_norm': 0.6499471664428711, 'learning_rate': 1.8938196800751575e-05, 'epoch': 0.52} +2025-05-10 21:24:33 - ERROR - stderr - 17%|█▋ | 650/3741 [3:58:40<18:21:58, 21.39s/it] +2025-05-10 21:24:56 - ERROR - stderr - 17%|█▋ | 651/3741 [3:59:02<18:44:10, 21.83s/it] +2025-05-10 21:24:56 - ERROR - stderr - +2025-05-10 21:24:56 - ERROR - stderr - +2025-05-10 21:24:56 - INFO - stdout - {'loss': 0.9328, 'grad_norm': 0.6707350611686707, 'learning_rate': 1.8934310391759247e-05, 'epoch': 0.52} +2025-05-10 21:24:56 - ERROR - stderr - 17%|█▋ | 651/3741 [3:59:02<18:44:10, 21.83s/it] +2025-05-10 21:25:15 - ERROR - stderr - 17%|█▋ | 652/3741 [3:59:22<18:05:32, 21.09s/it] +2025-05-10 21:25:15 - ERROR - stderr - +2025-05-10 21:25:15 - ERROR - stderr - +2025-05-10 21:25:15 - INFO - stdout - {'loss': 0.8878, 'grad_norm': 0.6414554715156555, 'learning_rate': 1.8930417283516717e-05, 'epoch': 0.52} +2025-05-10 21:25:15 - ERROR - stderr - 17%|█▋ | 652/3741 [3:59:22<18:05:32, 21.09s/it] +2025-05-10 21:25:40 - ERROR - stderr - 17%|█▋ | 653/3741 [3:59:47<19:06:02, 22.27s/it] +2025-05-10 21:25:40 - ERROR - stderr - +2025-05-10 21:25:40 - ERROR - stderr - +2025-05-10 21:25:40 - INFO - stdout - {'loss': 0.9511, 'grad_norm': 0.6393246650695801, 'learning_rate': 1.892651747894317e-05, 'epoch': 0.52} +2025-05-10 21:25:40 - ERROR - stderr - 17%|█▋ | 653/3741 [3:59:47<19:06:02, 22.27s/it] +2025-05-10 21:26:00 - ERROR - stderr - 17%|█▋ | 654/3741 [4:00:06<18:21:44, 21.41s/it] +2025-05-10 21:26:00 - ERROR - stderr - +2025-05-10 21:26:00 - ERROR - stderr - +2025-05-10 21:26:00 - INFO - stdout - {'loss': 0.9845, 'grad_norm': 0.658134937286377, 'learning_rate': 1.892261098096282e-05, 'epoch': 0.52} +2025-05-10 21:26:00 - ERROR - stderr - 17%|█▋ | 654/3741 [4:00:06<18:21:44, 21.41s/it] +2025-05-10 21:26:23 - ERROR - stderr - 18%|█▊ | 655/3741 [4:00:29<18:48:12, 21.94s/it] +2025-05-10 21:26:23 - ERROR - stderr - +2025-05-10 21:26:23 - ERROR - stderr - +2025-05-10 21:26:23 - INFO - stdout - {'loss': 0.9655, 'grad_norm': 0.6066871881484985, 'learning_rate': 1.891869779250488e-05, 'epoch': 0.53} +2025-05-10 21:26:23 - ERROR - stderr - 18%|█▊ | 655/3741 [4:00:29<18:48:12, 21.94s/it] +2025-05-10 21:26:42 - ERROR - stderr - 18%|█▊ | 656/3741 [4:00:49<18:09:40, 21.19s/it] +2025-05-10 21:26:42 - ERROR - stderr - +2025-05-10 21:26:42 - ERROR - stderr - +2025-05-10 21:26:42 - INFO - stdout - {'loss': 0.9605, 'grad_norm': 0.6272629499435425, 'learning_rate': 1.8914777916503602e-05, 'epoch': 0.53} +2025-05-10 21:26:42 - ERROR - stderr - 18%|█▊ | 656/3741 [4:00:49<18:09:40, 21.19s/it] +2025-05-10 21:27:05 - ERROR - stderr - 18%|█▊ | 657/3741 [4:01:12<18:34:56, 21.69s/it] +2025-05-10 21:27:05 - ERROR - stderr - +2025-05-10 21:27:05 - ERROR - stderr - +2025-05-10 21:27:05 - INFO - stdout - {'loss': 0.8884, 'grad_norm': 0.6052728295326233, 'learning_rate': 1.8910851355898238e-05, 'epoch': 0.53} +2025-05-10 21:27:05 - ERROR - stderr - 18%|█▊ | 657/3741 [4:01:12<18:34:56, 21.69s/it] +2025-05-10 21:27:25 - ERROR - stderr - 18%|█▊ | 658/3741 [4:01:31<17:58:54, 21.00s/it] +2025-05-10 21:27:25 - ERROR - stderr - +2025-05-10 21:27:25 - ERROR - stderr - +2025-05-10 21:27:25 - INFO - stdout - {'loss': 0.9684, 'grad_norm': 0.6381072998046875, 'learning_rate': 1.8906918113633054e-05, 'epoch': 0.53} +2025-05-10 21:27:25 - ERROR - stderr - 18%|█▊ | 658/3741 [4:01:31<17:58:54, 21.00s/it] +2025-05-10 21:27:47 - ERROR - stderr - 18%|█▊ | 659/3741 [4:01:54<18:22:02, 21.45s/it] +2025-05-10 21:27:47 - ERROR - stderr - +2025-05-10 21:27:47 - ERROR - stderr - +2025-05-10 21:27:47 - INFO - stdout - {'loss': 0.8999, 'grad_norm': 0.6366999745368958, 'learning_rate': 1.8902978192657334e-05, 'epoch': 0.53} +2025-05-10 21:27:47 - ERROR - stderr - 18%|█▊ | 659/3741 [4:01:54<18:22:02, 21.45s/it] +2025-05-10 21:28:07 - ERROR - stderr - 18%|█▊ | 660/3741 [4:02:13<17:52:45, 20.89s/it] +2025-05-10 21:28:07 - ERROR - stderr - +2025-05-10 21:28:07 - ERROR - stderr - +2025-05-10 21:28:07 - INFO - stdout - {'loss': 0.9436, 'grad_norm': 0.6535215377807617, 'learning_rate': 1.8899031595925362e-05, 'epoch': 0.53} +2025-05-10 21:28:07 - ERROR - stderr - 18%|█▊ | 660/3741 [4:02:13<17:52:45, 20.89s/it] +2025-05-10 21:28:26 - ERROR - stderr - 18%|█▊ | 661/3741 [4:02:33<17:31:28, 20.48s/it] +2025-05-10 21:28:26 - ERROR - stderr - +2025-05-10 21:28:26 - ERROR - stderr - +2025-05-10 21:28:26 - INFO - stdout - {'loss': 0.9122, 'grad_norm': 0.6399298310279846, 'learning_rate': 1.8895078326396436e-05, 'epoch': 0.53} +2025-05-10 21:28:26 - ERROR - stderr - 18%|█▊ | 661/3741 [4:02:33<17:31:28, 20.48s/it] +2025-05-10 21:28:46 - ERROR - stderr - 18%|█▊ | 662/3741 [4:02:52<17:14:53, 20.17s/it] +2025-05-10 21:28:46 - ERROR - stderr - +2025-05-10 21:28:46 - ERROR - stderr - +2025-05-10 21:28:46 - INFO - stdout - {'loss': 0.9312, 'grad_norm': 0.6174817681312561, 'learning_rate': 1.8891118387034845e-05, 'epoch': 0.53} +2025-05-10 21:28:46 - ERROR - stderr - 18%|█▊ | 662/3741 [4:02:52<17:14:53, 20.17s/it] +2025-05-10 21:29:05 - ERROR - stderr - 18%|█▊ | 663/3741 [4:03:12<17:03:50, 19.96s/it] +2025-05-10 21:29:05 - ERROR - stderr - +2025-05-10 21:29:05 - ERROR - stderr - +2025-05-10 21:29:05 - INFO - stdout - {'loss': 0.9274, 'grad_norm': 0.6312207579612732, 'learning_rate': 1.888715178080989e-05, 'epoch': 0.53} +2025-05-10 21:29:05 - ERROR - stderr - 18%|█▊ | 663/3741 [4:03:12<17:03:50, 19.96s/it] +2025-05-10 21:29:25 - ERROR - stderr - 18%|█▊ | 664/3741 [4:03:32<17:06:08, 20.01s/it] +2025-05-10 21:29:25 - ERROR - stderr - +2025-05-10 21:29:25 - ERROR - stderr - +2025-05-10 21:29:25 - INFO - stdout - {'loss': 0.9038, 'grad_norm': 0.6061504483222961, 'learning_rate': 1.8883178510695868e-05, 'epoch': 0.53} +2025-05-10 21:29:25 - ERROR - stderr - 18%|█▊ | 664/3741 [4:03:32<17:06:08, 20.01s/it] +2025-05-10 21:29:45 - ERROR - stderr - 18%|█▊ | 665/3741 [4:03:51<16:56:21, 19.82s/it] +2025-05-10 21:29:45 - ERROR - stderr - +2025-05-10 21:29:45 - ERROR - stderr - +2025-05-10 21:29:45 - INFO - stdout - {'loss': 0.9193, 'grad_norm': 0.62549889087677, 'learning_rate': 1.8879198579672068e-05, 'epoch': 0.53} +2025-05-10 21:29:45 - ERROR - stderr - 18%|█▊ | 665/3741 [4:03:51<16:56:21, 19.82s/it] +2025-05-10 21:30:08 - ERROR - stderr - 18%|█▊ | 666/3741 [4:04:14<17:41:29, 20.71s/it] +2025-05-10 21:30:08 - ERROR - stderr - +2025-05-10 21:30:08 - ERROR - stderr - +2025-05-10 21:30:08 - INFO - stdout - {'loss': 0.931, 'grad_norm': 0.6522451043128967, 'learning_rate': 1.8875211990722785e-05, 'epoch': 0.53} +2025-05-10 21:30:08 - ERROR - stderr - 18%|█▊ | 666/3741 [4:04:14<17:41:29, 20.71s/it] +2025-05-10 21:30:27 - ERROR - stderr - 18%|█▊ | 667/3741 [4:04:33<17:23:13, 20.36s/it] +2025-05-10 21:30:27 - ERROR - stderr - +2025-05-10 21:30:27 - ERROR - stderr - +2025-05-10 21:30:27 - INFO - stdout - {'loss': 0.9345, 'grad_norm': 0.6099725365638733, 'learning_rate': 1.8871218746837294e-05, 'epoch': 0.53} +2025-05-10 21:30:27 - ERROR - stderr - 18%|█▊ | 667/3741 [4:04:33<17:23:13, 20.36s/it] +2025-05-10 21:30:50 - ERROR - stderr - 18%|█▊ | 668/3741 [4:04:56<18:01:06, 21.11s/it] +2025-05-10 21:30:50 - ERROR - stderr - +2025-05-10 21:30:50 - ERROR - stderr - +2025-05-10 21:30:50 - INFO - stdout - {'loss': 0.9469, 'grad_norm': 0.6159772872924805, 'learning_rate': 1.8867218851009862e-05, 'epoch': 0.54} +2025-05-10 21:30:50 - ERROR - stderr - 18%|█▊ | 668/3741 [4:04:56<18:01:06, 21.11s/it] +2025-05-10 21:31:10 - ERROR - stderr - 18%|█▊ | 669/3741 [4:05:16<17:39:09, 20.69s/it] +2025-05-10 21:31:10 - ERROR - stderr - +2025-05-10 21:31:10 - ERROR - stderr - +2025-05-10 21:31:10 - INFO - stdout - {'loss': 0.8725, 'grad_norm': 0.6051928400993347, 'learning_rate': 1.8863212306239753e-05, 'epoch': 0.54} +2025-05-10 21:31:10 - ERROR - stderr - 18%|█▊ | 669/3741 [4:05:16<17:39:09, 20.69s/it] +2025-05-10 21:31:33 - ERROR - stderr - 18%|█▊ | 670/3741 [4:05:40<18:27:14, 21.63s/it] +2025-05-10 21:31:33 - ERROR - stderr - +2025-05-10 21:31:33 - ERROR - stderr - +2025-05-10 21:31:33 - INFO - stdout - {'loss': 0.9943, 'grad_norm': 0.5804814100265503, 'learning_rate': 1.8859199115531213e-05, 'epoch': 0.54} +2025-05-10 21:31:33 - ERROR - stderr - 18%|█▊ | 670/3741 [4:05:40<18:27:14, 21.63s/it] +2025-05-10 21:31:53 - ERROR - stderr - 18%|█▊ | 671/3741 [4:05:59<17:57:10, 21.05s/it] +2025-05-10 21:31:53 - ERROR - stderr - +2025-05-10 21:31:53 - ERROR - stderr - +2025-05-10 21:31:53 - INFO - stdout - {'loss': 1.008, 'grad_norm': 0.6454379558563232, 'learning_rate': 1.8855179281893464e-05, 'epoch': 0.54} +2025-05-10 21:31:53 - ERROR - stderr - 18%|█▊ | 671/3741 [4:06:00<17:57:10, 21.05s/it] +2025-05-10 21:32:16 - ERROR - stderr - 18%|█▊ | 672/3741 [4:06:22<18:26:18, 21.63s/it] +2025-05-10 21:32:16 - ERROR - stderr - +2025-05-10 21:32:16 - ERROR - stderr - +2025-05-10 21:32:16 - INFO - stdout - {'loss': 0.9135, 'grad_norm': 0.5961251258850098, 'learning_rate': 1.8851152808340715e-05, 'epoch': 0.54} +2025-05-10 21:32:16 - ERROR - stderr - 18%|█▊ | 672/3741 [4:06:22<18:26:18, 21.63s/it] +2025-05-10 21:32:36 - ERROR - stderr - 18%|█▊ | 673/3741 [4:06:42<17:52:40, 20.98s/it] +2025-05-10 21:32:36 - ERROR - stderr - +2025-05-10 21:32:36 - ERROR - stderr - +2025-05-10 21:32:36 - INFO - stdout - {'loss': 0.9762, 'grad_norm': 0.644010066986084, 'learning_rate': 1.884711969789215e-05, 'epoch': 0.54} +2025-05-10 21:32:36 - ERROR - stderr - 18%|█▊ | 673/3741 [4:06:42<17:52:40, 20.98s/it] +2025-05-10 21:32:57 - ERROR - stderr - 18%|█▊ | 674/3741 [4:07:03<17:55:54, 21.05s/it] +2025-05-10 21:32:57 - ERROR - stderr - +2025-05-10 21:32:57 - ERROR - stderr - +2025-05-10 21:32:57 - INFO - stdout - {'loss': 0.9054, 'grad_norm': 0.6359036564826965, 'learning_rate': 1.884307995357194e-05, 'epoch': 0.54} +2025-05-10 21:32:57 - ERROR - stderr - 18%|█▊ | 674/3741 [4:07:03<17:55:54, 21.05s/it] +2025-05-10 21:33:17 - ERROR - stderr - 18%|█▊ | 675/3741 [4:07:23<17:37:03, 20.69s/it] +2025-05-10 21:33:17 - ERROR - stderr - +2025-05-10 21:33:17 - ERROR - stderr - +2025-05-10 21:33:17 - INFO - stdout - {'loss': 0.9705, 'grad_norm': 0.5981766581535339, 'learning_rate': 1.883903357840922e-05, 'epoch': 0.54} +2025-05-10 21:33:17 - ERROR - stderr - 18%|█▊ | 675/3741 [4:07:23<17:37:03, 20.69s/it] +2025-05-10 21:33:37 - ERROR - stderr - 18%|█▊ | 676/3741 [4:07:43<17:34:04, 20.63s/it] +2025-05-10 21:33:37 - ERROR - stderr - +2025-05-10 21:33:37 - ERROR - stderr - +2025-05-10 21:33:37 - INFO - stdout - {'loss': 0.9594, 'grad_norm': 0.6233227849006653, 'learning_rate': 1.8834980575438094e-05, 'epoch': 0.54} +2025-05-10 21:33:37 - ERROR - stderr - 18%|█▊ | 676/3741 [4:07:44<17:34:04, 20.63s/it] +2025-05-10 21:33:57 - ERROR - stderr - 18%|█▊ | 677/3741 [4:08:03<17:14:20, 20.25s/it] +2025-05-10 21:33:57 - ERROR - stderr - +2025-05-10 21:33:57 - ERROR - stderr - +2025-05-10 21:33:57 - INFO - stdout - {'loss': 0.9626, 'grad_norm': 0.6139412522315979, 'learning_rate': 1.883092094769765e-05, 'epoch': 0.54} +2025-05-10 21:33:57 - ERROR - stderr - 18%|█▊ | 677/3741 [4:08:03<17:14:20, 20.25s/it] +2025-05-10 21:34:16 - ERROR - stderr - 18%|█▊ | 678/3741 [4:08:22<17:02:55, 20.04s/it] +2025-05-10 21:34:16 - ERROR - stderr - +2025-05-10 21:34:16 - ERROR - stderr - +2025-05-10 21:34:16 - INFO - stdout - {'loss': 0.9812, 'grad_norm': 0.6309959292411804, 'learning_rate': 1.882685469823193e-05, 'epoch': 0.54} +2025-05-10 21:34:16 - ERROR - stderr - 18%|█▊ | 678/3741 [4:08:22<17:02:55, 20.04s/it] +2025-05-10 21:34:39 - ERROR - stderr - 18%|█▊ | 679/3741 [4:08:45<17:40:02, 20.77s/it] +2025-05-10 21:34:39 - ERROR - stderr - +2025-05-10 21:34:39 - ERROR - stderr - +2025-05-10 21:34:39 - INFO - stdout - {'loss': 0.9537, 'grad_norm': 0.6182360649108887, 'learning_rate': 1.882278183008995e-05, 'epoch': 0.54} +2025-05-10 21:34:39 - ERROR - stderr - 18%|█▊ | 679/3741 [4:08:45<17:40:02, 20.77s/it] +2025-05-10 21:34:58 - ERROR - stderr - 18%|█▊ | 680/3741 [4:09:04<17:17:49, 20.34s/it] +2025-05-10 21:34:58 - ERROR - stderr - +2025-05-10 21:34:58 - ERROR - stderr - +2025-05-10 21:34:58 - INFO - stdout - {'loss': 0.9611, 'grad_norm': 0.6408948302268982, 'learning_rate': 1.881870234632568e-05, 'epoch': 0.55} +2025-05-10 21:34:58 - ERROR - stderr - 18%|█▊ | 680/3741 [4:09:04<17:17:49, 20.34s/it] +2025-05-10 21:35:22 - ERROR - stderr - 18%|█▊ | 681/3741 [4:09:28<18:15:38, 21.48s/it] +2025-05-10 21:35:22 - ERROR - stderr - +2025-05-10 21:35:22 - ERROR - stderr - +2025-05-10 21:35:22 - INFO - stdout - {'loss': 0.9661, 'grad_norm': 0.6162562966346741, 'learning_rate': 1.8814616249998063e-05, 'epoch': 0.55} +2025-05-10 21:35:22 - ERROR - stderr - 18%|█▊ | 681/3741 [4:09:28<18:15:38, 21.48s/it] +2025-05-10 21:35:42 - ERROR - stderr - 18%|█▊ | 682/3741 [4:09:48<17:46:25, 20.92s/it] +2025-05-10 21:35:42 - ERROR - stderr - +2025-05-10 21:35:42 - ERROR - stderr - +2025-05-10 21:35:42 - INFO - stdout - {'loss': 0.9394, 'grad_norm': 0.6035715341567993, 'learning_rate': 1.8810523544170986e-05, 'epoch': 0.55} +2025-05-10 21:35:42 - ERROR - stderr - 18%|█▊ | 682/3741 [4:09:48<17:46:25, 20.92s/it] +2025-05-10 21:36:06 - ERROR - stderr - 18%|█▊ | 683/3741 [4:10:13<18:41:36, 22.01s/it] +2025-05-10 21:36:06 - ERROR - stderr - +2025-05-10 21:36:06 - ERROR - stderr - +2025-05-10 21:36:06 - INFO - stdout - {'loss': 0.9357, 'grad_norm': 0.644282877445221, 'learning_rate': 1.88064242319133e-05, 'epoch': 0.55} +2025-05-10 21:36:06 - ERROR - stderr - 18%|█▊ | 683/3741 [4:10:13<18:41:36, 22.01s/it] +2025-05-10 21:36:26 - ERROR - stderr - 18%|█▊ | 684/3741 [4:10:32<18:03:57, 21.27s/it] +2025-05-10 21:36:26 - ERROR - stderr - +2025-05-10 21:36:26 - ERROR - stderr - +2025-05-10 21:36:26 - INFO - stdout - {'loss': 0.9142, 'grad_norm': 0.6356081962585449, 'learning_rate': 1.8802318316298817e-05, 'epoch': 0.55} +2025-05-10 21:36:26 - ERROR - stderr - 18%|█▊ | 684/3741 [4:10:32<18:03:57, 21.27s/it] +2025-05-10 21:36:45 - ERROR - stderr - 18%|█▊ | 685/3741 [4:10:52<17:39:09, 20.80s/it] +2025-05-10 21:36:45 - ERROR - stderr - +2025-05-10 21:36:45 - ERROR - stderr - +2025-05-10 21:36:45 - INFO - stdout - {'loss': 0.928, 'grad_norm': 0.6344892978668213, 'learning_rate': 1.8798205800406283e-05, 'epoch': 0.55} +2025-05-10 21:36:45 - ERROR - stderr - 18%|█▊ | 685/3741 [4:10:52<17:39:09, 20.80s/it] +2025-05-10 21:36:46 - INFO - stdout - WARNING: tokenization mismatch: 1 vs. 3126. (ignored) +2025-05-10 21:37:06 - ERROR - stderr - 18%|█▊ | 686/3741 [4:11:12<17:30:38, 20.63s/it] +2025-05-10 21:37:06 - ERROR - stderr - +2025-05-10 21:37:06 - ERROR - stderr - +2025-05-10 21:37:06 - INFO - stdout - {'loss': 0.9173, 'grad_norm': 0.6985346674919128, 'learning_rate': 1.8794086687319405e-05, 'epoch': 0.55} +2025-05-10 21:37:06 - ERROR - stderr - 18%|█▊ | 686/3741 [4:11:12<17:30:38, 20.63s/it] +2025-05-10 21:37:25 - ERROR - stderr - 18%|█▊ | 687/3741 [4:11:32<17:13:06, 20.30s/it] +2025-05-10 21:37:25 - ERROR - stderr - +2025-05-10 21:37:25 - ERROR - stderr - +2025-05-10 21:37:25 - INFO - stdout - {'loss': 0.9559, 'grad_norm': 0.6084068417549133, 'learning_rate': 1.8789960980126836e-05, 'epoch': 0.55} +2025-05-10 21:37:25 - ERROR - stderr - 18%|█▊ | 687/3741 [4:11:32<17:13:06, 20.30s/it] +2025-05-10 21:37:51 - ERROR - stderr - 18%|█▊ | 688/3741 [4:11:57<18:35:07, 21.92s/it] +2025-05-10 21:37:51 - ERROR - stderr - +2025-05-10 21:37:51 - ERROR - stderr - +2025-05-10 21:37:51 - INFO - stdout - {'loss': 0.9761, 'grad_norm': 0.6394224166870117, 'learning_rate': 1.8785828681922176e-05, 'epoch': 0.55} +2025-05-10 21:37:51 - ERROR - stderr - 18%|█▊ | 688/3741 [4:11:57<18:35:07, 21.92s/it] +2025-05-10 21:38:10 - ERROR - stderr - 18%|█▊ | 689/3741 [4:12:17<17:55:03, 21.13s/it] +2025-05-10 21:38:10 - ERROR - stderr - +2025-05-10 21:38:10 - ERROR - stderr - +2025-05-10 21:38:10 - INFO - stdout - {'loss': 0.9265, 'grad_norm': 0.637833833694458, 'learning_rate': 1.8781689795803954e-05, 'epoch': 0.55} +2025-05-10 21:38:10 - ERROR - stderr - 18%|█▊ | 689/3741 [4:12:17<17:55:03, 21.13s/it] +2025-05-10 21:38:35 - ERROR - stderr - 18%|█▊ | 690/3741 [4:12:42<18:56:17, 22.35s/it] +2025-05-10 21:38:35 - ERROR - stderr - +2025-05-10 21:38:35 - ERROR - stderr - +2025-05-10 21:38:35 - INFO - stdout - {'loss': 0.9381, 'grad_norm': 0.631105899810791, 'learning_rate': 1.8777544324875653e-05, 'epoch': 0.55} +2025-05-10 21:38:35 - ERROR - stderr - 18%|█▊ | 690/3741 [4:12:42<18:56:17, 22.35s/it] +2025-05-10 21:38:55 - ERROR - stderr - 18%|█▊ | 691/3741 [4:13:02<18:18:30, 21.61s/it] +2025-05-10 21:38:55 - ERROR - stderr - +2025-05-10 21:38:55 - ERROR - stderr - +2025-05-10 21:38:55 - INFO - stdout - {'loss': 0.938, 'grad_norm': 0.6532770991325378, 'learning_rate': 1.8773392272245687e-05, 'epoch': 0.55} +2025-05-10 21:38:55 - ERROR - stderr - 18%|█▊ | 691/3741 [4:13:02<18:18:30, 21.61s/it] +2025-05-10 21:39:20 - ERROR - stderr - 18%|█▊ | 692/3741 [4:13:26<19:06:39, 22.56s/it] +2025-05-10 21:39:20 - ERROR - stderr - +2025-05-10 21:39:20 - ERROR - stderr - +2025-05-10 21:39:20 - INFO - stdout - {'loss': 0.9557, 'grad_norm': 0.653390109539032, 'learning_rate': 1.8769233641027406e-05, 'epoch': 0.55} +2025-05-10 21:39:20 - ERROR - stderr - 18%|█▊ | 692/3741 [4:13:26<19:06:39, 22.56s/it] +2025-05-10 21:39:40 - ERROR - stderr - 19%|█▊ | 693/3741 [4:13:46<18:25:31, 21.76s/it] +2025-05-10 21:39:40 - ERROR - stderr - +2025-05-10 21:39:40 - ERROR - stderr - +2025-05-10 21:39:40 - INFO - stdout - {'loss': 0.8861, 'grad_norm': 0.6561689376831055, 'learning_rate': 1.8765068434339095e-05, 'epoch': 0.56} +2025-05-10 21:39:40 - ERROR - stderr - 19%|█▊ | 693/3741 [4:13:46<18:25:31, 21.76s/it] +2025-05-10 21:40:00 - ERROR - stderr - 19%|█▊ | 694/3741 [4:14:07<18:02:37, 21.32s/it] +2025-05-10 21:40:00 - ERROR - stderr - +2025-05-10 21:40:00 - ERROR - stderr - +2025-05-10 21:40:00 - INFO - stdout - {'loss': 0.9646, 'grad_norm': 0.6327289342880249, 'learning_rate': 1.8760896655303968e-05, 'epoch': 0.56} +2025-05-10 21:40:00 - ERROR - stderr - 19%|█▊ | 694/3741 [4:14:07<18:02:37, 21.32s/it] +2025-05-10 21:40:20 - ERROR - stderr - 19%|█▊ | 695/3741 [4:14:26<17:34:54, 20.78s/it] +2025-05-10 21:40:20 - ERROR - stderr - +2025-05-10 21:40:20 - ERROR - stderr - +2025-05-10 21:40:20 - INFO - stdout - {'loss': 0.9892, 'grad_norm': 0.6778370141983032, 'learning_rate': 1.875671830705016e-05, 'epoch': 0.56} +2025-05-10 21:40:20 - ERROR - stderr - 19%|█▊ | 695/3741 [4:14:26<17:34:54, 20.78s/it] +2025-05-10 21:40:40 - ERROR - stderr - 19%|█▊ | 696/3741 [4:14:46<17:24:39, 20.58s/it] +2025-05-10 21:40:40 - ERROR - stderr - +2025-05-10 21:40:40 - ERROR - stderr - +2025-05-10 21:40:40 - INFO - stdout - {'loss': 0.9706, 'grad_norm': 0.6418762803077698, 'learning_rate': 1.875253339271075e-05, 'epoch': 0.56} +2025-05-10 21:40:40 - ERROR - stderr - 19%|█▊ | 696/3741 [4:14:46<17:24:39, 20.58s/it] +2025-05-10 21:41:04 - ERROR - stderr - 19%|█▊ | 697/3741 [4:15:10<18:19:00, 21.66s/it] +2025-05-10 21:41:04 - ERROR - stderr - +2025-05-10 21:41:04 - ERROR - stderr - +2025-05-10 21:41:04 - INFO - stdout - {'loss': 0.9193, 'grad_norm': 0.6594840884208679, 'learning_rate': 1.8748341915423723e-05, 'epoch': 0.56} +2025-05-10 21:41:04 - ERROR - stderr - 19%|█▊ | 697/3741 [4:15:10<18:19:00, 21.66s/it] +2025-05-10 21:41:24 - ERROR - stderr - 19%|█▊ | 698/3741 [4:15:30<17:48:30, 21.07s/it] +2025-05-10 21:41:24 - ERROR - stderr - +2025-05-10 21:41:24 - ERROR - stderr - +2025-05-10 21:41:24 - INFO - stdout - {'loss': 0.9528, 'grad_norm': 0.6546462178230286, 'learning_rate': 1.874414387833199e-05, 'epoch': 0.56} +2025-05-10 21:41:24 - ERROR - stderr - 19%|█▊ | 698/3741 [4:15:30<17:48:30, 21.07s/it] +2025-05-10 21:41:48 - ERROR - stderr - 19%|█▊ | 699/3741 [4:15:54<18:39:27, 22.08s/it] +2025-05-10 21:41:48 - ERROR - stderr - +2025-05-10 21:41:48 - ERROR - stderr - +2025-05-10 21:41:48 - INFO - stdout - {'loss': 0.9301, 'grad_norm': 0.666907548904419, 'learning_rate': 1.8739939284583385e-05, 'epoch': 0.56} +2025-05-10 21:41:48 - ERROR - stderr - 19%|█▊ | 699/3741 [4:15:55<18:39:27, 22.08s/it] +2025-05-10 21:42:08 - ERROR - stderr - 19%|█▊ | 700/3741 [4:16:14<17:57:50, 21.27s/it] +2025-05-10 21:42:08 - ERROR - stderr - +2025-05-10 21:42:08 - ERROR - stderr - +2025-05-10 21:42:08 - INFO - stdout - {'loss': 0.9624, 'grad_norm': 0.6767510771751404, 'learning_rate': 1.873572813733066e-05, 'epoch': 0.56} +2025-05-10 21:42:08 - ERROR - stderr - 19%|█▊ | 700/3741 [4:16:14<17:57:50, 21.27s/it] +2025-05-10 21:42:32 - ERROR - stderr - 19%|█▊ | 701/3741 [4:16:38<18:48:24, 22.27s/it] +2025-05-10 21:42:32 - ERROR - stderr - +2025-05-10 21:42:32 - ERROR - stderr - +2025-05-10 21:42:32 - INFO - stdout - {'loss': 0.9672, 'grad_norm': 0.6574323773384094, 'learning_rate': 1.8731510439731465e-05, 'epoch': 0.56} +2025-05-10 21:42:32 - ERROR - stderr - 19%|█▊ | 701/3741 [4:16:39<18:48:24, 22.27s/it] +2025-05-10 21:42:51 - ERROR - stderr - 19%|█▉ | 702/3741 [4:16:58<18:02:32, 21.37s/it] +2025-05-10 21:42:51 - ERROR - stderr - +2025-05-10 21:42:51 - ERROR - stderr - +2025-05-10 21:42:51 - INFO - stdout - {'loss': 0.9405, 'grad_norm': 0.6924127340316772, 'learning_rate': 1.872728619494838e-05, 'epoch': 0.56} +2025-05-10 21:42:51 - ERROR - stderr - 19%|█▉ | 702/3741 [4:16:58<18:02:32, 21.37s/it] +2025-05-10 21:43:13 - ERROR - stderr - 19%|█▉ | 703/3741 [4:17:19<18:02:55, 21.39s/it] +2025-05-10 21:43:13 - ERROR - stderr - +2025-05-10 21:43:13 - ERROR - stderr - +2025-05-10 21:43:13 - INFO - stdout - {'loss': 0.9477, 'grad_norm': 0.6515429615974426, 'learning_rate': 1.8723055406148894e-05, 'epoch': 0.56} +2025-05-10 21:43:13 - ERROR - stderr - 19%|█▉ | 703/3741 [4:17:19<18:02:55, 21.39s/it] +2025-05-10 21:43:32 - ERROR - stderr - 19%|█▉ | 704/3741 [4:17:39<17:33:46, 20.82s/it] +2025-05-10 21:43:32 - ERROR - stderr - +2025-05-10 21:43:32 - ERROR - stderr - +2025-05-10 21:43:32 - INFO - stdout - {'loss': 0.9403, 'grad_norm': 0.7073892951011658, 'learning_rate': 1.8718818076505385e-05, 'epoch': 0.56} +2025-05-10 21:43:32 - ERROR - stderr - 19%|█▉ | 704/3741 [4:17:39<17:33:46, 20.82s/it] +2025-05-10 21:43:52 - ERROR - stderr - 19%|█▉ | 705/3741 [4:17:58<17:15:55, 20.47s/it] +2025-05-10 21:43:52 - ERROR - stderr - +2025-05-10 21:43:52 - ERROR - stderr - +2025-05-10 21:43:52 - INFO - stdout - {'loss': 0.9704, 'grad_norm': 0.706851065158844, 'learning_rate': 1.8714574209195153e-05, 'epoch': 0.57} +2025-05-10 21:43:52 - ERROR - stderr - 19%|█▉ | 705/3741 [4:17:58<17:15:55, 20.47s/it] +2025-05-10 21:44:15 - ERROR - stderr - 19%|█▉ | 706/3741 [4:18:21<17:54:19, 21.24s/it] +2025-05-10 21:44:15 - ERROR - stderr - +2025-05-10 21:44:15 - ERROR - stderr - +2025-05-10 21:44:15 - INFO - stdout - {'loss': 0.9558, 'grad_norm': 0.624336838722229, 'learning_rate': 1.8710323807400393e-05, 'epoch': 0.57} +2025-05-10 21:44:15 - ERROR - stderr - 19%|█▉ | 706/3741 [4:18:21<17:54:19, 21.24s/it] +2025-05-10 21:44:34 - ERROR - stderr - 19%|█▉ | 707/3741 [4:18:41<17:26:32, 20.70s/it] +2025-05-10 21:44:34 - ERROR - stderr - +2025-05-10 21:44:34 - ERROR - stderr - +2025-05-10 21:44:34 - INFO - stdout - {'loss': 0.9467, 'grad_norm': 0.6605740785598755, 'learning_rate': 1.8706066874308205e-05, 'epoch': 0.57} +2025-05-10 21:44:34 - ERROR - stderr - 19%|█▉ | 707/3741 [4:18:41<17:26:32, 20.70s/it] +2025-05-10 21:44:58 - ERROR - stderr - 19%|█▉ | 708/3741 [4:19:05<18:13:13, 21.63s/it] +2025-05-10 21:44:58 - ERROR - stderr - +2025-05-10 21:44:58 - ERROR - stderr - +2025-05-10 21:44:58 - INFO - stdout - {'loss': 0.9277, 'grad_norm': 0.7018135190010071, 'learning_rate': 1.870180341311057e-05, 'epoch': 0.57} +2025-05-10 21:44:58 - ERROR - stderr - 19%|█▉ | 708/3741 [4:19:05<18:13:13, 21.63s/it] +2025-05-10 21:45:18 - ERROR - stderr - 19%|█▉ | 709/3741 [4:19:24<17:36:46, 20.91s/it] +2025-05-10 21:45:18 - ERROR - stderr - +2025-05-10 21:45:18 - ERROR - stderr - +2025-05-10 21:45:18 - INFO - stdout - {'loss': 0.9706, 'grad_norm': 0.6792058348655701, 'learning_rate': 1.8697533427004395e-05, 'epoch': 0.57} +2025-05-10 21:45:18 - ERROR - stderr - 19%|█▉ | 709/3741 [4:19:24<17:36:46, 20.91s/it] +2025-05-10 21:45:42 - ERROR - stderr - 19%|█▉ | 710/3741 [4:19:48<18:29:34, 21.96s/it] +2025-05-10 21:45:42 - ERROR - stderr - +2025-05-10 21:45:42 - ERROR - stderr - +2025-05-10 21:45:42 - INFO - stdout - {'loss': 0.9426, 'grad_norm': 0.6452786326408386, 'learning_rate': 1.8693256919191446e-05, 'epoch': 0.57} +2025-05-10 21:45:42 - ERROR - stderr - 19%|█▉ | 710/3741 [4:19:48<18:29:34, 21.96s/it] +2025-05-10 21:46:01 - ERROR - stderr - 19%|█▉ | 711/3741 [4:20:08<17:51:05, 21.21s/it] +2025-05-10 21:46:01 - ERROR - stderr - +2025-05-10 21:46:01 - ERROR - stderr - +2025-05-10 21:46:01 - INFO - stdout - {'loss': 0.9299, 'grad_norm': 0.7065607309341431, 'learning_rate': 1.8688973892878405e-05, 'epoch': 0.57} +2025-05-10 21:46:01 - ERROR - stderr - 19%|█▉ | 711/3741 [4:20:08<17:51:05, 21.21s/it] +2025-05-10 21:46:21 - ERROR - stderr - 19%|█▉ | 712/3741 [4:20:28<17:29:37, 20.79s/it] +2025-05-10 21:46:21 - ERROR - stderr - +2025-05-10 21:46:21 - ERROR - stderr - +2025-05-10 21:46:21 - INFO - stdout - {'loss': 0.9521, 'grad_norm': 0.6309828758239746, 'learning_rate': 1.8684684351276822e-05, 'epoch': 0.57} +2025-05-10 21:46:21 - ERROR - stderr - 19%|█▉ | 712/3741 [4:20:28<17:29:37, 20.79s/it] +2025-05-10 21:46:42 - ERROR - stderr - 19%|█▉ | 713/3741 [4:20:48<17:29:45, 20.80s/it] +2025-05-10 21:46:42 - ERROR - stderr - +2025-05-10 21:46:42 - ERROR - stderr - +2025-05-10 21:46:42 - INFO - stdout - {'loss': 0.9899, 'grad_norm': 0.6651354432106018, 'learning_rate': 1.868038829760314e-05, 'epoch': 0.57} +2025-05-10 21:46:42 - ERROR - stderr - 19%|█▉ | 713/3741 [4:20:48<17:29:45, 20.80s/it] +2025-05-10 21:47:02 - ERROR - stderr - 19%|█▉ | 714/3741 [4:21:08<17:13:22, 20.48s/it] +2025-05-10 21:47:02 - ERROR - stderr - +2025-05-10 21:47:02 - ERROR - stderr - +2025-05-10 21:47:02 - INFO - stdout - {'loss': 0.9125, 'grad_norm': 0.6422202587127686, 'learning_rate': 1.8676085735078696e-05, 'epoch': 0.57} +2025-05-10 21:47:02 - ERROR - stderr - 19%|█▉ | 714/3741 [4:21:08<17:13:22, 20.48s/it] +2025-05-10 21:47:25 - ERROR - stderr - 19%|█▉ | 715/3741 [4:21:32<17:59:33, 21.41s/it] +2025-05-10 21:47:25 - ERROR - stderr - +2025-05-10 21:47:25 - ERROR - stderr - +2025-05-10 21:47:25 - INFO - stdout - {'loss': 0.9854, 'grad_norm': 0.6563206315040588, 'learning_rate': 1.8671776666929694e-05, 'epoch': 0.57} +2025-05-10 21:47:25 - ERROR - stderr - 19%|█▉ | 715/3741 [4:21:32<17:59:33, 21.41s/it] +2025-05-10 21:47:45 - ERROR - stderr - 19%|█▉ | 716/3741 [4:21:52<17:36:01, 20.95s/it] +2025-05-10 21:47:45 - ERROR - stderr - +2025-05-10 21:47:45 - ERROR - stderr - +2025-05-10 21:47:45 - INFO - stdout - {'loss': 0.9754, 'grad_norm': 0.5996401906013489, 'learning_rate': 1.8667461096387217e-05, 'epoch': 0.57} +2025-05-10 21:47:45 - ERROR - stderr - 19%|█▉ | 716/3741 [4:21:52<17:36:01, 20.95s/it] +2025-05-10 21:48:08 - ERROR - stderr - 19%|█▉ | 717/3741 [4:22:14<18:06:07, 21.55s/it] +2025-05-10 21:48:08 - ERROR - stderr - +2025-05-10 21:48:08 - ERROR - stderr - +2025-05-10 21:48:08 - INFO - stdout - {'loss': 0.9524, 'grad_norm': 0.5983526706695557, 'learning_rate': 1.866313902668723e-05, 'epoch': 0.57} +2025-05-10 21:48:08 - ERROR - stderr - 19%|█▉ | 717/3741 [4:22:14<18:06:07, 21.55s/it] +2025-05-10 21:48:28 - ERROR - stderr - 19%|█▉ | 718/3741 [4:22:34<17:35:29, 20.95s/it] +2025-05-10 21:48:28 - ERROR - stderr - +2025-05-10 21:48:28 - ERROR - stderr - +2025-05-10 21:48:28 - INFO - stdout - {'loss': 0.8934, 'grad_norm': 0.5966793298721313, 'learning_rate': 1.8658810461070566e-05, 'epoch': 0.58} +2025-05-10 21:48:28 - ERROR - stderr - 19%|█▉ | 718/3741 [4:22:34<17:35:29, 20.95s/it] +2025-05-10 21:48:48 - ERROR - stderr - 19%|█▉ | 719/3741 [4:22:54<17:20:19, 20.65s/it] +2025-05-10 21:48:48 - ERROR - stderr - +2025-05-10 21:48:48 - ERROR - stderr - +2025-05-10 21:48:48 - INFO - stdout - {'loss': 0.9368, 'grad_norm': 0.636679470539093, 'learning_rate': 1.865447540278293e-05, 'epoch': 0.58} +2025-05-10 21:48:48 - ERROR - stderr - 19%|█▉ | 719/3741 [4:22:54<17:20:19, 20.65s/it] +2025-05-10 21:49:07 - ERROR - stderr - 19%|█▉ | 720/3741 [4:23:13<17:01:43, 20.29s/it] +2025-05-10 21:49:07 - ERROR - stderr - +2025-05-10 21:49:07 - ERROR - stderr - +2025-05-10 21:49:07 - INFO - stdout - {'loss': 0.9498, 'grad_norm': 0.6102825403213501, 'learning_rate': 1.8650133855074905e-05, 'epoch': 0.58} +2025-05-10 21:49:07 - ERROR - stderr - 19%|█▉ | 720/3741 [4:23:13<17:01:43, 20.29s/it] +2025-05-10 21:49:27 - ERROR - stderr - 19%|█▉ | 721/3741 [4:23:33<16:49:17, 20.05s/it] +2025-05-10 21:49:27 - ERROR - stderr - +2025-05-10 21:49:27 - ERROR - stderr - +2025-05-10 21:49:27 - INFO - stdout - {'loss': 0.9235, 'grad_norm': 0.6652585864067078, 'learning_rate': 1.8645785821201918e-05, 'epoch': 0.58} +2025-05-10 21:49:27 - ERROR - stderr - 19%|█▉ | 721/3741 [4:23:33<16:49:17, 20.05s/it] +2025-05-10 21:49:51 - ERROR - stderr - 19%|█▉ | 722/3741 [4:23:57<17:56:05, 21.39s/it] +2025-05-10 21:49:51 - ERROR - stderr - +2025-05-10 21:49:51 - ERROR - stderr - +2025-05-10 21:49:51 - INFO - stdout - {'loss': 0.965, 'grad_norm': 0.6623063087463379, 'learning_rate': 1.864143130442428e-05, 'epoch': 0.58} +2025-05-10 21:49:51 - ERROR - stderr - 19%|█▉ | 722/3741 [4:23:57<17:56:05, 21.39s/it] +2025-05-10 21:50:11 - ERROR - stderr - 19%|█▉ | 723/3741 [4:24:17<17:26:17, 20.80s/it] +2025-05-10 21:50:11 - ERROR - stderr - +2025-05-10 21:50:11 - ERROR - stderr - +2025-05-10 21:50:11 - INFO - stdout - {'loss': 0.9354, 'grad_norm': 0.6263592839241028, 'learning_rate': 1.8637070308007156e-05, 'epoch': 0.58} +2025-05-10 21:50:11 - ERROR - stderr - 19%|█▉ | 723/3741 [4:24:17<17:26:17, 20.80s/it] +2025-05-10 21:50:34 - ERROR - stderr - 19%|█▉ | 724/3741 [4:24:40<18:01:55, 21.52s/it] +2025-05-10 21:50:34 - ERROR - stderr - +2025-05-10 21:50:34 - ERROR - stderr - +2025-05-10 21:50:34 - INFO - stdout - {'loss': 0.9467, 'grad_norm': 0.6858724355697632, 'learning_rate': 1.8632702835220572e-05, 'epoch': 0.58} +2025-05-10 21:50:34 - ERROR - stderr - 19%|█▉ | 724/3741 [4:24:40<18:01:55, 21.52s/it] +2025-05-10 21:50:53 - ERROR - stderr - 19%|█▉ | 725/3741 [4:25:00<17:30:23, 20.90s/it] +2025-05-10 21:50:53 - ERROR - stderr - +2025-05-10 21:50:53 - ERROR - stderr - +2025-05-10 21:50:53 - INFO - stdout - {'loss': 0.8885, 'grad_norm': 0.63621586561203, 'learning_rate': 1.8628328889339403e-05, 'epoch': 0.58} +2025-05-10 21:50:53 - ERROR - stderr - 19%|█▉ | 725/3741 [4:25:00<17:30:23, 20.90s/it] +2025-05-10 21:51:12 - ERROR - stderr - 19%|█▉ | 726/3741 [4:25:19<17:06:05, 20.42s/it] +2025-05-10 21:51:13 - ERROR - stderr - +2025-05-10 21:51:13 - ERROR - stderr - +2025-05-10 21:51:13 - INFO - stdout - {'loss': 0.9344, 'grad_norm': 0.629024088382721, 'learning_rate': 1.8623948473643383e-05, 'epoch': 0.58} +2025-05-10 21:51:13 - ERROR - stderr - 19%|█▉ | 726/3741 [4:25:19<17:06:05, 20.42s/it] +2025-05-10 21:51:35 - ERROR - stderr - 19%|█▉ | 727/3741 [4:25:42<17:41:32, 21.13s/it] +2025-05-10 21:51:35 - ERROR - stderr - +2025-05-10 21:51:35 - ERROR - stderr - +2025-05-10 21:51:35 - INFO - stdout - {'loss': 0.9756, 'grad_norm': 0.6625981330871582, 'learning_rate': 1.86195615914171e-05, 'epoch': 0.58} +2025-05-10 21:51:35 - ERROR - stderr - 19%|█▉ | 727/3741 [4:25:42<17:41:32, 21.13s/it] +2025-05-10 21:51:55 - ERROR - stderr - 19%|█▉ | 728/3741 [4:26:01<17:16:35, 20.64s/it] +2025-05-10 21:51:55 - ERROR - stderr - +2025-05-10 21:51:55 - ERROR - stderr - +2025-05-10 21:51:55 - INFO - stdout - {'loss': 0.9895, 'grad_norm': 0.6435332894325256, 'learning_rate': 1.8615168245949982e-05, 'epoch': 0.58} +2025-05-10 21:51:55 - ERROR - stderr - 19%|█▉ | 728/3741 [4:26:01<17:16:35, 20.64s/it] +2025-05-10 21:52:19 - ERROR - stderr - 19%|█▉ | 729/3741 [4:26:25<18:03:24, 21.58s/it] +2025-05-10 21:52:19 - ERROR - stderr - +2025-05-10 21:52:19 - ERROR - stderr - +2025-05-10 21:52:19 - INFO - stdout - {'loss': 0.9327, 'grad_norm': 0.6450731158256531, 'learning_rate': 1.8610768440536317e-05, 'epoch': 0.58} +2025-05-10 21:52:19 - ERROR - stderr - 19%|█▉ | 729/3741 [4:26:25<18:03:24, 21.58s/it] +2025-05-10 21:52:38 - ERROR - stderr - 20%|█▉ | 730/3741 [4:26:45<17:34:27, 21.01s/it] +2025-05-10 21:52:38 - ERROR - stderr - +2025-05-10 21:52:38 - ERROR - stderr - +2025-05-10 21:52:38 - INFO - stdout - {'loss': 0.961, 'grad_norm': 0.6825403571128845, 'learning_rate': 1.8606362178475227e-05, 'epoch': 0.59} +2025-05-10 21:52:38 - ERROR - stderr - 20%|█▉ | 730/3741 [4:26:45<17:34:27, 21.01s/it] +2025-05-10 21:53:02 - ERROR - stderr - 20%|█▉ | 731/3741 [4:27:08<18:15:18, 21.83s/it] +2025-05-10 21:53:02 - ERROR - stderr - +2025-05-10 21:53:02 - ERROR - stderr - +2025-05-10 21:53:02 - INFO - stdout - {'loss': 0.9043, 'grad_norm': 0.6117799878120422, 'learning_rate': 1.860194946307067e-05, 'epoch': 0.59} +2025-05-10 21:53:02 - ERROR - stderr - 20%|█▉ | 731/3741 [4:27:08<18:15:18, 21.83s/it] +2025-05-10 21:53:21 - ERROR - stderr - 20%|█▉ | 732/3741 [4:27:28<17:38:42, 21.11s/it] +2025-05-10 21:53:21 - ERROR - stderr - +2025-05-10 21:53:21 - ERROR - stderr - +2025-05-10 21:53:21 - INFO - stdout - {'loss': 0.993, 'grad_norm': 0.6143025159835815, 'learning_rate': 1.859753029763146e-05, 'epoch': 0.59} +2025-05-10 21:53:21 - ERROR - stderr - 20%|█▉ | 732/3741 [4:27:28<17:38:42, 21.11s/it] +2025-05-10 21:53:41 - ERROR - stderr - 20%|█▉ | 733/3741 [4:27:47<17:17:38, 20.70s/it] +2025-05-10 21:53:41 - ERROR - stderr - +2025-05-10 21:53:41 - ERROR - stderr - +2025-05-10 21:53:41 - INFO - stdout - {'loss': 0.9069, 'grad_norm': 0.5972070693969727, 'learning_rate': 1.859310468547123e-05, 'epoch': 0.59} +2025-05-10 21:53:41 - ERROR - stderr - 20%|█▉ | 733/3741 [4:27:47<17:17:38, 20.70s/it] +2025-05-10 21:54:04 - ERROR - stderr - 20%|█▉ | 734/3741 [4:28:11<17:53:56, 21.43s/it] +2025-05-10 21:54:04 - ERROR - stderr - +2025-05-10 21:54:04 - ERROR - stderr - +2025-05-10 21:54:04 - INFO - stdout - {'loss': 0.9822, 'grad_norm': 0.6459053158760071, 'learning_rate': 1.8588672629908462e-05, 'epoch': 0.59} +2025-05-10 21:54:04 - ERROR - stderr - 20%|█▉ | 734/3741 [4:28:11<17:53:56, 21.43s/it] +2025-05-10 21:54:24 - ERROR - stderr - 20%|█▉ | 735/3741 [4:28:30<17:21:03, 20.78s/it] +2025-05-10 21:54:24 - ERROR - stderr - +2025-05-10 21:54:24 - ERROR - stderr - +2025-05-10 21:54:24 - INFO - stdout - {'loss': 0.9833, 'grad_norm': 0.674164891242981, 'learning_rate': 1.8584234134266456e-05, 'epoch': 0.59} +2025-05-10 21:54:24 - ERROR - stderr - 20%|█▉ | 735/3741 [4:28:30<17:21:03, 20.78s/it] +2025-05-10 21:54:47 - ERROR - stderr - 20%|█▉ | 736/3741 [4:28:54<18:08:02, 21.72s/it] +2025-05-10 21:54:48 - ERROR - stderr - +2025-05-10 21:54:48 - ERROR - stderr - +2025-05-10 21:54:48 - INFO - stdout - {'loss': 0.9851, 'grad_norm': 0.6549596190452576, 'learning_rate': 1.857978920187335e-05, 'epoch': 0.59} +2025-05-10 21:54:48 - ERROR - stderr - 20%|█▉ | 736/3741 [4:28:54<18:08:02, 21.72s/it] +2025-05-10 21:55:07 - ERROR - stderr - 20%|█▉ | 737/3741 [4:29:13<17:32:54, 21.03s/it] +2025-05-10 21:55:07 - ERROR - stderr - +2025-05-10 21:55:07 - ERROR - stderr - +2025-05-10 21:55:07 - INFO - stdout - {'loss': 0.9541, 'grad_norm': 0.621340811252594, 'learning_rate': 1.85753378360621e-05, 'epoch': 0.59} +2025-05-10 21:55:07 - ERROR - stderr - 20%|█▉ | 737/3741 [4:29:13<17:32:54, 21.03s/it] +2025-05-10 21:55:31 - ERROR - stderr - 20%|█▉ | 738/3741 [4:29:37<18:15:00, 21.88s/it] +2025-05-10 21:55:31 - ERROR - stderr - +2025-05-10 21:55:31 - ERROR - stderr - +2025-05-10 21:55:31 - INFO - stdout - {'loss': 0.9206, 'grad_norm': 0.652487576007843, 'learning_rate': 1.8570880040170504e-05, 'epoch': 0.59} +2025-05-10 21:55:31 - ERROR - stderr - 20%|█▉ | 738/3741 [4:29:37<18:15:00, 21.88s/it] +2025-05-10 21:55:51 - ERROR - stderr - 20%|█▉ | 739/3741 [4:29:57<17:43:29, 21.26s/it] +2025-05-10 21:55:51 - ERROR - stderr - +2025-05-10 21:55:51 - ERROR - stderr - +2025-05-10 21:55:51 - INFO - stdout - {'loss': 0.9676, 'grad_norm': 0.6780267953872681, 'learning_rate': 1.8566415817541157e-05, 'epoch': 0.59} +2025-05-10 21:55:51 - ERROR - stderr - 20%|█▉ | 739/3741 [4:29:57<17:43:29, 21.26s/it] +2025-05-10 21:56:10 - ERROR - stderr - 20%|█▉ | 740/3741 [4:30:16<17:16:00, 20.71s/it] +2025-05-10 21:56:10 - ERROR - stderr - +2025-05-10 21:56:10 - ERROR - stderr - +2025-05-10 21:56:10 - INFO - stdout - {'loss': 0.9223, 'grad_norm': 0.6120235323905945, 'learning_rate': 1.8561945171521498e-05, 'epoch': 0.59} +2025-05-10 21:56:10 - ERROR - stderr - 20%|█▉ | 740/3741 [4:30:16<17:16:00, 20.71s/it] +2025-05-10 21:56:31 - ERROR - stderr - 20%|█▉ | 741/3741 [4:30:37<17:20:40, 20.81s/it] +2025-05-10 21:56:31 - ERROR - stderr - +2025-05-10 21:56:31 - ERROR - stderr - +2025-05-10 21:56:31 - INFO - stdout - {'loss': 0.9164, 'grad_norm': 0.6822912096977234, 'learning_rate': 1.8557468105463753e-05, 'epoch': 0.59} +2025-05-10 21:56:31 - ERROR - stderr - 20%|█▉ | 741/3741 [4:30:37<17:20:40, 20.81s/it] +2025-05-10 21:56:51 - ERROR - stderr - 20%|█▉ | 742/3741 [4:30:57<17:00:38, 20.42s/it] +2025-05-10 21:56:51 - ERROR - stderr - +2025-05-10 21:56:51 - ERROR - stderr - +2025-05-10 21:56:51 - INFO - stdout - {'loss': 0.9028, 'grad_norm': 0.6549542546272278, 'learning_rate': 1.855298462272499e-05, 'epoch': 0.6} +2025-05-10 21:56:51 - ERROR - stderr - 20%|█▉ | 742/3741 [4:30:57<17:00:38, 20.42s/it] +2025-05-10 21:57:14 - ERROR - stderr - 20%|█▉ | 743/3741 [4:31:20<17:43:04, 21.28s/it] +2025-05-10 21:57:14 - ERROR - stderr - +2025-05-10 21:57:14 - ERROR - stderr - +2025-05-10 21:57:14 - INFO - stdout - {'loss': 0.9741, 'grad_norm': 0.6103249788284302, 'learning_rate': 1.8548494726667076e-05, 'epoch': 0.6} +2025-05-10 21:57:14 - ERROR - stderr - 20%|█▉ | 743/3741 [4:31:20<17:43:04, 21.28s/it] +2025-05-10 21:57:33 - ERROR - stderr - 20%|█▉ | 744/3741 [4:31:39<17:14:01, 20.70s/it] +2025-05-10 21:57:33 - ERROR - stderr - +2025-05-10 21:57:33 - ERROR - stderr - +2025-05-10 21:57:33 - INFO - stdout - {'loss': 0.9629, 'grad_norm': 0.6277962923049927, 'learning_rate': 1.8543998420656686e-05, 'epoch': 0.6} +2025-05-10 21:57:33 - ERROR - stderr - 20%|█▉ | 744/3741 [4:31:40<17:14:01, 20.70s/it] +2025-05-10 21:57:57 - ERROR - stderr - 20%|█▉ | 745/3741 [4:32:04<18:05:08, 21.73s/it] +2025-05-10 21:57:57 - ERROR - stderr - +2025-05-10 21:57:57 - ERROR - stderr - +2025-05-10 21:57:57 - INFO - stdout - {'loss': 1.0021, 'grad_norm': 0.6683188676834106, 'learning_rate': 1.8539495708065304e-05, 'epoch': 0.6} +2025-05-10 21:57:57 - ERROR - stderr - 20%|█▉ | 745/3741 [4:32:04<18:05:08, 21.73s/it] +2025-05-10 21:58:17 - ERROR - stderr - 20%|█▉ | 746/3741 [4:32:23<17:36:37, 21.17s/it] +2025-05-10 21:58:17 - ERROR - stderr - +2025-05-10 21:58:17 - ERROR - stderr - +2025-05-10 21:58:17 - INFO - stdout - {'loss': 0.9854, 'grad_norm': 0.621095597743988, 'learning_rate': 1.8534986592269218e-05, 'epoch': 0.6} +2025-05-10 21:58:17 - ERROR - stderr - 20%|█▉ | 746/3741 [4:32:24<17:36:37, 21.17s/it] +2025-05-10 21:58:38 - ERROR - stderr - 20%|█▉ | 747/3741 [4:32:44<17:24:43, 20.94s/it] +2025-05-10 21:58:38 - ERROR - stderr - +2025-05-10 21:58:38 - ERROR - stderr - +2025-05-10 21:58:38 - INFO - stdout - {'loss': 0.966, 'grad_norm': 0.6299651861190796, 'learning_rate': 1.853047107664951e-05, 'epoch': 0.6} +2025-05-10 21:58:38 - ERROR - stderr - 20%|█▉ | 747/3741 [4:32:44<17:24:43, 20.94s/it] +2025-05-10 21:58:57 - ERROR - stderr - 20%|█▉ | 748/3741 [4:33:03<17:04:01, 20.53s/it] +2025-05-10 21:58:57 - ERROR - stderr - +2025-05-10 21:58:57 - ERROR - stderr - +2025-05-10 21:58:57 - INFO - stdout - {'loss': 0.9201, 'grad_norm': 0.7200894355773926, 'learning_rate': 1.852594916459208e-05, 'epoch': 0.6} +2025-05-10 21:58:57 - ERROR - stderr - 20%|█▉ | 748/3741 [4:33:03<17:04:01, 20.53s/it] +2025-05-10 21:59:17 - ERROR - stderr - 20%|██ | 749/3741 [4:33:23<16:49:21, 20.24s/it] +2025-05-10 21:59:17 - ERROR - stderr - +2025-05-10 21:59:17 - ERROR - stderr - +2025-05-10 21:59:17 - INFO - stdout - {'loss': 1.0063, 'grad_norm': 0.6269078850746155, 'learning_rate': 1.85214208594876e-05, 'epoch': 0.6} +2025-05-10 21:59:17 - ERROR - stderr - 20%|██ | 749/3741 [4:33:23<16:49:21, 20.24s/it] +2025-05-10 21:59:39 - ERROR - stderr - 20%|██ | 750/3741 [4:33:45<17:20:08, 20.87s/it] +2025-05-10 21:59:39 - ERROR - stderr - +2025-05-10 21:59:39 - ERROR - stderr - +2025-05-10 21:59:39 - INFO - stdout - {'loss': 0.9167, 'grad_norm': 0.5880782008171082, 'learning_rate': 1.8516886164731554e-05, 'epoch': 0.6} +2025-05-10 21:59:39 - ERROR - stderr - 20%|██ | 750/3741 [4:33:45<17:20:08, 20.87s/it] +2025-05-10 21:59:59 - ERROR - stderr - 20%|██ | 751/3741 [4:34:05<17:03:27, 20.54s/it] +2025-05-10 21:59:59 - ERROR - stderr - +2025-05-10 21:59:59 - ERROR - stderr - +2025-05-10 21:59:59 - INFO - stdout - {'loss': 0.9314, 'grad_norm': 0.6221625208854675, 'learning_rate': 1.851234508372421e-05, 'epoch': 0.6} +2025-05-10 21:59:59 - ERROR - stderr - 20%|██ | 751/3741 [4:34:05<17:03:27, 20.54s/it] +2025-05-10 22:00:23 - ERROR - stderr - 20%|██ | 752/3741 [4:34:30<18:01:17, 21.71s/it] +2025-05-10 22:00:23 - ERROR - stderr - +2025-05-10 22:00:23 - ERROR - stderr - +2025-05-10 22:00:23 - INFO - stdout - {'loss': 0.9383, 'grad_norm': 0.6242570281028748, 'learning_rate': 1.850779761987062e-05, 'epoch': 0.6} +2025-05-10 22:00:23 - ERROR - stderr - 20%|██ | 752/3741 [4:34:30<18:01:17, 21.71s/it] +2025-05-10 22:00:43 - ERROR - stderr - 20%|██ | 753/3741 [4:34:49<17:31:26, 21.11s/it] +2025-05-10 22:00:43 - ERROR - stderr - +2025-05-10 22:00:43 - ERROR - stderr - +2025-05-10 22:00:43 - INFO - stdout - {'loss': 0.9046, 'grad_norm': 0.6036713719367981, 'learning_rate': 1.8503243776580637e-05, 'epoch': 0.6} +2025-05-10 22:00:43 - ERROR - stderr - 20%|██ | 753/3741 [4:34:49<17:31:26, 21.11s/it] +2025-05-10 22:01:06 - ERROR - stderr - 20%|██ | 754/3741 [4:35:12<17:59:23, 21.68s/it] +2025-05-10 22:01:06 - ERROR - stderr - +2025-05-10 22:01:06 - ERROR - stderr - +2025-05-10 22:01:06 - INFO - stdout - {'loss': 0.9427, 'grad_norm': 0.6600368022918701, 'learning_rate': 1.8498683557268878e-05, 'epoch': 0.6} +2025-05-10 22:01:06 - ERROR - stderr - 20%|██ | 754/3741 [4:35:12<17:59:23, 21.68s/it] +2025-05-10 22:01:26 - ERROR - stderr - 20%|██ | 755/3741 [4:35:32<17:29:13, 21.08s/it] +2025-05-10 22:01:26 - ERROR - stderr - +2025-05-10 22:01:26 - ERROR - stderr - +2025-05-10 22:01:26 - INFO - stdout - {'loss': 0.9301, 'grad_norm': 0.6118487119674683, 'learning_rate': 1.8494116965354756e-05, 'epoch': 0.61} +2025-05-10 22:01:26 - ERROR - stderr - 20%|██ | 755/3741 [4:35:32<17:29:13, 21.08s/it] +2025-05-10 22:01:49 - ERROR - stderr - 20%|██ | 756/3741 [4:35:55<17:57:26, 21.66s/it] +2025-05-10 22:01:49 - ERROR - stderr - +2025-05-10 22:01:49 - ERROR - stderr - +2025-05-10 22:01:49 - INFO - stdout - {'loss': 0.9867, 'grad_norm': 0.6600939035415649, 'learning_rate': 1.8489544004262456e-05, 'epoch': 0.61} +2025-05-10 22:01:49 - ERROR - stderr - 20%|██ | 756/3741 [4:35:55<17:57:26, 21.66s/it] +2025-05-10 22:02:08 - ERROR - stderr - 20%|██ | 757/3741 [4:36:14<17:23:41, 20.99s/it] +2025-05-10 22:02:08 - ERROR - stderr - +2025-05-10 22:02:08 - ERROR - stderr - +2025-05-10 22:02:08 - INFO - stdout - {'loss': 0.904, 'grad_norm': 0.6410656571388245, 'learning_rate': 1.8484964677420937e-05, 'epoch': 0.61} +2025-05-10 22:02:08 - ERROR - stderr - 20%|██ | 757/3741 [4:36:14<17:23:41, 20.99s/it] +2025-05-10 22:02:28 - ERROR - stderr - 20%|██ | 758/3741 [4:36:34<17:07:09, 20.66s/it] +2025-05-10 22:02:28 - ERROR - stderr - +2025-05-10 22:02:28 - ERROR - stderr - +2025-05-10 22:02:28 - INFO - stdout - {'loss': 0.9244, 'grad_norm': 0.6048609614372253, 'learning_rate': 1.848037898826394e-05, 'epoch': 0.61} +2025-05-10 22:02:28 - ERROR - stderr - 20%|██ | 758/3741 [4:36:34<17:07:09, 20.66s/it] +2025-05-10 22:02:49 - ERROR - stderr - 20%|██ | 759/3741 [4:36:56<17:19:09, 20.91s/it] +2025-05-10 22:02:49 - ERROR - stderr - +2025-05-10 22:02:49 - ERROR - stderr - +2025-05-10 22:02:49 - INFO - stdout - {'loss': 0.9042, 'grad_norm': 0.600308895111084, 'learning_rate': 1.8475786940229965e-05, 'epoch': 0.61} +2025-05-10 22:02:49 - ERROR - stderr - 20%|██ | 759/3741 [4:36:56<17:19:09, 20.91s/it] +2025-05-10 22:03:09 - ERROR - stderr - 20%|██ | 760/3741 [4:37:16<17:02:42, 20.58s/it] +2025-05-10 22:03:09 - ERROR - stderr - +2025-05-10 22:03:09 - ERROR - stderr - +2025-05-10 22:03:09 - INFO - stdout - {'loss': 1.0067, 'grad_norm': 0.6293653249740601, 'learning_rate': 1.847118853676229e-05, 'epoch': 0.61} +2025-05-10 22:03:09 - ERROR - stderr - 20%|██ | 760/3741 [4:37:16<17:02:42, 20.58s/it] +2025-05-10 22:03:34 - ERROR - stderr - 20%|██ | 761/3741 [4:37:40<18:02:45, 21.80s/it] +2025-05-10 22:03:34 - ERROR - stderr - +2025-05-10 22:03:34 - ERROR - stderr - +2025-05-10 22:03:34 - INFO - stdout - {'loss': 0.9437, 'grad_norm': 0.6423448324203491, 'learning_rate': 1.8466583781308954e-05, 'epoch': 0.61} +2025-05-10 22:03:34 - ERROR - stderr - 20%|██ | 761/3741 [4:37:40<18:02:45, 21.80s/it] +2025-05-10 22:03:53 - ERROR - stderr - 20%|██ | 762/3741 [4:38:00<17:28:06, 21.11s/it] +2025-05-10 22:03:53 - ERROR - stderr - +2025-05-10 22:03:53 - ERROR - stderr - +2025-05-10 22:03:53 - INFO - stdout - {'loss': 0.8932, 'grad_norm': 0.591410756111145, 'learning_rate': 1.846197267732276e-05, 'epoch': 0.61} +2025-05-10 22:03:53 - ERROR - stderr - 20%|██ | 762/3741 [4:38:00<17:28:06, 21.11s/it] +2025-05-10 22:04:18 - ERROR - stderr - 20%|██ | 763/3741 [4:38:24<18:15:56, 22.08s/it] +2025-05-10 22:04:18 - ERROR - stderr - +2025-05-10 22:04:18 - ERROR - stderr - +2025-05-10 22:04:18 - INFO - stdout - {'loss': 0.8843, 'grad_norm': 0.602726936340332, 'learning_rate': 1.845735522826127e-05, 'epoch': 0.61} +2025-05-10 22:04:18 - ERROR - stderr - 20%|██ | 763/3741 [4:38:24<18:15:56, 22.08s/it] +2025-05-10 22:04:37 - ERROR - stderr - 20%|██ | 764/3741 [4:38:44<17:38:14, 21.33s/it] +2025-05-10 22:04:37 - ERROR - stderr - +2025-05-10 22:04:37 - ERROR - stderr - +2025-05-10 22:04:37 - INFO - stdout - {'loss': 0.9544, 'grad_norm': 0.6235020756721497, 'learning_rate': 1.84527314375868e-05, 'epoch': 0.61} +2025-05-10 22:04:37 - ERROR - stderr - 20%|██ | 764/3741 [4:38:44<17:38:14, 21.33s/it] +2025-05-10 22:05:01 - ERROR - stderr - 20%|██ | 765/3741 [4:39:07<18:08:08, 21.94s/it] +2025-05-10 22:05:01 - ERROR - stderr - +2025-05-10 22:05:01 - ERROR - stderr - +2025-05-10 22:05:01 - INFO - stdout - {'loss': 0.8938, 'grad_norm': 0.6325739622116089, 'learning_rate': 1.8448101308766433e-05, 'epoch': 0.61} +2025-05-10 22:05:01 - ERROR - stderr - 20%|██ | 765/3741 [4:39:07<18:08:08, 21.94s/it] +2025-05-10 22:05:20 - ERROR - stderr - 20%|██ | 766/3741 [4:39:26<17:27:42, 21.13s/it] +2025-05-10 22:05:20 - ERROR - stderr - +2025-05-10 22:05:20 - ERROR - stderr - +2025-05-10 22:05:20 - INFO - stdout - {'loss': 0.9345, 'grad_norm': 0.6697767972946167, 'learning_rate': 1.8443464845271995e-05, 'epoch': 0.61} +2025-05-10 22:05:20 - ERROR - stderr - 20%|██ | 766/3741 [4:39:26<17:27:42, 21.13s/it] +2025-05-10 22:05:40 - ERROR - stderr - 21%|██ | 767/3741 [4:39:46<17:10:49, 20.80s/it] +2025-05-10 22:05:40 - ERROR - stderr - +2025-05-10 22:05:40 - ERROR - stderr - +2025-05-10 22:05:40 - INFO - stdout - {'loss': 0.9425, 'grad_norm': 0.6331246495246887, 'learning_rate': 1.843882205058006e-05, 'epoch': 0.62} +2025-05-10 22:05:40 - ERROR - stderr - 21%|██ | 767/3741 [4:39:46<17:10:49, 20.80s/it] +2025-05-10 22:06:01 - ERROR - stderr - 21%|██ | 768/3741 [4:40:07<17:07:08, 20.73s/it] +2025-05-10 22:06:01 - ERROR - stderr - +2025-05-10 22:06:01 - ERROR - stderr - +2025-05-10 22:06:01 - INFO - stdout - {'loss': 0.9709, 'grad_norm': 0.7046418190002441, 'learning_rate': 1.8434172928171962e-05, 'epoch': 0.62} +2025-05-10 22:06:01 - ERROR - stderr - 21%|██ | 768/3741 [4:40:07<17:07:08, 20.73s/it] +2025-05-10 22:06:20 - ERROR - stderr - 21%|██ | 769/3741 [4:40:27<16:53:38, 20.46s/it] +2025-05-10 22:06:20 - ERROR - stderr - +2025-05-10 22:06:20 - ERROR - stderr - +2025-05-10 22:06:20 - INFO - stdout - {'loss': 0.9588, 'grad_norm': 0.7394378185272217, 'learning_rate': 1.8429517481533762e-05, 'epoch': 0.62} +2025-05-10 22:06:20 - ERROR - stderr - 21%|██ | 769/3741 [4:40:27<16:53:38, 20.46s/it] +2025-05-10 22:06:44 - ERROR - stderr - 21%|██ | 770/3741 [4:40:51<17:43:51, 21.48s/it] +2025-05-10 22:06:44 - ERROR - stderr - +2025-05-10 22:06:44 - ERROR - stderr - +2025-05-10 22:06:44 - INFO - stdout - {'loss': 0.9141, 'grad_norm': 0.6277191638946533, 'learning_rate': 1.8424855714156277e-05, 'epoch': 0.62} +2025-05-10 22:06:44 - ERROR - stderr - 21%|██ | 770/3741 [4:40:51<17:43:51, 21.48s/it] +2025-05-10 22:07:04 - ERROR - stderr - 21%|██ | 771/3741 [4:41:11<17:21:00, 21.03s/it] +2025-05-10 22:07:04 - ERROR - stderr - +2025-05-10 22:07:04 - ERROR - stderr - +2025-05-10 22:07:04 - INFO - stdout - {'loss': 0.9488, 'grad_norm': 0.6583722233772278, 'learning_rate': 1.842018762953506e-05, 'epoch': 0.62} +2025-05-10 22:07:04 - ERROR - stderr - 21%|██ | 771/3741 [4:41:11<17:21:00, 21.03s/it] +2025-05-10 22:07:28 - ERROR - stderr - 21%|██ | 772/3741 [4:41:34<17:59:54, 21.82s/it] +2025-05-10 22:07:28 - ERROR - stderr - +2025-05-10 22:07:28 - ERROR - stderr - +2025-05-10 22:07:28 - INFO - stdout - {'loss': 0.9369, 'grad_norm': 0.6868898272514343, 'learning_rate': 1.8415513231170398e-05, 'epoch': 0.62} +2025-05-10 22:07:28 - ERROR - stderr - 21%|██ | 772/3741 [4:41:34<17:59:54, 21.82s/it] +2025-05-10 22:07:48 - ERROR - stderr - 21%|██ | 773/3741 [4:41:54<17:26:31, 21.16s/it] +2025-05-10 22:07:48 - ERROR - stderr - +2025-05-10 22:07:48 - ERROR - stderr - +2025-05-10 22:07:48 - INFO - stdout - {'loss': 0.9142, 'grad_norm': 0.6717788577079773, 'learning_rate': 1.8410832522567318e-05, 'epoch': 0.62} +2025-05-10 22:07:48 - ERROR - stderr - 21%|██ | 773/3741 [4:41:54<17:26:31, 21.16s/it] +2025-05-10 22:08:10 - ERROR - stderr - 21%|██ | 774/3741 [4:42:16<17:40:01, 21.44s/it] +2025-05-10 22:08:10 - ERROR - stderr - +2025-05-10 22:08:10 - ERROR - stderr - +2025-05-10 22:08:10 - INFO - stdout - {'loss': 0.8938, 'grad_norm': 0.5902653932571411, 'learning_rate': 1.8406145507235566e-05, 'epoch': 0.62} +2025-05-10 22:08:10 - ERROR - stderr - 21%|██ | 774/3741 [4:42:16<17:40:01, 21.44s/it] +2025-05-10 22:08:29 - ERROR - stderr - 21%|██ | 775/3741 [4:42:35<17:10:58, 20.86s/it] +2025-05-10 22:08:29 - ERROR - stderr - +2025-05-10 22:08:29 - ERROR - stderr - +2025-05-10 22:08:29 - INFO - stdout - {'loss': 0.9601, 'grad_norm': 0.644224226474762, 'learning_rate': 1.8401452188689635e-05, 'epoch': 0.62} +2025-05-10 22:08:29 - ERROR - stderr - 21%|██ | 775/3741 [4:42:35<17:10:58, 20.86s/it] +2025-05-10 22:08:49 - ERROR - stderr - 21%|██ | 776/3741 [4:42:55<16:53:58, 20.52s/it] +2025-05-10 22:08:49 - ERROR - stderr - +2025-05-10 22:08:49 - ERROR - stderr - +2025-05-10 22:08:49 - INFO - stdout - {'loss': 0.9192, 'grad_norm': 0.7199499607086182, 'learning_rate': 1.839675257044873e-05, 'epoch': 0.62} +2025-05-10 22:08:49 - ERROR - stderr - 21%|██ | 776/3741 [4:42:55<16:53:58, 20.52s/it] +2025-05-10 22:09:10 - ERROR - stderr - 21%|██ | 777/3741 [4:43:16<17:04:15, 20.73s/it] +2025-05-10 22:09:10 - ERROR - stderr - +2025-05-10 22:09:10 - ERROR - stderr - +2025-05-10 22:09:10 - INFO - stdout - {'loss': 0.9351, 'grad_norm': 0.7300478219985962, 'learning_rate': 1.8392046656036788e-05, 'epoch': 0.62} +2025-05-10 22:09:10 - ERROR - stderr - 21%|██ | 777/3741 [4:43:16<17:04:15, 20.73s/it] +2025-05-10 22:09:30 - ERROR - stderr - 21%|██ | 778/3741 [4:43:37<16:57:08, 20.60s/it] +2025-05-10 22:09:30 - ERROR - stderr - +2025-05-10 22:09:30 - ERROR - stderr - +2025-05-10 22:09:30 - INFO - stdout - {'loss': 0.9561, 'grad_norm': 0.7216119170188904, 'learning_rate': 1.8387334448982454e-05, 'epoch': 0.62} +2025-05-10 22:09:30 - ERROR - stderr - 21%|██ | 778/3741 [4:43:37<16:57:08, 20.60s/it] +2025-05-10 22:09:54 - ERROR - stderr - 21%|██ | 779/3741 [4:44:00<17:38:51, 21.45s/it] +2025-05-10 22:09:54 - ERROR - stderr - +2025-05-10 22:09:54 - ERROR - stderr - +2025-05-10 22:09:54 - INFO - stdout - {'loss': 0.9391, 'grad_norm': 0.6239175200462341, 'learning_rate': 1.8382615952819116e-05, 'epoch': 0.62} +2025-05-10 22:09:54 - ERROR - stderr - 21%|██ | 779/3741 [4:44:00<17:38:51, 21.45s/it] +2025-05-10 22:10:13 - ERROR - stderr - 21%|██ | 780/3741 [4:44:20<17:09:46, 20.87s/it] +2025-05-10 22:10:13 - ERROR - stderr - +2025-05-10 22:10:13 - ERROR - stderr - +2025-05-10 22:10:13 - INFO - stdout - {'loss': 0.998, 'grad_norm': 0.6322103142738342, 'learning_rate': 1.8377891171084858e-05, 'epoch': 0.63} +2025-05-10 22:10:13 - ERROR - stderr - 21%|██ | 780/3741 [4:44:20<17:09:46, 20.87s/it] +2025-05-10 22:10:38 - ERROR - stderr - 21%|██ | 781/3741 [4:44:44<18:00:30, 21.90s/it] +2025-05-10 22:10:38 - ERROR - stderr - +2025-05-10 22:10:38 - ERROR - stderr - +2025-05-10 22:10:38 - INFO - stdout - {'loss': 0.9308, 'grad_norm': 0.681839644908905, 'learning_rate': 1.8373160107322476e-05, 'epoch': 0.63} +2025-05-10 22:10:38 - ERROR - stderr - 21%|██ | 781/3741 [4:44:44<18:00:30, 21.90s/it] +2025-05-10 22:10:57 - ERROR - stderr - 21%|██ | 782/3741 [4:45:03<17:25:00, 21.19s/it] +2025-05-10 22:10:57 - ERROR - stderr - +2025-05-10 22:10:57 - ERROR - stderr - +2025-05-10 22:10:57 - INFO - stdout - {'loss': 0.9486, 'grad_norm': 0.6046080589294434, 'learning_rate': 1.8368422765079486e-05, 'epoch': 0.63} +2025-05-10 22:10:57 - ERROR - stderr - 21%|██ | 782/3741 [4:45:03<17:25:00, 21.19s/it] +2025-05-10 22:11:20 - ERROR - stderr - 21%|██ | 783/3741 [4:45:26<17:50:09, 21.71s/it] +2025-05-10 22:11:20 - ERROR - stderr - +2025-05-10 22:11:20 - ERROR - stderr - +2025-05-10 22:11:20 - INFO - stdout - {'loss': 0.907, 'grad_norm': 0.675331711769104, 'learning_rate': 1.8363679147908115e-05, 'epoch': 0.63} +2025-05-10 22:11:20 - ERROR - stderr - 21%|██ | 783/3741 [4:45:26<17:50:09, 21.71s/it] +2025-05-10 22:11:39 - ERROR - stderr - 21%|██ | 784/3741 [4:45:46<17:14:35, 20.99s/it] +2025-05-10 22:11:39 - ERROR - stderr - +2025-05-10 22:11:39 - ERROR - stderr - +2025-05-10 22:11:39 - INFO - stdout - {'loss': 0.9345, 'grad_norm': 0.6665434241294861, 'learning_rate': 1.835892925936528e-05, 'epoch': 0.63} +2025-05-10 22:11:39 - ERROR - stderr - 21%|██ | 784/3741 [4:45:46<17:14:35, 20.99s/it] +2025-05-10 22:11:59 - ERROR - stderr - 21%|██ | 785/3741 [4:46:05<16:52:38, 20.55s/it] +2025-05-10 22:11:59 - ERROR - stderr - +2025-05-10 22:11:59 - ERROR - stderr - +2025-05-10 22:11:59 - INFO - stdout - {'loss': 0.9132, 'grad_norm': 0.6315925717353821, 'learning_rate': 1.8354173103012614e-05, 'epoch': 0.63} +2025-05-10 22:11:59 - ERROR - stderr - 21%|██ | 785/3741 [4:46:05<16:52:38, 20.55s/it] +2025-05-10 22:12:19 - ERROR - stderr - 21%|██ | 786/3741 [4:46:25<16:47:43, 20.46s/it] +2025-05-10 22:12:19 - ERROR - stderr - +2025-05-10 22:12:19 - ERROR - stderr - +2025-05-10 22:12:19 - INFO - stdout - {'loss': 0.8736, 'grad_norm': 0.6950697302818298, 'learning_rate': 1.8349410682416442e-05, 'epoch': 0.63} +2025-05-10 22:12:19 - ERROR - stderr - 21%|██ | 786/3741 [4:46:25<16:47:43, 20.46s/it] +2025-05-10 22:12:39 - ERROR - stderr - 21%|██ | 787/3741 [4:46:45<16:38:59, 20.29s/it] +2025-05-10 22:12:39 - ERROR - stderr - +2025-05-10 22:12:39 - ERROR - stderr - +2025-05-10 22:12:39 - INFO - stdout - {'loss': 0.9271, 'grad_norm': 0.6428248286247253, 'learning_rate': 1.8344642001147793e-05, 'epoch': 0.63} +2025-05-10 22:12:39 - ERROR - stderr - 21%|██ | 787/3741 [4:46:45<16:38:59, 20.29s/it] +2025-05-10 22:13:03 - ERROR - stderr - 21%|██ | 788/3741 [4:47:09<17:27:16, 21.28s/it] +2025-05-10 22:13:03 - ERROR - stderr - +2025-05-10 22:13:03 - ERROR - stderr - +2025-05-10 22:13:03 - INFO - stdout - {'loss': 0.9271, 'grad_norm': 0.6300097107887268, 'learning_rate': 1.8339867062782384e-05, 'epoch': 0.63} +2025-05-10 22:13:03 - ERROR - stderr - 21%|██ | 788/3741 [4:47:09<17:27:16, 21.28s/it] +2025-05-10 22:13:22 - ERROR - stderr - 21%|██ | 789/3741 [4:47:28<16:58:47, 20.71s/it] +2025-05-10 22:13:22 - ERROR - stderr - +2025-05-10 22:13:22 - ERROR - stderr - +2025-05-10 22:13:22 - INFO - stdout - {'loss': 0.9489, 'grad_norm': 0.6257496476173401, 'learning_rate': 1.8335085870900627e-05, 'epoch': 0.63} +2025-05-10 22:13:22 - ERROR - stderr - 21%|██ | 789/3741 [4:47:28<16:58:47, 20.71s/it] +2025-05-10 22:13:46 - ERROR - stderr - 21%|██ | 790/3741 [4:47:52<17:45:30, 21.66s/it] +2025-05-10 22:13:46 - ERROR - stderr - +2025-05-10 22:13:46 - ERROR - stderr - +2025-05-10 22:13:46 - INFO - stdout - {'loss': 0.926, 'grad_norm': 0.5959362983703613, 'learning_rate': 1.8330298429087624e-05, 'epoch': 0.63} +2025-05-10 22:13:46 - ERROR - stderr - 21%|██ | 790/3741 [4:47:52<17:45:30, 21.66s/it] +2025-05-10 22:14:05 - ERROR - stderr - 21%|██ | 791/3741 [4:48:12<17:12:00, 20.99s/it] +2025-05-10 22:14:05 - ERROR - stderr - +2025-05-10 22:14:05 - ERROR - stderr - +2025-05-10 22:14:05 - INFO - stdout - {'loss': 0.948, 'grad_norm': 0.6299023032188416, 'learning_rate': 1.8325504740933157e-05, 'epoch': 0.63} +2025-05-10 22:14:05 - ERROR - stderr - 21%|██ | 791/3741 [4:48:12<17:12:00, 20.99s/it] +2025-05-10 22:14:28 - ERROR - stderr - 21%|██ | 792/3741 [4:48:34<17:30:03, 21.36s/it] +2025-05-10 22:14:28 - ERROR - stderr - +2025-05-10 22:14:28 - ERROR - stderr - +2025-05-10 22:14:28 - INFO - stdout - {'loss': 0.9001, 'grad_norm': 0.632050633430481, 'learning_rate': 1.8320704810031702e-05, 'epoch': 0.64} +2025-05-10 22:14:28 - ERROR - stderr - 21%|██ | 792/3741 [4:48:34<17:30:03, 21.36s/it] +2025-05-10 22:14:47 - ERROR - stderr - 21%|██ | 793/3741 [4:48:53<17:01:14, 20.79s/it] +2025-05-10 22:14:47 - ERROR - stderr - +2025-05-10 22:14:47 - ERROR - stderr - +2025-05-10 22:14:47 - INFO - stdout - {'loss': 0.8965, 'grad_norm': 0.635412335395813, 'learning_rate': 1.8315898639982404e-05, 'epoch': 0.64} +2025-05-10 22:14:47 - ERROR - stderr - 21%|██ | 793/3741 [4:48:53<17:01:14, 20.79s/it] +2025-05-10 22:15:06 - ERROR - stderr - 21%|██ | 794/3741 [4:49:13<16:39:11, 20.34s/it] +2025-05-10 22:15:06 - ERROR - stderr - +2025-05-10 22:15:06 - ERROR - stderr - +2025-05-10 22:15:06 - INFO - stdout - {'loss': 0.9294, 'grad_norm': 0.5949950218200684, 'learning_rate': 1.8311086234389104e-05, 'epoch': 0.64} +2025-05-10 22:15:06 - ERROR - stderr - 21%|██ | 794/3741 [4:49:13<16:39:11, 20.34s/it] +2025-05-10 22:15:27 - ERROR - stderr - 21%|██▏ | 795/3741 [4:49:34<16:48:57, 20.55s/it] +2025-05-10 22:15:27 - ERROR - stderr - +2025-05-10 22:15:27 - ERROR - stderr - +2025-05-10 22:15:27 - INFO - stdout - {'loss': 0.9333, 'grad_norm': 0.6535398364067078, 'learning_rate': 1.83062675968603e-05, 'epoch': 0.64} +2025-05-10 22:15:27 - ERROR - stderr - 21%|██▏ | 795/3741 [4:49:34<16:48:57, 20.55s/it] +2025-05-10 22:15:47 - ERROR - stderr - 21%|██▏ | 796/3741 [4:49:53<16:32:38, 20.22s/it] +2025-05-10 22:15:47 - ERROR - stderr - +2025-05-10 22:15:47 - ERROR - stderr - +2025-05-10 22:15:47 - INFO - stdout - {'loss': 0.91, 'grad_norm': 0.6044979095458984, 'learning_rate': 1.8301442731009168e-05, 'epoch': 0.64} +2025-05-10 22:15:47 - ERROR - stderr - 21%|██▏ | 796/3741 [4:49:53<16:32:38, 20.22s/it] +2025-05-10 22:16:10 - ERROR - stderr - 21%|██▏ | 797/3741 [4:50:16<17:17:28, 21.14s/it] +2025-05-10 22:16:10 - ERROR - stderr - +2025-05-10 22:16:10 - ERROR - stderr - +2025-05-10 22:16:10 - INFO - stdout - {'loss': 0.9109, 'grad_norm': 0.607458770275116, 'learning_rate': 1.8296611640453562e-05, 'epoch': 0.64} +2025-05-10 22:16:10 - ERROR - stderr - 21%|██▏ | 797/3741 [4:50:16<17:17:28, 21.14s/it] +2025-05-10 22:16:29 - ERROR - stderr - 21%|██▏ | 798/3741 [4:50:36<16:49:33, 20.58s/it] +2025-05-10 22:16:29 - ERROR - stderr - +2025-05-10 22:16:29 - ERROR - stderr - +2025-05-10 22:16:29 - INFO - stdout - {'loss': 0.9502, 'grad_norm': 0.6543724536895752, 'learning_rate': 1.8291774328816e-05, 'epoch': 0.64} +2025-05-10 22:16:29 - ERROR - stderr - 21%|██▏ | 798/3741 [4:50:36<16:49:33, 20.58s/it] +2025-05-10 22:16:53 - ERROR - stderr - 21%|██▏ | 799/3741 [4:51:00<17:40:52, 21.64s/it] +2025-05-10 22:16:53 - ERROR - stderr - +2025-05-10 22:16:53 - ERROR - stderr - +2025-05-10 22:16:53 - INFO - stdout - {'loss': 0.8956, 'grad_norm': 0.5994077920913696, 'learning_rate': 1.8286930799723658e-05, 'epoch': 0.64} +2025-05-10 22:16:53 - ERROR - stderr - 21%|██▏ | 799/3741 [4:51:00<17:40:52, 21.64s/it] +2025-05-10 22:17:13 - ERROR - stderr - 21%|██▏ | 800/3741 [4:51:19<17:09:47, 21.01s/it] +2025-05-10 22:17:13 - ERROR - stderr - +2025-05-10 22:17:13 - ERROR - stderr - +2025-05-10 22:17:13 - INFO - stdout - {'loss': 0.9113, 'grad_norm': 0.5721734166145325, 'learning_rate': 1.828208105680838e-05, 'epoch': 0.64} +2025-05-10 22:17:13 - ERROR - stderr - 21%|██▏ | 800/3741 [4:51:19<17:09:47, 21.01s/it] +2025-05-10 22:17:35 - ERROR - stderr - 21%|██▏ | 801/3741 [4:51:41<17:18:23, 21.19s/it] +2025-05-10 22:17:35 - ERROR - stderr - +2025-05-10 22:17:35 - ERROR - stderr - +2025-05-10 22:17:35 - INFO - stdout - {'loss': 0.9111, 'grad_norm': 0.611034631729126, 'learning_rate': 1.827722510370667e-05, 'epoch': 0.64} +2025-05-10 22:17:35 - ERROR - stderr - 21%|██▏ | 801/3741 [4:51:41<17:18:23, 21.19s/it] +2025-05-10 22:17:54 - ERROR - stderr - 21%|██▏ | 802/3741 [4:52:00<16:53:23, 20.69s/it] +2025-05-10 22:17:54 - ERROR - stderr - +2025-05-10 22:17:54 - ERROR - stderr - +2025-05-10 22:17:54 - INFO - stdout - {'loss': 0.9313, 'grad_norm': 0.6357942819595337, 'learning_rate': 1.8272362944059684e-05, 'epoch': 0.64} +2025-05-10 22:17:54 - ERROR - stderr - 21%|██▏ | 802/3741 [4:52:00<16:53:23, 20.69s/it] +2025-05-10 22:18:14 - ERROR - stderr - 21%|██▏ | 803/3741 [4:52:20<16:40:11, 20.43s/it] +2025-05-10 22:18:14 - ERROR - stderr - +2025-05-10 22:18:14 - ERROR - stderr - +2025-05-10 22:18:14 - INFO - stdout - {'loss': 0.9279, 'grad_norm': 0.6018952131271362, 'learning_rate': 1.8267494581513236e-05, 'epoch': 0.64} +2025-05-10 22:18:14 - ERROR - stderr - 21%|██▏ | 803/3741 [4:52:20<16:40:11, 20.43s/it] +2025-05-10 22:18:36 - ERROR - stderr - 21%|██▏ | 804/3741 [4:52:43<17:10:10, 21.05s/it] +2025-05-10 22:18:36 - ERROR - stderr - +2025-05-10 22:18:36 - ERROR - stderr - +2025-05-10 22:18:36 - INFO - stdout - {'loss': 0.9433, 'grad_norm': 0.5941787958145142, 'learning_rate': 1.8262620019717794e-05, 'epoch': 0.64} +2025-05-10 22:18:36 - ERROR - stderr - 21%|██▏ | 804/3741 [4:52:43<17:10:10, 21.05s/it] +2025-05-10 22:18:56 - ERROR - stderr - 22%|██▏ | 805/3741 [4:53:02<16:44:14, 20.52s/it] +2025-05-10 22:18:56 - ERROR - stderr - +2025-05-10 22:18:56 - ERROR - stderr - +2025-05-10 22:18:56 - INFO - stdout - {'loss': 0.9269, 'grad_norm': 0.601996123790741, 'learning_rate': 1.825773926232847e-05, 'epoch': 0.65} +2025-05-10 22:18:56 - ERROR - stderr - 22%|██▏ | 805/3741 [4:53:02<16:44:14, 20.52s/it] +2025-05-10 22:19:20 - ERROR - stderr - 22%|██▏ | 806/3741 [4:53:26<17:32:52, 21.52s/it] +2025-05-10 22:19:20 - ERROR - stderr - +2025-05-10 22:19:20 - ERROR - stderr - +2025-05-10 22:19:20 - INFO - stdout - {'loss': 0.9359, 'grad_norm': 0.6564491987228394, 'learning_rate': 1.8252852313005015e-05, 'epoch': 0.65} +2025-05-10 22:19:20 - ERROR - stderr - 22%|██▏ | 806/3741 [4:53:26<17:32:52, 21.52s/it] +2025-05-10 22:19:39 - ERROR - stderr - 22%|██▏ | 807/3741 [4:53:45<17:03:49, 20.94s/it] +2025-05-10 22:19:39 - ERROR - stderr - +2025-05-10 22:19:39 - ERROR - stderr - +2025-05-10 22:19:39 - INFO - stdout - {'loss': 0.9534, 'grad_norm': 0.6465602517127991, 'learning_rate': 1.8247959175411836e-05, 'epoch': 0.65} +2025-05-10 22:19:39 - ERROR - stderr - 22%|██▏ | 807/3741 [4:53:45<17:03:49, 20.94s/it] +2025-05-10 22:20:02 - ERROR - stderr - 22%|██▏ | 808/3741 [4:54:09<17:38:08, 21.65s/it] +2025-05-10 22:20:02 - ERROR - stderr - +2025-05-10 22:20:02 - ERROR - stderr - +2025-05-10 22:20:02 - INFO - stdout - {'loss': 0.939, 'grad_norm': 0.6189476251602173, 'learning_rate': 1.824305985321797e-05, 'epoch': 0.65} +2025-05-10 22:20:02 - ERROR - stderr - 22%|██▏ | 808/3741 [4:54:09<17:38:08, 21.65s/it] +2025-05-10 22:20:22 - ERROR - stderr - 22%|██▏ | 809/3741 [4:54:28<17:08:02, 21.04s/it] +2025-05-10 22:20:22 - ERROR - stderr - +2025-05-10 22:20:22 - ERROR - stderr - +2025-05-10 22:20:22 - INFO - stdout - {'loss': 0.9447, 'grad_norm': 0.5999793410301208, 'learning_rate': 1.8238154350097103e-05, 'epoch': 0.65} +2025-05-10 22:20:22 - ERROR - stderr - 22%|██▏ | 809/3741 [4:54:28<17:08:02, 21.04s/it] +2025-05-10 22:20:42 - ERROR - stderr - 22%|██▏ | 810/3741 [4:54:48<16:48:13, 20.64s/it] +2025-05-10 22:20:42 - ERROR - stderr - +2025-05-10 22:20:42 - ERROR - stderr - +2025-05-10 22:20:42 - INFO - stdout - {'loss': 0.917, 'grad_norm': 0.6540852785110474, 'learning_rate': 1.8233242669727544e-05, 'epoch': 0.65} +2025-05-10 22:20:42 - ERROR - stderr - 22%|██▏ | 810/3741 [4:54:48<16:48:13, 20.64s/it] +2025-05-10 22:21:01 - ERROR - stderr - 22%|██▏ | 811/3741 [4:55:07<16:29:33, 20.26s/it] +2025-05-10 22:21:01 - ERROR - stderr - +2025-05-10 22:21:01 - ERROR - stderr - +2025-05-10 22:21:01 - INFO - stdout - {'loss': 0.921, 'grad_norm': 0.6192000508308411, 'learning_rate': 1.8228324815792236e-05, 'epoch': 0.65} +2025-05-10 22:21:01 - ERROR - stderr - 22%|██▏ | 811/3741 [4:55:08<16:29:33, 20.26s/it] +2025-05-10 22:21:21 - ERROR - stderr - 22%|██▏ | 812/3741 [4:55:27<16:20:28, 20.08s/it] +2025-05-10 22:21:21 - ERROR - stderr - +2025-05-10 22:21:21 - ERROR - stderr - +2025-05-10 22:21:21 - INFO - stdout - {'loss': 0.9884, 'grad_norm': 0.6083493232727051, 'learning_rate': 1.8223400791978756e-05, 'epoch': 0.65} +2025-05-10 22:21:21 - ERROR - stderr - 22%|██▏ | 812/3741 [4:55:27<16:20:28, 20.08s/it] +2025-05-10 22:21:44 - ERROR - stderr - 22%|██▏ | 813/3741 [4:55:50<17:02:08, 20.95s/it] +2025-05-10 22:21:44 - ERROR - stderr - +2025-05-10 22:21:44 - ERROR - stderr - +2025-05-10 22:21:44 - INFO - stdout - {'loss': 0.9191, 'grad_norm': 0.6045847535133362, 'learning_rate': 1.8218470601979302e-05, 'epoch': 0.65} +2025-05-10 22:21:44 - ERROR - stderr - 22%|██▏ | 813/3741 [4:55:50<17:02:08, 20.95s/it] +2025-05-10 22:22:03 - ERROR - stderr - 22%|██▏ | 814/3741 [4:56:09<16:37:32, 20.45s/it] +2025-05-10 22:22:03 - ERROR - stderr - +2025-05-10 22:22:03 - ERROR - stderr - +2025-05-10 22:22:03 - INFO - stdout - {'loss': 0.9332, 'grad_norm': 0.5809303522109985, 'learning_rate': 1.8213534249490706e-05, 'epoch': 0.65} +2025-05-10 22:22:03 - ERROR - stderr - 22%|██▏ | 814/3741 [4:56:09<16:37:32, 20.45s/it] +2025-05-10 22:22:27 - ERROR - stderr - 22%|██▏ | 815/3741 [4:56:33<17:24:08, 21.41s/it] +2025-05-10 22:22:27 - ERROR - stderr - +2025-05-10 22:22:27 - ERROR - stderr - +2025-05-10 22:22:27 - INFO - stdout - {'loss': 0.9094, 'grad_norm': 0.5929029583930969, 'learning_rate': 1.8208591738214403e-05, 'epoch': 0.65} +2025-05-10 22:22:27 - ERROR - stderr - 22%|██▏ | 815/3741 [4:56:33<17:24:08, 21.41s/it] +2025-05-10 22:22:46 - ERROR - stderr - 22%|██▏ | 816/3741 [4:56:53<16:58:47, 20.90s/it] +2025-05-10 22:22:46 - ERROR - stderr - +2025-05-10 22:22:46 - ERROR - stderr - +2025-05-10 22:22:46 - INFO - stdout - {'loss': 0.9628, 'grad_norm': 0.6310725212097168, 'learning_rate': 1.8203643071856462e-05, 'epoch': 0.65} +2025-05-10 22:22:46 - ERROR - stderr - 22%|██▏ | 816/3741 [4:56:53<16:58:47, 20.90s/it] +2025-05-10 22:23:09 - ERROR - stderr - 22%|██▏ | 817/3741 [4:57:16<17:26:33, 21.48s/it] +2025-05-10 22:23:09 - ERROR - stderr - +2025-05-10 22:23:09 - ERROR - stderr - +2025-05-10 22:23:09 - INFO - stdout - {'loss': 0.9297, 'grad_norm': 0.664486825466156, 'learning_rate': 1.819868825412756e-05, 'epoch': 0.66} +2025-05-10 22:23:09 - ERROR - stderr - 22%|██▏ | 817/3741 [4:57:16<17:26:33, 21.48s/it] +2025-05-10 22:23:29 - ERROR - stderr - 22%|██▏ | 818/3741 [4:57:35<16:57:04, 20.88s/it] +2025-05-10 22:23:29 - ERROR - stderr - +2025-05-10 22:23:29 - ERROR - stderr - +2025-05-10 22:23:29 - INFO - stdout - {'loss': 0.9559, 'grad_norm': 0.6123178601264954, 'learning_rate': 1.8193727288742987e-05, 'epoch': 0.66} +2025-05-10 22:23:29 - ERROR - stderr - 22%|██▏ | 818/3741 [4:57:35<16:57:04, 20.88s/it] +2025-05-10 22:23:48 - ERROR - stderr - 22%|██▏ | 819/3741 [4:57:55<16:39:32, 20.52s/it] +2025-05-10 22:23:48 - ERROR - stderr - +2025-05-10 22:23:48 - ERROR - stderr - +2025-05-10 22:23:48 - INFO - stdout - {'loss': 0.921, 'grad_norm': 0.6100270748138428, 'learning_rate': 1.818876017942265e-05, 'epoch': 0.66} +2025-05-10 22:23:48 - ERROR - stderr - 22%|██▏ | 819/3741 [4:57:55<16:39:32, 20.52s/it] +2025-05-10 22:24:10 - ERROR - stderr - 22%|██▏ | 820/3741 [4:58:16<16:55:21, 20.86s/it] +2025-05-10 22:24:10 - ERROR - stderr - +2025-05-10 22:24:10 - ERROR - stderr - +2025-05-10 22:24:10 - INFO - stdout - {'loss': 0.9472, 'grad_norm': 0.6273778080940247, 'learning_rate': 1.818378692989105e-05, 'epoch': 0.66} +2025-05-10 22:24:10 - ERROR - stderr - 22%|██▏ | 820/3741 [4:58:16<16:55:21, 20.86s/it] +2025-05-10 22:24:29 - ERROR - stderr - 22%|██▏ | 821/3741 [4:58:36<16:32:17, 20.39s/it] +2025-05-10 22:24:29 - ERROR - stderr - +2025-05-10 22:24:29 - ERROR - stderr - +2025-05-10 22:24:29 - INFO - stdout - {'loss': 0.9388, 'grad_norm': 0.6654192805290222, 'learning_rate': 1.8178807543877303e-05, 'epoch': 0.66} +2025-05-10 22:24:29 - ERROR - stderr - 22%|██▏ | 821/3741 [4:58:36<16:32:17, 20.39s/it] +2025-05-10 22:24:53 - ERROR - stderr - 22%|██▏ | 822/3741 [4:59:00<17:24:36, 21.47s/it] +2025-05-10 22:24:53 - ERROR - stderr - +2025-05-10 22:24:53 - ERROR - stderr - +2025-05-10 22:24:53 - INFO - stdout - {'loss': 0.9477, 'grad_norm': 0.6100279688835144, 'learning_rate': 1.817382202511512e-05, 'epoch': 0.66} +2025-05-10 22:24:53 - ERROR - stderr - 22%|██▏ | 822/3741 [4:59:00<17:24:36, 21.47s/it] +2025-05-10 22:25:13 - ERROR - stderr - 22%|██▏ | 823/3741 [4:59:19<16:55:41, 20.88s/it] +2025-05-10 22:25:13 - ERROR - stderr - +2025-05-10 22:25:13 - ERROR - stderr - +2025-05-10 22:25:13 - INFO - stdout - {'loss': 0.9419, 'grad_norm': 0.6125680208206177, 'learning_rate': 1.816883037734281e-05, 'epoch': 0.66} +2025-05-10 22:25:13 - ERROR - stderr - 22%|██▏ | 823/3741 [4:59:19<16:55:41, 20.88s/it] +2025-05-10 22:25:35 - ERROR - stderr - 22%|██▏ | 824/3741 [4:59:42<17:16:22, 21.32s/it] +2025-05-10 22:25:35 - ERROR - stderr - +2025-05-10 22:25:35 - ERROR - stderr - +2025-05-10 22:25:35 - INFO - stdout - {'loss': 1.0237, 'grad_norm': 0.6193715333938599, 'learning_rate': 1.8163832604303284e-05, 'epoch': 0.66} +2025-05-10 22:25:35 - ERROR - stderr - 22%|██▏ | 824/3741 [4:59:42<17:16:22, 21.32s/it] +2025-05-10 22:25:55 - ERROR - stderr - 22%|██▏ | 825/3741 [5:00:01<16:50:13, 20.79s/it] +2025-05-10 22:25:55 - ERROR - stderr - +2025-05-10 22:25:55 - ERROR - stderr - +2025-05-10 22:25:55 - INFO - stdout - {'loss': 0.9228, 'grad_norm': 0.6256586313247681, 'learning_rate': 1.815882870974404e-05, 'epoch': 0.66} +2025-05-10 22:25:55 - ERROR - stderr - 22%|██▏ | 825/3741 [5:00:01<16:50:13, 20.79s/it] +2025-05-10 22:26:14 - ERROR - stderr - 22%|██▏ | 826/3741 [5:00:21<16:30:00, 20.38s/it] +2025-05-10 22:26:14 - ERROR - stderr - +2025-05-10 22:26:14 - ERROR - stderr - +2025-05-10 22:26:14 - INFO - stdout - {'loss': 0.9198, 'grad_norm': 0.6021474599838257, 'learning_rate': 1.8153818697417176e-05, 'epoch': 0.66} +2025-05-10 22:26:14 - ERROR - stderr - 22%|██▏ | 826/3741 [5:00:21<16:30:00, 20.38s/it] +2025-05-10 22:26:35 - ERROR - stderr - 22%|██▏ | 827/3741 [5:00:42<16:41:08, 20.61s/it] +2025-05-10 22:26:35 - ERROR - stderr - +2025-05-10 22:26:35 - ERROR - stderr - +2025-05-10 22:26:35 - INFO - stdout - {'loss': 0.8507, 'grad_norm': 0.5720776319503784, 'learning_rate': 1.814880257107936e-05, 'epoch': 0.66} +2025-05-10 22:26:35 - ERROR - stderr - 22%|██▏ | 827/3741 [5:00:42<16:41:08, 20.61s/it] +2025-05-10 22:26:55 - ERROR - stderr - 22%|██▏ | 828/3741 [5:01:01<16:23:28, 20.26s/it] +2025-05-10 22:26:55 - ERROR - stderr - +2025-05-10 22:26:55 - ERROR - stderr - +2025-05-10 22:26:55 - INFO - stdout - {'loss': 0.9298, 'grad_norm': 0.5865132808685303, 'learning_rate': 1.8143780334491863e-05, 'epoch': 0.66} +2025-05-10 22:26:55 - ERROR - stderr - 22%|██▏ | 828/3741 [5:01:01<16:23:28, 20.26s/it] +2025-05-10 22:27:18 - ERROR - stderr - 22%|██▏ | 829/3741 [5:01:24<17:05:34, 21.13s/it] +2025-05-10 22:27:18 - ERROR - stderr - +2025-05-10 22:27:18 - ERROR - stderr - +2025-05-10 22:27:18 - INFO - stdout - {'loss': 0.8927, 'grad_norm': 0.585963785648346, 'learning_rate': 1.8138751991420524e-05, 'epoch': 0.66} +2025-05-10 22:27:18 - ERROR - stderr - 22%|██▏ | 829/3741 [5:01:24<17:05:34, 21.13s/it] +2025-05-10 22:27:38 - ERROR - stderr - 22%|██▏ | 830/3741 [5:01:44<16:42:45, 20.67s/it] +2025-05-10 22:27:38 - ERROR - stderr - +2025-05-10 22:27:38 - ERROR - stderr - +2025-05-10 22:27:38 - INFO - stdout - {'loss': 0.972, 'grad_norm': 0.6248182058334351, 'learning_rate': 1.8133717545635764e-05, 'epoch': 0.67} +2025-05-10 22:27:38 - ERROR - stderr - 22%|██▏ | 830/3741 [5:01:44<16:42:45, 20.67s/it] +2025-05-10 22:28:01 - ERROR - stderr - 22%|██▏ | 831/3741 [5:02:07<17:19:04, 21.42s/it] +2025-05-10 22:28:01 - ERROR - stderr - +2025-05-10 22:28:01 - ERROR - stderr - +2025-05-10 22:28:01 - INFO - stdout - {'loss': 0.9437, 'grad_norm': 0.6154810190200806, 'learning_rate': 1.812867700091258e-05, 'epoch': 0.67} +2025-05-10 22:28:01 - ERROR - stderr - 22%|██▏ | 831/3741 [5:02:07<17:19:04, 21.42s/it] +2025-05-10 22:28:20 - ERROR - stderr - 22%|██▏ | 832/3741 [5:02:27<16:50:33, 20.84s/it] +2025-05-10 22:28:20 - ERROR - stderr - +2025-05-10 22:28:20 - ERROR - stderr - +2025-05-10 22:28:20 - INFO - stdout - {'loss': 0.8818, 'grad_norm': 0.603408932685852, 'learning_rate': 1.8123630361030557e-05, 'epoch': 0.67} +2025-05-10 22:28:20 - ERROR - stderr - 22%|██▏ | 832/3741 [5:02:27<16:50:33, 20.84s/it] +2025-05-10 22:28:40 - ERROR - stderr - 22%|██▏ | 833/3741 [5:02:46<16:29:39, 20.42s/it] +2025-05-10 22:28:40 - ERROR - stderr - +2025-05-10 22:28:40 - ERROR - stderr - +2025-05-10 22:28:40 - INFO - stdout - {'loss': 0.9342, 'grad_norm': 0.5872328877449036, 'learning_rate': 1.8118577629773824e-05, 'epoch': 0.67} +2025-05-10 22:28:40 - ERROR - stderr - 22%|██▏ | 833/3741 [5:02:46<16:29:39, 20.42s/it] +2025-05-10 22:29:02 - ERROR - stderr - 22%|██▏ | 834/3741 [5:03:08<16:52:07, 20.89s/it] +2025-05-10 22:29:02 - ERROR - stderr - +2025-05-10 22:29:02 - ERROR - stderr - +2025-05-10 22:29:02 - INFO - stdout - {'loss': 0.9535, 'grad_norm': 0.5850470066070557, 'learning_rate': 1.81135188109311e-05, 'epoch': 0.67} +2025-05-10 22:29:02 - ERROR - stderr - 22%|██▏ | 834/3741 [5:03:08<16:52:07, 20.89s/it] +2025-05-10 22:29:21 - ERROR - stderr - 22%|██▏ | 835/3741 [5:03:28<16:32:21, 20.49s/it] +2025-05-10 22:29:21 - ERROR - stderr - +2025-05-10 22:29:21 - ERROR - stderr - +2025-05-10 22:29:21 - INFO - stdout - {'loss': 0.9408, 'grad_norm': 0.6239657402038574, 'learning_rate': 1.8108453908295655e-05, 'epoch': 0.67} +2025-05-10 22:29:21 - ERROR - stderr - 22%|██▏ | 835/3741 [5:03:28<16:32:21, 20.49s/it] +2025-05-10 22:29:44 - ERROR - stderr - 22%|██▏ | 836/3741 [5:03:51<17:08:33, 21.24s/it] +2025-05-10 22:29:44 - ERROR - stderr - +2025-05-10 22:29:44 - ERROR - stderr - +2025-05-10 22:29:44 - INFO - stdout - {'loss': 0.9907, 'grad_norm': 0.6208472847938538, 'learning_rate': 1.8103382925665324e-05, 'epoch': 0.67} +2025-05-10 22:29:44 - ERROR - stderr - 22%|██▏ | 836/3741 [5:03:51<17:08:33, 21.24s/it] +2025-05-10 22:30:04 - ERROR - stderr - 22%|██▏ | 837/3741 [5:04:10<16:43:46, 20.74s/it] +2025-05-10 22:30:04 - ERROR - stderr - +2025-05-10 22:30:04 - ERROR - stderr - +2025-05-10 22:30:04 - INFO - stdout - {'loss': 0.964, 'grad_norm': 0.5864999890327454, 'learning_rate': 1.8098305866842506e-05, 'epoch': 0.67} +2025-05-10 22:30:04 - ERROR - stderr - 22%|██▏ | 837/3741 [5:04:10<16:43:46, 20.74s/it] +2025-05-10 22:30:26 - ERROR - stderr - 22%|██▏ | 838/3741 [5:04:32<17:06:04, 21.21s/it] +2025-05-10 22:30:26 - ERROR - stderr - +2025-05-10 22:30:26 - ERROR - stderr - +2025-05-10 22:30:26 - INFO - stdout - {'loss': 0.969, 'grad_norm': 0.6111268997192383, 'learning_rate': 1.809322273563415e-05, 'epoch': 0.67} +2025-05-10 22:30:26 - ERROR - stderr - 22%|██▏ | 838/3741 [5:04:32<17:06:04, 21.21s/it] +2025-05-10 22:30:46 - ERROR - stderr - 22%|██▏ | 839/3741 [5:04:52<16:46:30, 20.81s/it] +2025-05-10 22:30:46 - ERROR - stderr - +2025-05-10 22:30:46 - ERROR - stderr - +2025-05-10 22:30:46 - INFO - stdout - {'loss': 0.9177, 'grad_norm': 0.6360272169113159, 'learning_rate': 1.8088133535851763e-05, 'epoch': 0.67} +2025-05-10 22:30:46 - ERROR - stderr - 22%|██▏ | 839/3741 [5:04:52<16:46:30, 20.81s/it] +2025-05-10 22:30:46 - INFO - stdout - WARNING: tokenization mismatch: 3183 vs. 3209. (ignored) +2025-05-10 22:30:46 - WARNING - transformers.tokenization_utils_base - Token indices sequence length is longer than the specified maximum sequence length for this model (5807 > 4096). Running this sequence through the model will result in indexing errors +2025-05-10 22:30:46 - WARNING - transformers.tokenization_utils_base - Token indices sequence length is longer than the specified maximum sequence length for this model (5807 > 4096). Running this sequence through the model will result in indexing errors +2025-05-10 22:31:06 - ERROR - stderr - 22%|██▏ | 840/3741 [5:05:12<16:28:54, 20.45s/it] +2025-05-10 22:31:06 - ERROR - stderr - +2025-05-10 22:31:06 - ERROR - stderr - +2025-05-10 22:31:06 - INFO - stdout - {'loss': 0.9047, 'grad_norm': 0.6175538897514343, 'learning_rate': 1.80830382713114e-05, 'epoch': 0.67} +2025-05-10 22:31:06 - ERROR - stderr - 22%|██▏ | 840/3741 [5:05:12<16:28:54, 20.45s/it] +2025-05-10 22:31:33 - ERROR - stderr - 22%|██▏ | 841/3741 [5:05:39<18:04:41, 22.44s/it] +2025-05-10 22:31:33 - ERROR - stderr - +2025-05-10 22:31:33 - ERROR - stderr - +2025-05-10 22:31:33 - INFO - stdout - {'loss': 0.9443, 'grad_norm': 0.6100848317146301, 'learning_rate': 1.8077936945833662e-05, 'epoch': 0.67} +2025-05-10 22:31:33 - ERROR - stderr - 22%|██▏ | 841/3741 [5:05:39<18:04:41, 22.44s/it] +2025-05-10 22:31:52 - ERROR - stderr - 23%|██▎ | 842/3741 [5:05:58<17:20:52, 21.54s/it] +2025-05-10 22:31:52 - ERROR - stderr - +2025-05-10 22:31:52 - ERROR - stderr - +2025-05-10 22:31:52 - INFO - stdout - {'loss': 0.9368, 'grad_norm': 0.6124653220176697, 'learning_rate': 1.80728295632437e-05, 'epoch': 0.68} +2025-05-10 22:31:52 - ERROR - stderr - 23%|██▎ | 842/3741 [5:05:58<17:20:52, 21.54s/it] +2025-05-10 22:32:15 - ERROR - stderr - 23%|██▎ | 843/3741 [5:06:22<17:43:21, 22.02s/it] +2025-05-10 22:32:15 - ERROR - stderr - +2025-05-10 22:32:15 - ERROR - stderr - +2025-05-10 22:32:15 - INFO - stdout - {'loss': 0.9087, 'grad_norm': 0.6022012829780579, 'learning_rate': 1.8067716127371197e-05, 'epoch': 0.68} +2025-05-10 22:32:15 - ERROR - stderr - 23%|██▎ | 843/3741 [5:06:22<17:43:21, 22.02s/it] +2025-05-10 22:32:35 - ERROR - stderr - 23%|██▎ | 844/3741 [5:06:41<17:05:06, 21.23s/it] +2025-05-10 22:32:35 - ERROR - stderr - +2025-05-10 22:32:35 - ERROR - stderr - +2025-05-10 22:32:35 - INFO - stdout - {'loss': 0.9418, 'grad_norm': 0.6640161275863647, 'learning_rate': 1.806259664205039e-05, 'epoch': 0.68} +2025-05-10 22:32:35 - ERROR - stderr - 23%|██▎ | 844/3741 [5:06:41<17:05:06, 21.23s/it] +2025-05-10 22:32:58 - ERROR - stderr - 23%|██▎ | 845/3741 [5:07:04<17:31:03, 21.78s/it] +2025-05-10 22:32:58 - ERROR - stderr - +2025-05-10 22:32:58 - ERROR - stderr - +2025-05-10 22:32:58 - INFO - stdout - {'loss': 0.9169, 'grad_norm': 0.5954174995422363, 'learning_rate': 1.805747111112004e-05, 'epoch': 0.68} +2025-05-10 22:32:58 - ERROR - stderr - 23%|██▎ | 845/3741 [5:07:04<17:31:03, 21.78s/it] +2025-05-10 22:33:17 - ERROR - stderr - 23%|██▎ | 846/3741 [5:07:23<16:57:12, 21.08s/it] +2025-05-10 22:33:17 - ERROR - stderr - +2025-05-10 22:33:17 - ERROR - stderr - +2025-05-10 22:33:17 - INFO - stdout - {'loss': 0.9537, 'grad_norm': 0.6202585101127625, 'learning_rate': 1.805233953842344e-05, 'epoch': 0.68} +2025-05-10 22:33:17 - ERROR - stderr - 23%|██▎ | 846/3741 [5:07:23<16:57:12, 21.08s/it] +2025-05-10 22:33:38 - ERROR - stderr - 23%|██▎ | 847/3741 [5:07:44<16:49:41, 20.93s/it] +2025-05-10 22:33:38 - ERROR - stderr - +2025-05-10 22:33:38 - ERROR - stderr - +2025-05-10 22:33:38 - INFO - stdout - {'loss': 0.9279, 'grad_norm': 0.5560839176177979, 'learning_rate': 1.8047201927808423e-05, 'epoch': 0.68} +2025-05-10 22:33:38 - ERROR - stderr - 23%|██▎ | 847/3741 [5:07:44<16:49:41, 20.93s/it] +2025-05-10 22:33:57 - ERROR - stderr - 23%|██▎ | 848/3741 [5:08:04<16:30:08, 20.54s/it] +2025-05-10 22:33:57 - ERROR - stderr - +2025-05-10 22:33:57 - ERROR - stderr - +2025-05-10 22:33:57 - INFO - stdout - {'loss': 0.934, 'grad_norm': 0.6648291945457458, 'learning_rate': 1.8042058283127345e-05, 'epoch': 0.68} +2025-05-10 22:33:57 - ERROR - stderr - 23%|██▎ | 848/3741 [5:08:04<16:30:08, 20.54s/it] +2025-05-10 22:34:17 - ERROR - stderr - 23%|██▎ | 849/3741 [5:08:24<16:24:47, 20.43s/it] +2025-05-10 22:34:18 - ERROR - stderr - +2025-05-10 22:34:18 - ERROR - stderr - +2025-05-10 22:34:18 - INFO - stdout - {'loss': 0.9258, 'grad_norm': 0.7005195021629333, 'learning_rate': 1.8036908608237085e-05, 'epoch': 0.68} +2025-05-10 22:34:18 - ERROR - stderr - 23%|██▎ | 849/3741 [5:08:24<16:24:47, 20.43s/it] +2025-05-10 22:34:41 - ERROR - stderr - 23%|██▎ | 850/3741 [5:08:47<17:07:29, 21.32s/it] +2025-05-10 22:34:41 - ERROR - stderr - +2025-05-10 22:34:41 - ERROR - stderr - +2025-05-10 22:34:41 - INFO - stdout - {'loss': 0.9609, 'grad_norm': 0.6536465883255005, 'learning_rate': 1.803175290699904e-05, 'epoch': 0.68} +2025-05-10 22:34:41 - ERROR - stderr - 23%|██▎ | 850/3741 [5:08:47<17:07:29, 21.32s/it] +2025-05-10 22:35:01 - ERROR - stderr - 23%|██▎ | 851/3741 [5:09:07<16:42:55, 20.82s/it] +2025-05-10 22:35:01 - ERROR - stderr - +2025-05-10 22:35:01 - ERROR - stderr - +2025-05-10 22:35:01 - INFO - stdout - {'loss': 0.9085, 'grad_norm': 0.6565441489219666, 'learning_rate': 1.8026591183279136e-05, 'epoch': 0.68} +2025-05-10 22:35:01 - ERROR - stderr - 23%|██▎ | 851/3741 [5:09:07<16:42:55, 20.82s/it] +2025-05-10 22:35:25 - ERROR - stderr - 23%|██▎ | 852/3741 [5:09:32<17:39:00, 21.99s/it] +2025-05-10 22:35:25 - ERROR - stderr - +2025-05-10 22:35:25 - ERROR - stderr - +2025-05-10 22:35:25 - INFO - stdout - {'loss': 0.9386, 'grad_norm': 0.6199874877929688, 'learning_rate': 1.8021423440947808e-05, 'epoch': 0.68} +2025-05-10 22:35:25 - ERROR - stderr - 23%|██▎ | 852/3741 [5:09:32<17:39:00, 21.99s/it] +2025-05-10 22:35:45 - ERROR - stderr - 23%|██▎ | 853/3741 [5:09:51<17:03:51, 21.27s/it] +2025-05-10 22:35:45 - ERROR - stderr - +2025-05-10 22:35:45 - ERROR - stderr - +2025-05-10 22:35:45 - INFO - stdout - {'loss': 0.9389, 'grad_norm': 0.6430292725563049, 'learning_rate': 1.801624968388e-05, 'epoch': 0.68} +2025-05-10 22:35:45 - ERROR - stderr - 23%|██▎ | 853/3741 [5:09:51<17:03:51, 21.27s/it] +2025-05-10 22:36:08 - ERROR - stderr - 23%|██▎ | 854/3741 [5:10:15<17:37:14, 21.97s/it] +2025-05-10 22:36:08 - ERROR - stderr - +2025-05-10 22:36:08 - ERROR - stderr - +2025-05-10 22:36:08 - INFO - stdout - {'loss': 0.9225, 'grad_norm': 0.601648211479187, 'learning_rate': 1.801106991595518e-05, 'epoch': 0.68} +2025-05-10 22:36:08 - ERROR - stderr - 23%|██▎ | 854/3741 [5:10:15<17:37:14, 21.97s/it] +2025-05-10 22:36:28 - ERROR - stderr - 23%|██▎ | 855/3741 [5:10:35<17:05:08, 21.31s/it] +2025-05-10 22:36:28 - ERROR - stderr - +2025-05-10 22:36:28 - ERROR - stderr - +2025-05-10 22:36:28 - INFO - stdout - {'loss': 0.9545, 'grad_norm': 0.591111421585083, 'learning_rate': 1.800588414105731e-05, 'epoch': 0.69} +2025-05-10 22:36:28 - ERROR - stderr - 23%|██▎ | 855/3741 [5:10:35<17:05:08, 21.31s/it] +2025-05-10 22:36:48 - ERROR - stderr - 23%|██▎ | 856/3741 [5:10:54<16:35:32, 20.70s/it] +2025-05-10 22:36:48 - ERROR - stderr - +2025-05-10 22:36:48 - ERROR - stderr - +2025-05-10 22:36:48 - INFO - stdout - {'loss': 0.942, 'grad_norm': 0.6806792616844177, 'learning_rate': 1.8000692363074862e-05, 'epoch': 0.69} +2025-05-10 22:36:48 - ERROR - stderr - 23%|██▎ | 856/3741 [5:10:54<16:35:32, 20.70s/it] +2025-05-10 22:37:07 - ERROR - stderr - 23%|██▎ | 857/3741 [5:11:13<16:18:36, 20.36s/it] +2025-05-10 22:37:07 - ERROR - stderr - +2025-05-10 22:37:07 - ERROR - stderr - +2025-05-10 22:37:07 - INFO - stdout - {'loss': 0.9303, 'grad_norm': 0.5764021277427673, 'learning_rate': 1.7995494585900802e-05, 'epoch': 0.69} +2025-05-10 22:37:07 - ERROR - stderr - 23%|██▎ | 857/3741 [5:11:13<16:18:36, 20.36s/it] +2025-05-10 22:37:26 - ERROR - stderr - 23%|██▎ | 858/3741 [5:11:33<16:04:18, 20.07s/it] +2025-05-10 22:37:26 - ERROR - stderr - +2025-05-10 22:37:26 - ERROR - stderr - +2025-05-10 22:37:26 - INFO - stdout - {'loss': 0.955, 'grad_norm': 0.6204013228416443, 'learning_rate': 1.7990290813432613e-05, 'epoch': 0.69} +2025-05-10 22:37:26 - ERROR - stderr - 23%|██▎ | 858/3741 [5:11:33<16:04:18, 20.07s/it] +2025-05-10 22:37:49 - ERROR - stderr - 23%|██▎ | 859/3741 [5:11:56<16:41:58, 20.86s/it] +2025-05-10 22:37:49 - ERROR - stderr - +2025-05-10 22:37:49 - ERROR - stderr - +2025-05-10 22:37:49 - INFO - stdout - {'loss': 0.9287, 'grad_norm': 0.618166446685791, 'learning_rate': 1.7985081049572244e-05, 'epoch': 0.69} +2025-05-10 22:37:49 - ERROR - stderr - 23%|██▎ | 859/3741 [5:11:56<16:41:58, 20.86s/it] +2025-05-10 22:38:09 - ERROR - stderr - 23%|██▎ | 860/3741 [5:12:15<16:21:06, 20.43s/it] +2025-05-10 22:38:09 - ERROR - stderr - +2025-05-10 22:38:09 - ERROR - stderr - +2025-05-10 22:38:09 - INFO - stdout - {'loss': 0.9297, 'grad_norm': 0.5855494141578674, 'learning_rate': 1.797986529822617e-05, 'epoch': 0.69} +2025-05-10 22:38:09 - ERROR - stderr - 23%|██▎ | 860/3741 [5:12:15<16:21:06, 20.43s/it] +2025-05-10 22:38:32 - ERROR - stderr - 23%|██▎ | 861/3741 [5:12:38<16:58:57, 21.23s/it] +2025-05-10 22:38:32 - ERROR - stderr - +2025-05-10 22:38:32 - ERROR - stderr - +2025-05-10 22:38:32 - INFO - stdout - {'loss': 0.9884, 'grad_norm': 0.6061149835586548, 'learning_rate': 1.7974643563305326e-05, 'epoch': 0.69} +2025-05-10 22:38:32 - ERROR - stderr - 23%|██▎ | 861/3741 [5:12:38<16:58:57, 21.23s/it] +2025-05-10 22:38:51 - ERROR - stderr - 23%|██▎ | 862/3741 [5:12:58<16:35:05, 20.74s/it] +2025-05-10 22:38:51 - ERROR - stderr - +2025-05-10 22:38:51 - ERROR - stderr - +2025-05-10 22:38:51 - INFO - stdout - {'loss': 0.9607, 'grad_norm': 0.5847954750061035, 'learning_rate': 1.7969415848725155e-05, 'epoch': 0.69} +2025-05-10 22:38:51 - ERROR - stderr - 23%|██▎ | 862/3741 [5:12:58<16:35:05, 20.74s/it] +2025-05-10 22:39:12 - ERROR - stderr - 23%|██▎ | 863/3741 [5:13:18<16:28:46, 20.61s/it] +2025-05-10 22:39:12 - ERROR - stderr - +2025-05-10 22:39:12 - ERROR - stderr - +2025-05-10 22:39:12 - INFO - stdout - {'loss': 0.9519, 'grad_norm': 0.652940034866333, 'learning_rate': 1.7964182158405567e-05, 'epoch': 0.69} +2025-05-10 22:39:12 - ERROR - stderr - 23%|██▎ | 863/3741 [5:13:18<16:28:46, 20.61s/it] +2025-05-10 22:39:31 - ERROR - stderr - 23%|██▎ | 864/3741 [5:13:38<16:14:49, 20.33s/it] +2025-05-10 22:39:31 - ERROR - stderr - +2025-05-10 22:39:31 - ERROR - stderr - +2025-05-10 22:39:31 - INFO - stdout - {'loss': 0.9627, 'grad_norm': 0.6230655908584595, 'learning_rate': 1.795894249627097e-05, 'epoch': 0.69} +2025-05-10 22:39:31 - ERROR - stderr - 23%|██▎ | 864/3741 [5:13:38<16:14:49, 20.33s/it] +2025-05-10 22:39:51 - ERROR - stderr - 23%|██▎ | 865/3741 [5:13:57<16:04:10, 20.11s/it] +2025-05-10 22:39:51 - ERROR - stderr - +2025-05-10 22:39:51 - ERROR - stderr - +2025-05-10 22:39:51 - INFO - stdout - {'loss': 0.8989, 'grad_norm': 0.5886598825454712, 'learning_rate': 1.795369686625024e-05, 'epoch': 0.69} +2025-05-10 22:39:51 - ERROR - stderr - 23%|██▎ | 865/3741 [5:13:57<16:04:10, 20.11s/it] +2025-05-10 22:40:14 - ERROR - stderr - 23%|██▎ | 866/3741 [5:14:21<16:51:38, 21.11s/it] +2025-05-10 22:40:14 - ERROR - stderr - +2025-05-10 22:40:14 - ERROR - stderr - +2025-05-10 22:40:14 - INFO - stdout - {'loss': 0.9438, 'grad_norm': 0.6408997178077698, 'learning_rate': 1.7948445272276727e-05, 'epoch': 0.69} +2025-05-10 22:40:14 - ERROR - stderr - 23%|██▎ | 866/3741 [5:14:21<16:51:38, 21.11s/it] +2025-05-10 22:40:34 - ERROR - stderr - 23%|██▎ | 867/3741 [5:14:40<16:28:21, 20.63s/it] +2025-05-10 22:40:34 - ERROR - stderr - +2025-05-10 22:40:34 - ERROR - stderr - +2025-05-10 22:40:34 - INFO - stdout - {'loss': 0.9283, 'grad_norm': 0.6148324012756348, 'learning_rate': 1.794318771828825e-05, 'epoch': 0.7} +2025-05-10 22:40:34 - ERROR - stderr - 23%|██▎ | 867/3741 [5:14:40<16:28:21, 20.63s/it] +2025-05-10 22:40:57 - ERROR - stderr - 23%|██▎ | 868/3741 [5:15:03<17:00:32, 21.31s/it] +2025-05-10 22:40:57 - ERROR - stderr - +2025-05-10 22:40:57 - ERROR - stderr - +2025-05-10 22:40:57 - INFO - stdout - {'loss': 0.955, 'grad_norm': 0.6214705109596252, 'learning_rate': 1.793792420822711e-05, 'epoch': 0.7} +2025-05-10 22:40:57 - ERROR - stderr - 23%|██▎ | 868/3741 [5:15:03<17:00:32, 21.31s/it] +2025-05-10 22:41:16 - ERROR - stderr - 23%|██▎ | 869/3741 [5:15:23<16:34:55, 20.79s/it] +2025-05-10 22:41:16 - ERROR - stderr - +2025-05-10 22:41:16 - ERROR - stderr - +2025-05-10 22:41:16 - INFO - stdout - {'loss': 0.9252, 'grad_norm': 0.6310122013092041, 'learning_rate': 1.7932654746040063e-05, 'epoch': 0.7} +2025-05-10 22:41:16 - ERROR - stderr - 23%|██▎ | 869/3741 [5:15:23<16:34:55, 20.79s/it] +2025-05-10 22:41:36 - ERROR - stderr - 23%|██▎ | 870/3741 [5:15:42<16:17:39, 20.43s/it] +2025-05-10 22:41:36 - ERROR - stderr - +2025-05-10 22:41:36 - ERROR - stderr - +2025-05-10 22:41:36 - INFO - stdout - {'loss': 0.9219, 'grad_norm': 0.6389340758323669, 'learning_rate': 1.7927379335678333e-05, 'epoch': 0.7} +2025-05-10 22:41:36 - ERROR - stderr - 23%|██▎ | 870/3741 [5:15:42<16:17:39, 20.43s/it] +2025-05-10 22:41:56 - ERROR - stderr - 23%|██▎ | 871/3741 [5:16:02<16:07:38, 20.23s/it] +2025-05-10 22:41:56 - ERROR - stderr - +2025-05-10 22:41:56 - ERROR - stderr - +2025-05-10 22:41:56 - INFO - stdout - {'loss': 0.9396, 'grad_norm': 0.597550630569458, 'learning_rate': 1.7922097981097596e-05, 'epoch': 0.7} +2025-05-10 22:41:56 - ERROR - stderr - 23%|██▎ | 871/3741 [5:16:02<16:07:38, 20.23s/it] +2025-05-10 22:42:16 - ERROR - stderr - 23%|██▎ | 872/3741 [5:16:22<16:03:00, 20.14s/it] +2025-05-10 22:42:16 - ERROR - stderr - +2025-05-10 22:42:16 - ERROR - stderr - +2025-05-10 22:42:16 - INFO - stdout - {'loss': 0.9493, 'grad_norm': 0.5725171566009521, 'learning_rate': 1.7916810686257998e-05, 'epoch': 0.7} +2025-05-10 22:42:16 - ERROR - stderr - 23%|██▎ | 872/3741 [5:16:22<16:03:00, 20.14s/it] +2025-05-10 22:42:39 - ERROR - stderr - 23%|██▎ | 873/3741 [5:16:46<16:54:42, 21.23s/it] +2025-05-10 22:42:39 - ERROR - stderr - +2025-05-10 22:42:39 - ERROR - stderr - +2025-05-10 22:42:39 - INFO - stdout - {'loss': 0.9032, 'grad_norm': 0.5874449014663696, 'learning_rate': 1.791151745512413e-05, 'epoch': 0.7} +2025-05-10 22:42:39 - ERROR - stderr - 23%|██▎ | 873/3741 [5:16:46<16:54:42, 21.23s/it] +2025-05-10 22:42:59 - ERROR - stderr - 23%|██▎ | 874/3741 [5:17:05<16:26:32, 20.65s/it] +2025-05-10 22:42:59 - ERROR - stderr - +2025-05-10 22:42:59 - ERROR - stderr - +2025-05-10 22:42:59 - INFO - stdout - {'loss': 0.8794, 'grad_norm': 0.6453227400779724, 'learning_rate': 1.790621829166504e-05, 'epoch': 0.7} +2025-05-10 22:42:59 - ERROR - stderr - 23%|██▎ | 874/3741 [5:17:05<16:26:32, 20.65s/it] +2025-05-10 22:43:23 - ERROR - stderr - 23%|██▎ | 875/3741 [5:17:29<17:16:40, 21.70s/it] +2025-05-10 22:43:23 - ERROR - stderr - +2025-05-10 22:43:23 - ERROR - stderr - +2025-05-10 22:43:23 - INFO - stdout - {'loss': 0.946, 'grad_norm': 0.5927412509918213, 'learning_rate': 1.7900913199854218e-05, 'epoch': 0.7} +2025-05-10 22:43:23 - ERROR - stderr - 23%|██▎ | 875/3741 [5:17:29<17:16:40, 21.70s/it] +2025-05-10 22:43:43 - ERROR - stderr - 23%|██▎ | 876/3741 [5:17:49<16:49:46, 21.15s/it] +2025-05-10 22:43:43 - ERROR - stderr - +2025-05-10 22:43:43 - ERROR - stderr - +2025-05-10 22:43:43 - INFO - stdout - {'loss': 0.9298, 'grad_norm': 0.6199756860733032, 'learning_rate': 1.7895602183669602e-05, 'epoch': 0.7} +2025-05-10 22:43:43 - ERROR - stderr - 23%|██▎ | 876/3741 [5:17:49<16:49:46, 21.15s/it] +2025-05-10 22:44:05 - ERROR - stderr - 23%|██▎ | 877/3741 [5:18:11<16:59:58, 21.37s/it] +2025-05-10 22:44:05 - ERROR - stderr - +2025-05-10 22:44:05 - ERROR - stderr - +2025-05-10 22:44:05 - INFO - stdout - {'loss': 0.9928, 'grad_norm': 0.6697978377342224, 'learning_rate': 1.7890285247093574e-05, 'epoch': 0.7} +2025-05-10 22:44:05 - ERROR - stderr - 23%|██▎ | 877/3741 [5:18:11<16:59:58, 21.37s/it] +2025-05-10 22:44:24 - ERROR - stderr - 23%|██▎ | 878/3741 [5:18:31<16:36:01, 20.87s/it] +2025-05-10 22:44:24 - ERROR - stderr - +2025-05-10 22:44:24 - ERROR - stderr - +2025-05-10 22:44:24 - INFO - stdout - {'loss': 0.9256, 'grad_norm': 0.6035749912261963, 'learning_rate': 1.7884962394112953e-05, 'epoch': 0.7} +2025-05-10 22:44:24 - ERROR - stderr - 23%|██▎ | 878/3741 [5:18:31<16:36:01, 20.87s/it] +2025-05-10 22:44:44 - ERROR - stderr - 23%|██▎ | 879/3741 [5:18:50<16:17:07, 20.48s/it] +2025-05-10 22:44:44 - ERROR - stderr - +2025-05-10 22:44:44 - ERROR - stderr - +2025-05-10 22:44:44 - INFO - stdout - {'loss': 0.9228, 'grad_norm': 0.6426158547401428, 'learning_rate': 1.7879633628719e-05, 'epoch': 0.7} +2025-05-10 22:44:44 - ERROR - stderr - 23%|██▎ | 879/3741 [5:18:50<16:17:07, 20.48s/it] +2025-05-10 22:45:07 - ERROR - stderr - 24%|██▎ | 880/3741 [5:19:13<16:54:11, 21.27s/it] +2025-05-10 22:45:07 - ERROR - stderr - +2025-05-10 22:45:07 - ERROR - stderr - +2025-05-10 22:45:07 - INFO - stdout - {'loss': 0.9229, 'grad_norm': 0.5659704804420471, 'learning_rate': 1.7874298954907405e-05, 'epoch': 0.71} +2025-05-10 22:45:07 - ERROR - stderr - 24%|██▎ | 880/3741 [5:19:13<16:54:11, 21.27s/it] +2025-05-10 22:45:26 - ERROR - stderr - 24%|██▎ | 881/3741 [5:19:33<16:26:55, 20.70s/it] +2025-05-10 22:45:26 - ERROR - stderr - +2025-05-10 22:45:26 - ERROR - stderr - +2025-05-10 22:45:26 - INFO - stdout - {'loss': 0.9421, 'grad_norm': 0.598106324672699, 'learning_rate': 1.786895837667828e-05, 'epoch': 0.71} +2025-05-10 22:45:26 - ERROR - stderr - 24%|██▎ | 881/3741 [5:19:33<16:26:55, 20.70s/it] +2025-05-10 22:45:50 - ERROR - stderr - 24%|██▎ | 882/3741 [5:19:56<17:07:45, 21.57s/it] +2025-05-10 22:45:50 - ERROR - stderr - +2025-05-10 22:45:50 - ERROR - stderr - +2025-05-10 22:45:50 - INFO - stdout - {'loss': 0.9289, 'grad_norm': 0.5607869029045105, 'learning_rate': 1.7863611898036175e-05, 'epoch': 0.71} +2025-05-10 22:45:50 - ERROR - stderr - 24%|██▎ | 882/3741 [5:19:56<17:07:45, 21.57s/it] +2025-05-10 22:46:09 - ERROR - stderr - 24%|██▎ | 883/3741 [5:20:16<16:36:54, 20.93s/it] +2025-05-10 22:46:09 - ERROR - stderr - +2025-05-10 22:46:09 - ERROR - stderr - +2025-05-10 22:46:09 - INFO - stdout - {'loss': 0.9785, 'grad_norm': 0.6277954578399658, 'learning_rate': 1.7858259522990067e-05, 'epoch': 0.71} +2025-05-10 22:46:09 - ERROR - stderr - 24%|██▎ | 883/3741 [5:20:16<16:36:54, 20.93s/it] +2025-05-10 22:46:32 - ERROR - stderr - 24%|██▎ | 884/3741 [5:20:39<17:06:29, 21.56s/it] +2025-05-10 22:46:32 - ERROR - stderr - +2025-05-10 22:46:32 - ERROR - stderr - +2025-05-10 22:46:32 - INFO - stdout - {'loss': 0.9637, 'grad_norm': 0.7224546670913696, 'learning_rate': 1.7852901255553346e-05, 'epoch': 0.71} +2025-05-10 22:46:32 - ERROR - stderr - 24%|██▎ | 884/3741 [5:20:39<17:06:29, 21.56s/it] +2025-05-10 22:46:52 - ERROR - stderr - 24%|██▎ | 885/3741 [5:20:58<16:36:19, 20.93s/it] +2025-05-10 22:46:52 - ERROR - stderr - +2025-05-10 22:46:52 - ERROR - stderr - +2025-05-10 22:46:52 - INFO - stdout - {'loss': 0.8912, 'grad_norm': 0.5827512145042419, 'learning_rate': 1.7847537099743824e-05, 'epoch': 0.71} +2025-05-10 22:46:52 - ERROR - stderr - 24%|██▎ | 885/3741 [5:20:58<16:36:19, 20.93s/it] +2025-05-10 22:47:12 - ERROR - stderr - 24%|██▎ | 886/3741 [5:21:18<16:22:23, 20.65s/it] +2025-05-10 22:47:12 - ERROR - stderr - +2025-05-10 22:47:12 - ERROR - stderr - +2025-05-10 22:47:12 - INFO - stdout - {'loss': 0.9232, 'grad_norm': 0.6022170186042786, 'learning_rate': 1.7842167059583723e-05, 'epoch': 0.71} +2025-05-10 22:47:12 - ERROR - stderr - 24%|██▎ | 886/3741 [5:21:18<16:22:23, 20.65s/it] +2025-05-10 22:47:31 - ERROR - stderr - 24%|██▎ | 887/3741 [5:21:38<16:03:52, 20.26s/it] +2025-05-10 22:47:31 - ERROR - stderr - +2025-05-10 22:47:31 - ERROR - stderr - +2025-05-10 22:47:31 - INFO - stdout - {'loss': 0.9659, 'grad_norm': 0.6745474934577942, 'learning_rate': 1.783679113909969e-05, 'epoch': 0.71} +2025-05-10 22:47:31 - ERROR - stderr - 24%|██▎ | 887/3741 [5:21:38<16:03:52, 20.26s/it] +2025-05-10 22:47:51 - ERROR - stderr - 24%|██▎ | 888/3741 [5:21:57<15:53:31, 20.05s/it] +2025-05-10 22:47:51 - ERROR - stderr - +2025-05-10 22:47:51 - ERROR - stderr - +2025-05-10 22:47:51 - INFO - stdout - {'loss': 0.9329, 'grad_norm': 0.6338194608688354, 'learning_rate': 1.7831409342322766e-05, 'epoch': 0.71} +2025-05-10 22:47:51 - ERROR - stderr - 24%|██▎ | 888/3741 [5:21:57<15:53:31, 20.05s/it] +2025-05-10 22:48:14 - ERROR - stderr - 24%|██▍ | 889/3741 [5:22:21<16:44:07, 21.12s/it] +2025-05-10 22:48:14 - ERROR - stderr - +2025-05-10 22:48:14 - ERROR - stderr - +2025-05-10 22:48:14 - INFO - stdout - {'loss': 0.9881, 'grad_norm': 0.638043224811554, 'learning_rate': 1.7826021673288413e-05, 'epoch': 0.71} +2025-05-10 22:48:14 - ERROR - stderr - 24%|██▍ | 889/3741 [5:22:21<16:44:07, 21.12s/it] +2025-05-10 22:48:34 - ERROR - stderr - 24%|██▍ | 890/3741 [5:22:41<16:25:17, 20.74s/it] +2025-05-10 22:48:34 - ERROR - stderr - +2025-05-10 22:48:34 - ERROR - stderr - +2025-05-10 22:48:34 - INFO - stdout - {'loss': 0.908, 'grad_norm': 0.5955981016159058, 'learning_rate': 1.7820628136036483e-05, 'epoch': 0.71} +2025-05-10 22:48:34 - ERROR - stderr - 24%|██▍ | 890/3741 [5:22:41<16:25:17, 20.74s/it] +2025-05-10 22:48:59 - ERROR - stderr - 24%|██▍ | 891/3741 [5:23:05<17:20:02, 21.90s/it] +2025-05-10 22:48:59 - ERROR - stderr - +2025-05-10 22:48:59 - ERROR - stderr - +2025-05-10 22:48:59 - INFO - stdout - {'loss': 0.9438, 'grad_norm': 0.6151586771011353, 'learning_rate': 1.7815228734611233e-05, 'epoch': 0.71} +2025-05-10 22:48:59 - ERROR - stderr - 24%|██▍ | 891/3741 [5:23:05<17:20:02, 21.90s/it] +2025-05-10 22:49:18 - ERROR - stderr - 24%|██▍ | 892/3741 [5:23:25<16:43:25, 21.13s/it] +2025-05-10 22:49:18 - ERROR - stderr - +2025-05-10 22:49:18 - ERROR - stderr - +2025-05-10 22:49:18 - INFO - stdout - {'loss': 0.9605, 'grad_norm': 0.6584967970848083, 'learning_rate': 1.7809823473061324e-05, 'epoch': 0.72} +2025-05-10 22:49:18 - ERROR - stderr - 24%|██▍ | 892/3741 [5:23:25<16:43:25, 21.13s/it] +2025-05-10 22:49:40 - ERROR - stderr - 24%|██▍ | 893/3741 [5:23:47<16:57:36, 21.44s/it] +2025-05-10 22:49:40 - ERROR - stderr - +2025-05-10 22:49:40 - ERROR - stderr - +2025-05-10 22:49:40 - INFO - stdout - {'loss': 0.9248, 'grad_norm': 0.6425672769546509, 'learning_rate': 1.7804412355439803e-05, 'epoch': 0.72} +2025-05-10 22:49:40 - ERROR - stderr - 24%|██▍ | 893/3741 [5:23:47<16:57:36, 21.44s/it] +2025-05-10 22:50:00 - ERROR - stderr - 24%|██▍ | 894/3741 [5:24:06<16:31:04, 20.89s/it] +2025-05-10 22:50:00 - ERROR - stderr - +2025-05-10 22:50:00 - ERROR - stderr - +2025-05-10 22:50:00 - INFO - stdout - {'loss': 0.874, 'grad_norm': 0.6150190234184265, 'learning_rate': 1.7798995385804107e-05, 'epoch': 0.72} +2025-05-10 22:50:00 - ERROR - stderr - 24%|██▍ | 894/3741 [5:24:06<16:31:04, 20.89s/it] +2025-05-10 22:50:19 - ERROR - stderr - 24%|██▍ | 895/3741 [5:24:26<16:10:09, 20.45s/it] +2025-05-10 22:50:19 - ERROR - stderr - +2025-05-10 22:50:19 - ERROR - stderr - +2025-05-10 22:50:19 - INFO - stdout - {'loss': 0.9333, 'grad_norm': 0.6137731075286865, 'learning_rate': 1.7793572568216063e-05, 'epoch': 0.72} +2025-05-10 22:50:19 - ERROR - stderr - 24%|██▍ | 895/3741 [5:24:26<16:10:09, 20.45s/it] +2025-05-10 22:50:42 - ERROR - stderr - 24%|██▍ | 896/3741 [5:24:48<16:34:26, 20.97s/it] +2025-05-10 22:50:42 - ERROR - stderr - +2025-05-10 22:50:42 - ERROR - stderr - +2025-05-10 22:50:42 - INFO - stdout - {'loss': 0.9447, 'grad_norm': 0.6165148019790649, 'learning_rate': 1.778814390674189e-05, 'epoch': 0.72} +2025-05-10 22:50:42 - ERROR - stderr - 24%|██▍ | 896/3741 [5:24:48<16:34:26, 20.97s/it] +2025-05-10 22:51:01 - ERROR - stderr - 24%|██▍ | 897/3741 [5:25:08<16:16:44, 20.61s/it] +2025-05-10 22:51:01 - ERROR - stderr - +2025-05-10 22:51:01 - ERROR - stderr - +2025-05-10 22:51:01 - INFO - stdout - {'loss': 0.8696, 'grad_norm': 0.6286599040031433, 'learning_rate': 1.7782709405452184e-05, 'epoch': 0.72} +2025-05-10 22:51:01 - ERROR - stderr - 24%|██▍ | 897/3741 [5:25:08<16:16:44, 20.61s/it] +2025-05-10 22:51:24 - ERROR - stderr - 24%|██▍ | 898/3741 [5:25:31<16:52:15, 21.36s/it] +2025-05-10 22:51:24 - ERROR - stderr - +2025-05-10 22:51:24 - ERROR - stderr - +2025-05-10 22:51:24 - INFO - stdout - {'loss': 0.9514, 'grad_norm': 0.6361931562423706, 'learning_rate': 1.777726906842191e-05, 'epoch': 0.72} +2025-05-10 22:51:24 - ERROR - stderr - 24%|██▍ | 898/3741 [5:25:31<16:52:15, 21.36s/it] +2025-05-10 22:51:44 - ERROR - stderr - 24%|██▍ | 899/3741 [5:25:50<16:27:46, 20.85s/it] +2025-05-10 22:51:44 - ERROR - stderr - +2025-05-10 22:51:44 - ERROR - stderr - +2025-05-10 22:51:44 - INFO - stdout - {'loss': 0.9249, 'grad_norm': 0.7451531887054443, 'learning_rate': 1.777182289973043e-05, 'epoch': 0.72} +2025-05-10 22:51:44 - ERROR - stderr - 24%|██▍ | 899/3741 [5:25:50<16:27:46, 20.85s/it] +2025-05-10 22:52:07 - ERROR - stderr - 24%|██▍ | 900/3741 [5:26:14<16:59:33, 21.53s/it] +2025-05-10 22:52:07 - ERROR - stderr - +2025-05-10 22:52:07 - ERROR - stderr - +2025-05-10 22:52:07 - INFO - stdout - {'loss': 0.8892, 'grad_norm': 0.6656950116157532, 'learning_rate': 1.776637090346146e-05, 'epoch': 0.72} +2025-05-10 22:52:07 - ERROR - stderr - 24%|██▍ | 900/3741 [5:26:14<16:59:33, 21.53s/it] +2025-05-10 22:52:27 - ERROR - stderr - 24%|██▍ | 901/3741 [5:26:33<16:27:31, 20.86s/it] +2025-05-10 22:52:27 - ERROR - stderr - +2025-05-10 22:52:27 - ERROR - stderr - +2025-05-10 22:52:27 - INFO - stdout - {'loss': 0.8965, 'grad_norm': 0.7215845584869385, 'learning_rate': 1.7760913083703088e-05, 'epoch': 0.72} +2025-05-10 22:52:27 - ERROR - stderr - 24%|██▍ | 901/3741 [5:26:33<16:27:31, 20.86s/it] +2025-05-10 22:52:46 - ERROR - stderr - 24%|██▍ | 902/3741 [5:26:53<16:12:42, 20.56s/it] +2025-05-10 22:52:46 - ERROR - stderr - +2025-05-10 22:52:46 - ERROR - stderr - +2025-05-10 22:52:46 - INFO - stdout - {'loss': 0.954, 'grad_norm': 0.6370206475257874, 'learning_rate': 1.7755449444547783e-05, 'epoch': 0.72} +2025-05-10 22:52:46 - ERROR - stderr - 24%|██▍ | 902/3741 [5:26:53<16:12:42, 20.56s/it] +2025-05-10 22:53:07 - ERROR - stderr - 24%|██▍ | 903/3741 [5:27:13<16:07:26, 20.45s/it] +2025-05-10 22:53:07 - ERROR - stderr - +2025-05-10 22:53:07 - ERROR - stderr - +2025-05-10 22:53:07 - INFO - stdout - {'loss': 0.8996, 'grad_norm': 0.6504797339439392, 'learning_rate': 1.7749979990092364e-05, 'epoch': 0.72} +2025-05-10 22:53:07 - ERROR - stderr - 24%|██▍ | 903/3741 [5:27:13<16:07:26, 20.45s/it] +2025-05-10 22:53:26 - ERROR - stderr - 24%|██▍ | 904/3741 [5:27:32<15:54:03, 20.18s/it] +2025-05-10 22:53:26 - ERROR - stderr - +2025-05-10 22:53:26 - ERROR - stderr - +2025-05-10 22:53:26 - INFO - stdout - {'loss': 0.9311, 'grad_norm': 1.0392930507659912, 'learning_rate': 1.774450472443801e-05, 'epoch': 0.72} +2025-05-10 22:53:26 - ERROR - stderr - 24%|██▍ | 904/3741 [5:27:32<15:54:03, 20.18s/it] +2025-05-10 22:53:50 - ERROR - stderr - 24%|██▍ | 905/3741 [5:27:57<16:50:00, 21.37s/it] +2025-05-10 22:53:50 - ERROR - stderr - +2025-05-10 22:53:50 - ERROR - stderr - +2025-05-10 22:53:50 - INFO - stdout - {'loss': 0.9424, 'grad_norm': 0.6109988689422607, 'learning_rate': 1.7739023651690267e-05, 'epoch': 0.73} +2025-05-10 22:53:50 - ERROR - stderr - 24%|██▍ | 905/3741 [5:27:57<16:50:00, 21.37s/it] +2025-05-10 22:54:10 - ERROR - stderr - 24%|██▍ | 906/3741 [5:28:16<16:28:36, 20.92s/it] +2025-05-10 22:54:10 - ERROR - stderr - +2025-05-10 22:54:10 - ERROR - stderr - +2025-05-10 22:54:10 - INFO - stdout - {'loss': 0.9875, 'grad_norm': 0.677939772605896, 'learning_rate': 1.7733536775959027e-05, 'epoch': 0.73} +2025-05-10 22:54:10 - ERROR - stderr - 24%|██▍ | 906/3741 [5:28:16<16:28:36, 20.92s/it] +2025-05-10 22:54:34 - ERROR - stderr - 24%|██▍ | 907/3741 [5:28:40<17:10:37, 21.82s/it] +2025-05-10 22:54:34 - ERROR - stderr - +2025-05-10 22:54:34 - ERROR - stderr - +2025-05-10 22:54:34 - INFO - stdout - {'loss': 0.9088, 'grad_norm': 0.5968044996261597, 'learning_rate': 1.7728044101358538e-05, 'epoch': 0.73} +2025-05-10 22:54:34 - ERROR - stderr - 24%|██▍ | 907/3741 [5:28:40<17:10:37, 21.82s/it] +2025-05-10 22:54:53 - ERROR - stderr - 24%|██▍ | 908/3741 [5:29:00<16:34:32, 21.06s/it] +2025-05-10 22:54:53 - ERROR - stderr - +2025-05-10 22:54:53 - ERROR - stderr - +2025-05-10 22:54:53 - INFO - stdout - {'loss': 0.9306, 'grad_norm': 0.5922111868858337, 'learning_rate': 1.7722545632007394e-05, 'epoch': 0.73} +2025-05-10 22:54:53 - ERROR - stderr - 24%|██▍ | 908/3741 [5:29:00<16:34:32, 21.06s/it] +2025-05-10 22:55:17 - ERROR - stderr - 24%|██▍ | 909/3741 [5:29:23<17:06:06, 21.74s/it] +2025-05-10 22:55:17 - ERROR - stderr - +2025-05-10 22:55:17 - ERROR - stderr - +2025-05-10 22:55:17 - INFO - stdout - {'loss': 0.9302, 'grad_norm': 0.6236370801925659, 'learning_rate': 1.771704137202853e-05, 'epoch': 0.73} +2025-05-10 22:55:17 - ERROR - stderr - 24%|██▍ | 909/3741 [5:29:23<17:06:06, 21.74s/it] +2025-05-10 22:55:36 - ERROR - stderr - 24%|██▍ | 910/3741 [5:29:43<16:34:15, 21.07s/it] +2025-05-10 22:55:36 - ERROR - stderr - +2025-05-10 22:55:36 - ERROR - stderr - +2025-05-10 22:55:36 - INFO - stdout - {'loss': 0.8739, 'grad_norm': 0.5695316195487976, 'learning_rate': 1.771153132554924e-05, 'epoch': 0.73} +2025-05-10 22:55:36 - ERROR - stderr - 24%|██▍ | 910/3741 [5:29:43<16:34:15, 21.07s/it] +2025-05-10 22:55:56 - ERROR - stderr - 24%|██▍ | 911/3741 [5:30:02<16:13:49, 20.65s/it] +2025-05-10 22:55:56 - ERROR - stderr - +2025-05-10 22:55:56 - ERROR - stderr - +2025-05-10 22:55:56 - INFO - stdout - {'loss': 0.8775, 'grad_norm': 0.6526502966880798, 'learning_rate': 1.770601549670113e-05, 'epoch': 0.73} +2025-05-10 22:55:56 - ERROR - stderr - 24%|██▍ | 911/3741 [5:30:02<16:13:49, 20.65s/it] +2025-05-10 22:56:15 - ERROR - stderr - 24%|██▍ | 912/3741 [5:30:22<15:57:06, 20.30s/it] +2025-05-10 22:56:15 - ERROR - stderr - +2025-05-10 22:56:15 - ERROR - stderr - +2025-05-10 22:56:15 - INFO - stdout - {'loss': 0.9219, 'grad_norm': 0.5602655410766602, 'learning_rate': 1.7700493889620163e-05, 'epoch': 0.73} +2025-05-10 22:56:15 - ERROR - stderr - 24%|██▍ | 912/3741 [5:30:22<15:57:06, 20.30s/it] +2025-05-10 22:56:35 - ERROR - stderr - 24%|██▍ | 913/3741 [5:30:41<15:46:07, 20.07s/it] +2025-05-10 22:56:35 - ERROR - stderr - +2025-05-10 22:56:35 - ERROR - stderr - +2025-05-10 22:56:35 - INFO - stdout - {'loss': 0.9495, 'grad_norm': 0.6553357839584351, 'learning_rate': 1.769496650844663e-05, 'epoch': 0.73} +2025-05-10 22:56:35 - ERROR - stderr - 24%|██▍ | 913/3741 [5:30:41<15:46:07, 20.07s/it] +2025-05-10 22:56:58 - ERROR - stderr - 24%|██▍ | 914/3741 [5:31:05<16:35:47, 21.13s/it] +2025-05-10 22:56:59 - ERROR - stderr - +2025-05-10 22:56:59 - ERROR - stderr - +2025-05-10 22:56:59 - INFO - stdout - {'loss': 0.8947, 'grad_norm': 0.6299150586128235, 'learning_rate': 1.768943335732515e-05, 'epoch': 0.73} +2025-05-10 22:56:59 - ERROR - stderr - 24%|██▍ | 914/3741 [5:31:05<16:35:47, 21.13s/it] +2025-05-10 22:57:18 - ERROR - stderr - 24%|██▍ | 915/3741 [5:31:24<16:14:28, 20.69s/it] +2025-05-10 22:57:18 - ERROR - stderr - +2025-05-10 22:57:18 - ERROR - stderr - +2025-05-10 22:57:18 - INFO - stdout - {'loss': 0.9501, 'grad_norm': 0.6235426664352417, 'learning_rate': 1.7683894440404663e-05, 'epoch': 0.73} +2025-05-10 22:57:18 - ERROR - stderr - 24%|██▍ | 915/3741 [5:31:24<16:14:28, 20.69s/it] +2025-05-10 22:57:41 - ERROR - stderr - 24%|██▍ | 916/3741 [5:31:47<16:46:55, 21.39s/it] +2025-05-10 22:57:41 - ERROR - stderr - +2025-05-10 22:57:41 - ERROR - stderr - +2025-05-10 22:57:41 - INFO - stdout - {'loss': 0.9679, 'grad_norm': 0.6619741916656494, 'learning_rate': 1.7678349761838438e-05, 'epoch': 0.73} +2025-05-10 22:57:41 - ERROR - stderr - 24%|██▍ | 916/3741 [5:31:47<16:46:55, 21.39s/it] +2025-05-10 22:58:01 - ERROR - stderr - 25%|██▍ | 917/3741 [5:32:07<16:26:15, 20.95s/it] +2025-05-10 22:58:01 - ERROR - stderr - +2025-05-10 22:58:01 - ERROR - stderr - +2025-05-10 22:58:01 - INFO - stdout - {'loss': 0.9105, 'grad_norm': 0.6198428869247437, 'learning_rate': 1.7672799325784066e-05, 'epoch': 0.74} +2025-05-10 22:58:01 - ERROR - stderr - 25%|██▍ | 917/3741 [5:32:07<16:26:15, 20.95s/it] +2025-05-10 22:58:22 - ERROR - stderr - 25%|██▍ | 918/3741 [5:32:29<16:31:37, 21.08s/it] +2025-05-10 22:58:22 - ERROR - stderr - +2025-05-10 22:58:22 - ERROR - stderr - +2025-05-10 22:58:22 - INFO - stdout - {'loss': 0.8585, 'grad_norm': 2.144275426864624, 'learning_rate': 1.7667243136403455e-05, 'epoch': 0.74} +2025-05-10 22:58:22 - ERROR - stderr - 25%|██▍ | 918/3741 [5:32:29<16:31:37, 21.08s/it] +2025-05-10 22:58:42 - ERROR - stderr - 25%|██▍ | 919/3741 [5:32:48<16:07:06, 20.56s/it] +2025-05-10 22:58:42 - ERROR - stderr - +2025-05-10 22:58:42 - ERROR - stderr - +2025-05-10 22:58:42 - INFO - stdout - {'loss': 0.9773, 'grad_norm': 0.6071990728378296, 'learning_rate': 1.7661681197862823e-05, 'epoch': 0.74} +2025-05-10 22:58:42 - ERROR - stderr - 25%|██▍ | 919/3741 [5:32:48<16:07:06, 20.56s/it] +2025-05-10 22:59:01 - ERROR - stderr - 25%|██▍ | 920/3741 [5:33:08<15:50:51, 20.22s/it] +2025-05-10 22:59:01 - ERROR - stderr - +2025-05-10 22:59:01 - ERROR - stderr - +2025-05-10 22:59:01 - INFO - stdout - {'loss': 0.9344, 'grad_norm': 0.5878254175186157, 'learning_rate': 1.76561135143327e-05, 'epoch': 0.74} +2025-05-10 22:59:01 - ERROR - stderr - 25%|██▍ | 920/3741 [5:33:08<15:50:51, 20.22s/it] +2025-05-10 22:59:25 - ERROR - stderr - 25%|██▍ | 921/3741 [5:33:31<16:38:58, 21.25s/it] +2025-05-10 22:59:25 - ERROR - stderr - +2025-05-10 22:59:25 - ERROR - stderr - +2025-05-10 22:59:25 - INFO - stdout - {'loss': 0.8986, 'grad_norm': 0.616235077381134, 'learning_rate': 1.7650540089987926e-05, 'epoch': 0.74} +2025-05-10 22:59:25 - ERROR - stderr - 25%|██▍ | 921/3741 [5:33:31<16:38:58, 21.25s/it] +2025-05-10 22:59:44 - ERROR - stderr - 25%|██▍ | 922/3741 [5:33:51<16:11:58, 20.69s/it] +2025-05-10 22:59:44 - ERROR - stderr - +2025-05-10 22:59:44 - ERROR - stderr - +2025-05-10 22:59:44 - INFO - stdout - {'loss': 0.9162, 'grad_norm': 0.6328748464584351, 'learning_rate': 1.7644960929007642e-05, 'epoch': 0.74} +2025-05-10 22:59:44 - ERROR - stderr - 25%|██▍ | 922/3741 [5:33:51<16:11:58, 20.69s/it] +2025-05-10 23:00:08 - ERROR - stderr - 25%|██▍ | 923/3741 [5:34:14<16:50:16, 21.51s/it] +2025-05-10 23:00:08 - ERROR - stderr - +2025-05-10 23:00:08 - ERROR - stderr - +2025-05-10 23:00:08 - INFO - stdout - {'loss': 0.9292, 'grad_norm': 0.610571563243866, 'learning_rate': 1.7639376035575296e-05, 'epoch': 0.74} +2025-05-10 23:00:08 - ERROR - stderr - 25%|██▍ | 923/3741 [5:34:14<16:50:16, 21.51s/it] +2025-05-10 23:00:27 - ERROR - stderr - 25%|██▍ | 924/3741 [5:34:33<16:18:51, 20.85s/it] +2025-05-10 23:00:27 - ERROR - stderr - +2025-05-10 23:00:27 - ERROR - stderr - +2025-05-10 23:00:27 - INFO - stdout - {'loss': 0.9503, 'grad_norm': 0.6044664978981018, 'learning_rate': 1.7633785413878634e-05, 'epoch': 0.74} +2025-05-10 23:00:27 - ERROR - stderr - 25%|██▍ | 924/3741 [5:34:33<16:18:51, 20.85s/it] +2025-05-10 23:00:50 - ERROR - stderr - 25%|██▍ | 925/3741 [5:34:56<16:45:29, 21.42s/it] +2025-05-10 23:00:50 - ERROR - stderr - +2025-05-10 23:00:50 - ERROR - stderr - +2025-05-10 23:00:50 - INFO - stdout - {'loss': 0.8839, 'grad_norm': 0.5878413319587708, 'learning_rate': 1.762818906810969e-05, 'epoch': 0.74} +2025-05-10 23:00:50 - ERROR - stderr - 25%|██▍ | 925/3741 [5:34:56<16:45:29, 21.42s/it] +2025-05-10 23:01:09 - ERROR - stderr - 25%|██▍ | 926/3741 [5:35:16<16:20:37, 20.90s/it] +2025-05-10 23:01:09 - ERROR - stderr - +2025-05-10 23:01:09 - ERROR - stderr - +2025-05-10 23:01:09 - INFO - stdout - {'loss': 0.9361, 'grad_norm': 0.99688321352005, 'learning_rate': 1.7622587002464792e-05, 'epoch': 0.74} +2025-05-10 23:01:09 - ERROR - stderr - 25%|██▍ | 926/3741 [5:35:16<16:20:37, 20.90s/it] +2025-05-10 23:01:29 - ERROR - stderr - 25%|██▍ | 927/3741 [5:35:35<15:59:09, 20.45s/it] +2025-05-10 23:01:29 - ERROR - stderr - +2025-05-10 23:01:29 - ERROR - stderr - +2025-05-10 23:01:29 - INFO - stdout - {'loss': 0.9249, 'grad_norm': 6.020138263702393, 'learning_rate': 1.7616979221144565e-05, 'epoch': 0.74} +2025-05-10 23:01:29 - ERROR - stderr - 25%|██▍ | 927/3741 [5:35:35<15:59:09, 20.45s/it] +2025-05-10 23:01:50 - ERROR - stderr - 25%|██▍ | 928/3741 [5:35:57<16:14:27, 20.78s/it] +2025-05-10 23:01:50 - ERROR - stderr - +2025-05-10 23:01:50 - ERROR - stderr - +2025-05-10 23:01:50 - INFO - stdout - {'loss': 0.8932, 'grad_norm': 0.65313321352005, 'learning_rate': 1.7611365728353907e-05, 'epoch': 0.74} +2025-05-10 23:01:50 - ERROR - stderr - 25%|██▍ | 928/3741 [5:35:57<16:14:27, 20.78s/it] +2025-05-10 23:02:10 - ERROR - stderr - 25%|██▍ | 929/3741 [5:36:16<15:54:11, 20.36s/it] +2025-05-10 23:02:10 - ERROR - stderr - +2025-05-10 23:02:10 - ERROR - stderr - +2025-05-10 23:02:10 - INFO - stdout - {'loss': 0.9224, 'grad_norm': 0.6319687962532043, 'learning_rate': 1.7605746528302017e-05, 'epoch': 0.74} +2025-05-10 23:02:10 - ERROR - stderr - 25%|██▍ | 929/3741 [5:36:16<15:54:11, 20.36s/it] +2025-05-10 23:02:33 - ERROR - stderr - 25%|██▍ | 930/3741 [5:36:39<16:30:00, 21.13s/it] +2025-05-10 23:02:33 - ERROR - stderr - +2025-05-10 23:02:33 - ERROR - stderr - +2025-05-10 23:02:33 - INFO - stdout - {'loss': 0.9589, 'grad_norm': 0.6352254152297974, 'learning_rate': 1.760012162520236e-05, 'epoch': 0.75} +2025-05-10 23:02:33 - ERROR - stderr - 25%|██▍ | 930/3741 [5:36:39<16:30:00, 21.13s/it] +2025-05-10 23:02:52 - ERROR - stderr - 25%|██▍ | 931/3741 [5:36:59<16:07:01, 20.65s/it] +2025-05-10 23:02:52 - ERROR - stderr - +2025-05-10 23:02:52 - ERROR - stderr - +2025-05-10 23:02:52 - INFO - stdout - {'loss': 0.9495, 'grad_norm': 0.6238382458686829, 'learning_rate': 1.759449102327267e-05, 'epoch': 0.75} +2025-05-10 23:02:52 - ERROR - stderr - 25%|██▍ | 931/3741 [5:36:59<16:07:01, 20.65s/it] +2025-05-10 23:03:15 - ERROR - stderr - 25%|██▍ | 932/3741 [5:37:21<16:29:37, 21.14s/it] +2025-05-10 23:03:15 - ERROR - stderr - +2025-05-10 23:03:15 - ERROR - stderr - +2025-05-10 23:03:15 - INFO - stdout - {'loss': 0.9395, 'grad_norm': 0.6095400452613831, 'learning_rate': 1.7588854726734974e-05, 'epoch': 0.75} +2025-05-10 23:03:15 - ERROR - stderr - 25%|██▍ | 932/3741 [5:37:21<16:29:37, 21.14s/it] +2025-05-10 23:03:34 - ERROR - stderr - 25%|██▍ | 933/3741 [5:37:40<16:08:09, 20.69s/it] +2025-05-10 23:03:34 - ERROR - stderr - +2025-05-10 23:03:34 - ERROR - stderr - +2025-05-10 23:03:34 - INFO - stdout - {'loss': 0.9041, 'grad_norm': 0.5706982016563416, 'learning_rate': 1.7583212739815555e-05, 'epoch': 0.75} +2025-05-10 23:03:34 - ERROR - stderr - 25%|██▍ | 933/3741 [5:37:41<16:08:09, 20.69s/it] +2025-05-10 23:03:54 - ERROR - stderr - 25%|██▍ | 934/3741 [5:38:00<15:50:44, 20.32s/it] +2025-05-10 23:03:54 - ERROR - stderr - +2025-05-10 23:03:54 - ERROR - stderr - +2025-05-10 23:03:54 - INFO - stdout - {'loss': 0.8945, 'grad_norm': 0.5789833664894104, 'learning_rate': 1.757756506674497e-05, 'epoch': 0.75} +2025-05-10 23:03:54 - ERROR - stderr - 25%|██▍ | 934/3741 [5:38:00<15:50:44, 20.32s/it] +2025-05-10 23:04:17 - ERROR - stderr - 25%|██▍ | 935/3741 [5:38:23<16:33:32, 21.24s/it] +2025-05-10 23:04:17 - ERROR - stderr - +2025-05-10 23:04:17 - ERROR - stderr - +2025-05-10 23:04:17 - INFO - stdout - {'loss': 0.9189, 'grad_norm': 0.5853317975997925, 'learning_rate': 1.7571911711758032e-05, 'epoch': 0.75} +2025-05-10 23:04:17 - ERROR - stderr - 25%|██▍ | 935/3741 [5:38:23<16:33:32, 21.24s/it] +2025-05-10 23:04:37 - ERROR - stderr - 25%|██▌ | 936/3741 [5:38:43<16:12:55, 20.81s/it] +2025-05-10 23:04:37 - ERROR - stderr - +2025-05-10 23:04:37 - ERROR - stderr - +2025-05-10 23:04:37 - INFO - stdout - {'loss': 0.9125, 'grad_norm': 0.6032062768936157, 'learning_rate': 1.7566252679093826e-05, 'epoch': 0.75} +2025-05-10 23:04:37 - ERROR - stderr - 25%|██▌ | 936/3741 [5:38:43<16:12:55, 20.81s/it] +2025-05-10 23:04:59 - ERROR - stderr - 25%|██▌ | 937/3741 [5:39:05<16:30:44, 21.20s/it] +2025-05-10 23:04:59 - ERROR - stderr - +2025-05-10 23:04:59 - ERROR - stderr - +2025-05-10 23:04:59 - INFO - stdout - {'loss': 0.9299, 'grad_norm': 0.6213047504425049, 'learning_rate': 1.7560587972995678e-05, 'epoch': 0.75} +2025-05-10 23:04:59 - ERROR - stderr - 25%|██▌ | 937/3741 [5:39:05<16:30:44, 21.20s/it] +2025-05-10 23:05:18 - ERROR - stderr - 25%|██▌ | 938/3741 [5:39:25<16:06:27, 20.69s/it] +2025-05-10 23:05:18 - ERROR - stderr - +2025-05-10 23:05:18 - ERROR - stderr - +2025-05-10 23:05:18 - INFO - stdout - {'loss': 0.9627, 'grad_norm': 0.68639075756073, 'learning_rate': 1.7554917597711188e-05, 'epoch': 0.75} +2025-05-10 23:05:18 - ERROR - stderr - 25%|██▌ | 938/3741 [5:39:25<16:06:27, 20.69s/it] +2025-05-10 23:05:39 - ERROR - stderr - 25%|██▌ | 939/3741 [5:39:46<16:09:34, 20.76s/it] +2025-05-10 23:05:39 - ERROR - stderr - +2025-05-10 23:05:39 - ERROR - stderr - +2025-05-10 23:05:39 - INFO - stdout - {'loss': 0.9761, 'grad_norm': 0.5955672264099121, 'learning_rate': 1.7549241557492187e-05, 'epoch': 0.75} +2025-05-10 23:05:39 - ERROR - stderr - 25%|██▌ | 939/3741 [5:39:46<16:09:34, 20.76s/it] +2025-05-10 23:05:59 - ERROR - stderr - 25%|██▌ | 940/3741 [5:40:06<15:59:52, 20.56s/it] +2025-05-10 23:05:59 - ERROR - stderr - +2025-05-10 23:05:59 - ERROR - stderr - +2025-05-10 23:05:59 - INFO - stdout - {'loss': 0.9432, 'grad_norm': 0.6289668679237366, 'learning_rate': 1.754355985659477e-05, 'epoch': 0.75} +2025-05-10 23:05:59 - ERROR - stderr - 25%|██▌ | 940/3741 [5:40:06<15:59:52, 20.56s/it] +2025-05-10 23:06:19 - ERROR - stderr - 25%|██▌ | 941/3741 [5:40:25<15:41:31, 20.18s/it] +2025-05-10 23:06:19 - ERROR - stderr - +2025-05-10 23:06:19 - ERROR - stderr - +2025-05-10 23:06:19 - INFO - stdout - {'loss': 0.9221, 'grad_norm': 0.621410071849823, 'learning_rate': 1.7537872499279265e-05, 'epoch': 0.75} +2025-05-10 23:06:19 - ERROR - stderr - 25%|██▌ | 941/3741 [5:40:25<15:41:31, 20.18s/it] +2025-05-10 23:06:41 - ERROR - stderr - 25%|██▌ | 942/3741 [5:40:47<16:08:30, 20.76s/it] +2025-05-10 23:06:41 - ERROR - stderr - +2025-05-10 23:06:41 - ERROR - stderr - +2025-05-10 23:06:41 - INFO - stdout - {'loss': 0.9208, 'grad_norm': 1.4276875257492065, 'learning_rate': 1.753217948981025e-05, 'epoch': 0.76} +2025-05-10 23:06:41 - ERROR - stderr - 25%|██▌ | 942/3741 [5:40:47<16:08:30, 20.76s/it] +2025-05-10 23:07:01 - ERROR - stderr - 25%|██▌ | 943/3741 [5:41:07<15:55:56, 20.50s/it] +2025-05-10 23:07:01 - ERROR - stderr - +2025-05-10 23:07:01 - ERROR - stderr - +2025-05-10 23:07:01 - INFO - stdout - {'loss': 0.9107, 'grad_norm': 0.6688864231109619, 'learning_rate': 1.7526480832456538e-05, 'epoch': 0.76} +2025-05-10 23:07:01 - ERROR - stderr - 25%|██▌ | 943/3741 [5:41:07<15:55:56, 20.50s/it] +2025-05-10 23:07:23 - ERROR - stderr - 25%|██▌ | 944/3741 [5:41:30<16:24:53, 21.13s/it] +2025-05-10 23:07:23 - ERROR - stderr - +2025-05-10 23:07:23 - ERROR - stderr - +2025-05-10 23:07:23 - INFO - stdout - {'loss': 0.9473, 'grad_norm': 0.6159544587135315, 'learning_rate': 1.752077653149117e-05, 'epoch': 0.76} +2025-05-10 23:07:23 - ERROR - stderr - 25%|██▌ | 944/3741 [5:41:30<16:24:53, 21.13s/it] +2025-05-10 23:07:43 - ERROR - stderr - 25%|██▌ | 945/3741 [5:41:49<15:59:20, 20.59s/it] +2025-05-10 23:07:43 - ERROR - stderr - +2025-05-10 23:07:43 - ERROR - stderr - +2025-05-10 23:07:43 - INFO - stdout - {'loss': 0.908, 'grad_norm': 0.6279981732368469, 'learning_rate': 1.751506659119143e-05, 'epoch': 0.76} +2025-05-10 23:07:43 - ERROR - stderr - 25%|██▌ | 945/3741 [5:41:49<15:59:20, 20.59s/it] +2025-05-10 23:08:06 - ERROR - stderr - 25%|██▌ | 946/3741 [5:42:12<16:30:58, 21.27s/it] +2025-05-10 23:08:06 - ERROR - stderr - +2025-05-10 23:08:06 - ERROR - stderr - +2025-05-10 23:08:06 - INFO - stdout - {'loss': 0.9023, 'grad_norm': 0.6103554964065552, 'learning_rate': 1.750935101583883e-05, 'epoch': 0.76} +2025-05-10 23:08:06 - ERROR - stderr - 25%|██▌ | 946/3741 [5:42:12<16:30:58, 21.27s/it] +2025-05-10 23:08:25 - ERROR - stderr - 25%|██▌ | 947/3741 [5:42:31<16:05:21, 20.73s/it] +2025-05-10 23:08:25 - ERROR - stderr - +2025-05-10 23:08:25 - ERROR - stderr - +2025-05-10 23:08:25 - INFO - stdout - {'loss': 0.9256, 'grad_norm': 0.6202152371406555, 'learning_rate': 1.7503629809719095e-05, 'epoch': 0.76} +2025-05-10 23:08:25 - ERROR - stderr - 25%|██▌ | 947/3741 [5:42:31<16:05:21, 20.73s/it] +2025-05-10 23:08:45 - ERROR - stderr - 25%|██▌ | 948/3741 [5:42:51<15:49:38, 20.40s/it] +2025-05-10 23:08:45 - ERROR - stderr - +2025-05-10 23:08:45 - ERROR - stderr - +2025-05-10 23:08:45 - INFO - stdout - {'loss': 0.93, 'grad_norm': 0.7202157378196716, 'learning_rate': 1.749790297712218e-05, 'epoch': 0.76} +2025-05-10 23:08:45 - ERROR - stderr - 25%|██▌ | 948/3741 [5:42:51<15:49:38, 20.40s/it] +2025-05-10 23:09:04 - ERROR - stderr - 25%|██▌ | 949/3741 [5:43:10<15:36:24, 20.12s/it] +2025-05-10 23:09:04 - ERROR - stderr - +2025-05-10 23:09:04 - ERROR - stderr - +2025-05-10 23:09:04 - INFO - stdout - {'loss': 0.9029, 'grad_norm': 0.6290937066078186, 'learning_rate': 1.7492170522342267e-05, 'epoch': 0.76} +2025-05-10 23:09:04 - ERROR - stderr - 25%|██▌ | 949/3741 [5:43:10<15:36:24, 20.12s/it] +2025-05-10 23:09:23 - ERROR - stderr - 25%|██▌ | 950/3741 [5:43:30<15:25:07, 19.89s/it] +2025-05-10 23:09:23 - ERROR - stderr - +2025-05-10 23:09:23 - ERROR - stderr - +2025-05-10 23:09:23 - INFO - stdout - {'loss': 0.9173, 'grad_norm': 0.6004471778869629, 'learning_rate': 1.748643244967774e-05, 'epoch': 0.76} +2025-05-10 23:09:23 - ERROR - stderr - 25%|██▌ | 950/3741 [5:43:30<15:25:07, 19.89s/it] +2025-05-10 23:09:46 - ERROR - stderr - 25%|██▌ | 951/3741 [5:43:52<16:00:23, 20.65s/it] +2025-05-10 23:09:46 - ERROR - stderr - +2025-05-10 23:09:46 - ERROR - stderr - +2025-05-10 23:09:46 - INFO - stdout - {'loss': 0.9121, 'grad_norm': 0.6735373139381409, 'learning_rate': 1.7480688763431203e-05, 'epoch': 0.76} +2025-05-10 23:09:46 - ERROR - stderr - 25%|██▌ | 951/3741 [5:43:52<16:00:23, 20.65s/it] +2025-05-10 23:10:06 - ERROR - stderr - 25%|██▌ | 952/3741 [5:44:12<15:46:28, 20.36s/it] +2025-05-10 23:10:06 - ERROR - stderr - +2025-05-10 23:10:06 - ERROR - stderr - +2025-05-10 23:10:06 - INFO - stdout - {'loss': 0.9696, 'grad_norm': 0.6806999444961548, 'learning_rate': 1.7474939467909468e-05, 'epoch': 0.76} +2025-05-10 23:10:06 - ERROR - stderr - 25%|██▌ | 952/3741 [5:44:12<15:46:28, 20.36s/it] +2025-05-10 23:10:29 - ERROR - stderr - 25%|██▌ | 953/3741 [5:44:36<16:34:29, 21.40s/it] +2025-05-10 23:10:29 - ERROR - stderr - +2025-05-10 23:10:29 - ERROR - stderr - +2025-05-10 23:10:29 - INFO - stdout - {'loss': 0.8985, 'grad_norm': 0.6270620822906494, 'learning_rate': 1.7469184567423548e-05, 'epoch': 0.76} +2025-05-10 23:10:29 - ERROR - stderr - 25%|██▌ | 953/3741 [5:44:36<16:34:29, 21.40s/it] +2025-05-10 23:10:49 - ERROR - stderr - 26%|██▌ | 954/3741 [5:44:55<16:09:22, 20.87s/it] +2025-05-10 23:10:49 - ERROR - stderr - +2025-05-10 23:10:49 - ERROR - stderr - +2025-05-10 23:10:49 - INFO - stdout - {'loss': 0.9334, 'grad_norm': 0.660423994064331, 'learning_rate': 1.7463424066288668e-05, 'epoch': 0.77} +2025-05-10 23:10:49 - ERROR - stderr - 26%|██▌ | 954/3741 [5:44:55<16:09:22, 20.87s/it] +2025-05-10 23:11:12 - ERROR - stderr - 26%|██▌ | 955/3741 [5:45:18<16:39:47, 21.53s/it] +2025-05-10 23:11:12 - ERROR - stderr - +2025-05-10 23:11:12 - ERROR - stderr - +2025-05-10 23:11:12 - INFO - stdout - {'loss': 0.9471, 'grad_norm': 0.6471710205078125, 'learning_rate': 1.745765796882425e-05, 'epoch': 0.77} +2025-05-10 23:11:12 - ERROR - stderr - 26%|██▌ | 955/3741 [5:45:18<16:39:47, 21.53s/it] +2025-05-10 23:11:32 - ERROR - stderr - 26%|██▌ | 956/3741 [5:45:38<16:13:31, 20.97s/it] +2025-05-10 23:11:32 - ERROR - stderr - +2025-05-10 23:11:32 - ERROR - stderr - +2025-05-10 23:11:32 - INFO - stdout - {'loss': 0.8939, 'grad_norm': 0.5963034629821777, 'learning_rate': 1.7451886279353905e-05, 'epoch': 0.77} +2025-05-10 23:11:32 - ERROR - stderr - 26%|██▌ | 956/3741 [5:45:38<16:13:31, 20.97s/it] +2025-05-10 23:11:52 - ERROR - stderr - 26%|██▌ | 957/3741 [5:45:58<15:57:07, 20.63s/it] +2025-05-10 23:11:52 - ERROR - stderr - +2025-05-10 23:11:52 - ERROR - stderr - +2025-05-10 23:11:52 - INFO - stdout - {'loss': 0.9114, 'grad_norm': 0.6383598446846008, 'learning_rate': 1.7446109002205444e-05, 'epoch': 0.77} +2025-05-10 23:11:52 - ERROR - stderr - 26%|██▌ | 957/3741 [5:45:58<15:57:07, 20.63s/it] +2025-05-10 23:12:12 - ERROR - stderr - 26%|██▌ | 958/3741 [5:46:18<15:47:17, 20.42s/it] +2025-05-10 23:12:12 - ERROR - stderr - +2025-05-10 23:12:12 - ERROR - stderr - +2025-05-10 23:12:12 - INFO - stdout - {'loss': 0.9375, 'grad_norm': 0.6523898839950562, 'learning_rate': 1.744032614171087e-05, 'epoch': 0.77} +2025-05-10 23:12:12 - ERROR - stderr - 26%|██▌ | 958/3741 [5:46:18<15:47:17, 20.42s/it] +2025-05-10 23:12:31 - ERROR - stderr - 26%|██▌ | 959/3741 [5:46:37<15:30:21, 20.07s/it] +2025-05-10 23:12:31 - ERROR - stderr - +2025-05-10 23:12:31 - ERROR - stderr - +2025-05-10 23:12:31 - INFO - stdout - {'loss': 0.9317, 'grad_norm': 0.6452939510345459, 'learning_rate': 1.743453770220636e-05, 'epoch': 0.77} +2025-05-10 23:12:31 - ERROR - stderr - 26%|██▌ | 959/3741 [5:46:37<15:30:21, 20.07s/it] +2025-05-10 23:12:51 - ERROR - stderr - 26%|██▌ | 960/3741 [5:46:57<15:29:19, 20.05s/it] +2025-05-10 23:12:51 - ERROR - stderr - +2025-05-10 23:12:51 - ERROR - stderr - +2025-05-10 23:12:51 - INFO - stdout - {'loss': 0.9467, 'grad_norm': 0.6215782165527344, 'learning_rate': 1.7428743688032292e-05, 'epoch': 0.77} +2025-05-10 23:12:51 - ERROR - stderr - 26%|██▌ | 960/3741 [5:46:57<15:29:19, 20.05s/it] +2025-05-10 23:13:10 - ERROR - stderr - 26%|██▌ | 961/3741 [5:47:17<15:22:53, 19.92s/it] +2025-05-10 23:13:10 - ERROR - stderr - +2025-05-10 23:13:10 - ERROR - stderr - +2025-05-10 23:13:10 - INFO - stdout - {'loss': 0.9916, 'grad_norm': 0.6118282675743103, 'learning_rate': 1.7422944103533212e-05, 'epoch': 0.77} +2025-05-10 23:13:10 - ERROR - stderr - 26%|██▌ | 961/3741 [5:47:17<15:22:53, 19.92s/it] +2025-05-10 23:13:33 - ERROR - stderr - 26%|██▌ | 962/3741 [5:47:40<16:02:57, 20.79s/it] +2025-05-10 23:13:33 - ERROR - stderr - +2025-05-10 23:13:33 - ERROR - stderr - +2025-05-10 23:13:33 - INFO - stdout - {'loss': 0.9415, 'grad_norm': 0.6718006730079651, 'learning_rate': 1.7417138953057847e-05, 'epoch': 0.77} +2025-05-10 23:13:33 - ERROR - stderr - 26%|██▌ | 962/3741 [5:47:40<16:02:57, 20.79s/it] +2025-05-10 23:13:53 - ERROR - stderr - 26%|██▌ | 963/3741 [5:47:59<15:46:58, 20.45s/it] +2025-05-10 23:13:53 - ERROR - stderr - +2025-05-10 23:13:53 - ERROR - stderr - +2025-05-10 23:13:53 - INFO - stdout - {'loss': 0.9109, 'grad_norm': 0.6651148200035095, 'learning_rate': 1.7411328240959095e-05, 'epoch': 0.77} +2025-05-10 23:13:53 - ERROR - stderr - 26%|██▌ | 963/3741 [5:47:59<15:46:58, 20.45s/it] +2025-05-10 23:14:16 - ERROR - stderr - 26%|██▌ | 964/3741 [5:48:22<16:23:00, 21.24s/it] +2025-05-10 23:14:16 - ERROR - stderr - +2025-05-10 23:14:16 - ERROR - stderr - +2025-05-10 23:14:16 - INFO - stdout - {'loss': 0.9311, 'grad_norm': 0.6115936636924744, 'learning_rate': 1.7405511971594022e-05, 'epoch': 0.77} +2025-05-10 23:14:16 - ERROR - stderr - 26%|██▌ | 964/3741 [5:48:22<16:23:00, 21.24s/it] +2025-05-10 23:14:36 - ERROR - stderr - 26%|██▌ | 965/3741 [5:48:42<16:00:20, 20.76s/it] +2025-05-10 23:14:36 - ERROR - stderr - +2025-05-10 23:14:36 - ERROR - stderr - +2025-05-10 23:14:36 - INFO - stdout - {'loss': 0.8825, 'grad_norm': 0.6545907855033875, 'learning_rate': 1.739969014932387e-05, 'epoch': 0.77} +2025-05-10 23:14:36 - ERROR - stderr - 26%|██▌ | 965/3741 [5:48:42<16:00:20, 20.76s/it] +2025-05-10 23:14:59 - ERROR - stderr - 26%|██▌ | 966/3741 [5:49:05<16:33:26, 21.48s/it] +2025-05-10 23:14:59 - ERROR - stderr - +2025-05-10 23:14:59 - ERROR - stderr - +2025-05-10 23:14:59 - INFO - stdout - {'loss': 0.9522, 'grad_norm': 0.6140168905258179, 'learning_rate': 1.7393862778514042e-05, 'epoch': 0.77} +2025-05-10 23:14:59 - ERROR - stderr - 26%|██▌ | 966/3741 [5:49:05<16:33:26, 21.48s/it] +2025-05-10 23:15:18 - ERROR - stderr - 26%|██▌ | 967/3741 [5:49:25<16:05:18, 20.88s/it] +2025-05-10 23:15:18 - ERROR - stderr - +2025-05-10 23:15:18 - ERROR - stderr - +2025-05-10 23:15:18 - INFO - stdout - {'loss': 0.8981, 'grad_norm': 0.5883581638336182, 'learning_rate': 1.738802986353409e-05, 'epoch': 0.78} +2025-05-10 23:15:18 - ERROR - stderr - 26%|██▌ | 967/3741 [5:49:25<16:05:18, 20.88s/it] +2025-05-10 23:15:38 - ERROR - stderr - 26%|██▌ | 968/3741 [5:49:44<15:47:43, 20.51s/it] +2025-05-10 23:15:38 - ERROR - stderr - +2025-05-10 23:15:38 - ERROR - stderr - +2025-05-10 23:15:38 - INFO - stdout - {'loss': 0.9418, 'grad_norm': 0.6271137595176697, 'learning_rate': 1.7382191408757744e-05, 'epoch': 0.78} +2025-05-10 23:15:38 - ERROR - stderr - 26%|██▌ | 968/3741 [5:49:44<15:47:43, 20.51s/it] +2025-05-10 23:15:57 - ERROR - stderr - 26%|██▌ | 969/3741 [5:50:04<15:34:03, 20.22s/it] +2025-05-10 23:15:57 - ERROR - stderr - +2025-05-10 23:15:57 - ERROR - stderr - +2025-05-10 23:15:57 - INFO - stdout - {'loss': 0.894, 'grad_norm': 0.6357349157333374, 'learning_rate': 1.7376347418562866e-05, 'epoch': 0.78} +2025-05-10 23:15:57 - ERROR - stderr - 26%|██▌ | 969/3741 [5:50:04<15:34:03, 20.22s/it] +2025-05-10 23:16:17 - ERROR - stderr - 26%|██▌ | 970/3741 [5:50:23<15:21:54, 19.96s/it] +2025-05-10 23:16:17 - ERROR - stderr - +2025-05-10 23:16:17 - ERROR - stderr - +2025-05-10 23:16:17 - INFO - stdout - {'loss': 0.9197, 'grad_norm': 0.6148669123649597, 'learning_rate': 1.7370497897331486e-05, 'epoch': 0.78} +2025-05-10 23:16:17 - ERROR - stderr - 26%|██▌ | 970/3741 [5:50:23<15:21:54, 19.96s/it] +2025-05-10 23:16:40 - ERROR - stderr - 26%|██▌ | 971/3741 [5:50:46<16:02:25, 20.85s/it] +2025-05-10 23:16:40 - ERROR - stderr - +2025-05-10 23:16:40 - ERROR - stderr - +2025-05-10 23:16:40 - INFO - stdout - {'loss': 0.952, 'grad_norm': 0.589157223701477, 'learning_rate': 1.7364642849449767e-05, 'epoch': 0.78} +2025-05-10 23:16:40 - ERROR - stderr - 26%|██▌ | 971/3741 [5:50:46<16:02:25, 20.85s/it] +2025-05-10 23:16:59 - ERROR - stderr - 26%|██▌ | 972/3741 [5:51:05<15:42:29, 20.42s/it] +2025-05-10 23:16:59 - ERROR - stderr - +2025-05-10 23:16:59 - ERROR - stderr - +2025-05-10 23:16:59 - INFO - stdout - {'loss': 0.9722, 'grad_norm': 0.6091080904006958, 'learning_rate': 1.735878227930803e-05, 'epoch': 0.78} +2025-05-10 23:16:59 - ERROR - stderr - 26%|██▌ | 972/3741 [5:51:05<15:42:29, 20.42s/it] +2025-05-10 23:17:23 - ERROR - stderr - 26%|██▌ | 973/3741 [5:51:29<16:27:05, 21.40s/it] +2025-05-10 23:17:23 - ERROR - stderr - +2025-05-10 23:17:23 - ERROR - stderr - +2025-05-10 23:17:23 - INFO - stdout - {'loss': 0.924, 'grad_norm': 0.6389529705047607, 'learning_rate': 1.735291619130073e-05, 'epoch': 0.78} +2025-05-10 23:17:23 - ERROR - stderr - 26%|██▌ | 973/3741 [5:51:29<16:27:05, 21.40s/it] +2025-05-10 23:17:42 - ERROR - stderr - 26%|██▌ | 974/3741 [5:51:49<16:00:18, 20.82s/it] +2025-05-10 23:17:42 - ERROR - stderr - +2025-05-10 23:17:42 - ERROR - stderr - +2025-05-10 23:17:42 - INFO - stdout - {'loss': 0.9491, 'grad_norm': 0.6408534049987793, 'learning_rate': 1.7347044589826455e-05, 'epoch': 0.78} +2025-05-10 23:17:42 - ERROR - stderr - 26%|██▌ | 974/3741 [5:51:49<16:00:18, 20.82s/it] +2025-05-10 23:18:04 - ERROR - stderr - 26%|██▌ | 975/3741 [5:52:10<16:11:28, 21.07s/it] +2025-05-10 23:18:04 - ERROR - stderr - +2025-05-10 23:18:04 - ERROR - stderr - +2025-05-10 23:18:04 - INFO - stdout - {'loss': 0.9298, 'grad_norm': 0.5990006327629089, 'learning_rate': 1.7341167479287934e-05, 'epoch': 0.78} +2025-05-10 23:18:04 - ERROR - stderr - 26%|██▌ | 975/3741 [5:52:10<16:11:28, 21.07s/it] +2025-05-10 23:18:24 - ERROR - stderr - 26%|██▌ | 976/3741 [5:52:30<15:53:20, 20.69s/it] +2025-05-10 23:18:24 - ERROR - stderr - +2025-05-10 23:18:24 - ERROR - stderr - +2025-05-10 23:18:24 - INFO - stdout - {'loss': 0.8903, 'grad_norm': 0.5884609222412109, 'learning_rate': 1.7335284864092024e-05, 'epoch': 0.78} +2025-05-10 23:18:24 - ERROR - stderr - 26%|██▌ | 976/3741 [5:52:30<15:53:20, 20.69s/it] +2025-05-10 23:18:43 - ERROR - stderr - 26%|██▌ | 977/3741 [5:52:49<15:35:46, 20.31s/it] +2025-05-10 23:18:43 - ERROR - stderr - +2025-05-10 23:18:43 - ERROR - stderr - +2025-05-10 23:18:43 - INFO - stdout - {'loss': 0.891, 'grad_norm': 0.5926967859268188, 'learning_rate': 1.732939674864971e-05, 'epoch': 0.78} +2025-05-10 23:18:43 - ERROR - stderr - 26%|██▌ | 977/3741 [5:52:50<15:35:46, 20.31s/it] +2025-05-10 23:19:03 - ERROR - stderr - 26%|██▌ | 978/3741 [5:53:10<15:35:17, 20.31s/it] +2025-05-10 23:19:03 - ERROR - stderr - +2025-05-10 23:19:03 - ERROR - stderr - +2025-05-10 23:19:03 - INFO - stdout - {'loss': 0.968, 'grad_norm': 0.697799801826477, 'learning_rate': 1.7323503137376102e-05, 'epoch': 0.78} +2025-05-10 23:19:03 - ERROR - stderr - 26%|██▌ | 978/3741 [5:53:10<15:35:17, 20.31s/it] +2025-05-10 23:19:23 - ERROR - stderr - 26%|██▌ | 979/3741 [5:53:29<15:23:35, 20.06s/it] +2025-05-10 23:19:23 - ERROR - stderr - +2025-05-10 23:19:23 - ERROR - stderr - +2025-05-10 23:19:23 - INFO - stdout - {'loss': 0.9672, 'grad_norm': 0.6278639435768127, 'learning_rate': 1.7317604034690434e-05, 'epoch': 0.79} +2025-05-10 23:19:23 - ERROR - stderr - 26%|██▌ | 979/3741 [5:53:29<15:23:35, 20.06s/it] +2025-05-10 23:19:46 - ERROR - stderr - 26%|██▌ | 980/3741 [5:53:52<16:03:23, 20.94s/it] +2025-05-10 23:19:46 - ERROR - stderr - +2025-05-10 23:19:46 - ERROR - stderr - +2025-05-10 23:19:46 - INFO - stdout - {'loss': 0.8997, 'grad_norm': 0.6392386555671692, 'learning_rate': 1.7311699445016046e-05, 'epoch': 0.79} +2025-05-10 23:19:46 - ERROR - stderr - 26%|██▌ | 980/3741 [5:53:52<16:03:23, 20.94s/it] +2025-05-10 23:20:05 - ERROR - stderr - 26%|██▌ | 981/3741 [5:54:12<15:40:12, 20.44s/it] +2025-05-10 23:20:05 - ERROR - stderr - +2025-05-10 23:20:05 - ERROR - stderr - +2025-05-10 23:20:05 - INFO - stdout - {'loss': 0.9535, 'grad_norm': 0.5999894738197327, 'learning_rate': 1.730578937278041e-05, 'epoch': 0.79} +2025-05-10 23:20:05 - ERROR - stderr - 26%|██▌ | 981/3741 [5:54:12<15:40:12, 20.44s/it] +2025-05-10 23:20:28 - ERROR - stderr - 26%|██▌ | 982/3741 [5:54:34<16:12:34, 21.15s/it] +2025-05-10 23:20:28 - ERROR - stderr - +2025-05-10 23:20:28 - ERROR - stderr - +2025-05-10 23:20:28 - INFO - stdout - {'loss': 0.892, 'grad_norm': 0.6001031398773193, 'learning_rate': 1.7299873822415093e-05, 'epoch': 0.79} +2025-05-10 23:20:28 - ERROR - stderr - 26%|██▌ | 982/3741 [5:54:34<16:12:34, 21.15s/it] +2025-05-10 23:20:48 - ERROR - stderr - 26%|██▋ | 983/3741 [5:54:54<15:50:47, 20.68s/it] +2025-05-10 23:20:48 - ERROR - stderr - +2025-05-10 23:20:48 - ERROR - stderr - +2025-05-10 23:20:48 - INFO - stdout - {'loss': 0.8658, 'grad_norm': 0.6019670963287354, 'learning_rate': 1.7293952798355776e-05, 'epoch': 0.79} +2025-05-10 23:20:48 - ERROR - stderr - 26%|██▋ | 983/3741 [5:54:54<15:50:47, 20.68s/it] +2025-05-10 23:21:07 - ERROR - stderr - 26%|██▋ | 984/3741 [5:55:14<15:38:02, 20.41s/it] +2025-05-10 23:21:07 - ERROR - stderr - +2025-05-10 23:21:07 - ERROR - stderr - +2025-05-10 23:21:07 - INFO - stdout - {'loss': 0.9091, 'grad_norm': 0.6335600018501282, 'learning_rate': 1.728802630504225e-05, 'epoch': 0.79} +2025-05-10 23:21:07 - ERROR - stderr - 26%|██▋ | 984/3741 [5:55:14<15:38:02, 20.41s/it] +2025-05-10 23:21:27 - ERROR - stderr - 26%|██▋ | 985/3741 [5:55:33<15:22:38, 20.09s/it] +2025-05-10 23:21:27 - ERROR - stderr - +2025-05-10 23:21:27 - ERROR - stderr - +2025-05-10 23:21:27 - INFO - stdout - {'loss': 0.9317, 'grad_norm': 0.5757085680961609, 'learning_rate': 1.7282094346918395e-05, 'epoch': 0.79} +2025-05-10 23:21:27 - ERROR - stderr - 26%|██▋ | 985/3741 [5:55:33<15:22:38, 20.09s/it] +2025-05-10 23:21:46 - ERROR - stderr - 26%|██▋ | 986/3741 [5:55:52<15:11:08, 19.84s/it] +2025-05-10 23:21:46 - ERROR - stderr - +2025-05-10 23:21:46 - ERROR - stderr - +2025-05-10 23:21:46 - INFO - stdout - {'loss': 0.9256, 'grad_norm': 0.6094053387641907, 'learning_rate': 1.72761569284322e-05, 'epoch': 0.79} +2025-05-10 23:21:46 - ERROR - stderr - 26%|██▋ | 986/3741 [5:55:52<15:11:08, 19.84s/it] +2025-05-10 23:22:09 - ERROR - stderr - 26%|██▋ | 987/3741 [5:56:15<15:54:24, 20.79s/it] +2025-05-10 23:22:09 - ERROR - stderr - +2025-05-10 23:22:09 - ERROR - stderr - +2025-05-10 23:22:09 - INFO - stdout - {'loss': 0.9395, 'grad_norm': 0.6195594668388367, 'learning_rate': 1.7270214054035736e-05, 'epoch': 0.79} +2025-05-10 23:22:09 - ERROR - stderr - 26%|██▋ | 987/3741 [5:56:15<15:54:24, 20.79s/it] +2025-05-10 23:22:29 - ERROR - stderr - 26%|██▋ | 988/3741 [5:56:35<15:36:25, 20.41s/it] +2025-05-10 23:22:29 - ERROR - stderr - +2025-05-10 23:22:29 - ERROR - stderr - +2025-05-10 23:22:29 - INFO - stdout - {'loss': 0.8758, 'grad_norm': 0.6129851937294006, 'learning_rate': 1.7264265728185186e-05, 'epoch': 0.79} +2025-05-10 23:22:29 - ERROR - stderr - 26%|██▋ | 988/3741 [5:56:35<15:36:25, 20.41s/it] +2025-05-10 23:22:51 - ERROR - stderr - 26%|██▋ | 989/3741 [5:56:57<16:06:20, 21.07s/it] +2025-05-10 23:22:51 - ERROR - stderr - +2025-05-10 23:22:51 - ERROR - stderr - +2025-05-10 23:22:51 - INFO - stdout - {'loss': 0.9307, 'grad_norm': 0.6170161962509155, 'learning_rate': 1.7258311955340794e-05, 'epoch': 0.79} +2025-05-10 23:22:51 - ERROR - stderr - 26%|██▋ | 989/3741 [5:56:57<16:06:20, 21.07s/it] +2025-05-10 23:23:10 - ERROR - stderr - 26%|██▋ | 990/3741 [5:57:17<15:42:29, 20.56s/it] +2025-05-10 23:23:11 - ERROR - stderr - +2025-05-10 23:23:11 - ERROR - stderr - +2025-05-10 23:23:11 - INFO - stdout - {'loss': 0.9174, 'grad_norm': 0.6150994300842285, 'learning_rate': 1.725235273996691e-05, 'epoch': 0.79} +2025-05-10 23:23:11 - ERROR - stderr - 26%|██▋ | 990/3741 [5:57:17<15:42:29, 20.56s/it] +2025-05-10 23:23:30 - ERROR - stderr - 26%|██▋ | 991/3741 [5:57:36<15:28:47, 20.26s/it] +2025-05-10 23:23:30 - ERROR - stderr - +2025-05-10 23:23:30 - ERROR - stderr - +2025-05-10 23:23:30 - INFO - stdout - {'loss': 0.9244, 'grad_norm': 0.6161699891090393, 'learning_rate': 1.7246388086531953e-05, 'epoch': 0.79} +2025-05-10 23:23:30 - ERROR - stderr - 26%|██▋ | 991/3741 [5:57:36<15:28:47, 20.26s/it] +2025-05-10 23:23:51 - ERROR - stderr - 27%|██▋ | 992/3741 [5:57:58<15:40:47, 20.53s/it] +2025-05-10 23:23:51 - ERROR - stderr - +2025-05-10 23:23:51 - ERROR - stderr - +2025-05-10 23:23:51 - INFO - stdout - {'loss': 0.9147, 'grad_norm': 0.6051169037818909, 'learning_rate': 1.7240417999508424e-05, 'epoch': 0.8} +2025-05-10 23:23:51 - ERROR - stderr - 27%|██▋ | 992/3741 [5:57:58<15:40:47, 20.53s/it] +2025-05-10 23:24:11 - ERROR - stderr - 27%|██▋ | 993/3741 [5:58:17<15:24:53, 20.19s/it] +2025-05-10 23:24:11 - ERROR - stderr - +2025-05-10 23:24:11 - ERROR - stderr - +2025-05-10 23:24:11 - INFO - stdout - {'loss': 0.9861, 'grad_norm': 0.6212258338928223, 'learning_rate': 1.7234442483372894e-05, 'epoch': 0.8} +2025-05-10 23:24:11 - ERROR - stderr - 27%|██▋ | 993/3741 [5:58:17<15:24:53, 20.19s/it] +2025-05-10 23:24:34 - ERROR - stderr - 27%|██▋ | 994/3741 [5:58:40<16:08:45, 21.16s/it] +2025-05-10 23:24:34 - ERROR - stderr - +2025-05-10 23:24:34 - ERROR - stderr - +2025-05-10 23:24:34 - INFO - stdout - {'loss': 0.9064, 'grad_norm': 0.6092191934585571, 'learning_rate': 1.722846154260602e-05, 'epoch': 0.8} +2025-05-10 23:24:34 - ERROR - stderr - 27%|██▋ | 994/3741 [5:58:40<16:08:45, 21.16s/it] +2025-05-10 23:24:54 - ERROR - stderr - 27%|██▋ | 995/3741 [5:59:00<15:45:18, 20.65s/it] +2025-05-10 23:24:54 - ERROR - stderr - +2025-05-10 23:24:54 - ERROR - stderr - +2025-05-10 23:24:54 - INFO - stdout - {'loss': 0.9129, 'grad_norm': 0.6202679872512817, 'learning_rate': 1.72224751816925e-05, 'epoch': 0.8} +2025-05-10 23:24:54 - ERROR - stderr - 27%|██▋ | 995/3741 [5:59:00<15:45:18, 20.65s/it] +2025-05-10 23:25:17 - ERROR - stderr - 27%|██▋ | 996/3741 [5:59:24<16:28:49, 21.61s/it] +2025-05-10 23:25:17 - ERROR - stderr - +2025-05-10 23:25:17 - ERROR - stderr - +2025-05-10 23:25:17 - INFO - stdout - {'loss': 0.9291, 'grad_norm': 0.6190642714500427, 'learning_rate': 1.721648340512112e-05, 'epoch': 0.8} +2025-05-10 23:25:17 - ERROR - stderr - 27%|██▋ | 996/3741 [5:59:24<16:28:49, 21.61s/it] +2025-05-10 23:25:37 - ERROR - stderr - 27%|██▋ | 997/3741 [5:59:43<16:01:40, 21.03s/it] +2025-05-10 23:25:37 - ERROR - stderr - +2025-05-10 23:25:37 - ERROR - stderr - +2025-05-10 23:25:37 - INFO - stdout - {'loss': 0.8931, 'grad_norm': 0.5609696507453918, 'learning_rate': 1.721048621738472e-05, 'epoch': 0.8} +2025-05-10 23:25:37 - ERROR - stderr - 27%|██▋ | 997/3741 [5:59:43<16:01:40, 21.03s/it] +2025-05-10 23:25:56 - ERROR - stderr - 27%|██▋ | 998/3741 [6:00:03<15:39:47, 20.56s/it] +2025-05-10 23:25:57 - ERROR - stderr - +2025-05-10 23:25:57 - ERROR - stderr - +2025-05-10 23:25:57 - INFO - stdout - {'loss': 0.9463, 'grad_norm': 0.6554841995239258, 'learning_rate': 1.720448362298019e-05, 'epoch': 0.8} +2025-05-10 23:25:57 - ERROR - stderr - 27%|██▋ | 998/3741 [6:00:03<15:39:47, 20.56s/it] +2025-05-10 23:26:17 - ERROR - stderr - 27%|██▋ | 999/3741 [6:00:23<15:35:02, 20.46s/it] +2025-05-10 23:26:17 - ERROR - stderr - +2025-05-10 23:26:17 - ERROR - stderr - +2025-05-10 23:26:17 - INFO - stdout - {'loss': 0.9057, 'grad_norm': 0.661469042301178, 'learning_rate': 1.719847562640848e-05, 'epoch': 0.8} +2025-05-10 23:26:17 - ERROR - stderr - 27%|██▋ | 999/3741 [6:00:23<15:35:02, 20.46s/it] +2025-05-10 23:26:36 - ERROR - stderr - 27%|██▋ | 1000/3741 [6:00:42<15:19:16, 20.12s/it] +2025-05-10 23:26:36 - ERROR - stderr - +2025-05-10 23:26:36 - ERROR - stderr - +2025-05-10 23:26:36 - INFO - stdout - {'loss': 0.9095, 'grad_norm': 0.581844687461853, 'learning_rate': 1.7192462232174595e-05, 'epoch': 0.8} +2025-05-10 23:26:36 - ERROR - stderr - 27%|██▋ | 1000/3741 [6:00:42<15:19:16, 20.12s/it] +2025-05-10 23:27:00 - ERROR - stderr - 27%|██▋ | 1001/3741 [6:01:07<16:14:43, 21.34s/it] +2025-05-10 23:27:00 - ERROR - stderr - +2025-05-10 23:27:00 - ERROR - stderr - +2025-05-10 23:27:00 - INFO - stdout - {'loss': 0.8885, 'grad_norm': 0.6142575144767761, 'learning_rate': 1.7186443444787578e-05, 'epoch': 0.8} +2025-05-10 23:27:00 - ERROR - stderr - 27%|██▋ | 1001/3741 [6:01:07<16:14:43, 21.34s/it] +2025-05-10 23:27:20 - ERROR - stderr - 27%|██▋ | 1002/3741 [6:01:26<15:49:57, 20.81s/it] +2025-05-10 23:27:20 - ERROR - stderr - +2025-05-10 23:27:20 - ERROR - stderr - +2025-05-10 23:27:20 - INFO - stdout - {'loss': 0.8893, 'grad_norm': 0.5919050574302673, 'learning_rate': 1.718041926876053e-05, 'epoch': 0.8} +2025-05-10 23:27:20 - ERROR - stderr - 27%|██▋ | 1002/3741 [6:01:26<15:49:57, 20.81s/it] +2025-05-10 23:27:43 - ERROR - stderr - 27%|██▋ | 1003/3741 [6:01:50<16:27:08, 21.63s/it] +2025-05-10 23:27:43 - ERROR - stderr - +2025-05-10 23:27:43 - ERROR - stderr - +2025-05-10 23:27:43 - INFO - stdout - {'loss': 0.923, 'grad_norm': 0.6128547787666321, 'learning_rate': 1.7174389708610565e-05, 'epoch': 0.8} +2025-05-10 23:27:43 - ERROR - stderr - 27%|██▋ | 1003/3741 [6:01:50<16:27:08, 21.63s/it] +2025-05-10 23:28:03 - ERROR - stderr - 27%|██▋ | 1004/3741 [6:02:10<16:05:04, 21.16s/it] +2025-05-10 23:28:03 - ERROR - stderr - +2025-05-10 23:28:03 - ERROR - stderr - +2025-05-10 23:28:03 - INFO - stdout - {'loss': 0.9256, 'grad_norm': 0.5759849548339844, 'learning_rate': 1.716835476885887e-05, 'epoch': 0.81} +2025-05-10 23:28:03 - ERROR - stderr - 27%|██▋ | 1004/3741 [6:02:10<16:05:04, 21.16s/it] +2025-05-10 23:28:23 - ERROR - stderr - 27%|██▋ | 1005/3741 [6:02:30<15:47:45, 20.78s/it] +2025-05-10 23:28:23 - ERROR - stderr - +2025-05-10 23:28:23 - ERROR - stderr - +2025-05-10 23:28:23 - INFO - stdout - {'loss': 0.9334, 'grad_norm': 0.5811640620231628, 'learning_rate': 1.7162314454030644e-05, 'epoch': 0.81} +2025-05-10 23:28:23 - ERROR - stderr - 27%|██▋ | 1005/3741 [6:02:30<15:47:45, 20.78s/it] +2025-05-10 23:28:43 - ERROR - stderr - 27%|██▋ | 1006/3741 [6:02:49<15:29:15, 20.39s/it] +2025-05-10 23:28:43 - ERROR - stderr - +2025-05-10 23:28:43 - ERROR - stderr - +2025-05-10 23:28:43 - INFO - stdout - {'loss': 0.8993, 'grad_norm': 0.6075664758682251, 'learning_rate': 1.7156268768655118e-05, 'epoch': 0.81} +2025-05-10 23:28:43 - ERROR - stderr - 27%|██▋ | 1006/3741 [6:02:49<15:29:15, 20.39s/it] +2025-05-10 23:29:02 - ERROR - stderr - 27%|██▋ | 1007/3741 [6:03:09<15:16:44, 20.12s/it] +2025-05-10 23:29:02 - ERROR - stderr - +2025-05-10 23:29:02 - ERROR - stderr - +2025-05-10 23:29:02 - INFO - stdout - {'loss': 0.9181, 'grad_norm': 0.6393078565597534, 'learning_rate': 1.715021771726555e-05, 'epoch': 0.81} +2025-05-10 23:29:02 - ERROR - stderr - 27%|█��▋ | 1007/3741 [6:03:09<15:16:44, 20.12s/it] +2025-05-10 23:29:25 - ERROR - stderr - 27%|██▋ | 1008/3741 [6:03:32<15:55:15, 20.97s/it] +2025-05-10 23:29:25 - ERROR - stderr - +2025-05-10 23:29:25 - ERROR - stderr - +2025-05-10 23:29:25 - INFO - stdout - {'loss': 0.9329, 'grad_norm': 0.6739677786827087, 'learning_rate': 1.714416130439923e-05, 'epoch': 0.81} +2025-05-10 23:29:25 - ERROR - stderr - 27%|██▋ | 1008/3741 [6:03:32<15:55:15, 20.97s/it] +2025-05-10 23:29:45 - ERROR - stderr - 27%|██▋ | 1009/3741 [6:03:51<15:37:14, 20.58s/it] +2025-05-10 23:29:45 - ERROR - stderr - +2025-05-10 23:29:45 - ERROR - stderr - +2025-05-10 23:29:45 - INFO - stdout - {'loss': 0.9393, 'grad_norm': 0.5906496047973633, 'learning_rate': 1.7138099534597464e-05, 'epoch': 0.81} +2025-05-10 23:29:45 - ERROR - stderr - 27%|██▋ | 1009/3741 [6:03:51<15:37:14, 20.58s/it] +2025-05-10 23:30:09 - ERROR - stderr - 27%|██▋ | 1010/3741 [6:04:15<16:20:22, 21.54s/it] +2025-05-10 23:30:09 - ERROR - stderr - +2025-05-10 23:30:09 - ERROR - stderr - +2025-05-10 23:30:09 - INFO - stdout - {'loss': 0.9145, 'grad_norm': 0.6302242875099182, 'learning_rate': 1.7132032412405565e-05, 'epoch': 0.81} +2025-05-10 23:30:09 - ERROR - stderr - 27%|██▋ | 1010/3741 [6:04:15<16:20:22, 21.54s/it] +2025-05-10 23:30:28 - ERROR - stderr - 27%|██▋ | 1011/3741 [6:04:34<15:51:56, 20.92s/it] +2025-05-10 23:30:28 - ERROR - stderr - +2025-05-10 23:30:28 - ERROR - stderr - +2025-05-10 23:30:28 - INFO - stdout - {'loss': 0.8723, 'grad_norm': 0.6030935645103455, 'learning_rate': 1.7125959942372875e-05, 'epoch': 0.81} +2025-05-10 23:30:28 - ERROR - stderr - 27%|██▋ | 1011/3741 [6:04:35<15:51:56, 20.92s/it] +2025-05-10 23:30:51 - ERROR - stderr - 27%|██▋ | 1012/3741 [6:04:58<16:23:55, 21.63s/it] +2025-05-10 23:30:51 - ERROR - stderr - +2025-05-10 23:30:51 - ERROR - stderr - +2025-05-10 23:30:51 - INFO - stdout - {'loss': 0.8957, 'grad_norm': 0.6145809292793274, 'learning_rate': 1.711988212905274e-05, 'epoch': 0.81} +2025-05-10 23:30:51 - ERROR - stderr - 27%|██▋ | 1012/3741 [6:04:58<16:23:55, 21.63s/it] +2025-05-10 23:31:11 - ERROR - stderr - 27%|██▋ | 1013/3741 [6:05:17<15:53:54, 20.98s/it] +2025-05-10 23:31:11 - ERROR - stderr - +2025-05-10 23:31:11 - ERROR - stderr - +2025-05-10 23:31:11 - INFO - stdout - {'loss': 0.9221, 'grad_norm': 0.5869849324226379, 'learning_rate': 1.7113798977002506e-05, 'epoch': 0.81} +2025-05-10 23:31:11 - ERROR - stderr - 27%|██▋ | 1013/3741 [6:05:17<15:53:54, 20.98s/it] +2025-05-10 23:31:31 - ERROR - stderr - 27%|██▋ | 1014/3741 [6:05:37<15:39:14, 20.67s/it] +2025-05-10 23:31:31 - ERROR - stderr - +2025-05-10 23:31:31 - ERROR - stderr - +2025-05-10 23:31:31 - INFO - stdout - {'loss': 0.9337, 'grad_norm': 0.7540897130966187, 'learning_rate': 1.710771049078353e-05, 'epoch': 0.81} +2025-05-10 23:31:31 - ERROR - stderr - 27%|██▋ | 1014/3741 [6:05:37<15:39:14, 20.67s/it] +2025-05-10 23:31:51 - ERROR - stderr - 27%|██▋ | 1015/3741 [6:05:58<15:34:43, 20.57s/it] +2025-05-10 23:31:51 - ERROR - stderr - +2025-05-10 23:31:51 - ERROR - stderr - +2025-05-10 23:31:51 - INFO - stdout - {'loss': 0.8933, 'grad_norm': 0.6184853911399841, 'learning_rate': 1.7101616674961165e-05, 'epoch': 0.81} +2025-05-10 23:31:51 - ERROR - stderr - 27%|██▋ | 1015/3741 [6:05:58<15:34:43, 20.57s/it] +2025-05-10 23:32:10 - ERROR - stderr - 27%|██▋ | 1016/3741 [6:06:17<15:15:57, 20.17s/it] +2025-05-10 23:32:10 - ERROR - stderr - +2025-05-10 23:32:10 - ERROR - stderr - +2025-05-10 23:32:10 - INFO - stdout - {'loss': 0.8933, 'grad_norm': 0.592350959777832, 'learning_rate': 1.7095517534104762e-05, 'epoch': 0.81} +2025-05-10 23:32:10 - ERROR - stderr - 27%|██▋ | 1016/3741 [6:06:17<15:15:57, 20.17s/it] +2025-05-10 23:32:34 - ERROR - stderr - 27%|██▋ | 1017/3741 [6:06:41<16:05:40, 21.27s/it] +2025-05-10 23:32:34 - ERROR - stderr - +2025-05-10 23:32:34 - ERROR - stderr - +2025-05-10 23:32:34 - INFO - stdout - {'loss': 0.9336, 'grad_norm': 0.5875340104103088, 'learning_rate': 1.7089413072787667e-05, 'epoch': 0.82} +2025-05-10 23:32:34 - ERROR - stderr - 27%|██▋ | 1017/3741 [6:06:41<16:05:40, 21.27s/it] +2025-05-10 23:32:54 - ERROR - stderr - 27%|██▋ | 1018/3741 [6:07:00<15:40:20, 20.72s/it] +2025-05-10 23:32:54 - ERROR - stderr - +2025-05-10 23:32:54 - ERROR - stderr - +2025-05-10 23:32:54 - INFO - stdout - {'loss': 0.8972, 'grad_norm': 0.6324250102043152, 'learning_rate': 1.7083303295587212e-05, 'epoch': 0.82} +2025-05-10 23:32:54 - ERROR - stderr - 27%|██▋ | 1018/3741 [6:07:00<15:40:20, 20.72s/it] +2025-05-10 23:33:17 - ERROR - stderr - 27%|██▋ | 1019/3741 [6:07:23<16:14:01, 21.47s/it] +2025-05-10 23:33:17 - ERROR - stderr - +2025-05-10 23:33:17 - ERROR - stderr - +2025-05-10 23:33:17 - INFO - stdout - {'loss': 0.9375, 'grad_norm': 0.6096128225326538, 'learning_rate': 1.7077188207084712e-05, 'epoch': 0.82} +2025-05-10 23:33:17 - ERROR - stderr - 27%|██▋ | 1019/3741 [6:07:23<16:14:01, 21.47s/it] +2025-05-10 23:33:37 - ERROR - stderr - 27%|██▋ | 1020/3741 [6:07:43<15:48:12, 20.91s/it] +2025-05-10 23:33:37 - ERROR - stderr - +2025-05-10 23:33:37 - ERROR - stderr - +2025-05-10 23:33:37 - INFO - stdout - {'loss': 0.9045, 'grad_norm': 0.6442949771881104, 'learning_rate': 1.7071067811865477e-05, 'epoch': 0.82} +2025-05-10 23:33:37 - ERROR - stderr - 27%|██▋ | 1020/3741 [6:07:43<15:48:12, 20.91s/it] +2025-05-10 23:33:56 - ERROR - stderr - 27%|██▋ | 1021/3741 [6:08:02<15:28:50, 20.49s/it] +2025-05-10 23:33:56 - ERROR - stderr - +2025-05-10 23:33:56 - ERROR - stderr - +2025-05-10 23:33:56 - INFO - stdout - {'loss': 0.9377, 'grad_norm': 0.6120867133140564, 'learning_rate': 1.706494211451878e-05, 'epoch': 0.82} +2025-05-10 23:33:56 - ERROR - stderr - 27%|██▋ | 1021/3741 [6:08:02<15:28:50, 20.49s/it] +2025-05-10 23:34:17 - ERROR - stderr - 27%|██▋ | 1022/3741 [6:08:23<15:34:32, 20.62s/it] +2025-05-10 23:34:17 - ERROR - stderr - +2025-05-10 23:34:17 - ERROR - stderr - +2025-05-10 23:34:17 - INFO - stdout - {'loss': 0.9255, 'grad_norm': 0.630803644657135, 'learning_rate': 1.7058811119637878e-05, 'epoch': 0.82} +2025-05-10 23:34:17 - ERROR - stderr - 27%|██▋ | 1022/3741 [6:08:23<15:34:32, 20.62s/it] +2025-05-10 23:34:37 - ERROR - stderr - 27%|██▋ | 1023/3741 [6:08:43<15:20:32, 20.32s/it] +2025-05-10 23:34:37 - ERROR - stderr - +2025-05-10 23:34:37 - ERROR - stderr - +2025-05-10 23:34:37 - INFO - stdout - {'loss': 0.9195, 'grad_norm': 0.5878363251686096, 'learning_rate': 1.7052674831820008e-05, 'epoch': 0.82} +2025-05-10 23:34:37 - ERROR - stderr - 27%|██▋ | 1023/3741 [6:08:43<15:20:32, 20.32s/it] +2025-05-10 23:35:00 - ERROR - stderr - 27%|██▋ | 1024/3741 [6:09:06<16:03:46, 21.28s/it] +2025-05-10 23:35:00 - ERROR - stderr - +2025-05-10 23:35:00 - ERROR - stderr - +2025-05-10 23:35:00 - INFO - stdout - {'loss': 0.9699, 'grad_norm': 0.6276422739028931, 'learning_rate': 1.704653325566636e-05, 'epoch': 0.82} +2025-05-10 23:35:00 - ERROR - stderr - 27%|██▋ | 1024/3741 [6:09:06<16:03:46, 21.28s/it] +2025-05-10 23:35:20 - ERROR - stderr - 27%|██▋ | 1025/3741 [6:09:26<15:40:12, 20.77s/it] +2025-05-10 23:35:20 - ERROR - stderr - +2025-05-10 23:35:20 - ERROR - stderr - +2025-05-10 23:35:20 - INFO - stdout - {'loss': 0.8794, 'grad_norm': 0.5793137550354004, 'learning_rate': 1.7040386395782093e-05, 'epoch': 0.82} +2025-05-10 23:35:20 - ERROR - stderr - 27%|██▋ | 1025/3741 [6:09:26<15:40:12, 20.77s/it] +2025-05-10 23:35:42 - ERROR - stderr - 27%|██▋ | 1026/3741 [6:09:49<16:04:34, 21.32s/it] +2025-05-10 23:35:42 - ERROR - stderr - +2025-05-10 23:35:42 - ERROR - stderr - +2025-05-10 23:35:42 - INFO - stdout - {'loss': 0.8908, 'grad_norm': 0.6176061630249023, 'learning_rate': 1.703423425677634e-05, 'epoch': 0.82} +2025-05-10 23:35:42 - ERROR - stderr - 27%|██▋ | 1026/3741 [6:09:49<16:04:34, 21.32s/it] +2025-05-10 23:36:02 - ERROR - stderr - 27%|██▋ | 1027/3741 [6:10:08<15:38:53, 20.76s/it] +2025-05-10 23:36:02 - ERROR - stderr - +2025-05-10 23:36:02 - ERROR - stderr - +2025-05-10 23:36:02 - INFO - stdout - {'loss': 0.9506, 'grad_norm': 0.616875946521759, 'learning_rate': 1.7028076843262185e-05, 'epoch': 0.82} +2025-05-10 23:36:02 - ERROR - stderr - 27%|██▋ | 1027/3741 [6:10:08<15:38:53, 20.76s/it] +2025-05-10 23:36:21 - ERROR - stderr - 27%|██▋ | 1028/3741 [6:10:27<15:20:31, 20.36s/it] +2025-05-10 23:36:21 - ERROR - stderr - +2025-05-10 23:36:21 - ERROR - stderr - +2025-05-10 23:36:21 - INFO - stdout - {'loss': 0.9218, 'grad_norm': 0.5971503257751465, 'learning_rate': 1.7021914159856664e-05, 'epoch': 0.82} +2025-05-10 23:36:21 - ERROR - stderr - 27%|██▋ | 1028/3741 [6:10:28<15:20:31, 20.36s/it] +2025-05-10 23:36:43 - ERROR - stderr - 28%|██▊ | 1029/3741 [6:10:49<15:41:05, 20.82s/it] +2025-05-10 23:36:43 - ERROR - stderr - +2025-05-10 23:36:43 - ERROR - stderr - +2025-05-10 23:36:43 - INFO - stdout - {'loss': 0.9296, 'grad_norm': 0.6316090226173401, 'learning_rate': 1.701574621118076e-05, 'epoch': 0.83} +2025-05-10 23:36:43 - ERROR - stderr - 28%|██▊ | 1029/3741 [6:10:49<15:41:05, 20.82s/it] +2025-05-10 23:37:02 - ERROR - stderr - 28%|██▊ | 1030/3741 [6:11:09<15:21:08, 20.39s/it] +2025-05-10 23:37:02 - ERROR - stderr - +2025-05-10 23:37:02 - ERROR - stderr - +2025-05-10 23:37:02 - INFO - stdout - {'loss': 0.895, 'grad_norm': 0.6042530536651611, 'learning_rate': 1.700957300185942e-05, 'epoch': 0.83} +2025-05-10 23:37:02 - ERROR - stderr - 28%|██▊ | 1030/3741 [6:11:09<15:21:08, 20.39s/it] +2025-05-10 23:37:26 - ERROR - stderr - 28%|██▊ | 1031/3741 [6:11:33<16:10:04, 21.48s/it] +2025-05-10 23:37:26 - ERROR - stderr - +2025-05-10 23:37:26 - ERROR - stderr - +2025-05-10 23:37:26 - INFO - stdout - {'loss': 0.9031, 'grad_norm': 0.6263911128044128, 'learning_rate': 1.7003394536521525e-05, 'epoch': 0.83} +2025-05-10 23:37:26 - ERROR - stderr - 28%|██▊ | 1031/3741 [6:11:33<16:10:04, 21.48s/it] +2025-05-10 23:37:46 - ERROR - stderr - 28%|██▊ | 1032/3741 [6:11:53<15:46:10, 20.96s/it] +2025-05-10 23:37:46 - ERROR - stderr - +2025-05-10 23:37:46 - ERROR - stderr - +2025-05-10 23:37:46 - INFO - stdout - {'loss': 0.8886, 'grad_norm': 0.5868535041809082, 'learning_rate': 1.6997210819799894e-05, 'epoch': 0.83} +2025-05-10 23:37:46 - ERROR - stderr - 28%|██▊ | 1032/3741 [6:11:53<15:46:10, 20.96s/it] +2025-05-10 23:38:10 - ERROR - stderr - 28%|██▊ | 1033/3741 [6:12:16<16:21:34, 21.75s/it] +2025-05-10 23:38:10 - ERROR - stderr - +2025-05-10 23:38:10 - ERROR - stderr - +2025-05-10 23:38:10 - INFO - stdout - {'loss': 0.9142, 'grad_norm': 0.681711733341217, 'learning_rate': 1.6991021856331297e-05, 'epoch': 0.83} +2025-05-10 23:38:10 - ERROR - stderr - 28%|██▊ | 1033/3741 [6:12:16<16:21:34, 21.75s/it] +2025-05-10 23:38:30 - ERROR - stderr - 28%|██▊ | 1034/3741 [6:12:36<15:56:17, 21.20s/it] +2025-05-10 23:38:30 - ERROR - stderr - +2025-05-10 23:38:30 - ERROR - stderr - +2025-05-10 23:38:30 - INFO - stdout - {'loss': 0.886, 'grad_norm': 0.6533603072166443, 'learning_rate': 1.698482765075642e-05, 'epoch': 0.83} +2025-05-10 23:38:30 - ERROR - stderr - 28%|██▊ | 1034/3741 [6:12:36<15:56:17, 21.20s/it] +2025-05-10 23:38:50 - ERROR - stderr - 28%|██▊ | 1035/3741 [6:12:56<15:39:55, 20.84s/it] +2025-05-10 23:38:50 - ERROR - stderr - +2025-05-10 23:38:50 - ERROR - stderr - +2025-05-10 23:38:50 - INFO - stdout - {'loss': 0.8767, 'grad_norm': 0.6320748925209045, 'learning_rate': 1.6978628207719892e-05, 'epoch': 0.83} +2025-05-10 23:38:50 - ERROR - stderr - 28%|██▊ | 1035/3741 [6:12:56<15:39:55, 20.84s/it] +2025-05-10 23:39:11 - ERROR - stderr - 28%|██▊ | 1036/3741 [6:13:17<15:39:37, 20.84s/it] +2025-05-10 23:39:11 - ERROR - stderr - +2025-05-10 23:39:11 - ERROR - stderr - +2025-05-10 23:39:11 - INFO - stdout - {'loss': 0.9081, 'grad_norm': 0.6173900365829468, 'learning_rate': 1.6972423531870273e-05, 'epoch': 0.83} +2025-05-10 23:39:11 - ERROR - stderr - 28%|██▊ | 1036/3741 [6:13:17<15:39:37, 20.84s/it] +2025-05-10 23:39:30 - ERROR - stderr - 28%|██▊ | 1037/3741 [6:13:37<15:26:12, 20.55s/it] +2025-05-10 23:39:30 - ERROR - stderr - +2025-05-10 23:39:30 - ERROR - stderr - +2025-05-10 23:39:30 - INFO - stdout - {'loss': 0.9403, 'grad_norm': 0.6053596138954163, 'learning_rate': 1.696621362786003e-05, 'epoch': 0.83} +2025-05-10 23:39:30 - ERROR - stderr - 28%|██▊ | 1037/3741 [6:13:37<15:26:12, 20.55s/it] +2025-05-10 23:39:53 - ERROR - stderr - 28%|██▊ | 1038/3741 [6:13:59<15:54:00, 21.18s/it] +2025-05-10 23:39:53 - ERROR - stderr - +2025-05-10 23:39:53 - ERROR - stderr - +2025-05-10 23:39:53 - INFO - stdout - {'loss': 0.9318, 'grad_norm': 0.5761358141899109, 'learning_rate': 1.6959998500345572e-05, 'epoch': 0.83} +2025-05-10 23:39:53 - ERROR - stderr - 28%|██▊ | 1038/3741 [6:13:59<15:54:00, 21.18s/it] +2025-05-10 23:40:12 - ERROR - stderr - 28%|██▊ | 1039/3741 [6:14:19<15:26:55, 20.58s/it] +2025-05-10 23:40:12 - ERROR - stderr - +2025-05-10 23:40:12 - ERROR - stderr - +2025-05-10 23:40:12 - INFO - stdout - {'loss': 0.9407, 'grad_norm': 0.6473420262336731, 'learning_rate': 1.6953778153987205e-05, 'epoch': 0.83} +2025-05-10 23:40:12 - ERROR - stderr - 28%|██▊ | 1039/3741 [6:14:19<15:26:55, 20.58s/it] +2025-05-10 23:40:36 - ERROR - stderr - 28%|██▊ | 1040/3741 [6:14:42<16:06:03, 21.46s/it] +2025-05-10 23:40:36 - ERROR - stderr - +2025-05-10 23:40:36 - ERROR - stderr - +2025-05-10 23:40:36 - INFO - stdout - {'loss': 0.8952, 'grad_norm': 0.5871930122375488, 'learning_rate': 1.6947552593449154e-05, 'epoch': 0.83} +2025-05-10 23:40:36 - ERROR - stderr - 28%|██▊ | 1040/3741 [6:14:42<16:06:03, 21.46s/it] +2025-05-10 23:40:55 - ERROR - stderr - 28%|██▊ | 1041/3741 [6:15:02<15:40:31, 20.90s/it] +2025-05-10 23:40:55 - ERROR - stderr - +2025-05-10 23:40:55 - ERROR - stderr - +2025-05-10 23:40:55 - INFO - stdout - {'loss': 0.8676, 'grad_norm': 0.5863208770751953, 'learning_rate': 1.6941321823399567e-05, 'epoch': 0.83} +2025-05-10 23:40:55 - ERROR - stderr - 28%|██▊ | 1041/3741 [6:15:02<15:40:31, 20.90s/it] +2025-05-10 23:41:16 - ERROR - stderr - 28%|██▊ | 1042/3741 [6:15:22<15:29:48, 20.67s/it] +2025-05-10 23:41:16 - ERROR - stderr - +2025-05-10 23:41:16 - ERROR - stderr - +2025-05-10 23:41:16 - INFO - stdout - {'loss': 0.884, 'grad_norm': 0.6157333254814148, 'learning_rate': 1.6935085848510476e-05, 'epoch': 0.84} +2025-05-10 23:41:16 - ERROR - stderr - 28%|██▊ | 1042/3741 [6:15:22<15:29:48, 20.67s/it] +2025-05-10 23:41:35 - ERROR - stderr - 28%|██▊ | 1043/3741 [6:15:41<15:13:51, 20.32s/it] +2025-05-10 23:41:35 - ERROR - stderr - +2025-05-10 23:41:35 - ERROR - stderr - +2025-05-10 23:41:35 - INFO - stdout - {'loss': 0.9337, 'grad_norm': 0.6380476951599121, 'learning_rate': 1.6928844673457838e-05, 'epoch': 0.84} +2025-05-10 23:41:35 - ERROR - stderr - 28%|██▊ | 1043/3741 [6:15:41<15:13:51, 20.32s/it] +2025-05-10 23:41:55 - ERROR - stderr - 28%|██▊ | 1044/3741 [6:16:01<15:10:50, 20.26s/it] +2025-05-10 23:41:55 - ERROR - stderr - +2025-05-10 23:41:55 - ERROR - stderr - +2025-05-10 23:41:55 - INFO - stdout - {'loss': 0.9369, 'grad_norm': 0.6176585555076599, 'learning_rate': 1.692259830292149e-05, 'epoch': 0.84} +2025-05-10 23:41:55 - ERROR - stderr - 28%|██▊ | 1044/3741 [6:16:01<15:10:50, 20.26s/it] +2025-05-10 23:42:19 - ERROR - stderr - 28%|██▊ | 1045/3741 [6:16:25<15:54:17, 21.24s/it] +2025-05-10 23:42:19 - ERROR - stderr - +2025-05-10 23:42:19 - ERROR - stderr - +2025-05-10 23:42:19 - INFO - stdout - {'loss': 0.9504, 'grad_norm': 0.6162835359573364, 'learning_rate': 1.691634674158518e-05, 'epoch': 0.84} +2025-05-10 23:42:19 - ERROR - stderr - 28%|██▊ | 1045/3741 [6:16:25<15:54:17, 21.24s/it] +2025-05-10 23:42:38 - ERROR - stderr - 28%|██▊ | 1046/3741 [6:16:44<15:30:34, 20.72s/it] +2025-05-10 23:42:38 - ERROR - stderr - +2025-05-10 23:42:38 - ERROR - stderr - +2025-05-10 23:42:38 - INFO - stdout - {'loss': 0.9074, 'grad_norm': 0.6078632473945618, 'learning_rate': 1.6910089994136535e-05, 'epoch': 0.84} +2025-05-10 23:42:38 - ERROR - stderr - 28%|██▊ | 1046/3741 [6:16:44<15:30:34, 20.72s/it] +2025-05-10 23:43:01 - ERROR - stderr - 28%|██▊ | 1047/3741 [6:17:08<16:02:31, 21.44s/it] +2025-05-10 23:43:01 - ERROR - stderr - +2025-05-10 23:43:01 - ERROR - stderr - +2025-05-10 23:43:01 - INFO - stdout - {'loss': 0.9469, 'grad_norm': 0.5939008593559265, 'learning_rate': 1.6903828065267083e-05, 'epoch': 0.84} +2025-05-10 23:43:01 - ERROR - stderr - 28%|██▊ | 1047/3741 [6:17:08<16:02:31, 21.44s/it] +2025-05-10 23:43:21 - ERROR - stderr - 28%|██▊ | 1048/3741 [6:17:27<15:35:10, 20.84s/it] +2025-05-10 23:43:21 - ERROR - stderr - +2025-05-10 23:43:21 - ERROR - stderr - +2025-05-10 23:43:21 - INFO - stdout - {'loss': 0.8914, 'grad_norm': 0.6359356641769409, 'learning_rate': 1.6897560959672232e-05, 'epoch': 0.84} +2025-05-10 23:43:21 - ERROR - stderr - 28%|██▊ | 1048/3741 [6:17:27<15:35:10, 20.84s/it] +2025-05-10 23:43:40 - ERROR - stderr - 28%|██▊ | 1049/3741 [6:17:47<15:17:49, 20.46s/it] +2025-05-10 23:43:40 - ERROR - stderr - +2025-05-10 23:43:40 - ERROR - stderr - +2025-05-10 23:43:40 - INFO - stdout - {'loss': 0.938, 'grad_norm': 0.6040184497833252, 'learning_rate': 1.6891288682051264e-05, 'epoch': 0.84} +2025-05-10 23:43:40 - ERROR - stderr - 28%|██▊ | 1049/3741 [6:17:47<15:17:49, 20.46s/it] +2025-05-10 23:44:01 - ERROR - stderr - 28%|██▊ | 1050/3741 [6:18:07<15:14:28, 20.39s/it] +2025-05-10 23:44:01 - ERROR - stderr - +2025-05-10 23:44:01 - ERROR - stderr - +2025-05-10 23:44:01 - INFO - stdout - {'loss': 0.8751, 'grad_norm': 0.6027700901031494, 'learning_rate': 1.6885011237107353e-05, 'epoch': 0.84} +2025-05-10 23:44:01 - ERROR - stderr - 28%|██▊ | 1050/3741 [6:18:07<15:14:28, 20.39s/it] +2025-05-10 23:44:20 - ERROR - stderr - 28%|██▊ | 1051/3741 [6:18:26<15:02:29, 20.13s/it] +2025-05-10 23:44:20 - ERROR - stderr - +2025-05-10 23:44:20 - ERROR - stderr - +2025-05-10 23:44:20 - INFO - stdout - {'loss': 0.9169, 'grad_norm': 0.5934613943099976, 'learning_rate': 1.6878728629547536e-05, 'epoch': 0.84} +2025-05-10 23:44:20 - ERROR - stderr - 28%|██▊ | 1051/3741 [6:18:26<15:02:29, 20.13s/it] +2025-05-10 23:44:43 - ERROR - stderr - 28%|██▊ | 1052/3741 [6:18:50<15:43:31, 21.05s/it] +2025-05-10 23:44:43 - ERROR - stderr - +2025-05-10 23:44:43 - ERROR - stderr - +2025-05-10 23:44:43 - INFO - stdout - {'loss': 0.9461, 'grad_norm': 0.6678500771522522, 'learning_rate': 1.6872440864082732e-05, 'epoch': 0.84} +2025-05-10 23:44:43 - ERROR - stderr - 28%|██▊ | 1052/3741 [6:18:50<15:43:31, 21.05s/it] +2025-05-10 23:45:03 - ERROR - stderr - 28%|██▊ | 1053/3741 [6:19:09<15:22:20, 20.59s/it] +2025-05-10 23:45:03 - ERROR - stderr - +2025-05-10 23:45:03 - ERROR - stderr - +2025-05-10 23:45:03 - INFO - stdout - {'loss': 0.9198, 'grad_norm': 0.6098446249961853, 'learning_rate': 1.686614794542772e-05, 'epoch': 0.84} +2025-05-10 23:45:03 - ERROR - stderr - 28%|██▊ | 1053/3741 [6:19:09<15:22:20, 20.59s/it] +2025-05-10 23:45:26 - ERROR - stderr - 28%|██▊ | 1054/3741 [6:19:33<16:01:09, 21.46s/it] +2025-05-10 23:45:26 - ERROR - stderr - +2025-05-10 23:45:26 - ERROR - stderr - +2025-05-10 23:45:26 - INFO - stdout - {'loss': 0.9057, 'grad_norm': 0.5894660949707031, 'learning_rate': 1.685984987830114e-05, 'epoch': 0.85} +2025-05-10 23:45:26 - ERROR - stderr - 28%|██▊ | 1054/3741 [6:19:33<16:01:09, 21.46s/it] +2025-05-10 23:45:46 - ERROR - stderr - 28%|██▊ | 1055/3741 [6:19:52<15:39:21, 20.98s/it] +2025-05-10 23:45:46 - ERROR - stderr - +2025-05-10 23:45:46 - ERROR - stderr - +2025-05-10 23:45:46 - INFO - stdout - {'loss': 0.9392, 'grad_norm': 0.6063706874847412, 'learning_rate': 1.68535466674255e-05, 'epoch': 0.85} +2025-05-10 23:45:46 - ERROR - stderr - 28%|██▊ | 1055/3741 [6:19:52<15:39:21, 20.98s/it] +2025-05-10 23:46:07 - ERROR - stderr - 28%|██▊ | 1056/3741 [6:20:13<15:31:14, 20.81s/it] +2025-05-10 23:46:07 - ERROR - stderr - +2025-05-10 23:46:07 - ERROR - stderr - +2025-05-10 23:46:07 - INFO - stdout - {'loss': 0.9146, 'grad_norm': 0.6084437966346741, 'learning_rate': 1.6847238317527167e-05, 'epoch': 0.85} +2025-05-10 23:46:07 - ERROR - stderr - 28%|██▊ | 1056/3741 [6:20:13<15:31:14, 20.81s/it] +2025-05-10 23:46:26 - ERROR - stderr - 28%|██▊ | 1057/3741 [6:20:32<15:12:26, 20.40s/it] +2025-05-10 23:46:26 - ERROR - stderr - +2025-05-10 23:46:26 - ERROR - stderr - +2025-05-10 23:46:26 - INFO - stdout - {'loss': 0.9152, 'grad_norm': 0.5813028812408447, 'learning_rate': 1.684092483333635e-05, 'epoch': 0.85} +2025-05-10 23:46:26 - ERROR - stderr - 28%|██▊ | 1057/3741 [6:20:32<15:12:26, 20.40s/it] +2025-05-10 23:46:45 - ERROR - stderr - 28%|██▊ | 1058/3741 [6:20:52<14:59:35, 20.12s/it] +2025-05-10 23:46:45 - ERROR - stderr - +2025-05-10 23:46:45 - ERROR - stderr - +2025-05-10 23:46:45 - INFO - stdout - {'loss': 0.8822, 'grad_norm': 0.6176558136940002, 'learning_rate': 1.6834606219587114e-05, 'epoch': 0.85} +2025-05-10 23:46:45 - ERROR - stderr - 28%|██▊ | 1058/3741 [6:20:52<14:59:35, 20.12s/it] +2025-05-10 23:47:09 - ERROR - stderr - 28%|██▊ | 1059/3741 [6:21:15<15:46:59, 21.19s/it] +2025-05-10 23:47:09 - ERROR - stderr - +2025-05-10 23:47:09 - ERROR - stderr - +2025-05-10 23:47:09 - INFO - stdout - {'loss': 0.9067, 'grad_norm': 0.5906162858009338, 'learning_rate': 1.682828248101738e-05, 'epoch': 0.85} +2025-05-10 23:47:09 - ERROR - stderr - 28%|██▊ | 1059/3741 [6:21:15<15:46:59, 21.19s/it] +2025-05-10 23:47:29 - ERROR - stderr - 28%|██▊ | 1060/3741 [6:21:36<15:33:00, 20.88s/it] +2025-05-10 23:47:29 - ERROR - stderr - +2025-05-10 23:47:29 - ERROR - stderr - +2025-05-10 23:47:29 - INFO - stdout - {'loss': 0.931, 'grad_norm': 0.5896495580673218, 'learning_rate': 1.682195362236889e-05, 'epoch': 0.85} +2025-05-10 23:47:29 - ERROR - stderr - 28%|██▊ | 1060/3741 [6:21:36<15:33:00, 20.88s/it] +2025-05-10 23:47:52 - ERROR - stderr - 28%|██▊ | 1061/3741 [6:21:58<15:58:25, 21.46s/it] +2025-05-10 23:47:52 - ERROR - stderr - +2025-05-10 23:47:52 - ERROR - stderr - +2025-05-10 23:47:52 - INFO - stdout - {'loss': 0.9665, 'grad_norm': 0.5951011776924133, 'learning_rate': 1.681561964838725e-05, 'epoch': 0.85} +2025-05-10 23:47:52 - ERROR - stderr - 28%|██▊ | 1061/3741 [6:21:58<15:58:25, 21.46s/it] +2025-05-10 23:48:12 - ERROR - stderr - 28%|██▊ | 1062/3741 [6:22:18<15:32:57, 20.89s/it] +2025-05-10 23:48:12 - ERROR - stderr - +2025-05-10 23:48:12 - ERROR - stderr - +2025-05-10 23:48:12 - INFO - stdout - {'loss': 0.8821, 'grad_norm': 0.6564264297485352, 'learning_rate': 1.6809280563821878e-05, 'epoch': 0.85} +2025-05-10 23:48:12 - ERROR - stderr - 28%|██▊ | 1062/3741 [6:22:18<15:32:57, 20.89s/it] +2025-05-10 23:48:31 - ERROR - stderr - 28%|██▊ | 1063/3741 [6:22:38<15:14:54, 20.50s/it] +2025-05-10 23:48:31 - ERROR - stderr - +2025-05-10 23:48:31 - ERROR - stderr - +2025-05-10 23:48:31 - INFO - stdout - {'loss': 0.8951, 'grad_norm': 0.5982756018638611, 'learning_rate': 1.6802936373426045e-05, 'epoch': 0.85} +2025-05-10 23:48:31 - ERROR - stderr - 28%|██▊ | 1063/3741 [6:22:38<15:14:54, 20.50s/it] +2025-05-10 23:48:52 - ERROR - stderr - 28%|██▊ | 1064/3741 [6:22:58<15:12:15, 20.45s/it] +2025-05-10 23:48:52 - ERROR - stderr - +2025-05-10 23:48:52 - ERROR - stderr - +2025-05-10 23:48:52 - INFO - stdout - {'loss': 0.9748, 'grad_norm': 0.6046779155731201, 'learning_rate': 1.6796587081956833e-05, 'epoch': 0.85} +2025-05-10 23:48:52 - ERROR - stderr - 28%|██▊ | 1064/3741 [6:22:58<15:12:15, 20.45s/it] +2025-05-10 23:49:11 - ERROR - stderr - 28%|██▊ | 1065/3741 [6:23:18<15:03:12, 20.25s/it] +2025-05-10 23:49:11 - ERROR - stderr - +2025-05-10 23:49:11 - ERROR - stderr - +2025-05-10 23:49:11 - INFO - stdout - {'loss': 0.8921, 'grad_norm': 0.5632441639900208, 'learning_rate': 1.6790232694175164e-05, 'epoch': 0.85} +2025-05-10 23:49:11 - ERROR - stderr - 28%|██▊ | 1065/3741 [6:23:18<15:03:12, 20.25s/it] +2025-05-10 23:49:35 - ERROR - stderr - 28%|██▊ | 1066/3741 [6:23:41<15:46:31, 21.23s/it] +2025-05-10 23:49:35 - ERROR - stderr - +2025-05-10 23:49:35 - ERROR - stderr - +2025-05-10 23:49:35 - INFO - stdout - {'loss': 0.9368, 'grad_norm': 0.5854066610336304, 'learning_rate': 1.678387321484577e-05, 'epoch': 0.85} +2025-05-10 23:49:35 - ERROR - stderr - 28%|██▊ | 1066/3741 [6:23:41<15:46:31, 21.23s/it] +2025-05-10 23:49:55 - ERROR - stderr - 29%|██▊ | 1067/3741 [6:24:01<15:29:33, 20.86s/it] +2025-05-10 23:49:55 - ERROR - stderr - +2025-05-10 23:49:55 - ERROR - stderr - +2025-05-10 23:49:55 - INFO - stdout - {'loss': 0.9264, 'grad_norm': 0.6365918517112732, 'learning_rate': 1.6777508648737203e-05, 'epoch': 0.86} +2025-05-10 23:49:55 - ERROR - stderr - 29%|██▊ | 1067/3741 [6:24:01<15:29:33, 20.86s/it] +2025-05-10 23:50:18 - ERROR - stderr - 29%|██▊ | 1068/3741 [6:24:24<15:57:19, 21.49s/it] +2025-05-10 23:50:18 - ERROR - stderr - +2025-05-10 23:50:18 - ERROR - stderr - +2025-05-10 23:50:18 - INFO - stdout - {'loss': 0.9111, 'grad_norm': 0.5902692675590515, 'learning_rate': 1.677113900062184e-05, 'epoch': 0.86} +2025-05-10 23:50:18 - ERROR - stderr - 29%|██▊ | 1068/3741 [6:24:24<15:57:19, 21.49s/it] +2025-05-10 23:50:38 - ERROR - stderr - 29%|██▊ | 1069/3741 [6:24:44<15:35:02, 21.00s/it] +2025-05-10 23:50:38 - ERROR - stderr - +2025-05-10 23:50:38 - ERROR - stderr - +2025-05-10 23:50:38 - INFO - stdout - {'loss': 0.9626, 'grad_norm': 0.6386597752571106, 'learning_rate': 1.6764764275275852e-05, 'epoch': 0.86} +2025-05-10 23:50:38 - ERROR - stderr - 29%|██▊ | 1069/3741 [6:24:44<15:35:02, 21.00s/it] +2025-05-10 23:50:59 - ERROR - stderr - 29%|██▊ | 1070/3741 [6:25:05<15:33:33, 20.97s/it] +2025-05-10 23:50:59 - ERROR - stderr - +2025-05-10 23:50:59 - ERROR - stderr - +2025-05-10 23:50:59 - INFO - stdout - {'loss': 0.9684, 'grad_norm': 0.6048006415367126, 'learning_rate': 1.675838447747923e-05, 'epoch': 0.86} +2025-05-10 23:50:59 - ERROR - stderr - 29%|██▊ | 1070/3741 [6:25:05<15:33:33, 20.97s/it] +2025-05-10 23:51:18 - ERROR - stderr - 29%|██▊ | 1071/3741 [6:25:24<15:13:11, 20.52s/it] +2025-05-10 23:51:18 - ERROR - stderr - +2025-05-10 23:51:18 - ERROR - stderr - +2025-05-10 23:51:18 - INFO - stdout - {'loss': 0.9235, 'grad_norm': 0.5801950693130493, 'learning_rate': 1.675199961201576e-05, 'epoch': 0.86} +2025-05-10 23:51:18 - ERROR - stderr - 29%|██▊ | 1071/3741 [6:25:24<15:13:11, 20.52s/it] +2025-05-10 23:51:38 - ERROR - stderr - 29%|██▊ | 1072/3741 [6:25:44<15:03:11, 20.30s/it] +2025-05-10 23:51:38 - ERROR - stderr - +2025-05-10 23:51:38 - ERROR - stderr - +2025-05-10 23:51:38 - INFO - stdout - {'loss': 0.9174, 'grad_norm': 0.599275529384613, 'learning_rate': 1.6745609683673034e-05, 'epoch': 0.86} +2025-05-10 23:51:38 - ERROR - stderr - 29%|██▊ | 1072/3741 [6:25:44<15:03:11, 20.30s/it] +2025-05-10 23:52:01 - ERROR - stderr - 29%|██▊ | 1073/3741 [6:26:08<15:44:51, 21.25s/it] +2025-05-10 23:52:01 - ERROR - stderr - +2025-05-10 23:52:01 - ERROR - stderr - +2025-05-10 23:52:01 - INFO - stdout - {'loss': 0.9221, 'grad_norm': 0.6032297015190125, 'learning_rate': 1.6739214697242437e-05, 'epoch': 0.86} +2025-05-10 23:52:01 - ERROR - stderr - 29%|██▊ | 1073/3741 [6:26:08<15:44:51, 21.25s/it] +2025-05-10 23:52:21 - ERROR - stderr - 29%|██▊ | 1074/3741 [6:26:27<15:21:09, 20.72s/it] +2025-05-10 23:52:21 - ERROR - stderr - +2025-05-10 23:52:21 - ERROR - stderr - +2025-05-10 23:52:21 - INFO - stdout - {'loss': 0.9157, 'grad_norm': 0.6280431151390076, 'learning_rate': 1.6732814657519146e-05, 'epoch': 0.86} +2025-05-10 23:52:21 - ERROR - stderr - 29%|██▊ | 1074/3741 [6:26:27<15:21:09, 20.72s/it] +2025-05-10 23:52:44 - ERROR - stderr - 29%|██▊ | 1075/3741 [6:26:51<15:58:01, 21.56s/it] +2025-05-10 23:52:44 - ERROR - stderr - +2025-05-10 23:52:44 - ERROR - stderr - +2025-05-10 23:52:44 - INFO - stdout - {'loss': 0.9094, 'grad_norm': 0.7165436744689941, 'learning_rate': 1.6726409569302134e-05, 'epoch': 0.86} +2025-05-10 23:52:44 - ERROR - stderr - 29%|██▊ | 1075/3741 [6:26:51<15:58:01, 21.56s/it] +2025-05-10 23:53:04 - ERROR - stderr - 29%|██▉ | 1076/3741 [6:27:10<15:30:36, 20.95s/it] +2025-05-10 23:53:04 - ERROR - stderr - +2025-05-10 23:53:04 - ERROR - stderr - +2025-05-10 23:53:04 - INFO - stdout - {'loss': 0.8972, 'grad_norm': 0.6161721348762512, 'learning_rate': 1.6719999437394146e-05, 'epoch': 0.86} +2025-05-10 23:53:04 - ERROR - stderr - 29%|██▉ | 1076/3741 [6:27:10<15:30:36, 20.95s/it] +2025-05-10 23:53:25 - ERROR - stderr - 29%|██▉ | 1077/3741 [6:27:31<15:30:57, 20.97s/it] +2025-05-10 23:53:25 - ERROR - stderr - +2025-05-10 23:53:25 - ERROR - stderr - +2025-05-10 23:53:25 - INFO - stdout - {'loss': 0.9607, 'grad_norm': 0.5874865055084229, 'learning_rate': 1.6713584266601728e-05, 'epoch': 0.86} +2025-05-10 23:53:25 - ERROR - stderr - 29%|██▉ | 1077/3741 [6:27:31<15:30:57, 20.97s/it] +2025-05-10 23:53:44 - ERROR - stderr - 29%|██▉ | 1078/3741 [6:27:51<15:11:11, 20.53s/it] +2025-05-10 23:53:44 - ERROR - stderr - +2025-05-10 23:53:44 - ERROR - stderr - +2025-05-10 23:53:44 - INFO - stdout - {'loss': 0.8646, 'grad_norm': 0.5720422863960266, 'learning_rate': 1.6707164061735183e-05, 'epoch': 0.86} +2025-05-10 23:53:44 - ERROR - stderr - 29%|██▉ | 1078/3741 [6:27:51<15:11:11, 20.53s/it] +2025-05-10 23:54:04 - ERROR - stderr - 29%|██▉ | 1079/3741 [6:28:10<14:55:27, 20.18s/it] +2025-05-10 23:54:04 - ERROR - stderr - +2025-05-10 23:54:04 - ERROR - stderr - +2025-05-10 23:54:04 - INFO - stdout - {'loss': 0.8971, 'grad_norm': 0.6087173223495483, 'learning_rate': 1.6700738827608606e-05, 'epoch': 0.87} +2025-05-10 23:54:04 - ERROR - stderr - 29%|██▉ | 1079/3741 [6:28:10<14:55:27, 20.18s/it] +2025-05-10 23:54:26 - ERROR - stderr - 29%|██▉ | 1080/3741 [6:28:32<15:17:13, 20.68s/it] +2025-05-10 23:54:26 - ERROR - stderr - +2025-05-10 23:54:26 - ERROR - stderr - +2025-05-10 23:54:26 - INFO - stdout - {'loss': 0.9118, 'grad_norm': 0.5989866256713867, 'learning_rate': 1.6694308569039853e-05, 'epoch': 0.87} +2025-05-10 23:54:26 - ERROR - stderr - 29%|██▉ | 1080/3741 [6:28:32<15:17:13, 20.68s/it] +2025-05-10 23:54:46 - ERROR - stderr - 29%|██▉ | 1081/3741 [6:28:52<15:10:42, 20.54s/it] +2025-05-10 23:54:46 - ERROR - stderr - +2025-05-10 23:54:46 - ERROR - stderr - +2025-05-10 23:54:46 - INFO - stdout - {'loss': 0.9619, 'grad_norm': 0.6564970016479492, 'learning_rate': 1.6687873290850554e-05, 'epoch': 0.87} +2025-05-10 23:54:46 - ERROR - stderr - 29%|██▉ | 1081/3741 [6:28:52<15:10:42, 20.54s/it] +2025-05-10 23:55:10 - ERROR - stderr - 29%|██▉ | 1082/3741 [6:29:16<15:53:41, 21.52s/it] +2025-05-10 23:55:10 - ERROR - stderr - +2025-05-10 23:55:10 - ERROR - stderr - +2025-05-10 23:55:10 - INFO - stdout - {'loss': 0.9252, 'grad_norm': 0.6022130250930786, 'learning_rate': 1.6681432997866097e-05, 'epoch': 0.87} +2025-05-10 23:55:10 - ERROR - stderr - 29%|██▉ | 1082/3741 [6:29:16<15:53:41, 21.52s/it] +2025-05-10 23:55:29 - ERROR - stderr - 29%|██▉ | 1083/3741 [6:29:35<15:22:50, 20.83s/it] +2025-05-10 23:55:29 - ERROR - stderr - +2025-05-10 23:55:29 - ERROR - stderr - +2025-05-10 23:55:29 - INFO - stdout - {'loss': 0.9207, 'grad_norm': 0.5839381217956543, 'learning_rate': 1.667498769491563e-05, 'epoch': 0.87} +2025-05-10 23:55:29 - ERROR - stderr - 29%|██▉ | 1083/3741 [6:29:35<15:22:50, 20.83s/it] +2025-05-10 23:55:53 - ERROR - stderr - 29%|██▉ | 1084/3741 [6:30:00<16:10:33, 21.92s/it] +2025-05-10 23:55:53 - ERROR - stderr - +2025-05-10 23:55:53 - ERROR - stderr - +2025-05-10 23:55:53 - INFO - stdout - {'loss': 0.9339, 'grad_norm': 0.6378865242004395, 'learning_rate': 1.666853738683207e-05, 'epoch': 0.87} +2025-05-10 23:55:53 - ERROR - stderr - 29%|██▉ | 1084/3741 [6:30:00<16:10:33, 21.92s/it] +2025-05-10 23:56:13 - ERROR - stderr - 29%|██▉ | 1085/3741 [6:30:19<15:39:35, 21.23s/it] +2025-05-10 23:56:13 - ERROR - stderr - +2025-05-10 23:56:13 - ERROR - stderr - +2025-05-10 23:56:13 - INFO - stdout - {'loss': 0.9323, 'grad_norm': 0.667452335357666, 'learning_rate': 1.6662082078452068e-05, 'epoch': 0.87} +2025-05-10 23:56:13 - ERROR - stderr - 29%|██▉ | 1085/3741 [6:30:19<15:39:35, 21.23s/it] +2025-05-10 23:56:32 - ERROR - stderr - 29%|██▉ | 1086/3741 [6:30:39<15:17:14, 20.73s/it] +2025-05-10 23:56:32 - ERROR - stderr - +2025-05-10 23:56:32 - ERROR - stderr - +2025-05-10 23:56:32 - INFO - stdout - {'loss': 0.8857, 'grad_norm': 0.5752742886543274, 'learning_rate': 1.665562177461604e-05, 'epoch': 0.87} +2025-05-10 23:56:32 - ERROR - stderr - 29%|██▉ | 1086/3741 [6:30:39<15:17:14, 20.73s/it] +2025-05-10 23:56:52 - ERROR - stderr - 29%|██▉ | 1087/3741 [6:30:58<15:00:39, 20.36s/it] +2025-05-10 23:56:52 - ERROR - stderr - +2025-05-10 23:56:52 - ERROR - stderr - +2025-05-10 23:56:52 - INFO - stdout - {'loss': 0.9146, 'grad_norm': 0.6381446719169617, 'learning_rate': 1.6649156480168137e-05, 'epoch': 0.87} +2025-05-10 23:56:52 - ERROR - stderr - 29%|██▉ | 1087/3741 [6:30:58<15:00:39, 20.36s/it] +2025-05-10 23:57:11 - ERROR - stderr - 29%|██▉ | 1088/3741 [6:31:18<14:47:26, 20.07s/it] +2025-05-10 23:57:11 - ERROR - stderr - +2025-05-10 23:57:11 - ERROR - stderr - +2025-05-10 23:57:11 - INFO - stdout - {'loss': 0.9048, 'grad_norm': 0.6204044818878174, 'learning_rate': 1.6642686199956263e-05, 'epoch': 0.87} +2025-05-10 23:57:11 - ERROR - stderr - 29%|██▉ | 1088/3741 [6:31:18<14:47:26, 20.07s/it] +2025-05-10 23:57:35 - ERROR - stderr - 29%|██▉ | 1089/3741 [6:31:41<15:28:43, 21.01s/it] +2025-05-10 23:57:35 - ERROR - stderr - +2025-05-10 23:57:35 - ERROR - stderr - +2025-05-10 23:57:35 - INFO - stdout - {'loss': 0.9792, 'grad_norm': 0.6212306618690491, 'learning_rate': 1.6636210938832053e-05, 'epoch': 0.87} +2025-05-10 23:57:35 - ERROR - stderr - 29%|██▉ | 1089/3741 [6:31:41<15:28:43, 21.01s/it] +2025-05-10 23:57:54 - ERROR - stderr - 29%|██▉ | 1090/3741 [6:32:01<15:12:28, 20.65s/it] +2025-05-10 23:57:54 - ERROR - stderr - +2025-05-10 23:57:54 - ERROR - stderr - +2025-05-10 23:57:54 - INFO - stdout - {'loss': 0.9181, 'grad_norm': 0.5908262133598328, 'learning_rate': 1.662973070165088e-05, 'epoch': 0.87} +2025-05-10 23:57:54 - ERROR - stderr - 29%|██▉ | 1090/3741 [6:32:01<15:12:28, 20.65s/it] +2025-05-10 23:58:18 - ERROR - stderr - 29%|██▉ | 1091/3741 [6:32:24<15:46:25, 21.43s/it] +2025-05-10 23:58:18 - ERROR - stderr - +2025-05-10 23:58:18 - ERROR - stderr - +2025-05-10 23:58:18 - INFO - stdout - {'loss': 0.953, 'grad_norm': 0.6047478318214417, 'learning_rate': 1.6623245493271832e-05, 'epoch': 0.87} +2025-05-10 23:58:18 - ERROR - stderr - 29%|██▉ | 1091/3741 [6:32:24<15:46:25, 21.43s/it] +2025-05-10 23:58:37 - ERROR - stderr - 29%|██▉ | 1092/3741 [6:32:43<15:20:17, 20.84s/it] +2025-05-10 23:58:37 - ERROR - stderr - +2025-05-10 23:58:37 - ERROR - stderr - +2025-05-10 23:58:37 - INFO - stdout - {'loss': 0.9327, 'grad_norm': 0.5832977294921875, 'learning_rate': 1.6616755318557758e-05, 'epoch': 0.88} +2025-05-10 23:58:37 - ERROR - stderr - 29%|██▉ | 1092/3741 [6:32:43<15:20:17, 20.84s/it] +2025-05-10 23:58:57 - ERROR - stderr - 29%|██▉ | 1093/3741 [6:33:03<15:03:46, 20.48s/it] +2025-05-10 23:58:57 - ERROR - stderr - +2025-05-10 23:58:57 - ERROR - stderr - +2025-05-10 23:58:57 - INFO - stdout - {'loss': 0.9074, 'grad_norm': 0.6228951811790466, 'learning_rate': 1.6610260182375202e-05, 'epoch': 0.88} +2025-05-10 23:58:57 - ERROR - stderr - 29%|██▉ | 1093/3741 [6:33:03<15:03:46, 20.48s/it] +2025-05-10 23:59:16 - ERROR - stderr - 29%|██▉ | 1094/3741 [6:33:22<14:48:02, 20.13s/it] +2025-05-10 23:59:16 - ERROR - stderr - +2025-05-10 23:59:16 - ERROR - stderr - +2025-05-10 23:59:16 - INFO - stdout - {'loss': 0.9113, 'grad_norm': 0.5714309811592102, 'learning_rate': 1.660376008959444e-05, 'epoch': 0.88} +2025-05-10 23:59:16 - ERROR - stderr - 29%|██▉ | 1094/3741 [6:33:22<14:48:02, 20.13s/it] +2025-05-10 23:59:36 - ERROR - stderr - 29%|██▉ | 1095/3741 [6:33:42<14:43:57, 20.04s/it] +2025-05-10 23:59:36 - ERROR - stderr - +2025-05-10 23:59:36 - ERROR - stderr - +2025-05-10 23:59:36 - INFO - stdout - {'loss': 0.8875, 'grad_norm': 0.57155442237854, 'learning_rate': 1.6597255045089466e-05, 'epoch': 0.88} +2025-05-10 23:59:36 - ERROR - stderr - 29%|██▉ | 1095/3741 [6:33:42<14:43:57, 20.04s/it] +2025-05-10 23:59:59 - ERROR - stderr - 29%|██▉ | 1096/3741 [6:34:06<15:26:59, 21.03s/it] +2025-05-10 23:59:59 - ERROR - stderr - +2025-05-10 23:59:59 - ERROR - stderr - +2025-05-10 23:59:59 - INFO - stdout - {'loss': 0.9663, 'grad_norm': 0.6274538040161133, 'learning_rate': 1.6590745053737986e-05, 'epoch': 0.88} +2025-05-10 23:59:59 - ERROR - stderr - 29%|██▉ | 1096/3741 [6:34:06<15:26:59, 21.03s/it] +2025-05-11 00:00:18 - ERROR - stderr - 29%|██▉ | 1097/3741 [6:34:25<15:03:22, 20.50s/it] +2025-05-11 00:00:18 - ERROR - stderr - +2025-05-11 00:00:18 - ERROR - stderr - +2025-05-11 00:00:18 - INFO - stdout - {'loss': 0.9048, 'grad_norm': 0.5771580934524536, 'learning_rate': 1.65842301204214e-05, 'epoch': 0.88} +2025-05-11 00:00:18 - ERROR - stderr - 29%|██▉ | 1097/3741 [6:34:25<15:03:22, 20.50s/it] +2025-05-11 00:00:41 - ERROR - stderr - 29%|██▉ | 1098/3741 [6:34:48<15:35:02, 21.23s/it] +2025-05-11 00:00:41 - ERROR - stderr - +2025-05-11 00:00:41 - ERROR - stderr - +2025-05-11 00:00:41 - INFO - stdout - {'loss': 0.9001, 'grad_norm': 0.5920909643173218, 'learning_rate': 1.657771025002484e-05, 'epoch': 0.88} +2025-05-11 00:00:41 - ERROR - stderr - 29%|██▉ | 1098/3741 [6:34:48<15:35:02, 21.23s/it] +2025-05-11 00:01:01 - ERROR - stderr - 29%|██▉ | 1099/3741 [6:35:07<15:11:07, 20.69s/it] +2025-05-11 00:01:01 - ERROR - stderr - +2025-05-11 00:01:01 - ERROR - stderr - +2025-05-11 00:01:01 - INFO - stdout - {'loss': 0.921, 'grad_norm': 0.5597774386405945, 'learning_rate': 1.657118544743712e-05, 'epoch': 0.88} +2025-05-11 00:01:01 - ERROR - stderr - 29%|██▉ | 1099/3741 [6:35:07<15:11:07, 20.69s/it] +2025-05-11 00:01:21 - ERROR - stderr - 29%|██▉ | 1100/3741 [6:35:27<14:58:29, 20.41s/it] +2025-05-11 00:01:21 - ERROR - stderr - +2025-05-11 00:01:21 - ERROR - stderr - +2025-05-11 00:01:21 - INFO - stdout - {'loss': 0.9508, 'grad_norm': 0.5892897248268127, 'learning_rate': 1.6564655717550766e-05, 'epoch': 0.88} +2025-05-11 00:01:21 - ERROR - stderr - 29%|██▉ | 1100/3741 [6:35:27<14:58:29, 20.41s/it] +2025-05-11 00:01:41 - ERROR - stderr - 29%|██▉ | 1101/3741 [6:35:48<15:02:27, 20.51s/it] +2025-05-11 00:01:41 - ERROR - stderr - +2025-05-11 00:01:41 - ERROR - stderr - +2025-05-11 00:01:41 - INFO - stdout - {'loss': 0.9015, 'grad_norm': 0.641369640827179, 'learning_rate': 1.6558121065261982e-05, 'epoch': 0.88} +2025-05-11 00:01:41 - ERROR - stderr - 29%|██▉ | 1101/3741 [6:35:48<15:02:27, 20.51s/it] +2025-05-11 00:02:01 - ERROR - stderr - 29%|██▉ | 1102/3741 [6:36:07<14:47:35, 20.18s/it] +2025-05-11 00:02:01 - ERROR - stderr - +2025-05-11 00:02:01 - ERROR - stderr - +2025-05-11 00:02:01 - INFO - stdout - {'loss': 0.8589, 'grad_norm': 0.5892067551612854, 'learning_rate': 1.6551581495470683e-05, 'epoch': 0.88} +2025-05-11 00:02:01 - ERROR - stderr - 29%|██▉ | 1102/3741 [6:36:07<14:47:35, 20.18s/it] +2025-05-11 00:02:25 - ERROR - stderr - 29%|██▉ | 1103/3741 [6:36:31<15:39:20, 21.37s/it] +2025-05-11 00:02:25 - ERROR - stderr - +2025-05-11 00:02:25 - ERROR - stderr - +2025-05-11 00:02:25 - INFO - stdout - {'loss': 0.9548, 'grad_norm': 0.5978425145149231, 'learning_rate': 1.6545037013080455e-05, 'epoch': 0.88} +2025-05-11 00:02:25 - ERROR - stderr - 29%|██▉ | 1103/3741 [6:36:31<15:39:20, 21.37s/it] +2025-05-11 00:02:44 - ERROR - stderr - 30%|██▉ | 1104/3741 [6:36:51<15:14:27, 20.81s/it] +2025-05-11 00:02:44 - ERROR - stderr - +2025-05-11 00:02:44 - ERROR - stderr - +2025-05-11 00:02:44 - INFO - stdout - {'loss': 0.9711, 'grad_norm': 0.6504059433937073, 'learning_rate': 1.6538487622998576e-05, 'epoch': 0.89} +2025-05-11 00:02:44 - ERROR - stderr - 30%|██▉ | 1104/3741 [6:36:51<15:14:27, 20.81s/it] +2025-05-11 00:03:09 - ERROR - stderr - 30%|██▉ | 1105/3741 [6:37:15<16:01:59, 21.90s/it] +2025-05-11 00:03:09 - ERROR - stderr - +2025-05-11 00:03:09 - ERROR - stderr - +2025-05-11 00:03:09 - INFO - stdout - {'loss': 0.9147, 'grad_norm': 0.6230421662330627, 'learning_rate': 1.6531933330136e-05, 'epoch': 0.89} +2025-05-11 00:03:09 - ERROR - stderr - 30%|██▉ | 1105/3741 [6:37:15<16:01:59, 21.90s/it] +2025-05-11 00:03:29 - ERROR - stderr - 30%|██▉ | 1106/3741 [6:37:35<15:40:45, 21.42s/it] +2025-05-11 00:03:29 - ERROR - stderr - +2025-05-11 00:03:29 - ERROR - stderr - +2025-05-11 00:03:29 - INFO - stdout - {'loss': 0.963, 'grad_norm': 0.6024095416069031, 'learning_rate': 1.652537413940736e-05, 'epoch': 0.89} +2025-05-11 00:03:29 - ERROR - stderr - 30%|██▉ | 1106/3741 [6:37:35<15:40:45, 21.42s/it] +2025-05-11 00:03:50 - ERROR - stderr - 30%|██▉ | 1107/3741 [6:37:56<15:27:24, 21.13s/it] +2025-05-11 00:03:50 - ERROR - stderr - +2025-05-11 00:03:50 - ERROR - stderr - +2025-05-11 00:03:50 - INFO - stdout - {'loss': 0.9197, 'grad_norm': 0.643290102481842, 'learning_rate': 1.6518810055730962e-05, 'epoch': 0.89} +2025-05-11 00:03:50 - ERROR - stderr - 30%|██▉ | 1107/3741 [6:37:56<15:27:24, 21.13s/it] +2025-05-11 00:04:09 - ERROR - stderr - 30%|██▉ | 1108/3741 [6:38:15<15:06:24, 20.65s/it] +2025-05-11 00:04:09 - ERROR - stderr - +2025-05-11 00:04:09 - ERROR - stderr - +2025-05-11 00:04:09 - INFO - stdout - {'loss': 0.9211, 'grad_norm': 0.6359246373176575, 'learning_rate': 1.6512241084028775e-05, 'epoch': 0.89} +2025-05-11 00:04:09 - ERROR - stderr - 30%|██▉ | 1108/3741 [6:38:15<15:06:24, 20.65s/it] +2025-05-11 00:04:29 - ERROR - stderr - 30%|██▉ | 1109/3741 [6:38:35<14:53:40, 20.37s/it] +2025-05-11 00:04:29 - ERROR - stderr - +2025-05-11 00:04:29 - ERROR - stderr - +2025-05-11 00:04:29 - INFO - stdout - {'loss': 0.8995, 'grad_norm': 0.5819621682167053, 'learning_rate': 1.6505667229226445e-05, 'epoch': 0.89} +2025-05-11 00:04:29 - ERROR - stderr - 30%|██▉ | 1109/3741 [6:38:35<14:53:40, 20.37s/it] +2025-05-11 00:04:53 - ERROR - stderr - 30%|██▉ | 1110/3741 [6:38:59<15:41:21, 21.47s/it] +2025-05-11 00:04:53 - ERROR - stderr - +2025-05-11 00:04:53 - ERROR - stderr - +2025-05-11 00:04:53 - INFO - stdout - {'loss': 0.901, 'grad_norm': 0.624454140663147, 'learning_rate': 1.6499088496253266e-05, 'epoch': 0.89} +2025-05-11 00:04:53 - ERROR - stderr - 30%|██▉ | 1110/3741 [6:38:59<15:41:21, 21.47s/it] +2025-05-11 00:05:13 - ERROR - stderr - 30%|██▉ | 1111/3741 [6:39:19<15:19:39, 20.98s/it] +2025-05-11 00:05:13 - ERROR - stderr - +2025-05-11 00:05:13 - ERROR - stderr - +2025-05-11 00:05:13 - INFO - stdout - {'loss': 0.8551, 'grad_norm': 0.608256459236145, 'learning_rate': 1.6492504890042196e-05, 'epoch': 0.89} +2025-05-11 00:05:13 - ERROR - stderr - 30%|██▉ | 1111/3741 [6:39:19<15:19:39, 20.98s/it] +2025-05-11 00:05:37 - ERROR - stderr - 30%|██▉ | 1112/3741 [6:39:43<15:59:52, 21.91s/it] +2025-05-11 00:05:37 - ERROR - stderr - +2025-05-11 00:05:37 - ERROR - stderr - +2025-05-11 00:05:37 - INFO - stdout - {'loss': 0.9358, 'grad_norm': 0.6560264825820923, 'learning_rate': 1.6485916415529852e-05, 'epoch': 0.89} +2025-05-11 00:05:37 - ERROR - stderr - 30%|██▉ | 1112/3741 [6:39:43<15:59:52, 21.91s/it] +2025-05-11 00:05:37 - INFO - stdout - WARNING: tokenization mismatch: 3227 vs. 3245. (ignored) +2025-05-11 00:05:57 - ERROR - stderr - 30%|██▉ | 1113/3741 [6:40:03<15:38:33, 21.43s/it] +2025-05-11 00:05:57 - ERROR - stderr - +2025-05-11 00:05:57 - ERROR - stderr - +2025-05-11 00:05:57 - INFO - stdout - {'loss': 0.9347, 'grad_norm': 0.5924005508422852, 'learning_rate': 1.6479323077656492e-05, 'epoch': 0.89} +2025-05-11 00:05:57 - ERROR - stderr - 30%|██▉ | 1113/3741 [6:40:03<15:38:33, 21.43s/it] +2025-05-11 00:06:17 - ERROR - stderr - 30%|██▉ | 1114/3741 [6:40:23<15:18:04, 20.97s/it] +2025-05-11 00:06:17 - ERROR - stderr - +2025-05-11 00:06:17 - ERROR - stderr - +2025-05-11 00:06:17 - INFO - stdout - {'loss': 0.9396, 'grad_norm': 0.6272872686386108, 'learning_rate': 1.647272488136603e-05, 'epoch': 0.89} +2025-05-11 00:06:17 - ERROR - stderr - 30%|██▉ | 1114/3741 [6:40:23<15:18:04, 20.97s/it] +2025-05-11 00:06:37 - ERROR - stderr - 30%|██▉ | 1115/3741 [6:40:43<15:03:44, 20.65s/it] +2025-05-11 00:06:37 - ERROR - stderr - +2025-05-11 00:06:37 - ERROR - stderr - +2025-05-11 00:06:37 - INFO - stdout - {'loss': 0.9505, 'grad_norm': 0.5873216986656189, 'learning_rate': 1.6466121831606013e-05, 'epoch': 0.89} +2025-05-11 00:06:37 - ERROR - stderr - 30%|██▉ | 1115/3741 [6:40:43<15:03:44, 20.65s/it] +2025-05-11 00:06:56 - ERROR - stderr - 30%|██▉ | 1116/3741 [6:41:03<14:48:00, 20.30s/it] +2025-05-11 00:06:56 - ERROR - stderr - +2025-05-11 00:06:56 - ERROR - stderr - +2025-05-11 00:06:56 - INFO - stdout - {'loss': 0.9651, 'grad_norm': 0.5705021023750305, 'learning_rate': 1.6459513933327637e-05, 'epoch': 0.89} +2025-05-11 00:06:56 - ERROR - stderr - 30%|██▉ | 1116/3741 [6:41:03<14:48:00, 20.30s/it] +2025-05-11 00:07:20 - ERROR - stderr - 30%|██▉ | 1117/3741 [6:41:26<15:31:25, 21.30s/it] +2025-05-11 00:07:20 - ERROR - stderr - +2025-05-11 00:07:20 - ERROR - stderr - +2025-05-11 00:07:20 - INFO - stdout - {'loss': 0.8757, 'grad_norm': 0.6147488951683044, 'learning_rate': 1.6452901191485725e-05, 'epoch': 0.9} +2025-05-11 00:07:20 - ERROR - stderr - 30%|██▉ | 1117/3741 [6:41:26<15:31:25, 21.30s/it] +2025-05-11 00:07:39 - ERROR - stderr - 30%|██▉ | 1118/3741 [6:41:46<15:06:26, 20.73s/it] +2025-05-11 00:07:39 - ERROR - stderr - +2025-05-11 00:07:39 - ERROR - stderr - +2025-05-11 00:07:39 - INFO - stdout - {'loss': 0.9019, 'grad_norm': 0.589171826839447, 'learning_rate': 1.6446283611038735e-05, 'epoch': 0.9} +2025-05-11 00:07:39 - ERROR - stderr - 30%|██▉ | 1118/3741 [6:41:46<15:06:26, 20.73s/it] +2025-05-11 00:08:03 - ERROR - stderr - 30%|██▉ | 1119/3741 [6:42:09<15:41:18, 21.54s/it] +2025-05-11 00:08:03 - ERROR - stderr - +2025-05-11 00:08:03 - ERROR - stderr - +2025-05-11 00:08:03 - INFO - stdout - {'loss': 0.9234, 'grad_norm': 0.5974717736244202, 'learning_rate': 1.643966119694876e-05, 'epoch': 0.9} +2025-05-11 00:08:03 - ERROR - stderr - 30%|██▉ | 1119/3741 [6:42:09<15:41:18, 21.54s/it] +2025-05-11 00:08:22 - ERROR - stderr - 30%|██▉ | 1120/3741 [6:42:29<15:15:09, 20.95s/it] +2025-05-11 00:08:22 - ERROR - stderr - +2025-05-11 00:08:22 - ERROR - stderr - +2025-05-11 00:08:22 - INFO - stdout - {'loss': 0.9127, 'grad_norm': 0.5791064500808716, 'learning_rate': 1.643303395418151e-05, 'epoch': 0.9} +2025-05-11 00:08:22 - ERROR - stderr - 30%|██▉ | 1120/3741 [6:42:29<15:15:09, 20.95s/it] +2025-05-11 00:08:42 - ERROR - stderr - 30%|██▉ | 1121/3741 [6:42:48<14:54:30, 20.48s/it] +2025-05-11 00:08:42 - ERROR - stderr - +2025-05-11 00:08:42 - ERROR - stderr - +2025-05-11 00:08:42 - INFO - stdout - {'loss': 0.8784, 'grad_norm': 0.6018791198730469, 'learning_rate': 1.642640188770632e-05, 'epoch': 0.9} +2025-05-11 00:08:42 - ERROR - stderr - 30%|██▉ | 1121/3741 [6:42:48<14:54:30, 20.48s/it] +2025-05-11 00:09:03 - ERROR - stderr - 30%|██▉ | 1122/3741 [6:43:09<15:04:18, 20.72s/it] +2025-05-11 00:09:03 - ERROR - stderr - +2025-05-11 00:09:03 - ERROR - stderr - +2025-05-11 00:09:03 - INFO - stdout - {'loss': 0.9173, 'grad_norm': 0.5726858973503113, 'learning_rate': 1.641976500249613e-05, 'epoch': 0.9} +2025-05-11 00:09:03 - ERROR - stderr - 30%|██▉ | 1122/3741 [6:43:09<15:04:18, 20.72s/it] +2025-05-11 00:09:22 - ERROR - stderr - 30%|███ | 1123/3741 [6:43:29<14:45:19, 20.29s/it] +2025-05-11 00:09:22 - ERROR - stderr - +2025-05-11 00:09:22 - ERROR - stderr - +2025-05-11 00:09:22 - INFO - stdout - {'loss': 0.9295, 'grad_norm': 0.6228300333023071, 'learning_rate': 1.641312330352751e-05, 'epoch': 0.9} +2025-05-11 00:09:22 - ERROR - stderr - 30%|███ | 1123/3741 [6:43:29<14:45:19, 20.29s/it] +2025-05-11 00:09:46 - ERROR - stderr - 30%|███ | 1124/3741 [6:43:53<15:31:51, 21.36s/it] +2025-05-11 00:09:46 - ERROR - stderr - +2025-05-11 00:09:46 - ERROR - stderr - +2025-05-11 00:09:46 - INFO - stdout - {'loss': 0.9149, 'grad_norm': 0.5762906670570374, 'learning_rate': 1.6406476795780634e-05, 'epoch': 0.9} +2025-05-11 00:09:46 - ERROR - stderr - 30%|███ | 1124/3741 [6:43:53<15:31:51, 21.36s/it] +2025-05-11 00:10:06 - ERROR - stderr - 30%|███ | 1125/3741 [6:44:12<15:06:40, 20.80s/it] +2025-05-11 00:10:06 - ERROR - stderr - +2025-05-11 00:10:06 - ERROR - stderr - +2025-05-11 00:10:06 - INFO - stdout - {'loss': 0.962, 'grad_norm': 0.6083019375801086, 'learning_rate': 1.639982548423927e-05, 'epoch': 0.9} +2025-05-11 00:10:06 - ERROR - stderr - 30%|███ | 1125/3741 [6:44:12<15:06:40, 20.80s/it] +2025-05-11 00:10:29 - ERROR - stderr - 30%|███ | 1126/3741 [6:44:35<15:39:12, 21.55s/it] +2025-05-11 00:10:29 - ERROR - stderr - +2025-05-11 00:10:29 - ERROR - stderr - +2025-05-11 00:10:29 - INFO - stdout - {'loss': 0.9129, 'grad_norm': 0.6070680022239685, 'learning_rate': 1.6393169373890805e-05, 'epoch': 0.9} +2025-05-11 00:10:29 - ERROR - stderr - 30%|███ | 1126/3741 [6:44:35<15:39:12, 21.55s/it] +2025-05-11 00:10:49 - ERROR - stderr - 30%|███ | 1127/3741 [6:44:55<15:15:03, 21.00s/it] +2025-05-11 00:10:49 - ERROR - stderr - +2025-05-11 00:10:49 - ERROR - stderr - +2025-05-11 00:10:49 - INFO - stdout - {'loss': 0.9209, 'grad_norm': 0.5900879502296448, 'learning_rate': 1.6386508469726215e-05, 'epoch': 0.9} +2025-05-11 00:10:49 - ERROR - stderr - 30%|███ | 1127/3741 [6:44:55<15:15:03, 21.00s/it] +2025-05-11 00:11:09 - ERROR - stderr - 30%|███ | 1128/3741 [6:45:15<15:01:33, 20.70s/it] +2025-05-11 00:11:09 - ERROR - stderr - +2025-05-11 00:11:09 - ERROR - stderr - +2025-05-11 00:11:09 - INFO - stdout - {'loss': 0.9232, 'grad_norm': 0.5943062901496887, 'learning_rate': 1.637984277674008e-05, 'epoch': 0.9} +2025-05-11 00:11:09 - ERROR - stderr - 30%|███ | 1128/3741 [6:45:15<15:01:33, 20.70s/it] +2025-05-11 00:11:28 - ERROR - stderr - 30%|███ | 1129/3741 [6:45:35<14:45:35, 20.34s/it] +2025-05-11 00:11:28 - ERROR - stderr - +2025-05-11 00:11:28 - ERROR - stderr - +2025-05-11 00:11:28 - INFO - stdout - {'loss': 0.9191, 'grad_norm': 0.6167227029800415, 'learning_rate': 1.6373172299930553e-05, 'epoch': 0.91} +2025-05-11 00:11:28 - ERROR - stderr - 30%|███ | 1129/3741 [6:45:35<14:45:35, 20.34s/it] +2025-05-11 00:11:48 - ERROR - stderr - 30%|███ | 1130/3741 [6:45:54<14:34:35, 20.10s/it] +2025-05-11 00:11:48 - ERROR - stderr - +2025-05-11 00:11:48 - ERROR - stderr - +2025-05-11 00:11:48 - INFO - stdout - {'loss': 0.8991, 'grad_norm': 0.5882411003112793, 'learning_rate': 1.636649704429939e-05, 'epoch': 0.91} +2025-05-11 00:11:48 - ERROR - stderr - 30%|███ | 1130/3741 [6:45:54<14:34:35, 20.10s/it] +2025-05-11 00:12:11 - ERROR - stderr - 30%|███ | 1131/3741 [6:46:17<15:13:41, 21.00s/it] +2025-05-11 00:12:11 - ERROR - stderr - +2025-05-11 00:12:11 - ERROR - stderr - +2025-05-11 00:12:11 - INFO - stdout - {'loss': 0.9584, 'grad_norm': 0.5918989777565002, 'learning_rate': 1.6359817014851925e-05, 'epoch': 0.91} +2025-05-11 00:12:11 - ERROR - stderr - 30%|███ | 1131/3741 [6:46:17<15:13:41, 21.00s/it] +2025-05-11 00:12:30 - ERROR - stderr - 30%|███ | 1132/3741 [6:46:37<14:54:10, 20.56s/it] +2025-05-11 00:12:30 - ERROR - stderr - +2025-05-11 00:12:30 - ERROR - stderr - +2025-05-11 00:12:30 - INFO - stdout - {'loss': 0.9231, 'grad_norm': 0.5983660817146301, 'learning_rate': 1.635313221659707e-05, 'epoch': 0.91} +2025-05-11 00:12:30 - ERROR - stderr - 30%|███ | 1132/3741 [6:46:37<14:54:10, 20.56s/it] +2025-05-11 00:12:54 - ERROR - stderr - 30%|███ | 1133/3741 [6:47:00<15:27:29, 21.34s/it] +2025-05-11 00:12:54 - ERROR - stderr - +2025-05-11 00:12:54 - ERROR - stderr - +2025-05-11 00:12:54 - INFO - stdout - {'loss': 0.9037, 'grad_norm': 0.5728667378425598, 'learning_rate': 1.6346442654547314e-05, 'epoch': 0.91} +2025-05-11 00:12:54 - ERROR - stderr - 30%|███ | 1133/3741 [6:47:00<15:27:29, 21.34s/it] +2025-05-11 00:13:13 - ERROR - stderr - 30%|███ | 1134/3741 [6:47:20<15:06:21, 20.86s/it] +2025-05-11 00:13:13 - ERROR - stderr - +2025-05-11 00:13:13 - ERROR - stderr - +2025-05-11 00:13:13 - INFO - stdout - {'loss': 0.8928, 'grad_norm': 0.6043873429298401, 'learning_rate': 1.633974833371872e-05, 'epoch': 0.91} +2025-05-11 00:13:13 - ERROR - stderr - 30%|███ | 1134/3741 [6:47:20<15:06:21, 20.86s/it] +2025-05-11 00:13:33 - ERROR - stderr - 30%|███ | 1135/3741 [6:47:39<14:52:55, 20.56s/it] +2025-05-11 00:13:33 - ERROR - stderr - +2025-05-11 00:13:33 - ERROR - stderr - +2025-05-11 00:13:33 - INFO - stdout - {'loss': 0.9516, 'grad_norm': 0.604079008102417, 'learning_rate': 1.633304925913092e-05, 'epoch': 0.91} +2025-05-11 00:13:33 - ERROR - stderr - 30%|███ | 1135/3741 [6:47:39<14:52:55, 20.56s/it] +2025-05-11 00:13:53 - ERROR - stderr - 30%|███ | 1136/3741 [6:47:59<14:43:03, 20.34s/it] +2025-05-11 00:13:53 - ERROR - stderr - +2025-05-11 00:13:53 - ERROR - stderr - +2025-05-11 00:13:53 - INFO - stdout - {'loss': 0.942, 'grad_norm': 0.611089825630188, 'learning_rate': 1.6326345435807104e-05, 'epoch': 0.91} +2025-05-11 00:13:53 - ERROR - stderr - 30%|███ | 1136/3741 [6:47:59<14:43:03, 20.34s/it] +2025-05-11 00:14:13 - ERROR - stderr - 30%|███ | 1137/3741 [6:48:19<14:35:01, 20.16s/it] +2025-05-11 00:14:13 - ERROR - stderr - +2025-05-11 00:14:13 - ERROR - stderr - +2025-05-11 00:14:13 - INFO - stdout - {'loss': 0.9315, 'grad_norm': 0.61098313331604, 'learning_rate': 1.631963686877403e-05, 'epoch': 0.91} +2025-05-11 00:14:13 - ERROR - stderr - 30%|███ | 1137/3741 [6:48:19<14:35:01, 20.16s/it] +2025-05-11 00:14:36 - ERROR - stderr - 30%|███ | 1138/3741 [6:48:42<15:16:33, 21.13s/it] +2025-05-11 00:14:36 - ERROR - stderr - +2025-05-11 00:14:36 - ERROR - stderr - +2025-05-11 00:14:36 - INFO - stdout - {'loss': 0.8947, 'grad_norm': 0.597474217414856, 'learning_rate': 1.6312923563062008e-05, 'epoch': 0.91} +2025-05-11 00:14:36 - ERROR - stderr - 30%|███ | 1138/3741 [6:48:42<15:16:33, 21.13s/it] +2025-05-11 00:14:56 - ERROR - stderr - 30%|███ | 1139/3741 [6:49:02<14:58:29, 20.72s/it] +2025-05-11 00:14:56 - ERROR - stderr - +2025-05-11 00:14:56 - ERROR - stderr - +2025-05-11 00:14:56 - INFO - stdout - {'loss': 0.9241, 'grad_norm': 0.6015665531158447, 'learning_rate': 1.6306205523704903e-05, 'epoch': 0.91} +2025-05-11 00:14:56 - ERROR - stderr - 30%|███ | 1139/3741 [6:49:02<14:58:29, 20.72s/it] +2025-05-11 00:15:19 - ERROR - stderr - 30%|███ | 1140/3741 [6:49:25<15:26:57, 21.38s/it] +2025-05-11 00:15:19 - ERROR - stderr - +2025-05-11 00:15:19 - ERROR - stderr - +2025-05-11 00:15:19 - INFO - stdout - {'loss': 0.9079, 'grad_norm': 0.559998095035553, 'learning_rate': 1.6299482755740132e-05, 'epoch': 0.91} +2025-05-11 00:15:19 - ERROR - stderr - 30%|███ | 1140/3741 [6:49:25<15:26:57, 21.38s/it] +2025-05-11 00:15:38 - ERROR - stderr - 30%|███ | 1141/3741 [6:49:44<15:00:30, 20.78s/it] +2025-05-11 00:15:38 - ERROR - stderr - +2025-05-11 00:15:38 - ERROR - stderr - +2025-05-11 00:15:38 - INFO - stdout - {'loss': 0.9465, 'grad_norm': 0.5764912962913513, 'learning_rate': 1.6292755264208656e-05, 'epoch': 0.91} +2025-05-11 00:15:38 - ERROR - stderr - 30%|███ | 1141/3741 [6:49:45<15:00:30, 20.78s/it] +2025-05-11 00:15:58 - ERROR - stderr - 31%|███ | 1142/3741 [6:50:04<14:42:47, 20.38s/it] +2025-05-11 00:15:58 - ERROR - stderr - +2025-05-11 00:15:58 - ERROR - stderr - +2025-05-11 00:15:58 - INFO - stdout - {'loss': 0.9198, 'grad_norm': 0.6615179181098938, 'learning_rate': 1.6286023054154973e-05, 'epoch': 0.92} +2025-05-11 00:15:58 - ERROR - stderr - 31%|███ | 1142/3741 [6:50:04<14:42:47, 20.38s/it] +2025-05-11 00:16:17 - ERROR - stderr - 31%|███ | 1143/3741 [6:50:24<14:35:33, 20.22s/it] +2025-05-11 00:16:17 - ERROR - stderr - +2025-05-11 00:16:17 - ERROR - stderr - +2025-05-11 00:16:17 - INFO - stdout - {'loss': 0.9332, 'grad_norm': 0.6102979183197021, 'learning_rate': 1.6279286130627124e-05, 'epoch': 0.92} +2025-05-11 00:16:17 - ERROR - stderr - 31%|███ | 1143/3741 [6:50:24<14:35:33, 20.22s/it] +2025-05-11 00:16:37 - ERROR - stderr - 31%|███ | 1144/3741 [6:50:43<14:25:22, 19.99s/it] +2025-05-11 00:16:37 - ERROR - stderr - +2025-05-11 00:16:37 - ERROR - stderr - +2025-05-11 00:16:37 - INFO - stdout - {'loss': 0.9494, 'grad_norm': 0.5873243808746338, 'learning_rate': 1.627254449867669e-05, 'epoch': 0.92} +2025-05-11 00:16:37 - ERROR - stderr - 31%|███ | 1144/3741 [6:50:43<14:25:22, 19.99s/it] +2025-05-11 00:17:00 - ERROR - stderr - 31%|███ | 1145/3741 [6:51:06<14:58:23, 20.76s/it] +2025-05-11 00:17:00 - ERROR - stderr - +2025-05-11 00:17:00 - ERROR - stderr - +2025-05-11 00:17:00 - INFO - stdout - {'loss': 0.8697, 'grad_norm': 0.5706033110618591, 'learning_rate': 1.626579816335877e-05, 'epoch': 0.92} +2025-05-11 00:17:00 - ERROR - stderr - 31%|███ | 1145/3741 [6:51:06<14:58:23, 20.76s/it] +2025-05-11 00:17:20 - ERROR - stderr - 31%|███ | 1146/3741 [6:51:26<14:52:29, 20.64s/it] +2025-05-11 00:17:20 - ERROR - stderr - +2025-05-11 00:17:20 - ERROR - stderr - +2025-05-11 00:17:20 - INFO - stdout - {'loss': 0.9287, 'grad_norm': 0.6418749094009399, 'learning_rate': 1.6259047129731996e-05, 'epoch': 0.92} +2025-05-11 00:17:20 - ERROR - stderr - 31%|███ | 1146/3741 [6:51:26<14:52:29, 20.64s/it] +2025-05-11 00:17:41 - ERROR - stderr - 31%|███ | 1147/3741 [6:51:48<15:03:35, 20.90s/it] +2025-05-11 00:17:41 - ERROR - stderr - +2025-05-11 00:17:41 - ERROR - stderr - +2025-05-11 00:17:41 - INFO - stdout - {'loss': 0.9095, 'grad_norm': 0.6301258206367493, 'learning_rate': 1.6252291402858525e-05, 'epoch': 0.92} +2025-05-11 00:17:41 - ERROR - stderr - 31%|███ | 1147/3741 [6:51:48<15:03:35, 20.90s/it] +2025-05-11 00:18:01 - ERROR - stderr - 31%|███ | 1148/3741 [6:52:07<14:44:12, 20.46s/it] +2025-05-11 00:18:01 - ERROR - stderr - +2025-05-11 00:18:01 - ERROR - stderr - +2025-05-11 00:18:01 - INFO - stdout - {'loss': 0.9062, 'grad_norm': 0.6077032685279846, 'learning_rate': 1.6245530987804034e-05, 'epoch': 0.92} +2025-05-11 00:18:01 - ERROR - stderr - 31%|███ | 1148/3741 [6:52:07<14:44:12, 20.46s/it] +2025-05-11 00:18:20 - ERROR - stderr - 31%|█���█ | 1149/3741 [6:52:27<14:32:02, 20.19s/it] +2025-05-11 00:18:20 - ERROR - stderr - +2025-05-11 00:18:20 - ERROR - stderr - +2025-05-11 00:18:20 - INFO - stdout - {'loss': 0.9294, 'grad_norm': 0.6020398139953613, 'learning_rate': 1.6238765889637704e-05, 'epoch': 0.92} +2025-05-11 00:18:20 - ERROR - stderr - 31%|███ | 1149/3741 [6:52:27<14:32:02, 20.19s/it] +2025-05-11 00:18:43 - ERROR - stderr - 31%|███ | 1150/3741 [6:52:50<15:06:24, 20.99s/it] +2025-05-11 00:18:43 - ERROR - stderr - +2025-05-11 00:18:43 - ERROR - stderr - +2025-05-11 00:18:43 - INFO - stdout - {'loss': 0.9235, 'grad_norm': 0.6611399054527283, 'learning_rate': 1.6231996113432242e-05, 'epoch': 0.92} +2025-05-11 00:18:43 - ERROR - stderr - 31%|███ | 1150/3741 [6:52:50<15:06:24, 20.99s/it] +2025-05-11 00:19:02 - ERROR - stderr - 31%|███ | 1151/3741 [6:53:09<14:43:39, 20.47s/it] +2025-05-11 00:19:02 - ERROR - stderr - +2025-05-11 00:19:02 - ERROR - stderr - +2025-05-11 00:19:02 - INFO - stdout - {'loss': 0.9033, 'grad_norm': 0.6157788634300232, 'learning_rate': 1.6225221664263857e-05, 'epoch': 0.92} +2025-05-11 00:19:02 - ERROR - stderr - 31%|███ | 1151/3741 [6:53:09<14:43:39, 20.47s/it] +2025-05-11 00:19:26 - ERROR - stderr - 31%|███ | 1152/3741 [6:53:32<15:18:28, 21.29s/it] +2025-05-11 00:19:26 - ERROR - stderr - +2025-05-11 00:19:26 - ERROR - stderr - +2025-05-11 00:19:26 - INFO - stdout - {'loss': 0.8995, 'grad_norm': 0.59830242395401, 'learning_rate': 1.6218442547212265e-05, 'epoch': 0.92} +2025-05-11 00:19:26 - ERROR - stderr - 31%|███ | 1152/3741 [6:53:32<15:18:28, 21.29s/it] +2025-05-11 00:19:45 - ERROR - stderr - 31%|███ | 1153/3741 [6:53:52<14:56:25, 20.78s/it] +2025-05-11 00:19:45 - ERROR - stderr - +2025-05-11 00:19:45 - ERROR - stderr - +2025-05-11 00:19:45 - INFO - stdout - {'loss': 0.9215, 'grad_norm': 0.626473069190979, 'learning_rate': 1.6211658767360667e-05, 'epoch': 0.92} +2025-05-11 00:19:45 - ERROR - stderr - 31%|███ | 1153/3741 [6:53:52<14:56:25, 20.78s/it] +2025-05-11 00:20:05 - ERROR - stderr - 31%|███ | 1154/3741 [6:54:12<14:46:58, 20.57s/it] +2025-05-11 00:20:05 - ERROR - stderr - +2025-05-11 00:20:05 - ERROR - stderr - +2025-05-11 00:20:05 - INFO - stdout - {'loss': 0.9305, 'grad_norm': 0.5951080322265625, 'learning_rate': 1.620487032979578e-05, 'epoch': 0.93} +2025-05-11 00:20:05 - ERROR - stderr - 31%|███ | 1154/3741 [6:54:12<14:46:58, 20.57s/it] +2025-05-11 00:20:27 - ERROR - stderr - 31%|███ | 1155/3741 [6:54:33<14:59:52, 20.88s/it] +2025-05-11 00:20:27 - ERROR - stderr - +2025-05-11 00:20:27 - ERROR - stderr - +2025-05-11 00:20:27 - INFO - stdout - {'loss': 0.9093, 'grad_norm': 0.6206769943237305, 'learning_rate': 1.619807723960781e-05, 'epoch': 0.93} +2025-05-11 00:20:27 - ERROR - stderr - 31%|███ | 1155/3741 [6:54:33<14:59:52, 20.88s/it] +2025-05-11 00:20:46 - ERROR - stderr - 31%|███ | 1156/3741 [6:54:53<14:41:35, 20.46s/it] +2025-05-11 00:20:46 - ERROR - stderr - +2025-05-11 00:20:46 - ERROR - stderr - +2025-05-11 00:20:46 - INFO - stdout - {'loss': 0.9339, 'grad_norm': 0.6188283562660217, 'learning_rate': 1.619127950189044e-05, 'epoch': 0.93} +2025-05-11 00:20:46 - ERROR - stderr - 31%|███ | 1156/3741 [6:54:53<14:41:35, 20.46s/it] +2025-05-11 00:21:10 - ERROR - stderr - 31%|███ | 1157/3741 [6:55:17<15:27:30, 21.54s/it] +2025-05-11 00:21:10 - ERROR - stderr - +2025-05-11 00:21:10 - ERROR - stderr - +2025-05-11 00:21:10 - INFO - stdout - {'loss': 0.8635, 'grad_norm': 0.5791252851486206, 'learning_rate': 1.6184477121740848e-05, 'epoch': 0.93} +2025-05-11 00:21:10 - ERROR - stderr - 31%|███ | 1157/3741 [6:55:17<15:27:30, 21.54s/it] +2025-05-11 00:21:30 - ERROR - stderr - 31%|███ | 1158/3741 [6:55:37<15:05:11, 21.03s/it] +2025-05-11 00:21:30 - ERROR - stderr - +2025-05-11 00:21:30 - ERROR - stderr - +2025-05-11 00:21:30 - INFO - stdout - {'loss': 0.8821, 'grad_norm': 0.5923981666564941, 'learning_rate': 1.6177670104259694e-05, 'epoch': 0.93} +2025-05-11 00:21:30 - ERROR - stderr - 31%|███ | 1158/3741 [6:55:37<15:05:11, 21.03s/it] +2025-05-11 00:21:53 - ERROR - stderr - 31%|███ | 1159/3741 [6:55:59<15:26:05, 21.52s/it] +2025-05-11 00:21:53 - ERROR - stderr - +2025-05-11 00:21:53 - ERROR - stderr - +2025-05-11 00:21:53 - INFO - stdout - {'loss': 0.8967, 'grad_norm': 0.5693655610084534, 'learning_rate': 1.61708584545511e-05, 'epoch': 0.93} +2025-05-11 00:21:53 - ERROR - stderr - 31%|███ | 1159/3741 [6:55:59<15:26:05, 21.52s/it] +2025-05-11 00:22:12 - ERROR - stderr - 31%|███ | 1160/3741 [6:56:19<14:58:59, 20.90s/it] +2025-05-11 00:22:12 - ERROR - stderr - +2025-05-11 00:22:12 - ERROR - stderr - +2025-05-11 00:22:12 - INFO - stdout - {'loss': 0.9091, 'grad_norm': 0.6008737087249756, 'learning_rate': 1.616404217772269e-05, 'epoch': 0.93} +2025-05-11 00:22:12 - ERROR - stderr - 31%|███ | 1160/3741 [6:56:19<14:58:59, 20.90s/it] +2025-05-11 00:22:32 - ERROR - stderr - 31%|███ | 1161/3741 [6:56:39<14:46:15, 20.61s/it] +2025-05-11 00:22:32 - ERROR - stderr - +2025-05-11 00:22:32 - ERROR - stderr - +2025-05-11 00:22:32 - INFO - stdout - {'loss': 0.9188, 'grad_norm': 0.5927824974060059, 'learning_rate': 1.6157221278885523e-05, 'epoch': 0.93} +2025-05-11 00:22:32 - ERROR - stderr - 31%|███ | 1161/3741 [6:56:39<14:46:15, 20.61s/it] +2025-05-11 00:22:53 - ERROR - stderr - 31%|███ | 1162/3741 [6:57:00<14:48:41, 20.68s/it] +2025-05-11 00:22:53 - ERROR - stderr - +2025-05-11 00:22:53 - ERROR - stderr - +2025-05-11 00:22:53 - INFO - stdout - {'loss': 0.901, 'grad_norm': 0.6398462653160095, 'learning_rate': 1.615039576315415e-05, 'epoch': 0.93} +2025-05-11 00:22:53 - ERROR - stderr - 31%|███ | 1162/3741 [6:57:00<14:48:41, 20.68s/it] +2025-05-11 00:23:13 - ERROR - stderr - 31%|███ | 1163/3741 [6:57:19<14:34:32, 20.35s/it] +2025-05-11 00:23:13 - ERROR - stderr - +2025-05-11 00:23:13 - ERROR - stderr - +2025-05-11 00:23:13 - INFO - stdout - {'loss': 0.9274, 'grad_norm': 0.6090993285179138, 'learning_rate': 1.6143565635646575e-05, 'epoch': 0.93} +2025-05-11 00:23:13 - ERROR - stderr - 31%|███ | 1163/3741 [6:57:19<14:34:32, 20.35s/it] +2025-05-11 00:23:37 - ERROR - stderr - 31%|███ | 1164/3741 [6:57:43<15:19:23, 21.41s/it] +2025-05-11 00:23:37 - ERROR - stderr - +2025-05-11 00:23:37 - ERROR - stderr - +2025-05-11 00:23:37 - INFO - stdout - {'loss': 0.9281, 'grad_norm': 0.6457433700561523, 'learning_rate': 1.6136730901484267e-05, 'epoch': 0.93} +2025-05-11 00:23:37 - ERROR - stderr - 31%|███ | 1164/3741 [6:57:43<15:19:23, 21.41s/it] +2025-05-11 00:23:56 - ERROR - stderr - 31%|███ | 1165/3741 [6:58:03<14:56:17, 20.88s/it] +2025-05-11 00:23:56 - ERROR - stderr - +2025-05-11 00:23:56 - ERROR - stderr - +2025-05-11 00:23:56 - INFO - stdout - {'loss': 0.9133, 'grad_norm': 0.627136766910553, 'learning_rate': 1.612989156579213e-05, 'epoch': 0.93} +2025-05-11 00:23:56 - ERROR - stderr - 31%|███ | 1165/3741 [6:58:03<14:56:17, 20.88s/it] +2025-05-11 00:24:19 - ERROR - stderr - 31%|███ | 1166/3741 [6:58:26<15:23:18, 21.51s/it] +2025-05-11 00:24:19 - ERROR - stderr - +2025-05-11 00:24:19 - ERROR - stderr - +2025-05-11 00:24:19 - INFO - stdout - {'loss': 0.8857, 'grad_norm': 0.6137925982475281, 'learning_rate': 1.612304763369853e-05, 'epoch': 0.94} +2025-05-11 00:24:19 - ERROR - stderr - 31%|███ | 1166/3741 [6:58:26<15:23:18, 21.51s/it] +2025-05-11 00:24:39 - ERROR - stderr - 31%|███ | 1167/3741 [6:58:45<14:56:29, 20.90s/it] +2025-05-11 00:24:39 - ERROR - stderr - +2025-05-11 00:24:39 - ERROR - stderr - +2025-05-11 00:24:39 - INFO - stdout - {'loss': 0.9099, 'grad_norm': 0.6183207035064697, 'learning_rate': 1.6116199110335295e-05, 'epoch': 0.94} +2025-05-11 00:24:39 - ERROR - stderr - 31%|███ | 1167/3741 [6:58:45<14:56:29, 20.90s/it] +2025-05-11 00:24:59 - ERROR - stderr - 31%|███ | 1168/3741 [6:59:05<14:44:45, 20.63s/it] +2025-05-11 00:24:59 - ERROR - stderr - +2025-05-11 00:24:59 - ERROR - stderr - +2025-05-11 00:24:59 - INFO - stdout - {'loss': 0.9584, 'grad_norm': 0.6730118989944458, 'learning_rate': 1.610934600083767e-05, 'epoch': 0.94} +2025-05-11 00:24:59 - ERROR - stderr - 31%|███ | 1168/3741 [6:59:05<14:44:45, 20.63s/it] +2025-05-11 00:25:20 - ERROR - stderr - 31%|███ | 1169/3741 [6:59:27<14:57:43, 20.94s/it] +2025-05-11 00:25:20 - ERROR - stderr - +2025-05-11 00:25:20 - ERROR - stderr - +2025-05-11 00:25:20 - INFO - stdout - {'loss': 0.9138, 'grad_norm': 0.6072790622711182, 'learning_rate': 1.610248831034435e-05, 'epoch': 0.94} +2025-05-11 00:25:20 - ERROR - stderr - 31%|███ | 1169/3741 [6:59:27<14:57:43, 20.94s/it] +2025-05-11 00:25:40 - ERROR - stderr - 31%|███▏ | 1170/3741 [6:59:46<14:37:51, 20.49s/it] +2025-05-11 00:25:40 - ERROR - stderr - +2025-05-11 00:25:40 - ERROR - stderr - +2025-05-11 00:25:40 - INFO - stdout - {'loss': 0.938, 'grad_norm': 0.6239385008811951, 'learning_rate': 1.609562604399747e-05, 'epoch': 0.94} +2025-05-11 00:25:40 - ERROR - stderr - 31%|███▏ | 1170/3741 [6:59:46<14:37:51, 20.49s/it] +2025-05-11 00:26:03 - ERROR - stderr - 31%|███▏ | 1171/3741 [7:00:10<15:17:28, 21.42s/it] +2025-05-11 00:26:03 - ERROR - stderr - +2025-05-11 00:26:03 - ERROR - stderr - +2025-05-11 00:26:03 - INFO - stdout - {'loss': 0.8756, 'grad_norm': 0.6454656720161438, 'learning_rate': 1.6088759206942586e-05, 'epoch': 0.94} +2025-05-11 00:26:03 - ERROR - stderr - 31%|███▏ | 1171/3741 [7:00:10<15:17:28, 21.42s/it] +2025-05-11 00:26:23 - ERROR - stderr - 31%|███▏ | 1172/3741 [7:00:29<14:53:14, 20.86s/it] +2025-05-11 00:26:23 - ERROR - stderr - +2025-05-11 00:26:23 - ERROR - stderr - +2025-05-11 00:26:23 - INFO - stdout - {'loss': 0.9057, 'grad_norm': 0.6884939074516296, 'learning_rate': 1.6081887804328687e-05, 'epoch': 0.94} +2025-05-11 00:26:23 - ERROR - stderr - 31%|███▏ | 1172/3741 [7:00:29<14:53:14, 20.86s/it] +2025-05-11 00:26:46 - ERROR - stderr - 31%|███▏ | 1173/3741 [7:00:53<15:24:16, 21.60s/it] +2025-05-11 00:26:46 - ERROR - stderr - +2025-05-11 00:26:46 - ERROR - stderr - +2025-05-11 00:26:46 - INFO - stdout - {'loss': 0.9183, 'grad_norm': 0.6258487105369568, 'learning_rate': 1.607501184130819e-05, 'epoch': 0.94} +2025-05-11 00:26:46 - ERROR - stderr - 31%|███▏ | 1173/3741 [7:00:53<15:24:16, 21.60s/it] +2025-05-11 00:27:06 - ERROR - stderr - 31%|███▏ | 1174/3741 [7:01:12<14:58:21, 21.00s/it] +2025-05-11 00:27:06 - ERROR - stderr - +2025-05-11 00:27:06 - ERROR - stderr - +2025-05-11 00:27:06 - INFO - stdout - {'loss': 0.9145, 'grad_norm': 0.576998770236969, 'learning_rate': 1.606813132303692e-05, 'epoch': 0.94} +2025-05-11 00:27:06 - ERROR - stderr - 31%|███▏ | 1174/3741 [7:01:12<14:58:21, 21.00s/it] +2025-05-11 00:27:25 - ERROR - stderr - 31%|███▏ | 1175/3741 [7:01:32<14:38:04, 20.53s/it] +2025-05-11 00:27:25 - ERROR - stderr - +2025-05-11 00:27:25 - ERROR - stderr - +2025-05-11 00:27:25 - INFO - stdout - {'loss': 0.9295, 'grad_norm': 0.6313689351081848, 'learning_rate': 1.606124625467413e-05, 'epoch': 0.94} +2025-05-11 00:27:25 - ERROR - stderr - 31%|███▏ | 1175/3741 [7:01:32<14:38:04, 20.53s/it] +2025-05-11 00:27:47 - ERROR - stderr - 31%|███▏ | 1176/3741 [7:01:53<14:47:06, 20.75s/it] +2025-05-11 00:27:47 - ERROR - stderr - +2025-05-11 00:27:47 - ERROR - stderr - +2025-05-11 00:27:47 - INFO - stdout - {'loss': 0.8613, 'grad_norm': 0.6496961116790771, 'learning_rate': 1.605435664138247e-05, 'epoch': 0.94} +2025-05-11 00:27:47 - ERROR - stderr - 31%|███▏ | 1176/3741 [7:01:53<14:47:06, 20.75s/it] +2025-05-11 00:28:06 - ERROR - stderr - 31%|███▏ | 1177/3741 [7:02:12<14:28:05, 20.31s/it] +2025-05-11 00:28:06 - ERROR - stderr - +2025-05-11 00:28:06 - ERROR - stderr - +2025-05-11 00:28:06 - INFO - stdout - {'loss': 0.942, 'grad_norm': 0.6497421264648438, 'learning_rate': 1.6047462488328017e-05, 'epoch': 0.94} +2025-05-11 00:28:06 - ERROR - stderr - 31%|███▏ | 1177/3741 [7:02:12<14:28:05, 20.31s/it] +2025-05-11 00:28:29 - ERROR - stderr - 31%|███▏ | 1178/3741 [7:02:35<14:58:19, 21.03s/it] +2025-05-11 00:28:29 - ERROR - stderr - +2025-05-11 00:28:29 - ERROR - stderr - +2025-05-11 00:28:29 - INFO - stdout - {'loss': 0.9454, 'grad_norm': 0.6210437417030334, 'learning_rate': 1.604056380068023e-05, 'epoch': 0.94} +2025-05-11 00:28:29 - ERROR - stderr - 31%|███▏ | 1178/3741 [7:02:35<14:58:19, 21.03s/it] +2025-05-11 00:28:48 - ERROR - stderr - 32%|███▏ | 1179/3741 [7:02:54<14:37:57, 20.56s/it] +2025-05-11 00:28:48 - ERROR - stderr - +2025-05-11 00:28:48 - ERROR - stderr - +2025-05-11 00:28:48 - INFO - stdout - {'loss': 0.8651, 'grad_norm': 0.5845626592636108, 'learning_rate': 1.6033660583611988e-05, 'epoch': 0.95} +2025-05-11 00:28:48 - ERROR - stderr - 32%|███▏ | 1179/3741 [7:02:54<14:37:57, 20.56s/it] +2025-05-11 00:29:11 - ERROR - stderr - 32%|███▏ | 1180/3741 [7:03:18<15:13:07, 21.39s/it] +2025-05-11 00:29:11 - ERROR - stderr - +2025-05-11 00:29:11 - ERROR - stderr - +2025-05-11 00:29:11 - INFO - stdout - {'loss': 0.963, 'grad_norm': 0.643485426902771, 'learning_rate': 1.6026752842299564e-05, 'epoch': 0.95} +2025-05-11 00:29:11 - ERROR - stderr - 32%|███▏ | 1180/3741 [7:03:18<15:13:07, 21.39s/it] +2025-05-11 00:29:31 - ERROR - stderr - 32%|███▏ | 1181/3741 [7:03:37<14:49:18, 20.84s/it] +2025-05-11 00:29:31 - ERROR - stderr - +2025-05-11 00:29:31 - ERROR - stderr - +2025-05-11 00:29:31 - INFO - stdout - {'loss': 0.9274, 'grad_norm': 0.6154069900512695, 'learning_rate': 1.6019840581922604e-05, 'epoch': 0.95} +2025-05-11 00:29:31 - ERROR - stderr - 32%|███▏ | 1181/3741 [7:03:37<14:49:18, 20.84s/it] +2025-05-11 00:29:51 - ERROR - stderr - 32%|███▏ | 1182/3741 [7:03:57<14:34:47, 20.51s/it] +2025-05-11 00:29:51 - ERROR - stderr - +2025-05-11 00:29:51 - ERROR - stderr - +2025-05-11 00:29:51 - INFO - stdout - {'loss': 0.936, 'grad_norm': 0.6406455039978027, 'learning_rate': 1.6012923807664164e-05, 'epoch': 0.95} +2025-05-11 00:29:51 - ERROR - stderr - 32%|███▏ | 1182/3741 [7:03:57<14:34:47, 20.51s/it] +2025-05-11 00:30:13 - ERROR - stderr - 32%|███▏ | 1183/3741 [7:04:19<14:57:41, 21.06s/it] +2025-05-11 00:30:13 - ERROR - stderr - +2025-05-11 00:30:13 - ERROR - stderr - +2025-05-11 00:30:13 - INFO - stdout - {'loss': 0.8924, 'grad_norm': 0.6149894595146179, 'learning_rate': 1.6006002524710674e-05, 'epoch': 0.95} +2025-05-11 00:30:13 - ERROR - stderr - 32%|███▏ | 1183/3741 [7:04:19<14:57:41, 21.06s/it] +2025-05-11 00:30:33 - ERROR - stderr - 32%|███▏ | 1184/3741 [7:04:39<14:38:25, 20.61s/it] +2025-05-11 00:30:33 - ERROR - stderr - +2025-05-11 00:30:33 - ERROR - stderr - +2025-05-11 00:30:33 - INFO - stdout - {'loss': 0.9235, 'grad_norm': 0.5921556353569031, 'learning_rate': 1.599907673825195e-05, 'epoch': 0.95} +2025-05-11 00:30:33 - ERROR - stderr - 32%|███▏ | 1184/3741 [7:04:39<14:38:25, 20.61s/it] +2025-05-11 00:30:56 - ERROR - stderr - 32%|███▏ | 1185/3741 [7:05:02<15:13:45, 21.45s/it] +2025-05-11 00:30:56 - ERROR - stderr - +2025-05-11 00:30:56 - ERROR - stderr - +2025-05-11 00:30:56 - INFO - stdout - {'loss': 0.8885, 'grad_norm': 0.6019387245178223, 'learning_rate': 1.599214645348118e-05, 'epoch': 0.95} +2025-05-11 00:30:56 - ERROR - stderr - 32%|███▏ | 1185/3741 [7:05:02<15:13:45, 21.45s/it] +2025-05-11 00:31:15 - ERROR - stderr - 32%|███▏ | 1186/3741 [7:05:22<14:47:30, 20.84s/it] +2025-05-11 00:31:15 - ERROR - stderr - +2025-05-11 00:31:15 - ERROR - stderr - +2025-05-11 00:31:15 - INFO - stdout - {'loss': 0.9241, 'grad_norm': 0.6550159454345703, 'learning_rate': 1.5985211675594933e-05, 'epoch': 0.95} +2025-05-11 00:31:15 - ERROR - stderr - 32%|███▏ | 1186/3741 [7:05:22<14:47:30, 20.84s/it] +2025-05-11 00:31:39 - ERROR - stderr - 32%|███▏ | 1187/3741 [7:05:45<15:16:01, 21.52s/it] +2025-05-11 00:31:39 - ERROR - stderr - +2025-05-11 00:31:39 - ERROR - stderr - +2025-05-11 00:31:39 - INFO - stdout - {'loss': 0.965, 'grad_norm': 0.639525294303894, 'learning_rate': 1.5978272409793136e-05, 'epoch': 0.95} +2025-05-11 00:31:39 - ERROR - stderr - 32%|███▏ | 1187/3741 [7:05:45<15:16:01, 21.52s/it] +2025-05-11 00:31:58 - ERROR - stderr - 32%|███▏ | 1188/3741 [7:06:05<14:51:31, 20.95s/it] +2025-05-11 00:31:58 - ERROR - stderr - +2025-05-11 00:31:58 - ERROR - stderr - +2025-05-11 00:31:58 - INFO - stdout - {'loss': 0.8969, 'grad_norm': 0.6293081045150757, 'learning_rate': 1.597132866127909e-05, 'epoch': 0.95} +2025-05-11 00:31:58 - ERROR - stderr - 32%|███▏ | 1188/3741 [7:06:05<14:51:31, 20.95s/it] +2025-05-11 00:32:18 - ERROR - stderr - 32%|███▏ | 1189/3741 [7:06:24<14:36:48, 20.61s/it] +2025-05-11 00:32:18 - ERROR - stderr - +2025-05-11 00:32:18 - ERROR - stderr - +2025-05-11 00:32:18 - INFO - stdout - {'loss': 0.869, 'grad_norm': 0.6015990376472473, 'learning_rate': 1.5964380435259448e-05, 'epoch': 0.95} +2025-05-11 00:32:18 - ERROR - stderr - 32%|███▏ | 1189/3741 [7:06:24<14:36:48, 20.61s/it] +2025-05-11 00:32:40 - ERROR - stderr - 32%|███▏ | 1190/3741 [7:06:47<14:59:05, 21.15s/it] +2025-05-11 00:32:40 - ERROR - stderr - +2025-05-11 00:32:40 - ERROR - stderr - +2025-05-11 00:32:40 - INFO - stdout - {'loss': 0.9152, 'grad_norm': 0.5990379452705383, 'learning_rate': 1.595742773694424e-05, 'epoch': 0.95} +2025-05-11 00:32:40 - ERROR - stderr - 32%|███▏ | 1190/3741 [7:06:47<14:59:05, 21.15s/it] +2025-05-11 00:33:00 - ERROR - stderr - 32%|███▏ | 1191/3741 [7:07:06<14:40:11, 20.71s/it] +2025-05-11 00:33:00 - ERROR - stderr - +2025-05-11 00:33:00 - ERROR - stderr - +2025-05-11 00:33:00 - INFO - stdout - {'loss': 0.9237, 'grad_norm': 0.6142351031303406, 'learning_rate': 1.5950470571546818e-05, 'epoch': 0.96} +2025-05-11 00:33:00 - ERROR - stderr - 32%|███▏ | 1191/3741 [7:07:06<14:40:11, 20.71s/it] +2025-05-11 00:33:24 - ERROR - stderr - 32%|███▏ | 1192/3741 [7:07:30<15:14:27, 21.53s/it] +2025-05-11 00:33:24 - ERROR - stderr - +2025-05-11 00:33:24 - ERROR - stderr - +2025-05-11 00:33:24 - INFO - stdout - {'loss': 0.8922, 'grad_norm': 0.6138330101966858, 'learning_rate': 1.5943508944283916e-05, 'epoch': 0.96} +2025-05-11 00:33:24 - ERROR - stderr - 32%|███▏ | 1192/3741 [7:07:30<15:14:27, 21.53s/it] +2025-05-11 00:33:43 - ERROR - stderr - 32%|███▏ | 1193/3741 [7:07:49<14:48:18, 20.92s/it] +2025-05-11 00:33:43 - ERROR - stderr - +2025-05-11 00:33:43 - ERROR - stderr - +2025-05-11 00:33:43 - INFO - stdout - {'loss': 0.9292, 'grad_norm': 0.6090849041938782, 'learning_rate': 1.5936542860375594e-05, 'epoch': 0.96} +2025-05-11 00:33:43 - ERROR - stderr - 32%|███▏ | 1193/3741 [7:07:49<14:48:18, 20.92s/it] +2025-05-11 00:34:08 - ERROR - stderr - 32%|███▏ | 1194/3741 [7:08:14<15:35:42, 22.04s/it] +2025-05-11 00:34:08 - ERROR - stderr - +2025-05-11 00:34:08 - ERROR - stderr - +2025-05-11 00:34:08 - INFO - stdout - {'loss': 0.8901, 'grad_norm': 0.6265794634819031, 'learning_rate': 1.592957232504526e-05, 'epoch': 0.96} +2025-05-11 00:34:08 - ERROR - stderr - 32%|███▏ | 1194/3741 [7:08:14<15:35:42, 22.04s/it] +2025-05-11 00:34:28 - ERROR - stderr - 32%|███▏ | 1195/3741 [7:08:34<15:07:09, 21.38s/it] +2025-05-11 00:34:28 - ERROR - stderr - +2025-05-11 00:34:28 - ERROR - stderr - +2025-05-11 00:34:28 - INFO - stdout - {'loss': 0.8742, 'grad_norm': 0.6023232936859131, 'learning_rate': 1.5922597343519654e-05, 'epoch': 0.96} +2025-05-11 00:34:28 - ERROR - stderr - 32%|███▏ | 1195/3741 [7:08:34<15:07:09, 21.38s/it] +2025-05-11 00:34:47 - ERROR - stderr - 32%|███▏ | 1196/3741 [7:08:54<14:48:54, 20.96s/it] +2025-05-11 00:34:48 - ERROR - stderr - +2025-05-11 00:34:48 - ERROR - stderr - +2025-05-11 00:34:48 - INFO - stdout - {'loss': 0.8904, 'grad_norm': 0.5652976632118225, 'learning_rate': 1.591561792102886e-05, 'epoch': 0.96} +2025-05-11 00:34:48 - ERROR - stderr - 32%|███▏ | 1196/3741 [7:08:54<14:48:54, 20.96s/it] +2025-05-11 00:35:08 - ERROR - stderr - 32%|███▏ | 1197/3741 [7:09:15<14:45:42, 20.89s/it] +2025-05-11 00:35:08 - ERROR - stderr - +2025-05-11 00:35:08 - ERROR - stderr - +2025-05-11 00:35:08 - INFO - stdout - {'loss': 0.9088, 'grad_norm': 0.6332113742828369, 'learning_rate': 1.5908634062806285e-05, 'epoch': 0.96} +2025-05-11 00:35:08 - ERROR - stderr - 32%|███▏ | 1197/3741 [7:09:15<14:45:42, 20.89s/it] +2025-05-11 00:35:28 - ERROR - stderr - 32%|███▏ | 1198/3741 [7:09:34<14:30:47, 20.55s/it] +2025-05-11 00:35:28 - ERROR - stderr - +2025-05-11 00:35:28 - ERROR - stderr - +2025-05-11 00:35:28 - INFO - stdout - {'loss': 0.8891, 'grad_norm': 0.6024392247200012, 'learning_rate': 1.5901645774088662e-05, 'epoch': 0.96} +2025-05-11 00:35:28 - ERROR - stderr - 32%|███▏ | 1198/3741 [7:09:34<14:30:47, 20.55s/it] +2025-05-11 00:35:50 - ERROR - stderr - 32%|███▏ | 1199/3741 [7:09:56<14:50:08, 21.01s/it] +2025-05-11 00:35:50 - ERROR - stderr - +2025-05-11 00:35:50 - ERROR - stderr - +2025-05-11 00:35:50 - INFO - stdout - {'loss': 0.9047, 'grad_norm': 0.6040472984313965, 'learning_rate': 1.5894653060116053e-05, 'epoch': 0.96} +2025-05-11 00:35:50 - ERROR - stderr - 32%|███▏ | 1199/3741 [7:09:56<14:50:08, 21.01s/it] +2025-05-11 00:36:09 - ERROR - stderr - 32%|███▏ | 1200/3741 [7:10:16<14:28:33, 20.51s/it] +2025-05-11 00:36:09 - ERROR - stderr - +2025-05-11 00:36:09 - ERROR - stderr - +2025-05-11 00:36:09 - INFO - stdout - {'loss': 0.9191, 'grad_norm': 0.5790461301803589, 'learning_rate': 1.5887655926131832e-05, 'epoch': 0.96} +2025-05-11 00:36:09 - ERROR - stderr - 32%|███▏ | 1200/3741 [7:10:16<14:28:33, 20.51s/it] +2025-05-11 00:36:32 - ERROR - stderr - 32%|███▏ | 1201/3741 [7:10:39<14:57:46, 21.21s/it] +2025-05-11 00:36:32 - ERROR - stderr - +2025-05-11 00:36:32 - ERROR - stderr - +2025-05-11 00:36:32 - INFO - stdout - {'loss': 0.9197, 'grad_norm': 0.6125912666320801, 'learning_rate': 1.588065437738268e-05, 'epoch': 0.96} +2025-05-11 00:36:32 - ERROR - stderr - 32%|███▏ | 1201/3741 [7:10:39<14:57:46, 21.21s/it] +2025-05-11 00:36:52 - ERROR - stderr - 32%|███▏ | 1202/3741 [7:10:58<14:34:22, 20.66s/it] +2025-05-11 00:36:52 - ERROR - stderr - +2025-05-11 00:36:52 - ERROR - stderr - +2025-05-11 00:36:52 - INFO - stdout - {'loss': 0.879, 'grad_norm': 0.5846207141876221, 'learning_rate': 1.587364841911861e-05, 'epoch': 0.96} +2025-05-11 00:36:52 - ERROR - stderr - 32%|███▏ | 1202/3741 [7:10:58<14:34:22, 20.66s/it] +2025-05-11 00:37:12 - ERROR - stderr - 32%|███▏ | 1203/3741 [7:11:18<14:25:58, 20.47s/it] +2025-05-11 00:37:12 - ERROR - stderr - +2025-05-11 00:37:12 - ERROR - stderr - +2025-05-11 00:37:12 - INFO - stdout - {'loss': 0.9328, 'grad_norm': 0.5989664196968079, 'learning_rate': 1.5866638056592916e-05, 'epoch': 0.96} +2025-05-11 00:37:12 - ERROR - stderr - 32%|███▏ | 1203/3741 [7:11:18<14:25:58, 20.47s/it] +2025-05-11 00:37:31 - ERROR - stderr - 32%|███▏ | 1204/3741 [7:11:37<14:12:43, 20.17s/it] +2025-05-11 00:37:31 - ERROR - stderr - +2025-05-11 00:37:31 - ERROR - stderr - +2025-05-11 00:37:31 - INFO - stdout - {'loss': 0.8551, 'grad_norm': 0.5732477307319641, 'learning_rate': 1.5859623295062215e-05, 'epoch': 0.97} +2025-05-11 00:37:31 - ERROR - stderr - 32%|███▏ | 1204/3741 [7:11:37<14:12:43, 20.17s/it] +2025-05-11 00:37:51 - ERROR - stderr - 32%|███▏ | 1205/3741 [7:11:57<14:07:22, 20.05s/it] +2025-05-11 00:37:51 - ERROR - stderr - +2025-05-11 00:37:51 - ERROR - stderr - +2025-05-11 00:37:51 - INFO - stdout - {'loss': 0.9435, 'grad_norm': 0.6074816584587097, 'learning_rate': 1.585260413978641e-05, 'epoch': 0.97} +2025-05-11 00:37:51 - ERROR - stderr - 32%|███▏ | 1205/3741 [7:11:57<14:07:22, 20.05s/it] +2025-05-11 00:38:14 - ERROR - stderr - 32%|███▏ | 1206/3741 [7:12:20<14:42:28, 20.89s/it] +2025-05-11 00:38:14 - ERROR - stderr - +2025-05-11 00:38:14 - ERROR - stderr - +2025-05-11 00:38:14 - INFO - stdout - {'loss': 0.9607, 'grad_norm': 0.6296406388282776, 'learning_rate': 1.5845580596028697e-05, 'epoch': 0.97} +2025-05-11 00:38:14 - ERROR - stderr - 32%|███▏ | 1206/3741 [7:12:20<14:42:28, 20.89s/it] +2025-05-11 00:38:34 - ERROR - stderr - 32%|███▏ | 1207/3741 [7:12:40<14:28:21, 20.56s/it] +2025-05-11 00:38:34 - ERROR - stderr - +2025-05-11 00:38:34 - ERROR - stderr - +2025-05-11 00:38:34 - INFO - stdout - {'loss': 0.9418, 'grad_norm': 3.621976375579834, 'learning_rate': 1.583855266905558e-05, 'epoch': 0.97} +2025-05-11 00:38:34 - ERROR - stderr - 32%|███▏ | 1207/3741 [7:12:40<14:28:21, 20.56s/it] +2025-05-11 00:38:57 - ERROR - stderr - 32%|███▏ | 1208/3741 [7:13:03<15:05:52, 21.46s/it] +2025-05-11 00:38:57 - ERROR - stderr - +2025-05-11 00:38:57 - ERROR - stderr - +2025-05-11 00:38:57 - INFO - stdout - {'loss': 0.9094, 'grad_norm': 0.6303392648696899, 'learning_rate': 1.5831520364136835e-05, 'epoch': 0.97} +2025-05-11 00:38:57 - ERROR - stderr - 32%|███▏ | 1208/3741 [7:13:03<15:05:52, 21.46s/it] +2025-05-11 00:39:16 - ERROR - stderr - 32%|███▏ | 1209/3741 [7:13:23<14:39:39, 20.84s/it] +2025-05-11 00:39:17 - ERROR - stderr - +2025-05-11 00:39:17 - ERROR - stderr - +2025-05-11 00:39:17 - INFO - stdout - {'loss': 0.9088, 'grad_norm': 0.5891981720924377, 'learning_rate': 1.5824483686545517e-05, 'epoch': 0.97} +2025-05-11 00:39:17 - ERROR - stderr - 32%|███▏ | 1209/3741 [7:13:23<14:39:39, 20.84s/it] +2025-05-11 00:39:36 - ERROR - stderr - 32%|███▏ | 1210/3741 [7:13:43<14:28:03, 20.58s/it] +2025-05-11 00:39:36 - ERROR - stderr - +2025-05-11 00:39:36 - ERROR - stderr - +2025-05-11 00:39:36 - INFO - stdout - {'loss': 0.9175, 'grad_norm': 0.7623486518859863, 'learning_rate': 1.581744264155797e-05, 'epoch': 0.97} +2025-05-11 00:39:36 - ERROR - stderr - 32%|███▏ | 1210/3741 [7:13:43<14:28:03, 20.58s/it] +2025-05-11 00:39:56 - ERROR - stderr - 32%|███▏ | 1211/3741 [7:14:02<14:13:25, 20.24s/it] +2025-05-11 00:39:56 - ERROR - stderr - +2025-05-11 00:39:56 - ERROR - stderr - +2025-05-11 00:39:56 - INFO - stdout - {'loss': 0.8938, 'grad_norm': 0.5808781385421753, 'learning_rate': 1.5810397234453816e-05, 'epoch': 0.97} +2025-05-11 00:39:56 - ERROR - stderr - 32%|███▏ | 1211/3741 [7:14:02<14:13:25, 20.24s/it] +2025-05-11 00:40:16 - ERROR - stderr - 32%|███▏ | 1212/3741 [7:14:22<14:09:09, 20.15s/it] +2025-05-11 00:40:16 - ERROR - stderr - +2025-05-11 00:40:16 - ERROR - stderr - +2025-05-11 00:40:16 - INFO - stdout - {'loss': 0.9222, 'grad_norm': 0.5807702541351318, 'learning_rate': 1.5803347470515933e-05, 'epoch': 0.97} +2025-05-11 00:40:16 - ERROR - stderr - 32%|███▏ | 1212/3741 [7:14:22<14:09:09, 20.15s/it] +2025-05-11 00:40:38 - ERROR - stderr - 32%|███▏ | 1213/3741 [7:14:45<14:39:04, 20.86s/it] +2025-05-11 00:40:38 - ERROR - stderr - +2025-05-11 00:40:38 - ERROR - stderr - +2025-05-11 00:40:38 - INFO - stdout - {'loss': 0.909, 'grad_norm': 0.5763218998908997, 'learning_rate': 1.5796293355030476e-05, 'epoch': 0.97} +2025-05-11 00:40:38 - ERROR - stderr - 32%|███▏ | 1213/3741 [7:14:45<14:39:04, 20.86s/it] +2025-05-11 00:40:58 - ERROR - stderr - 32%|███▏ | 1214/3741 [7:15:04<14:22:20, 20.48s/it] +2025-05-11 00:40:58 - ERROR - stderr - +2025-05-11 00:40:58 - ERROR - stderr - +2025-05-11 00:40:58 - INFO - stdout - {'loss': 0.8921, 'grad_norm': 0.5519773364067078, 'learning_rate': 1.578923489328686e-05, 'epoch': 0.97} +2025-05-11 00:40:58 - ERROR - stderr - 32%|███▏ | 1214/3741 [7:15:04<14:22:20, 20.48s/it] +2025-05-11 00:41:21 - ERROR - stderr - 32%|███▏ | 1215/3741 [7:15:28<14:58:07, 21.33s/it] +2025-05-11 00:41:21 - ERROR - stderr - +2025-05-11 00:41:21 - ERROR - stderr - +2025-05-11 00:41:21 - INFO - stdout - {'loss': 0.887, 'grad_norm': 0.579440176486969, 'learning_rate': 1.5782172090577762e-05, 'epoch': 0.97} +2025-05-11 00:41:21 - ERROR - stderr - 32%|███▏ | 1215/3741 [7:15:28<14:58:07, 21.33s/it] +2025-05-11 00:41:41 - ERROR - stderr - 33%|███▎ | 1216/3741 [7:15:48<14:43:38, 21.00s/it] +2025-05-11 00:41:41 - ERROR - stderr - +2025-05-11 00:41:41 - ERROR - stderr - +2025-05-11 00:41:41 - INFO - stdout - {'loss': 0.8632, 'grad_norm': 0.5852498412132263, 'learning_rate': 1.5775104952199113e-05, 'epoch': 0.98} +2025-05-11 00:41:41 - ERROR - stderr - 33%|███▎ | 1216/3741 [7:15:48<14:43:38, 21.00s/it] +2025-05-11 00:42:04 - ERROR - stderr - 33%|███▎ | 1217/3741 [7:16:11<15:06:38, 21.55s/it] +2025-05-11 00:42:04 - ERROR - stderr - +2025-05-11 00:42:04 - ERROR - stderr - +2025-05-11 00:42:04 - INFO - stdout - {'loss': 0.9183, 'grad_norm': 0.606121301651001, 'learning_rate': 1.5768033483450088e-05, 'epoch': 0.98} +2025-05-11 00:42:04 - ERROR - stderr - 33%|███▎ | 1217/3741 [7:16:11<15:06:38, 21.55s/it] +2025-05-11 00:42:24 - ERROR - stderr - 33%|███▎ | 1218/3741 [7:16:30<14:44:17, 21.03s/it] +2025-05-11 00:42:24 - ERROR - stderr - +2025-05-11 00:42:24 - ERROR - stderr - +2025-05-11 00:42:24 - INFO - stdout - {'loss': 0.9547, 'grad_norm': 0.6091791987419128, 'learning_rate': 1.5760957689633127e-05, 'epoch': 0.98} +2025-05-11 00:42:24 - ERROR - stderr - 33%|███▎ | 1218/3741 [7:16:30<14:44:17, 21.03s/it] +2025-05-11 00:42:44 - ERROR - stderr - 33%|███▎ | 1219/3741 [7:16:50<14:26:52, 20.62s/it] +2025-05-11 00:42:44 - ERROR - stderr - +2025-05-11 00:42:44 - ERROR - stderr - +2025-05-11 00:42:44 - INFO - stdout - {'loss': 0.8725, 'grad_norm': 0.6013908386230469, 'learning_rate': 1.575387757605389e-05, 'epoch': 0.98} +2025-05-11 00:42:44 - ERROR - stderr - 33%|███▎ | 1219/3741 [7:16:50<14:26:52, 20.62s/it] +2025-05-11 00:43:05 - ERROR - stderr - 33%|███▎ | 1220/3741 [7:17:11<14:30:46, 20.72s/it] +2025-05-11 00:43:05 - ERROR - stderr - +2025-05-11 00:43:05 - ERROR - stderr - +2025-05-11 00:43:05 - INFO - stdout - {'loss': 0.9157, 'grad_norm': 0.5744641423225403, 'learning_rate': 1.5746793148021292e-05, 'epoch': 0.98} +2025-05-11 00:43:05 - ERROR - stderr - 33%|███▎ | 1220/3741 [7:17:11<14:30:46, 20.72s/it] +2025-05-11 00:43:24 - ERROR - stderr - 33%|███▎ | 1221/3741 [7:17:31<14:16:40, 20.40s/it] +2025-05-11 00:43:24 - ERROR - stderr - +2025-05-11 00:43:24 - ERROR - stderr - +2025-05-11 00:43:24 - INFO - stdout - {'loss': 0.9291, 'grad_norm': 0.6412164568901062, 'learning_rate': 1.5739704410847475e-05, 'epoch': 0.98} +2025-05-11 00:43:24 - ERROR - stderr - 33%|███▎ | 1221/3741 [7:17:31<14:16:40, 20.40s/it] +2025-05-11 00:43:48 - ERROR - stderr - 33%|███▎ | 1222/3741 [7:17:54<14:52:24, 21.26s/it] +2025-05-11 00:43:48 - ERROR - stderr - +2025-05-11 00:43:48 - ERROR - stderr - +2025-05-11 00:43:48 - INFO - stdout - {'loss': 0.941, 'grad_norm': 0.5992948412895203, 'learning_rate': 1.5732611369847818e-05, 'epoch': 0.98} +2025-05-11 00:43:48 - ERROR - stderr - 33%|███▎ | 1222/3741 [7:17:54<14:52:24, 21.26s/it] +2025-05-11 00:44:07 - ERROR - stderr - 33%|███▎ | 1223/3741 [7:18:14<14:33:10, 20.81s/it] +2025-05-11 00:44:07 - ERROR - stderr - +2025-05-11 00:44:07 - ERROR - stderr - +2025-05-11 00:44:07 - INFO - stdout - {'loss': 0.8843, 'grad_norm': 0.613304078578949, 'learning_rate': 1.5725514030340926e-05, 'epoch': 0.98} +2025-05-11 00:44:07 - ERROR - stderr - 33%|███▎ | 1223/3741 [7:18:14<14:33:10, 20.81s/it] +2025-05-11 00:44:30 - ERROR - stderr - 33%|███▎ | 1224/3741 [7:18:36<14:50:53, 21.24s/it] +2025-05-11 00:44:30 - ERROR - stderr - +2025-05-11 00:44:30 - ERROR - stderr - +2025-05-11 00:44:30 - INFO - stdout - {'loss': 0.9606, 'grad_norm': 0.5996186137199402, 'learning_rate': 1.5718412397648627e-05, 'epoch': 0.98} +2025-05-11 00:44:30 - ERROR - stderr - 33%|███▎ | 1224/3741 [7:18:36<14:50:53, 21.24s/it] +2025-05-11 00:44:49 - ERROR - stderr - 33%|███▎ | 1225/3741 [7:18:56<14:28:57, 20.72s/it] +2025-05-11 00:44:49 - ERROR - stderr - +2025-05-11 00:44:49 - ERROR - stderr - +2025-05-11 00:44:49 - INFO - stdout - {'loss': 0.8808, 'grad_norm': 0.8090897798538208, 'learning_rate': 1.5711306477095962e-05, 'epoch': 0.98} +2025-05-11 00:44:49 - ERROR - stderr - 33%|███▎ | 1225/3741 [7:18:56<14:28:57, 20.72s/it] +2025-05-11 00:45:09 - ERROR - stderr - 33%|███▎ | 1226/3741 [7:19:16<14:23:15, 20.59s/it] +2025-05-11 00:45:10 - ERROR - stderr - +2025-05-11 00:45:10 - ERROR - stderr - +2025-05-11 00:45:10 - INFO - stdout - {'loss': 0.9475, 'grad_norm': 0.5831454396247864, 'learning_rate': 1.5704196274011198e-05, 'epoch': 0.98} +2025-05-11 00:45:10 - ERROR - stderr - 33%|███▎ | 1226/3741 [7:19:16<14:23:15, 20.59s/it] +2025-05-11 00:45:30 - ERROR - stderr - 33%|███▎ | 1227/3741 [7:19:36<14:20:13, 20.53s/it] +2025-05-11 00:45:30 - ERROR - stderr - +2025-05-11 00:45:30 - ERROR - stderr - +2025-05-11 00:45:30 - INFO - stdout - {'loss': 0.9014, 'grad_norm': 0.6127690672874451, 'learning_rate': 1.56970817937258e-05, 'epoch': 0.98} +2025-05-11 00:45:30 - ERROR - stderr - 33%|███▎ | 1227/3741 [7:19:36<14:20:13, 20.53s/it] +2025-05-11 00:45:49 - ERROR - stderr - 33%|███▎ | 1228/3741 [7:19:55<14:04:17, 20.16s/it] +2025-05-11 00:45:49 - ERROR - stderr - +2025-05-11 00:45:49 - ERROR - stderr - +2025-05-11 00:45:49 - INFO - stdout - {'loss': 0.9017, 'grad_norm': 0.6199975609779358, 'learning_rate': 1.5689963041574453e-05, 'epoch': 0.98} +2025-05-11 00:45:49 - ERROR - stderr - 33%|███▎ | 1228/3741 [7:19:55<14:04:17, 20.16s/it] +2025-05-11 00:46:13 - ERROR - stderr - 33%|███▎ | 1229/3741 [7:20:19<14:45:11, 21.14s/it] +2025-05-11 00:46:13 - ERROR - stderr - +2025-05-11 00:46:13 - ERROR - stderr - +2025-05-11 00:46:13 - INFO - stdout - {'loss': 0.9638, 'grad_norm': 0.618943452835083, 'learning_rate': 1.568284002289504e-05, 'epoch': 0.99} +2025-05-11 00:46:13 - ERROR - stderr - 33%|███▎ | 1229/3741 [7:20:19<14:45:11, 21.14s/it] +2025-05-11 00:46:33 - ERROR - stderr - 33%|███▎ | 1230/3741 [7:20:39<14:29:30, 20.78s/it] +2025-05-11 00:46:33 - ERROR - stderr - +2025-05-11 00:46:33 - ERROR - stderr - +2025-05-11 00:46:33 - INFO - stdout - {'loss': 0.9033, 'grad_norm': 0.5724090337753296, 'learning_rate': 1.567571274302864e-05, 'epoch': 0.99} +2025-05-11 00:46:33 - ERROR - stderr - 33%|███▎ | 1230/3741 [7:20:39<14:29:30, 20.78s/it] +2025-05-11 00:46:56 - ERROR - stderr - 33%|███▎ | 1231/3741 [7:21:02<15:03:20, 21.59s/it] +2025-05-11 00:46:56 - ERROR - stderr - +2025-05-11 00:46:56 - ERROR - stderr - +2025-05-11 00:46:56 - INFO - stdout - {'loss': 0.8814, 'grad_norm': 0.6071799397468567, 'learning_rate': 1.5668581207319536e-05, 'epoch': 0.99} +2025-05-11 00:46:56 - ERROR - stderr - 33%|███▎ | 1231/3741 [7:21:02<15:03:20, 21.59s/it] +2025-05-11 00:47:15 - ERROR - stderr - 33%|███▎ | 1232/3741 [7:21:22<14:36:15, 20.95s/it] +2025-05-11 00:47:16 - ERROR - stderr - +2025-05-11 00:47:16 - ERROR - stderr - +2025-05-11 00:47:16 - INFO - stdout - {'loss': 0.9195, 'grad_norm': 0.6167645454406738, 'learning_rate': 1.5661445421115188e-05, 'epoch': 0.99} +2025-05-11 00:47:16 - ERROR - stderr - 33%|███▎ | 1232/3741 [7:21:22<14:36:15, 20.95s/it] +2025-05-11 00:47:35 - ERROR - stderr - 33%|███▎ | 1233/3741 [7:21:41<14:19:51, 20.57s/it] +2025-05-11 00:47:35 - ERROR - stderr - +2025-05-11 00:47:35 - ERROR - stderr - +2025-05-11 00:47:35 - INFO - stdout - {'loss': 0.8856, 'grad_norm': 0.5625812411308289, 'learning_rate': 1.5654305389766257e-05, 'epoch': 0.99} +2025-05-11 00:47:35 - ERROR - stderr - 33%|███▎ | 1233/3741 [7:21:41<14:19:51, 20.57s/it] +2025-05-11 00:47:57 - ERROR - stderr - 33%|███▎ | 1234/3741 [7:22:03<14:30:18, 20.83s/it] +2025-05-11 00:47:57 - ERROR - stderr - +2025-05-11 00:47:57 - ERROR - stderr - +2025-05-11 00:47:57 - INFO - stdout - {'loss': 0.8532, 'grad_norm': 0.6261424422264099, 'learning_rate': 1.5647161118626583e-05, 'epoch': 0.99} +2025-05-11 00:47:57 - ERROR - stderr - 33%|███▎ | 1234/3741 [7:22:03<14:30:18, 20.83s/it] +2025-05-11 00:48:16 - ERROR - stderr - 33%|███▎ | 1235/3741 [7:22:22<14:11:07, 20.38s/it] +2025-05-11 00:48:16 - ERROR - stderr - +2025-05-11 00:48:16 - ERROR - stderr - +2025-05-11 00:48:16 - INFO - stdout - {'loss': 0.9229, 'grad_norm': 0.5534703135490417, 'learning_rate': 1.5640012613053176e-05, 'epoch': 0.99} +2025-05-11 00:48:16 - ERROR - stderr - 33%|███▎ | 1235/3741 [7:22:22<14:11:07, 20.38s/it] +2025-05-11 00:48:38 - ERROR - stderr - 33%|███▎ | 1236/3741 [7:22:44<14:32:25, 20.90s/it] +2025-05-11 00:48:38 - ERROR - stderr - +2025-05-11 00:48:38 - ERROR - stderr - +2025-05-11 00:48:38 - INFO - stdout - {'loss': 0.9122, 'grad_norm': 0.5943836569786072, 'learning_rate': 1.563285987840624e-05, 'epoch': 0.99} +2025-05-11 00:48:38 - ERROR - stderr - 33%|███▎ | 1236/3741 [7:22:44<14:32:25, 20.90s/it] +2025-05-11 00:48:58 - ERROR - stderr - 33%|███▎ | 1237/3741 [7:23:04<14:15:53, 20.51s/it] +2025-05-11 00:48:58 - ERROR - stderr - +2025-05-11 00:48:58 - ERROR - stderr - +2025-05-11 00:48:58 - INFO - stdout - {'loss': 0.8596, 'grad_norm': 0.5869540572166443, 'learning_rate': 1.562570292004913e-05, 'epoch': 0.99} +2025-05-11 00:48:58 - ERROR - stderr - 33%|███▎ | 1237/3741 [7:23:04<14:15:53, 20.51s/it] +2025-05-11 00:49:20 - ERROR - stderr - 33%|███▎ | 1238/3741 [7:23:26<14:39:01, 21.07s/it] +2025-05-11 00:49:20 - ERROR - stderr - +2025-05-11 00:49:20 - ERROR - stderr - +2025-05-11 00:49:20 - INFO - stdout - {'loss': 0.8861, 'grad_norm': 0.5831838846206665, 'learning_rate': 1.561854174334838e-05, 'epoch': 0.99} +2025-05-11 00:49:20 - ERROR - stderr - 33%|███▎ | 1238/3741 [7:23:26<14:39:01, 21.07s/it] +2025-05-11 00:49:40 - ERROR - stderr - 33%|███▎ | 1239/3741 [7:23:46<14:23:16, 20.70s/it] +2025-05-11 00:49:40 - ERROR - stderr - +2025-05-11 00:49:40 - ERROR - stderr - +2025-05-11 00:49:40 - INFO - stdout - {'loss': 0.9125, 'grad_norm': 0.6431090831756592, 'learning_rate': 1.5611376353673686e-05, 'epoch': 0.99} +2025-05-11 00:49:40 - ERROR - stderr - 33%|███▎ | 1239/3741 [7:23:46<14:23:16, 20.70s/it] +2025-05-11 00:49:59 - ERROR - stderr - 33%|███▎ | 1240/3741 [7:24:06<14:07:02, 20.32s/it] +2025-05-11 00:49:59 - ERROR - stderr - +2025-05-11 00:49:59 - ERROR - stderr - +2025-05-11 00:49:59 - INFO - stdout - {'loss': 0.9392, 'grad_norm': 0.5620553493499756, 'learning_rate': 1.56042067563979e-05, 'epoch': 0.99} +2025-05-11 00:49:59 - ERROR - stderr - 33%|███▎ | 1240/3741 [7:24:06<14:07:02, 20.32s/it] +2025-05-11 00:50:21 - ERROR - stderr - 33%|███▎ | 1241/3741 [7:24:27<14:22:20, 20.70s/it] +2025-05-11 00:50:21 - ERROR - stderr - +2025-05-11 00:50:21 - ERROR - stderr - +2025-05-11 00:50:21 - INFO - stdout - {'loss': 0.892, 'grad_norm': 0.5939339399337769, 'learning_rate': 1.5597032956897028e-05, 'epoch': 1.0} +2025-05-11 00:50:21 - ERROR - stderr - 33%|███▎ | 1241/3741 [7:24:27<14:22:20, 20.70s/it] +2025-05-11 00:50:40 - ERROR - stderr - 33%|███▎ | 1242/3741 [7:24:47<14:08:07, 20.36s/it] +2025-05-11 00:50:40 - ERROR - stderr - +2025-05-11 00:50:40 - ERROR - stderr - +2025-05-11 00:50:40 - INFO - stdout - {'loss': 0.9498, 'grad_norm': 0.6007006764411926, 'learning_rate': 1.558985496055023e-05, 'epoch': 1.0} +2025-05-11 00:50:40 - ERROR - stderr - 33%|███▎ | 1242/3741 [7:24:47<14:08:07, 20.36s/it] +2025-05-11 00:51:04 - ERROR - stderr - 33%|███▎ | 1243/3741 [7:25:10<14:44:00, 21.23s/it] +2025-05-11 00:51:04 - ERROR - stderr - +2025-05-11 00:51:04 - ERROR - stderr - +2025-05-11 00:51:04 - INFO - stdout - {'loss': 0.8872, 'grad_norm': 0.5932491421699524, 'learning_rate': 1.5582672772739815e-05, 'epoch': 1.0} +2025-05-11 00:51:04 - ERROR - stderr - 33%|███▎ | 1243/3741 [7:25:10<14:44:00, 21.23s/it] +2025-05-11 00:51:23 - ERROR - stderr - 33%|███▎ | 1244/3741 [7:25:30<14:23:10, 20.74s/it] +2025-05-11 00:51:23 - ERROR - stderr - +2025-05-11 00:51:23 - ERROR - stderr - +2025-05-11 00:51:23 - INFO - stdout - {'loss': 0.9013, 'grad_norm': 0.6062937378883362, 'learning_rate': 1.5575486398851232e-05, 'epoch': 1.0} +2025-05-11 00:51:23 - ERROR - stderr - 33%|███▎ | 1244/3741 [7:25:30<14:23:10, 20.74s/it] +2025-05-11 00:51:43 - ERROR - stderr - 33%|███▎ | 1245/3741 [7:25:50<14:15:20, 20.56s/it] +2025-05-11 00:51:43 - ERROR - stderr - +2025-05-11 00:51:43 - ERROR - stderr - +2025-05-11 00:51:43 - INFO - stdout - {'loss': 0.8867, 'grad_norm': 0.6182659268379211, 'learning_rate': 1.5568295844273064e-05, 'epoch': 1.0} +2025-05-11 00:51:43 - ERROR - stderr - 33%|███▎ | 1245/3741 [7:25:50<14:15:20, 20.56s/it] +2025-05-11 00:52:03 - ERROR - stderr - 33%|███▎ | 1246/3741 [7:26:09<14:01:13, 20.23s/it] +2025-05-11 00:52:03 - ERROR - stderr - +2025-05-11 00:52:03 - ERROR - stderr - +2025-05-11 00:52:03 - INFO - stdout - {'loss': 0.9485, 'grad_norm': 0.6514543294906616, 'learning_rate': 1.5561101114397043e-05, 'epoch': 1.0} +2025-05-11 00:52:03 - ERROR - stderr - 33%|███▎ | 1246/3741 [7:26:09<14:01:13, 20.23s/it] +2025-05-11 00:52:21 - ERROR - stderr - 33%|███▎ | 1247/3741 [7:26:27<13:35:43, 19.62s/it] +2025-05-11 00:52:21 - ERROR - stderr - +2025-05-11 00:52:21 - ERROR - stderr - +2025-05-11 00:52:21 - INFO - stdout - {'loss': 0.9416, 'grad_norm': 0.610464870929718, 'learning_rate': 1.555390221461801e-05, 'epoch': 1.0} +2025-05-11 00:52:21 - ERROR - stderr - 33%|███▎ | 1247/3741 [7:26:27<13:35:43, 19.62s/it] +2025-05-11 00:52:49 - ERROR - stderr - 33%|███▎ | 1248/3741 [7:26:55<15:14:06, 22.00s/it] +2025-05-11 00:52:49 - ERROR - stderr - +2025-05-11 00:52:49 - ERROR - stderr - +2025-05-11 00:52:49 - INFO - stdout - {'loss': 0.7783, 'grad_norm': 0.5633330941200256, 'learning_rate': 1.554669915033395e-05, 'epoch': 1.0} +2025-05-11 00:52:49 - ERROR - stderr - 33%|███▎ | 1248/3741 [7:26:55<15:14:06, 22.00s/it] +2025-05-11 00:53:08 - ERROR - stderr - 33%|███▎ | 1249/3741 [7:27:15<14:45:17, 21.32s/it] +2025-05-11 00:53:08 - ERROR - stderr - +2025-05-11 00:53:08 - ERROR - stderr - +2025-05-11 00:53:08 - INFO - stdout - {'loss': 0.7874, 'grad_norm': 0.5994012951850891, 'learning_rate': 1.553949192694597e-05, 'epoch': 1.0} +2025-05-11 00:53:08 - ERROR - stderr - 33%|███▎ | 1249/3741 [7:27:15<14:45:17, 21.32s/it] +2025-05-11 00:53:32 - ERROR - stderr - 33%|███▎ | 1250/3741 [7:27:38<15:08:29, 21.88s/it] +2025-05-11 00:53:32 - ERROR - stderr - +2025-05-11 00:53:32 - ERROR - stderr - +2025-05-11 00:53:32 - INFO - stdout - {'loss': 0.772, 'grad_norm': 0.6167377829551697, 'learning_rate': 1.553228054985829e-05, 'epoch': 1.0} +2025-05-11 00:53:32 - ERROR - stderr - 33%|███▎ | 1250/3741 [7:27:38<15:08:29, 21.88s/it] +2025-05-11 00:53:51 - ERROR - stderr - 33%|███▎ | 1251/3741 [7:27:58<14:41:14, 21.23s/it] +2025-05-11 00:53:51 - ERROR - stderr - +2025-05-11 00:53:51 - ERROR - stderr - +2025-05-11 00:53:51 - INFO - stdout - {'loss': 0.7683, 'grad_norm': 0.6523487567901611, 'learning_rate': 1.5525065024478245e-05, 'epoch': 1.0} +2025-05-11 00:53:51 - ERROR - stderr - 33%|███▎ | 1251/3741 [7:27:58<14:41:14, 21.23s/it] +2025-05-11 00:54:11 - ERROR - stderr - 33%|███▎ | 1252/3741 [7:28:17<14:22:20, 20.79s/it] +2025-05-11 00:54:11 - ERROR - stderr - +2025-05-11 00:54:11 - ERROR - stderr - +2025-05-11 00:54:11 - INFO - stdout - {'loss': 0.779, 'grad_norm': 0.6528876423835754, 'learning_rate': 1.5517845356216283e-05, 'epoch': 1.0} +2025-05-11 00:54:11 - ERROR - stderr - 33%|███▎ | 1252/3741 [7:28:17<14:22:20, 20.79s/it] +2025-05-11 00:54:11 - WARNING - transformers.tokenization_utils_base - Token indices sequence length is longer than the specified maximum sequence length for this model (5854 > 4096). Running this sequence through the model will result in indexing errors +2025-05-11 00:54:11 - WARNING - transformers.tokenization_utils_base - Token indices sequence length is longer than the specified maximum sequence length for this model (5854 > 4096). Running this sequence through the model will result in indexing errors +2025-05-11 00:54:31 - ERROR - stderr - 33%|███▎ | 1253/3741 [7:28:37<14:08:13, 20.46s/it] +2025-05-11 00:54:31 - ERROR - stderr - +2025-05-11 00:54:31 - ERROR - stderr - +2025-05-11 00:54:31 - INFO - stdout - {'loss': 0.7917, 'grad_norm': 0.6447154879570007, 'learning_rate': 1.551062155048595e-05, 'epoch': 1.0} +2025-05-11 00:54:31 - ERROR - stderr - 33%|███▎ | 1253/3741 [7:28:37<14:08:13, 20.46s/it] +2025-05-11 00:54:55 - ERROR - stderr - 34%|███▎ | 1254/3741 [7:29:01<14:50:19, 21.48s/it] +2025-05-11 00:54:55 - ERROR - stderr - +2025-05-11 00:54:55 - ERROR - stderr - +2025-05-11 00:54:55 - INFO - stdout - {'loss': 0.7961, 'grad_norm': 0.6762365698814392, 'learning_rate': 1.550339361270391e-05, 'epoch': 1.01} +2025-05-11 00:54:55 - ERROR - stderr - 34%|███▎ | 1254/3741 [7:29:01<14:50:19, 21.48s/it] +2025-05-11 00:55:15 - ERROR - stderr - 34%|███▎ | 1255/3741 [7:29:21<14:34:23, 21.10s/it] +2025-05-11 00:55:15 - ERROR - stderr - +2025-05-11 00:55:15 - ERROR - stderr - +2025-05-11 00:55:15 - INFO - stdout - {'loss': 0.764, 'grad_norm': 0.6918103098869324, 'learning_rate': 1.5496161548289918e-05, 'epoch': 1.01} +2025-05-11 00:55:15 - ERROR - stderr - 34%|███▎ | 1255/3741 [7:29:21<14:34:23, 21.10s/it] +2025-05-11 00:55:34 - ERROR - stderr - 34%|███▎ | 1256/3741 [7:29:41<14:14:15, 20.63s/it] +2025-05-11 00:55:34 - ERROR - stderr - +2025-05-11 00:55:34 - ERROR - stderr - +2025-05-11 00:55:34 - INFO - stdout - {'loss': 0.7675, 'grad_norm': 0.6805532574653625, 'learning_rate': 1.5488925362666818e-05, 'epoch': 1.01} +2025-05-11 00:55:34 - ERROR - stderr - 34%|███▎ | 1256/3741 [7:29:41<14:14:15, 20.63s/it] +2025-05-11 00:55:57 - ERROR - stderr - 34%|███▎ | 1257/3741 [7:30:04<14:43:30, 21.34s/it] +2025-05-11 00:55:57 - ERROR - stderr - +2025-05-11 00:55:57 - ERROR - stderr - +2025-05-11 00:55:57 - INFO - stdout - {'loss': 0.7496, 'grad_norm': 0.698422908782959, 'learning_rate': 1.5481685061260547e-05, 'epoch': 1.01} +2025-05-11 00:55:57 - ERROR - stderr - 34%|███▎ | 1257/3741 [7:30:04<14:43:30, 21.34s/it] +2025-05-11 00:56:17 - ERROR - stderr - 34%|███▎ | 1258/3741 [7:30:23<14:18:16, 20.74s/it] +2025-05-11 00:56:17 - ERROR - stderr - +2025-05-11 00:56:17 - ERROR - stderr - +2025-05-11 00:56:17 - INFO - stdout - {'loss': 0.8026, 'grad_norm': 0.6384891271591187, 'learning_rate': 1.5474440649500132e-05, 'epoch': 1.01} +2025-05-11 00:56:17 - ERROR - stderr - 34%|███▎ | 1258/3741 [7:30:23<14:18:16, 20.74s/it] +2025-05-11 00:56:39 - ERROR - stderr - 34%|███▎ | 1259/3741 [7:30:46<14:40:55, 21.30s/it] +2025-05-11 00:56:39 - ERROR - stderr - +2025-05-11 00:56:39 - ERROR - stderr - +2025-05-11 00:56:39 - INFO - stdout - {'loss': 0.7986, 'grad_norm': 0.6521022319793701, 'learning_rate': 1.5467192132817678e-05, 'epoch': 1.01} +2025-05-11 00:56:39 - ERROR - stderr - 34%|███▎ | 1259/3741 [7:30:46<14:40:55, 21.30s/it] +2025-05-11 00:56:59 - ERROR - stderr - 34%|███▎ | 1260/3741 [7:31:05<14:19:02, 20.77s/it] +2025-05-11 00:56:59 - ERROR - stderr - +2025-05-11 00:56:59 - ERROR - stderr - +2025-05-11 00:56:59 - INFO - stdout - {'loss': 0.7312, 'grad_norm': 0.6222298741340637, 'learning_rate': 1.5459939516648374e-05, 'epoch': 1.01} +2025-05-11 00:56:59 - ERROR - stderr - 34%|███▎ | 1260/3741 [7:31:05<14:19:02, 20.77s/it] +2025-05-11 00:57:18 - ERROR - stderr - 34%|███▎ | 1261/3741 [7:31:24<14:00:11, 20.33s/it] +2025-05-11 00:57:18 - ERROR - stderr - +2025-05-11 00:57:18 - ERROR - stderr - +2025-05-11 00:57:18 - INFO - stdout - {'loss': 0.7311, 'grad_norm': 0.6429084539413452, 'learning_rate': 1.5452682806430473e-05, 'epoch': 1.01} +2025-05-11 00:57:18 - ERROR - stderr - 34%|███▎ | 1261/3741 [7:31:24<14:00:11, 20.33s/it] +2025-05-11 00:57:40 - ERROR - stderr - 34%|███▎ | 1262/3741 [7:31:46<14:18:14, 20.77s/it] +2025-05-11 00:57:40 - ERROR - stderr - +2025-05-11 00:57:40 - ERROR - stderr - +2025-05-11 00:57:40 - INFO - stdout - {'loss': 0.8077, 'grad_norm': 0.7085431814193726, 'learning_rate': 1.544542200760531e-05, 'epoch': 1.01} +2025-05-11 00:57:40 - ERROR - stderr - 34%|███▎ | 1262/3741 [7:31:46<14:18:14, 20.77s/it] +2025-05-11 00:57:59 - ERROR - stderr - 34%|███▍ | 1263/3741 [7:32:06<14:00:23, 20.35s/it] +2025-05-11 00:57:59 - ERROR - stderr - +2025-05-11 00:57:59 - ERROR - stderr - +2025-05-11 00:57:59 - INFO - stdout - {'loss': 0.7795, 'grad_norm': 0.6494969725608826, 'learning_rate': 1.543815712561727e-05, 'epoch': 1.01} +2025-05-11 00:57:59 - ERROR - stderr - 34%|███▍ | 1263/3741 [7:32:06<14:00:23, 20.35s/it] +2025-05-11 00:58:22 - ERROR - stderr - 34%|███▍ | 1264/3741 [7:32:29<14:32:17, 21.13s/it] +2025-05-11 00:58:22 - ERROR - stderr - +2025-05-11 00:58:22 - ERROR - stderr - +2025-05-11 00:58:22 - INFO - stdout - {'loss': 0.7784, 'grad_norm': 0.650427520275116, 'learning_rate': 1.5430888165913814e-05, 'epoch': 1.01} +2025-05-11 00:58:22 - ERROR - stderr - 34%|███▍ | 1264/3741 [7:32:29<14:32:17, 21.13s/it] +2025-05-11 00:58:42 - ERROR - stderr - 34%|███▍ | 1265/3741 [7:32:49<14:18:52, 20.81s/it] +2025-05-11 00:58:42 - ERROR - stderr - +2025-05-11 00:58:42 - ERROR - stderr - +2025-05-11 00:58:42 - INFO - stdout - {'loss': 0.7681, 'grad_norm': 0.674248456954956, 'learning_rate': 1.5423615133945457e-05, 'epoch': 1.01} +2025-05-11 00:58:42 - ERROR - stderr - 34%|███▍ | 1265/3741 [7:32:49<14:18:52, 20.81s/it] +2025-05-11 00:59:04 - ERROR - stderr - 34%|███▍ | 1266/3741 [7:33:10<14:30:25, 21.10s/it] +2025-05-11 00:59:04 - ERROR - stderr - +2025-05-11 00:59:04 - ERROR - stderr - +2025-05-11 00:59:04 - INFO - stdout - {'loss': 0.7758, 'grad_norm': 0.6563466191291809, 'learning_rate': 1.5416338035165766e-05, 'epoch': 1.02} +2025-05-11 00:59:04 - ERROR - stderr - 34%|███▍ | 1266/3741 [7:33:10<14:30:25, 21.10s/it] +2025-05-11 00:59:24 - ERROR - stderr - 34%|███▍ | 1267/3741 [7:33:30<14:09:33, 20.60s/it] +2025-05-11 00:59:24 - ERROR - stderr - +2025-05-11 00:59:24 - ERROR - stderr - +2025-05-11 00:59:24 - INFO - stdout - {'loss': 0.7597, 'grad_norm': 0.6918492317199707, 'learning_rate': 1.5409056875031355e-05, 'epoch': 1.02} +2025-05-11 00:59:24 - ERROR - stderr - 34%|███▍ | 1267/3741 [7:33:30<14:09:33, 20.60s/it] +2025-05-11 00:59:43 - ERROR - stderr - 34%|███▍ | 1268/3741 [7:33:49<13:54:34, 20.25s/it] +2025-05-11 00:59:43 - ERROR - stderr - +2025-05-11 00:59:43 - ERROR - stderr - +2025-05-11 00:59:43 - INFO - stdout - {'loss': 0.7596, 'grad_norm': 0.6418508291244507, 'learning_rate': 1.5401771659001885e-05, 'epoch': 1.02} +2025-05-11 00:59:43 - ERROR - stderr - 34%|███▍ | 1268/3741 [7:33:49<13:54:34, 20.25s/it] +2025-05-11 01:00:06 - ERROR - stderr - 34%|███▍ | 1269/3741 [7:34:12<14:26:57, 21.04s/it] +2025-05-11 01:00:06 - ERROR - stderr - +2025-05-11 01:00:06 - ERROR - stderr - +2025-05-11 01:00:06 - INFO - stdout - {'loss': 0.7344, 'grad_norm': 0.6073652505874634, 'learning_rate': 1.5394482392540066e-05, 'epoch': 1.02} +2025-05-11 01:00:06 - ERROR - stderr - 34%|███▍ | 1269/3741 [7:34:12<14:26:57, 21.04s/it] +2025-05-11 01:00:26 - ERROR - stderr - 34%|███▍ | 1270/3741 [7:34:32<14:10:02, 20.64s/it] +2025-05-11 01:00:26 - ERROR - stderr - +2025-05-11 01:00:26 - ERROR - stderr - +2025-05-11 01:00:26 - INFO - stdout - {'loss': 0.7876, 'grad_norm': 0.7088474631309509, 'learning_rate': 1.5387189081111628e-05, 'epoch': 1.02} +2025-05-11 01:00:26 - ERROR - stderr - 34%|███▍ | 1270/3741 [7:34:32<14:10:02, 20.64s/it] +2025-05-11 01:00:49 - ERROR - stderr - 34%|███▍ | 1271/3741 [7:34:55<14:44:32, 21.49s/it] +2025-05-11 01:00:49 - ERROR - stderr - +2025-05-11 01:00:49 - ERROR - stderr - +2025-05-11 01:00:49 - INFO - stdout - {'loss': 0.7867, 'grad_norm': 0.6955075263977051, 'learning_rate': 1.5379891730185352e-05, 'epoch': 1.02} +2025-05-11 01:00:49 - ERROR - stderr - 34%|███▍ | 1271/3741 [7:34:55<14:44:32, 21.49s/it] +2025-05-11 01:01:08 - ERROR - stderr - 34%|███▍ | 1272/3741 [7:35:15<14:17:24, 20.84s/it] +2025-05-11 01:01:08 - ERROR - stderr - +2025-05-11 01:01:08 - ERROR - stderr - +2025-05-11 01:01:08 - INFO - stdout - {'loss': 0.8059, 'grad_norm': 0.6917376518249512, 'learning_rate': 1.537259034523304e-05, 'epoch': 1.02} +2025-05-11 01:01:08 - ERROR - stderr - 34%|███▍ | 1272/3741 [7:35:15<14:17:24, 20.84s/it] +2025-05-11 01:01:28 - ERROR - stderr - 34%|███▍ | 1273/3741 [7:35:34<14:00:17, 20.43s/it] +2025-05-11 01:01:28 - ERROR - stderr - +2025-05-11 01:01:28 - ERROR - stderr - +2025-05-11 01:01:28 - INFO - stdout - {'loss': 0.7737, 'grad_norm': 0.6557261943817139, 'learning_rate': 1.5365284931729513e-05, 'epoch': 1.02} +2025-05-11 01:01:28 - ERROR - stderr - 34%|███▍ | 1273/3741 [7:35:34<14:00:17, 20.43s/it] +2025-05-11 01:01:48 - ERROR - stderr - 34%|███▍ | 1274/3741 [7:35:54<13:52:05, 20.24s/it] +2025-05-11 01:01:48 - ERROR - stderr - +2025-05-11 01:01:48 - ERROR - stderr - +2025-05-11 01:01:48 - INFO - stdout - {'loss': 0.7509, 'grad_norm': 0.6700888872146606, 'learning_rate': 1.5357975495152628e-05, 'epoch': 1.02} +2025-05-11 01:01:48 - ERROR - stderr - 34%|███▍ | 1274/3741 [7:35:54<13:52:05, 20.24s/it] +2025-05-11 01:02:07 - ERROR - stderr - 34%|███▍ | 1275/3741 [7:36:13<13:40:51, 19.97s/it] +2025-05-11 01:02:07 - ERROR - stderr - +2025-05-11 01:02:07 - ERROR - stderr - +2025-05-11 01:02:07 - INFO - stdout - {'loss': 0.8075, 'grad_norm': 0.6783604025840759, 'learning_rate': 1.5350662040983236e-05, 'epoch': 1.02} +2025-05-11 01:02:07 - ERROR - stderr - 34%|███▍ | 1275/3741 [7:36:13<13:40:51, 19.97s/it] +2025-05-11 01:02:31 - ERROR - stderr - 34%|███▍ | 1276/3741 [7:36:37<14:30:59, 21.20s/it] +2025-05-11 01:02:31 - ERROR - stderr - +2025-05-11 01:02:31 - ERROR - stderr - +2025-05-11 01:02:31 - INFO - stdout - {'loss': 0.7346, 'grad_norm': 0.6425763368606567, 'learning_rate': 1.5343344574705234e-05, 'epoch': 1.02} +2025-05-11 01:02:31 - ERROR - stderr - 34%|███▍ | 1276/3741 [7:36:37<14:30:59, 21.20s/it] +2025-05-11 01:02:51 - ERROR - stderr - 34%|███▍ | 1277/3741 [7:36:57<14:10:56, 20.72s/it] +2025-05-11 01:02:51 - ERROR - stderr - +2025-05-11 01:02:51 - ERROR - stderr - +2025-05-11 01:02:51 - INFO - stdout - {'loss': 0.785, 'grad_norm': 0.6592726707458496, 'learning_rate': 1.5336023101805486e-05, 'epoch': 1.02} +2025-05-11 01:02:51 - ERROR - stderr - 34%|███▍ | 1277/3741 [7:36:57<14:10:56, 20.72s/it] +2025-05-11 01:03:14 - ERROR - stderr - 34%|███▍ | 1278/3741 [7:37:20<14:44:07, 21.54s/it] +2025-05-11 01:03:14 - ERROR - stderr - +2025-05-11 01:03:14 - ERROR - stderr - +2025-05-11 01:03:14 - INFO - stdout - {'loss': 0.7834, 'grad_norm': 0.6982232332229614, 'learning_rate': 1.5328697627773898e-05, 'epoch': 1.02} +2025-05-11 01:03:14 - ERROR - stderr - 34%|███▍ | 1278/3741 [7:37:20<14:44:07, 21.54s/it] +2025-05-11 01:03:34 - ERROR - stderr - 34%|███▍ | 1279/3741 [7:37:40<14:23:16, 21.04s/it] +2025-05-11 01:03:34 - ERROR - stderr - +2025-05-11 01:03:34 - ERROR - stderr - +2025-05-11 01:03:34 - INFO - stdout - {'loss': 0.7505, 'grad_norm': 0.6378937363624573, 'learning_rate': 1.5321368158103346e-05, 'epoch': 1.03} +2025-05-11 01:03:34 - ERROR - stderr - 34%|███▍ | 1279/3741 [7:37:40<14:23:16, 21.04s/it] +2025-05-11 01:03:54 - ERROR - stderr - 34%|███▍ | 1280/3741 [7:38:00<14:05:56, 20.62s/it] +2025-05-11 01:03:54 - ERROR - stderr - +2025-05-11 01:03:54 - ERROR - stderr - +2025-05-11 01:03:54 - INFO - stdout - {'loss': 0.7627, 'grad_norm': 0.6628844141960144, 'learning_rate': 1.531403469828973e-05, 'epoch': 1.03} +2025-05-11 01:03:54 - ERROR - stderr - 34%|███▍ | 1280/3741 [7:38:00<14:05:56, 20.62s/it] +2025-05-11 01:04:13 - ERROR - stderr - 34%|███▍ | 1281/3741 [7:38:19<13:52:45, 20.31s/it] +2025-05-11 01:04:13 - ERROR - stderr - +2025-05-11 01:04:13 - ERROR - stderr - +2025-05-11 01:04:13 - INFO - stdout - {'loss': 0.7615, 'grad_norm': 0.6596646904945374, 'learning_rate': 1.5306697253831914e-05, 'epoch': 1.03} +2025-05-11 01:04:13 - ERROR - stderr - 34%|███▍ | 1281/3741 [7:38:19<13:52:45, 20.31s/it] +2025-05-11 01:04:33 - ERROR - stderr - 34%|███▍ | 1282/3741 [7:38:39<13:42:18, 20.06s/it] +2025-05-11 01:04:33 - ERROR - stderr - +2025-05-11 01:04:33 - ERROR - stderr - +2025-05-11 01:04:33 - INFO - stdout - {'loss': 0.7921, 'grad_norm': 0.6652581095695496, 'learning_rate': 1.5299355830231776e-05, 'epoch': 1.03} +2025-05-11 01:04:33 - ERROR - stderr - 34%|███▍ | 1282/3741 [7:38:39<13:42:18, 20.06s/it] +2025-05-11 01:04:56 - ERROR - stderr - 34%|███▍ | 1283/3741 [7:39:02<14:20:21, 21.00s/it] +2025-05-11 01:04:56 - ERROR - stderr - +2025-05-11 01:04:56 - ERROR - stderr - +2025-05-11 01:04:56 - INFO - stdout - {'loss': 0.7812, 'grad_norm': 0.6460443735122681, 'learning_rate': 1.5292010432994162e-05, 'epoch': 1.03} +2025-05-11 01:04:56 - ERROR - stderr - 34%|███▍ | 1283/3741 [7:39:02<14:20:21, 21.00s/it] +2025-05-11 01:05:16 - ERROR - stderr - 34%|███▍ | 1284/3741 [7:39:22<14:04:02, 20.61s/it] +2025-05-11 01:05:16 - ERROR - stderr - +2025-05-11 01:05:16 - ERROR - stderr - +2025-05-11 01:05:16 - INFO - stdout - {'loss': 0.7718, 'grad_norm': 0.6455625295639038, 'learning_rate': 1.5284661067626897e-05, 'epoch': 1.03} +2025-05-11 01:05:16 - ERROR - stderr - 34%|███▍ | 1284/3741 [7:39:22<14:04:02, 20.61s/it] +2025-05-11 01:05:40 - ERROR - stderr - 34%|███▍ | 1285/3741 [7:39:46<14:47:05, 21.67s/it] +2025-05-11 01:05:40 - ERROR - stderr - +2025-05-11 01:05:40 - ERROR - stderr - +2025-05-11 01:05:40 - INFO - stdout - {'loss': 0.7546, 'grad_norm': 0.653901994228363, 'learning_rate': 1.5277307739640787e-05, 'epoch': 1.03} +2025-05-11 01:05:40 - ERROR - stderr - 34%|███▍ | 1285/3741 [7:39:46<14:47:05, 21.67s/it] +2025-05-11 01:05:59 - ERROR - stderr - 34%|███▍ | 1286/3741 [7:40:06<14:21:33, 21.06s/it] +2025-05-11 01:05:59 - ERROR - stderr - +2025-05-11 01:05:59 - ERROR - stderr - +2025-05-11 01:05:59 - INFO - stdout - {'loss': 0.7652, 'grad_norm': 0.6660287976264954, 'learning_rate': 1.526995045454961e-05, 'epoch': 1.03} +2025-05-11 01:05:59 - ERROR - stderr - 34%|███▍ | 1286/3741 [7:40:06<14:21:33, 21.06s/it] +2025-05-11 01:06:19 - ERROR - stderr - 34%|███▍ | 1287/3741 [7:40:25<14:03:56, 20.63s/it] +2025-05-11 01:06:19 - ERROR - stderr - +2025-05-11 01:06:19 - ERROR - stderr - +2025-05-11 01:06:19 - INFO - stdout - {'loss': 0.7771, 'grad_norm': 0.6647891998291016, 'learning_rate': 1.5262589217870106e-05, 'epoch': 1.03} +2025-05-11 01:06:19 - ERROR - stderr - 34%|███▍ | 1287/3741 [7:40:25<14:03:56, 20.63s/it] +2025-05-11 01:06:40 - ERROR - stderr - 34%|███▍ | 1288/3741 [7:40:46<14:05:32, 20.68s/it] +2025-05-11 01:06:40 - ERROR - stderr - +2025-05-11 01:06:40 - ERROR - stderr - +2025-05-11 01:06:40 - INFO - stdout - {'loss': 0.7632, 'grad_norm': 0.6715826988220215, 'learning_rate': 1.5255224035121986e-05, 'epoch': 1.03} +2025-05-11 01:06:40 - ERROR - stderr - 34%|███▍ | 1288/3741 [7:40:46<14:05:32, 20.68s/it] +2025-05-11 01:06:59 - ERROR - stderr - 34%|███▍ | 1289/3741 [7:41:06<13:53:37, 20.40s/it] +2025-05-11 01:07:00 - ERROR - stderr - +2025-05-11 01:07:00 - ERROR - stderr - +2025-05-11 01:07:00 - INFO - stdout - {'loss': 0.8042, 'grad_norm': 0.6524284482002258, 'learning_rate': 1.524785491182791e-05, 'epoch': 1.03} +2025-05-11 01:07:00 - ERROR - stderr - 34%|███▍ | 1289/3741 [7:41:06<13:53:37, 20.40s/it] +2025-05-11 01:07:23 - ERROR - stderr - 34%|███▍ | 1290/3741 [7:41:29<14:29:39, 21.29s/it] +2025-05-11 01:07:23 - ERROR - stderr - +2025-05-11 01:07:23 - ERROR - stderr - +2025-05-11 01:07:23 - INFO - stdout - {'loss': 0.8175, 'grad_norm': 0.7093574404716492, 'learning_rate': 1.5240481853513495e-05, 'epoch': 1.03} +2025-05-11 01:07:23 - ERROR - stderr - 34%|███▍ | 1290/3741 [7:41:29<14:29:39, 21.29s/it] +2025-05-11 01:07:43 - ERROR - stderr - 35%|███▍ | 1291/3741 [7:41:49<14:09:54, 20.81s/it] +2025-05-11 01:07:43 - ERROR - stderr - +2025-05-11 01:07:43 - ERROR - stderr - +2025-05-11 01:07:43 - INFO - stdout - {'loss': 0.8186, 'grad_norm': 0.6630702018737793, 'learning_rate': 1.523310486570732e-05, 'epoch': 1.04} +2025-05-11 01:07:43 - ERROR - stderr - 35%|███▍ | 1291/3741 [7:41:49<14:09:54, 20.81s/it] +2025-05-11 01:08:06 - ERROR - stderr - 35%|███▍ | 1292/3741 [7:42:13<14:47:36, 21.75s/it] +2025-05-11 01:08:07 - ERROR - stderr - +2025-05-11 01:08:07 - ERROR - stderr - +2025-05-11 01:08:07 - INFO - stdout - {'loss': 0.7712, 'grad_norm': 0.669937014579773, 'learning_rate': 1.5225723953940896e-05, 'epoch': 1.04} +2025-05-11 01:08:07 - ERROR - stderr - 35%|███▍ | 1292/3741 [7:42:13<14:47:36, 21.75s/it] +2025-05-11 01:08:26 - ERROR - stderr - 35%|███▍ | 1293/3741 [7:42:32<14:18:53, 21.05s/it] +2025-05-11 01:08:26 - ERROR - stderr - +2025-05-11 01:08:26 - ERROR - stderr - +2025-05-11 01:08:26 - INFO - stdout - {'loss': 0.7704, 'grad_norm': 0.6852511167526245, 'learning_rate': 1.5218339123748682e-05, 'epoch': 1.04} +2025-05-11 01:08:26 - ERROR - stderr - 35%|███▍ | 1293/3741 [7:42:32<14:18:53, 21.05s/it] +2025-05-11 01:08:46 - ERROR - stderr - 35%|███▍ | 1294/3741 [7:42:52<14:05:04, 20.72s/it] +2025-05-11 01:08:46 - ERROR - stderr - +2025-05-11 01:08:46 - ERROR - stderr - +2025-05-11 01:08:46 - INFO - stdout - {'loss': 0.7617, 'grad_norm': 0.6196748614311218, 'learning_rate': 1.5210950380668074e-05, 'epoch': 1.04} +2025-05-11 01:08:46 - ERROR - stderr - 35%|███▍ | 1294/3741 [7:42:52<14:05:04, 20.72s/it] +2025-05-11 01:09:07 - ERROR - stderr - 35%|███▍ | 1295/3741 [7:43:13<14:09:43, 20.84s/it] +2025-05-11 01:09:07 - ERROR - stderr - +2025-05-11 01:09:07 - ERROR - stderr - +2025-05-11 01:09:07 - INFO - stdout - {'loss': 0.7316, 'grad_norm': 0.6314553618431091, 'learning_rate': 1.5203557730239408e-05, 'epoch': 1.04} +2025-05-11 01:09:07 - ERROR - stderr - 35%|███▍ | 1295/3741 [7:43:13<14:09:43, 20.84s/it] +2025-05-11 01:09:27 - ERROR - stderr - 35%|███▍ | 1296/3741 [7:43:33<13:58:13, 20.57s/it] +2025-05-11 01:09:27 - ERROR - stderr - +2025-05-11 01:09:27 - ERROR - stderr - +2025-05-11 01:09:27 - INFO - stdout - {'loss': 0.7706, 'grad_norm': 0.6329060196876526, 'learning_rate': 1.5196161178005941e-05, 'epoch': 1.04} +2025-05-11 01:09:27 - ERROR - stderr - 35%|███▍ | 1296/3741 [7:43:33<13:58:13, 20.57s/it] +2025-05-11 01:09:51 - ERROR - stderr - 35%|███▍ | 1297/3741 [7:43:57<14:39:46, 21.60s/it] +2025-05-11 01:09:51 - ERROR - stderr - +2025-05-11 01:09:51 - ERROR - stderr - +2025-05-11 01:09:51 - INFO - stdout - {'loss': 0.7561, 'grad_norm': 0.642294704914093, 'learning_rate': 1.5188760729513865e-05, 'epoch': 1.04} +2025-05-11 01:09:51 - ERROR - stderr - 35%|███▍ | 1297/3741 [7:43:57<14:39:46, 21.60s/it] +2025-05-11 01:10:10 - ERROR - stderr - 35%|███▍ | 1298/3741 [7:44:17<14:13:58, 20.97s/it] +2025-05-11 01:10:10 - ERROR - stderr - +2025-05-11 01:10:10 - ERROR - stderr - +2025-05-11 01:10:10 - INFO - stdout - {'loss': 0.8194, 'grad_norm': 0.6721711158752441, 'learning_rate': 1.5181356390312279e-05, 'epoch': 1.04} +2025-05-11 01:10:10 - ERROR - stderr - 35%|███▍ | 1298/3741 [7:44:17<14:13:58, 20.97s/it] +2025-05-11 01:10:34 - ERROR - stderr - 35%|███▍ | 1299/3741 [7:44:40<14:42:32, 21.68s/it] +2025-05-11 01:10:34 - ERROR - stderr - +2025-05-11 01:10:34 - ERROR - stderr - +2025-05-11 01:10:34 - INFO - stdout - {'loss': 0.7759, 'grad_norm': 0.6798752546310425, 'learning_rate': 1.5173948165953216e-05, 'epoch': 1.04} +2025-05-11 01:10:34 - ERROR - stderr - 35%|███▍ | 1299/3741 [7:44:40<14:42:32, 21.68s/it] +2025-05-11 01:10:53 - ERROR - stderr - 35%|███▍ | 1300/3741 [7:45:00<14:15:29, 21.03s/it] +2025-05-11 01:10:53 - ERROR - stderr - +2025-05-11 01:10:53 - ERROR - stderr - +2025-05-11 01:10:53 - INFO - stdout - {'loss': 0.7913, 'grad_norm': 0.6321367025375366, 'learning_rate': 1.5166536061991615e-05, 'epoch': 1.04} +2025-05-11 01:10:53 - ERROR - stderr - 35%|███▍ | 1300/3741 [7:45:00<14:15:29, 21.03s/it] +2025-05-11 01:11:13 - ERROR - stderr - 35%|███▍ | 1301/3741 [7:45:19<13:54:58, 20.53s/it] +2025-05-11 01:11:13 - ERROR - stderr - +2025-05-11 01:11:13 - ERROR - stderr - +2025-05-11 01:11:13 - INFO - stdout - {'loss': 0.751, 'grad_norm': 0.6367747783660889, 'learning_rate': 1.5159120083985319e-05, 'epoch': 1.04} +2025-05-11 01:11:13 - ERROR - stderr - 35%|███▍ | 1301/3741 [7:45:19<13:54:58, 20.53s/it] +2025-05-11 01:11:34 - ERROR - stderr - 35%|███▍ | 1302/3741 [7:45:40<14:02:16, 20.72s/it] +2025-05-11 01:11:34 - ERROR - stderr - +2025-05-11 01:11:34 - ERROR - stderr - +2025-05-11 01:11:34 - INFO - stdout - {'loss': 0.7406, 'grad_norm': 0.6426526308059692, 'learning_rate': 1.5151700237495087e-05, 'epoch': 1.04} +2025-05-11 01:11:34 - ERROR - stderr - 35%|███▍ | 1302/3741 [7:45:40<14:02:16, 20.72s/it] +2025-05-11 01:11:54 - ERROR - stderr - 35%|███▍ | 1303/3741 [7:46:00<13:52:31, 20.49s/it] +2025-05-11 01:11:54 - ERROR - stderr - +2025-05-11 01:11:54 - ERROR - stderr - +2025-05-11 01:11:54 - INFO - stdout - {'loss': 0.7382, 'grad_norm': 0.6288602352142334, 'learning_rate': 1.5144276528084566e-05, 'epoch': 1.04} +2025-05-11 01:11:54 - ERROR - stderr - 35%|███▍ | 1303/3741 [7:46:00<13:52:31, 20.49s/it] +2025-05-11 01:12:18 - ERROR - stderr - 35%|███▍ | 1304/3741 [7:46:24<14:32:37, 21.48s/it] +2025-05-11 01:12:18 - ERROR - stderr - +2025-05-11 01:12:18 - ERROR - stderr - +2025-05-11 01:12:18 - INFO - stdout - {'loss': 0.7271, 'grad_norm': 0.6340166330337524, 'learning_rate': 1.513684896132031e-05, 'epoch': 1.05} +2025-05-11 01:12:18 - ERROR - stderr - 35%|███▍ | 1304/3741 [7:46:24<14:32:37, 21.48s/it] +2025-05-11 01:12:37 - ERROR - stderr - 35%|███▍ | 1305/3741 [7:46:43<14:08:02, 20.89s/it] +2025-05-11 01:12:37 - ERROR - stderr - +2025-05-11 01:12:37 - ERROR - stderr - +2025-05-11 01:12:37 - INFO - stdout - {'loss': 0.7534, 'grad_norm': 0.6427846550941467, 'learning_rate': 1.5129417542771761e-05, 'epoch': 1.05} +2025-05-11 01:12:37 - ERROR - stderr - 35%|███▍ | 1305/3741 [7:46:43<14:08:02, 20.89s/it] +2025-05-11 01:13:00 - ERROR - stderr - 35%|███▍ | 1306/3741 [7:47:06<14:31:00, 21.46s/it] +2025-05-11 01:13:00 - ERROR - stderr - +2025-05-11 01:13:00 - ERROR - stderr - +2025-05-11 01:13:00 - INFO - stdout - {'loss': 0.73, 'grad_norm': 0.6341578960418701, 'learning_rate': 1.512198227801125e-05, 'epoch': 1.05} +2025-05-11 01:13:00 - ERROR - stderr - 35%|███▍ | 1306/3741 [7:47:06<14:31:00, 21.46s/it] +2025-05-11 01:13:19 - ERROR - stderr - 35%|███▍ | 1307/3741 [7:47:26<14:06:37, 20.87s/it] +2025-05-11 01:13:19 - ERROR - stderr - +2025-05-11 01:13:19 - ERROR - stderr - +2025-05-11 01:13:19 - INFO - stdout - {'loss': 0.7734, 'grad_norm': 0.6635767817497253, 'learning_rate': 1.5114543172613995e-05, 'epoch': 1.05} +2025-05-11 01:13:19 - ERROR - stderr - 35%|███▍ | 1307/3741 [7:47:26<14:06:37, 20.87s/it] +2025-05-11 01:13:39 - ERROR - stderr - 35%|███▍ | 1308/3741 [7:47:45<13:47:27, 20.41s/it] +2025-05-11 01:13:39 - ERROR - stderr - +2025-05-11 01:13:39 - ERROR - stderr - +2025-05-11 01:13:39 - INFO - stdout - {'loss': 0.7465, 'grad_norm': 0.6806950569152832, 'learning_rate': 1.5107100232158085e-05, 'epoch': 1.05} +2025-05-11 01:13:39 - ERROR - stderr - 35%|███▍ | 1308/3741 [7:47:45<13:47:27, 20.41s/it] +2025-05-11 01:14:02 - ERROR - stderr - 35%|███▍ | 1309/3741 [7:48:08<14:18:11, 21.17s/it] +2025-05-11 01:14:02 - ERROR - stderr - +2025-05-11 01:14:02 - ERROR - stderr - +2025-05-11 01:14:02 - INFO - stdout - {'loss': 0.7822, 'grad_norm': 0.639504075050354, 'learning_rate': 1.5099653462224492e-05, 'epoch': 1.05} +2025-05-11 01:14:02 - ERROR - stderr - 35%|███▍ | 1309/3741 [7:48:08<14:18:11, 21.17s/it] +2025-05-11 01:14:21 - ERROR - stderr - 35%|███▌ | 1310/3741 [7:48:27<13:55:38, 20.62s/it] +2025-05-11 01:14:21 - ERROR - stderr - +2025-05-11 01:14:21 - ERROR - stderr - +2025-05-11 01:14:21 - INFO - stdout - {'loss': 0.7742, 'grad_norm': 0.6781004667282104, 'learning_rate': 1.5092202868397056e-05, 'epoch': 1.05} +2025-05-11 01:14:21 - ERROR - stderr - 35%|███▌ | 1310/3741 [7:48:27<13:55:38, 20.62s/it] +2025-05-11 01:14:45 - ERROR - stderr - 35%|███▌ | 1311/3741 [7:48:51<14:38:38, 21.69s/it] +2025-05-11 01:14:45 - ERROR - stderr - +2025-05-11 01:14:45 - ERROR - stderr - +2025-05-11 01:14:45 - INFO - stdout - {'loss': 0.7638, 'grad_norm': 0.6971407532691956, 'learning_rate': 1.5084748456262487e-05, 'epoch': 1.05} +2025-05-11 01:14:45 - ERROR - stderr - 35%|███▌ | 1311/3741 [7:48:52<14:38:38, 21.69s/it] +2025-05-11 01:15:05 - ERROR - stderr - 35%|███▌ | 1312/3741 [7:49:11<14:12:53, 21.07s/it] +2025-05-11 01:15:05 - ERROR - stderr - +2025-05-11 01:15:05 - ERROR - stderr - +2025-05-11 01:15:05 - INFO - stdout - {'loss': 0.8214, 'grad_norm': 0.6818044781684875, 'learning_rate': 1.5077290231410367e-05, 'epoch': 1.05} +2025-05-11 01:15:05 - ERROR - stderr - 35%|███▌ | 1312/3741 [7:49:11<14:12:53, 21.07s/it] +2025-05-11 01:15:26 - ERROR - stderr - 35%|███▌ | 1313/3741 [7:49:33<14:20:24, 21.26s/it] +2025-05-11 01:15:27 - ERROR - stderr - +2025-05-11 01:15:27 - ERROR - stderr - +2025-05-11 01:15:27 - INFO - stdout - {'loss': 0.7426, 'grad_norm': 0.6158934831619263, 'learning_rate': 1.506982819943311e-05, 'epoch': 1.05} +2025-05-11 01:15:27 - ERROR - stderr - 35%|███▌ | 1313/3741 [7:49:33<14:20:24, 21.26s/it] +2025-05-11 01:15:46 - ERROR - stderr - 35%|███▌ | 1314/3741 [7:49:52<14:00:10, 20.77s/it] +2025-05-11 01:15:46 - ERROR - stderr - +2025-05-11 01:15:46 - ERROR - stderr - +2025-05-11 01:15:46 - INFO - stdout - {'loss': 0.7396, 'grad_norm': 0.6084417700767517, 'learning_rate': 1.5062362365926012e-05, 'epoch': 1.05} +2025-05-11 01:15:46 - ERROR - stderr - 35%|███▌ | 1314/3741 [7:49:52<14:00:10, 20.77s/it] +2025-05-11 01:16:06 - ERROR - stderr - 35%|███▌ | 1315/3741 [7:50:12<13:44:28, 20.39s/it] +2025-05-11 01:16:06 - ERROR - stderr - +2025-05-11 01:16:06 - ERROR - stderr - +2025-05-11 01:16:06 - INFO - stdout - {'loss': 0.7497, 'grad_norm': 0.6691953539848328, 'learning_rate': 1.5054892736487206e-05, 'epoch': 1.05} +2025-05-11 01:16:06 - ERROR - stderr - 35%|███▌ | 1315/3741 [7:50:12<13:44:28, 20.39s/it] +2025-05-11 01:16:29 - ERROR - stderr - 35%|███▌ | 1316/3741 [7:50:35<14:21:54, 21.33s/it] +2025-05-11 01:16:29 - ERROR - stderr - +2025-05-11 01:16:29 - ERROR - stderr - +2025-05-11 01:16:29 - INFO - stdout - {'loss': 0.773, 'grad_norm': 0.6629313826560974, 'learning_rate': 1.504741931671768e-05, 'epoch': 1.06} +2025-05-11 01:16:29 - ERROR - stderr - 35%|███▌ | 1316/3741 [7:50:35<14:21:54, 21.33s/it] +2025-05-11 01:16:49 - ERROR - stderr - 35%|███▌ | 1317/3741 [7:50:55<14:00:40, 20.81s/it] +2025-05-11 01:16:49 - ERROR - stderr - +2025-05-11 01:16:49 - ERROR - stderr - +2025-05-11 01:16:49 - INFO - stdout - {'loss': 0.7542, 'grad_norm': 0.641639232635498, 'learning_rate': 1.503994211222125e-05, 'epoch': 1.06} +2025-05-11 01:16:49 - ERROR - stderr - 35%|███▌ | 1317/3741 [7:50:55<14:00:40, 20.81s/it] +2025-05-11 01:17:12 - ERROR - stderr - 35%|███▌ | 1318/3741 [7:51:18<14:30:09, 21.55s/it] +2025-05-11 01:17:12 - ERROR - stderr - +2025-05-11 01:17:12 - ERROR - stderr - +2025-05-11 01:17:12 - INFO - stdout - {'loss': 0.7645, 'grad_norm': 0.6214974522590637, 'learning_rate': 1.5032461128604583e-05, 'epoch': 1.06} +2025-05-11 01:17:12 - ERROR - stderr - 35%|███▌ | 1318/3741 [7:51:18<14:30:09, 21.55s/it] +2025-05-11 01:17:31 - ERROR - stderr - 35%|███▌ | 1319/3741 [7:51:38<14:04:39, 20.92s/it] +2025-05-11 01:17:31 - ERROR - stderr - +2025-05-11 01:17:31 - ERROR - stderr - +2025-05-11 01:17:31 - INFO - stdout - {'loss': 0.7688, 'grad_norm': 0.6951003670692444, 'learning_rate': 1.5024976371477175e-05, 'epoch': 1.06} +2025-05-11 01:17:31 - ERROR - stderr - 35%|███▌ | 1319/3741 [7:51:38<14:04:39, 20.92s/it] +2025-05-11 01:17:52 - ERROR - stderr - 35%|███▌ | 1320/3741 [7:51:58<13:58:33, 20.78s/it] +2025-05-11 01:17:52 - ERROR - stderr - +2025-05-11 01:17:52 - ERROR - stderr - +2025-05-11 01:17:52 - INFO - stdout - {'loss': 0.7435, 'grad_norm': 0.641646683216095, 'learning_rate': 1.5017487846451353e-05, 'epoch': 1.06} +2025-05-11 01:17:52 - ERROR - stderr - 35%|███▌ | 1320/3741 [7:51:58<13:58:33, 20.78s/it] +2025-05-11 01:18:12 - ERROR - stderr - 35%|███▌ | 1321/3741 [7:52:18<13:44:53, 20.45s/it] +2025-05-11 01:18:12 - ERROR - stderr - +2025-05-11 01:18:12 - ERROR - stderr - +2025-05-11 01:18:12 - INFO - stdout - {'loss': 0.7606, 'grad_norm': 0.6781443953514099, 'learning_rate': 1.5009995559142268e-05, 'epoch': 1.06} +2025-05-11 01:18:12 - ERROR - stderr - 35%|███▌ | 1321/3741 [7:52:18<13:44:53, 20.45s/it] +2025-05-11 01:18:31 - ERROR - stderr - 35%|███▌ | 1322/3741 [7:52:38<13:36:01, 20.24s/it] +2025-05-11 01:18:31 - ERROR - stderr - +2025-05-11 01:18:31 - ERROR - stderr - +2025-05-11 01:18:31 - INFO - stdout - {'loss': 0.7608, 'grad_norm': 0.6722328066825867, 'learning_rate': 1.5002499515167891e-05, 'epoch': 1.06} +2025-05-11 01:18:31 - ERROR - stderr - 35%|███▌ | 1322/3741 [7:52:38<13:36:01, 20.24s/it] +2025-05-11 01:18:55 - ERROR - stderr - 35%|███▌ | 1323/3741 [7:53:01<14:11:29, 21.13s/it] +2025-05-11 01:18:55 - ERROR - stderr - +2025-05-11 01:18:55 - ERROR - stderr - +2025-05-11 01:18:55 - INFO - stdout - {'loss': 0.7563, 'grad_norm': 0.6786977052688599, 'learning_rate': 1.4994999720149008e-05, 'epoch': 1.06} +2025-05-11 01:18:55 - ERROR - stderr - 35%|███▌ | 1323/3741 [7:53:01<14:11:29, 21.13s/it] +2025-05-11 01:19:14 - ERROR - stderr - 35%|███▌ | 1324/3741 [7:53:20<13:50:06, 20.61s/it] +2025-05-11 01:19:14 - ERROR - stderr - +2025-05-11 01:19:14 - ERROR - stderr - +2025-05-11 01:19:14 - INFO - stdout - {'loss': 0.7366, 'grad_norm': 0.6650587320327759, 'learning_rate': 1.4987496179709226e-05, 'epoch': 1.06} +2025-05-11 01:19:14 - ERROR - stderr - 35%|███▌ | 1324/3741 [7:53:20<13:50:06, 20.61s/it] +2025-05-11 01:19:38 - ERROR - stderr - 35%|███▌ | 1325/3741 [7:53:44<14:29:25, 21.59s/it] +2025-05-11 01:19:38 - ERROR - stderr - +2025-05-11 01:19:38 - ERROR - stderr - +2025-05-11 01:19:38 - INFO - stdout - {'loss': 0.7738, 'grad_norm': 0.6645624041557312, 'learning_rate': 1.4979988899474955e-05, 'epoch': 1.06} +2025-05-11 01:19:38 - ERROR - stderr - 35%|███▌ | 1325/3741 [7:53:44<14:29:25, 21.59s/it] +2025-05-11 01:19:57 - ERROR - stderr - 35%|███▌ | 1326/3741 [7:54:04<14:03:11, 20.95s/it] +2025-05-11 01:19:57 - ERROR - stderr - +2025-05-11 01:19:57 - ERROR - stderr - +2025-05-11 01:19:57 - INFO - stdout - {'loss': 0.7404, 'grad_norm': 0.622387170791626, 'learning_rate': 1.4972477885075404e-05, 'epoch': 1.06} +2025-05-11 01:19:57 - ERROR - stderr - 35%|███▌ | 1326/3741 [7:54:04<14:03:11, 20.95s/it] +2025-05-11 01:20:17 - ERROR - stderr - 35%|███▌ | 1327/3741 [7:54:23<13:44:36, 20.50s/it] +2025-05-11 01:20:17 - ERROR - stderr - +2025-05-11 01:20:17 - ERROR - stderr - +2025-05-11 01:20:17 - INFO - stdout - {'loss': 0.7977, 'grad_norm': 0.6579604148864746, 'learning_rate': 1.4964963142142597e-05, 'epoch': 1.06} +2025-05-11 01:20:17 - ERROR - stderr - 35%|███▌ | 1327/3741 [7:54:23<13:44:36, 20.50s/it] +2025-05-11 01:20:38 - ERROR - stderr - 35%|███▌ | 1328/3741 [7:54:45<13:56:42, 20.80s/it] +2025-05-11 01:20:38 - ERROR - stderr - +2025-05-11 01:20:38 - ERROR - stderr - +2025-05-11 01:20:38 - INFO - stdout - {'loss': 0.7356, 'grad_norm': 0.6148852109909058, 'learning_rate': 1.4957444676311333e-05, 'epoch': 1.06} +2025-05-11 01:20:38 - ERROR - stderr - 35%|███▌ | 1328/3741 [7:54:45<13:56:42, 20.80s/it] +2025-05-11 01:20:58 - ERROR - stderr - 36%|███▌ | 1329/3741 [7:55:04<13:41:53, 20.44s/it] +2025-05-11 01:20:58 - ERROR - stderr - +2025-05-11 01:20:58 - ERROR - stderr - +2025-05-11 01:20:58 - INFO - stdout - {'loss': 0.792, 'grad_norm': 0.7013448476791382, 'learning_rate': 1.494992249321922e-05, 'epoch': 1.07} +2025-05-11 01:20:58 - ERROR - stderr - 36%|███▌ | 1329/3741 [7:55:04<13:41:53, 20.44s/it] +2025-05-11 01:21:22 - ERROR - stderr - 36%|███▌ | 1330/3741 [7:55:29<14:30:08, 21.65s/it] +2025-05-11 01:21:22 - ERROR - stderr - +2025-05-11 01:21:22 - ERROR - stderr - +2025-05-11 01:21:22 - INFO - stdout - {'loss': 0.7947, 'grad_norm': 0.6262637376785278, 'learning_rate': 1.4942396598506643e-05, 'epoch': 1.07} +2025-05-11 01:21:22 - ERROR - stderr - 36%|███▌ | 1330/3741 [7:55:29<14:30:08, 21.65s/it] +2025-05-11 01:21:42 - ERROR - stderr - 36%|███▌ | 1331/3741 [7:55:48<14:03:17, 20.99s/it] +2025-05-11 01:21:42 - ERROR - stderr - +2025-05-11 01:21:42 - ERROR - stderr - +2025-05-11 01:21:42 - INFO - stdout - {'loss': 0.756, 'grad_norm': 0.6252999901771545, 'learning_rate': 1.4934866997816779e-05, 'epoch': 1.07} +2025-05-11 01:21:42 - ERROR - stderr - 36%|███▌ | 1331/3741 [7:55:48<14:03:17, 20.99s/it] +2025-05-11 01:22:03 - ERROR - stderr - 36%|███▌ | 1332/3741 [7:56:09<14:05:17, 21.05s/it] +2025-05-11 01:22:03 - ERROR - stderr - +2025-05-11 01:22:03 - ERROR - stderr - +2025-05-11 01:22:03 - INFO - stdout - {'loss': 0.7121, 'grad_norm': 0.6794742345809937, 'learning_rate': 1.4927333696795581e-05, 'epoch': 1.07} +2025-05-11 01:22:03 - ERROR - stderr - 36%|███▌ | 1332/3741 [7:56:09<14:05:17, 21.05s/it] +2025-05-11 01:22:23 - ERROR - stderr - 36%|███▌ | 1333/3741 [7:56:30<13:57:05, 20.86s/it] +2025-05-11 01:22:23 - ERROR - stderr - +2025-05-11 01:22:23 - ERROR - stderr - +2025-05-11 01:22:23 - INFO - stdout - {'loss': 0.7567, 'grad_norm': 0.6507592797279358, 'learning_rate': 1.4919796701091767e-05, 'epoch': 1.07} +2025-05-11 01:22:23 - ERROR - stderr - 36%|███▌ | 1333/3741 [7:56:30<13:57:05, 20.86s/it] +2025-05-11 01:22:43 - ERROR - stderr - 36%|███▌ | 1334/3741 [7:56:49<13:40:33, 20.45s/it] +2025-05-11 01:22:43 - ERROR - stderr - +2025-05-11 01:22:43 - ERROR - stderr - +2025-05-11 01:22:43 - INFO - stdout - {'loss': 0.7232, 'grad_norm': 0.6619201898574829, 'learning_rate': 1.4912256016356837e-05, 'epoch': 1.07} +2025-05-11 01:22:43 - ERROR - stderr - 36%|███▌ | 1334/3741 [7:56:49<13:40:33, 20.45s/it] +2025-05-11 01:23:05 - ERROR - stderr - 36%|███▌ | 1335/3741 [7:57:12<14:03:59, 21.05s/it] +2025-05-11 01:23:05 - ERROR - stderr - +2025-05-11 01:23:05 - ERROR - stderr - +2025-05-11 01:23:05 - INFO - stdout - {'loss': 0.7477, 'grad_norm': 0.628643274307251, 'learning_rate': 1.4904711648245053e-05, 'epoch': 1.07} +2025-05-11 01:23:05 - ERROR - stderr - 36%|███▌ | 1335/3741 [7:57:12<14:03:59, 21.05s/it] +2025-05-11 01:23:25 - ERROR - stderr - 36%|███▌ | 1336/3741 [7:57:31<13:42:00, 20.51s/it] +2025-05-11 01:23:25 - ERROR - stderr - +2025-05-11 01:23:25 - ERROR - stderr - +2025-05-11 01:23:25 - INFO - stdout - {'loss': 0.8047, 'grad_norm': 0.6922639608383179, 'learning_rate': 1.4897163602413438e-05, 'epoch': 1.07} +2025-05-11 01:23:25 - ERROR - stderr - 36%|███▌ | 1336/3741 [7:57:31<13:42:00, 20.51s/it] +2025-05-11 01:23:47 - ERROR - stderr - 36%|███▌ | 1337/3741 [7:57:54<14:08:38, 21.18s/it] +2025-05-11 01:23:47 - ERROR - stderr - +2025-05-11 01:23:47 - ERROR - stderr - +2025-05-11 01:23:47 - INFO - stdout - {'loss': 0.7624, 'grad_norm': 0.6198031306266785, 'learning_rate': 1.4889611884521777e-05, 'epoch': 1.07} +2025-05-11 01:23:47 - ERROR - stderr - 36%|███▌ | 1337/3741 [7:57:54<14:08:38, 21.18s/it] +2025-05-11 01:23:48 - INFO - stdout - WARNING: tokenization mismatch: 3227 vs. 3245. (ignored) +2025-05-11 01:24:07 - ERROR - stderr - 36%|███▌ | 1338/3741 [7:58:14<13:54:07, 20.83s/it] +2025-05-11 01:24:07 - ERROR - stderr - +2025-05-11 01:24:07 - ERROR - stderr - +2025-05-11 01:24:07 - INFO - stdout - {'loss': 0.8046, 'grad_norm': 0.6809404492378235, 'learning_rate': 1.4882056500232604e-05, 'epoch': 1.07} +2025-05-11 01:24:07 - ERROR - stderr - 36%|███▌ | 1338/3741 [7:58:14<13:54:07, 20.83s/it] +2025-05-11 01:24:27 - ERROR - stderr - 36%|███▌ | 1339/3741 [7:58:34<13:42:50, 20.55s/it] +2025-05-11 01:24:27 - ERROR - stderr - +2025-05-11 01:24:27 - ERROR - stderr - +2025-05-11 01:24:27 - INFO - stdout - {'loss': 0.7846, 'grad_norm': 0.6459743976593018, 'learning_rate': 1.4874497455211203e-05, 'epoch': 1.07} +2025-05-11 01:24:27 - ERROR - stderr - 36%|███▌ | 1339/3741 [7:58:34<13:42:50, 20.55s/it] +2025-05-11 01:24:47 - ERROR - stderr - 36%|███▌ | 1340/3741 [7:58:54<13:36:39, 20.41s/it] +2025-05-11 01:24:47 - ERROR - stderr - +2025-05-11 01:24:47 - ERROR - stderr - +2025-05-11 01:24:47 - INFO - stdout - {'loss': 0.7692, 'grad_norm': 0.6424413323402405, 'learning_rate': 1.48669347551256e-05, 'epoch': 1.07} +2025-05-11 01:24:47 - ERROR - stderr - 36%|███▌ | 1340/3741 [7:58:54<13:36:39, 20.41s/it] +2025-05-11 01:25:07 - ERROR - stderr - 36%|███▌ | 1341/3741 [7:59:13<13:27:34, 20.19s/it] +2025-05-11 01:25:07 - ERROR - stderr - +2025-05-11 01:25:07 - ERROR - stderr - +2025-05-11 01:25:07 - INFO - stdout - {'loss': 0.774, 'grad_norm': 0.7114162445068359, 'learning_rate': 1.4859368405646568e-05, 'epoch': 1.08} +2025-05-11 01:25:07 - ERROR - stderr - 36%|███▌ | 1341/3741 [7:59:13<13:27:34, 20.19s/it] +2025-05-11 01:25:30 - ERROR - stderr - 36%|███▌ | 1342/3741 [7:59:37<14:04:59, 21.13s/it] +2025-05-11 01:25:30 - ERROR - stderr - +2025-05-11 01:25:30 - ERROR - stderr - +2025-05-11 01:25:30 - INFO - stdout - {'loss': 0.7763, 'grad_norm': 0.658915638923645, 'learning_rate': 1.485179841244762e-05, 'epoch': 1.08} +2025-05-11 01:25:30 - ERROR - stderr - 36%|███▌ | 1342/3741 [7:59:37<14:04:59, 21.13s/it] +2025-05-11 01:25:50 - ERROR - stderr - 36%|███▌ | 1343/3741 [7:59:56<13:44:18, 20.62s/it] +2025-05-11 01:25:50 - ERROR - stderr - +2025-05-11 01:25:50 - ERROR - stderr - +2025-05-11 01:25:50 - INFO - stdout - {'loss': 0.7907, 'grad_norm': 0.6556907296180725, 'learning_rate': 1.484422478120498e-05, 'epoch': 1.08} +2025-05-11 01:25:50 - ERROR - stderr - 36%|███▌ | 1343/3741 [7:59:56<13:44:18, 20.62s/it] +2025-05-11 01:26:13 - ERROR - stderr - 36%|███▌ | 1344/3741 [8:00:19<14:15:31, 21.42s/it] +2025-05-11 01:26:13 - ERROR - stderr - +2025-05-11 01:26:13 - ERROR - stderr - +2025-05-11 01:26:13 - INFO - stdout - {'loss': 0.7479, 'grad_norm': 0.6900550723075867, 'learning_rate': 1.4836647517597627e-05, 'epoch': 1.08} +2025-05-11 01:26:13 - ERROR - stderr - 36%|███▌ | 1344/3741 [8:00:19<14:15:31, 21.42s/it] +2025-05-11 01:26:33 - ERROR - stderr - 36%|███▌ | 1345/3741 [8:00:39<13:54:39, 20.90s/it] +2025-05-11 01:26:33 - ERROR - stderr - +2025-05-11 01:26:33 - ERROR - stderr - +2025-05-11 01:26:33 - INFO - stdout - {'loss': 0.7893, 'grad_norm': 0.7201621532440186, 'learning_rate': 1.4829066627307246e-05, 'epoch': 1.08} +2025-05-11 01:26:33 - ERROR - stderr - 36%|███▌ | 1345/3741 [8:00:39<13:54:39, 20.90s/it] +2025-05-11 01:26:53 - ERROR - stderr - 36%|███▌ | 1346/3741 [8:00:59<13:43:07, 20.62s/it] +2025-05-11 01:26:53 - ERROR - stderr - +2025-05-11 01:26:53 - ERROR - stderr - +2025-05-11 01:26:53 - INFO - stdout - {'loss': 0.7821, 'grad_norm': 0.6671075820922852, 'learning_rate': 1.4821482116018251e-05, 'epoch': 1.08} +2025-05-11 01:26:53 - ERROR - stderr - 36%|███▌ | 1346/3741 [8:00:59<13:43:07, 20.62s/it] +2025-05-11 01:27:13 - ERROR - stderr - 36%|███▌ | 1347/3741 [8:01:19<13:38:42, 20.52s/it] +2025-05-11 01:27:13 - ERROR - stderr - +2025-05-11 01:27:13 - ERROR - stderr - +2025-05-11 01:27:13 - INFO - stdout - {'loss': 0.7846, 'grad_norm': 0.7838239669799805, 'learning_rate': 1.4813893989417762e-05, 'epoch': 1.08} +2025-05-11 01:27:13 - ERROR - stderr - 36%|███▌ | 1347/3741 [8:01:19<13:38:42, 20.52s/it] +2025-05-11 01:27:33 - ERROR - stderr - 36%|███▌ | 1348/3741 [8:01:39<13:27:46, 20.25s/it] +2025-05-11 01:27:33 - ERROR - stderr - +2025-05-11 01:27:33 - ERROR - stderr - +2025-05-11 01:27:33 - INFO - stdout - {'loss': 0.7694, 'grad_norm': 0.6680654287338257, 'learning_rate': 1.4806302253195617e-05, 'epoch': 1.08} +2025-05-11 01:27:33 - ERROR - stderr - 36%|███▌ | 1348/3741 [8:01:39<13:27:46, 20.25s/it] +2025-05-11 01:27:56 - ERROR - stderr - 36%|███▌ | 1349/3741 [8:02:03<14:09:31, 21.31s/it] +2025-05-11 01:27:56 - ERROR - stderr - +2025-05-11 01:27:56 - ERROR - stderr - +2025-05-11 01:27:56 - INFO - stdout - {'loss': 0.7297, 'grad_norm': 0.6512035131454468, 'learning_rate': 1.4798706913044357e-05, 'epoch': 1.08} +2025-05-11 01:27:56 - ERROR - stderr - 36%|███▌ | 1349/3741 [8:02:03<14:09:31, 21.31s/it] +2025-05-11 01:28:16 - ERROR - stderr - 36%|███▌ | 1350/3741 [8:02:22<13:45:52, 20.72s/it] +2025-05-11 01:28:16 - ERROR - stderr - +2025-05-11 01:28:16 - ERROR - stderr - +2025-05-11 01:28:16 - INFO - stdout - {'loss': 0.7998, 'grad_norm': 0.6682960391044617, 'learning_rate': 1.4791107974659229e-05, 'epoch': 1.08} +2025-05-11 01:28:16 - ERROR - stderr - 36%|███▌ | 1350/3741 [8:02:22<13:45:52, 20.72s/it] +2025-05-11 01:28:39 - ERROR - stderr - 36%|███▌ | 1351/3741 [8:02:45<14:17:57, 21.54s/it] +2025-05-11 01:28:39 - ERROR - stderr - +2025-05-11 01:28:39 - ERROR - stderr - +2025-05-11 01:28:39 - INFO - stdout - {'loss': 0.7683, 'grad_norm': 0.7098135948181152, 'learning_rate': 1.4783505443738173e-05, 'epoch': 1.08} +2025-05-11 01:28:39 - ERROR - stderr - 36%|███▌ | 1351/3741 [8:02:46<14:17:57, 21.54s/it] +2025-05-11 01:28:59 - ERROR - stderr - 36%|███▌ | 1352/3741 [8:03:05<13:54:39, 20.96s/it] +2025-05-11 01:28:59 - ERROR - stderr - +2025-05-11 01:28:59 - ERROR - stderr - +2025-05-11 01:28:59 - INFO - stdout - {'loss': 0.7553, 'grad_norm': 0.6800927519798279, 'learning_rate': 1.4775899325981828e-05, 'epoch': 1.08} +2025-05-11 01:28:59 - ERROR - stderr - 36%|███▌ | 1352/3741 [8:03:05<13:54:39, 20.96s/it] +2025-05-11 01:29:18 - ERROR - stderr - 36%|███▌ | 1353/3741 [8:03:25<13:37:53, 20.55s/it] +2025-05-11 01:29:18 - ERROR - stderr - +2025-05-11 01:29:18 - ERROR - stderr - +2025-05-11 01:29:18 - INFO - stdout - {'loss': 0.772, 'grad_norm': 0.6061440110206604, 'learning_rate': 1.476828962709352e-05, 'epoch': 1.09} +2025-05-11 01:29:18 - ERROR - stderr - 36%|███▌ | 1353/3741 [8:03:25<13:37:53, 20.55s/it] +2025-05-11 01:29:39 - ERROR - stderr - 36%|███▌ | 1354/3741 [8:03:45<13:39:06, 20.59s/it] +2025-05-11 01:29:39 - ERROR - stderr - +2025-05-11 01:29:39 - ERROR - stderr - +2025-05-11 01:29:39 - INFO - stdout - {'loss': 0.8075, 'grad_norm': 0.6747270226478577, 'learning_rate': 1.4760676352779258e-05, 'epoch': 1.09} +2025-05-11 01:29:39 - ERROR - stderr - 36%|███▌ | 1354/3741 [8:03:45<13:39:06, 20.59s/it] +2025-05-11 01:29:59 - ERROR - stderr - 36%|███▌ | 1355/3741 [8:04:05<13:28:39, 20.34s/it] +2025-05-11 01:29:59 - ERROR - stderr - +2025-05-11 01:29:59 - ERROR - stderr - +2025-05-11 01:29:59 - INFO - stdout - {'loss': 0.8008, 'grad_norm': 0.6570102572441101, 'learning_rate': 1.4753059508747738e-05, 'epoch': 1.09} +2025-05-11 01:29:59 - ERROR - stderr - 36%|███▌ | 1355/3741 [8:04:05<13:28:39, 20.34s/it] +2025-05-11 01:30:22 - ERROR - stderr - 36%|███▌ | 1356/3741 [8:04:29<14:06:42, 21.30s/it] +2025-05-11 01:30:22 - ERROR - stderr - +2025-05-11 01:30:22 - ERROR - stderr - +2025-05-11 01:30:22 - INFO - stdout - {'loss': 0.7605, 'grad_norm': 0.6908283233642578, 'learning_rate': 1.4745439100710326e-05, 'epoch': 1.09} +2025-05-11 01:30:22 - ERROR - stderr - 36%|███▌ | 1356/3741 [8:04:29<14:06:42, 21.30s/it] +2025-05-11 01:30:42 - ERROR - stderr - 36%|███▋ | 1357/3741 [8:04:48<13:45:47, 20.78s/it] +2025-05-11 01:30:42 - ERROR - stderr - +2025-05-11 01:30:42 - ERROR - stderr - +2025-05-11 01:30:42 - INFO - stdout - {'loss': 0.746, 'grad_norm': 0.6615950465202332, 'learning_rate': 1.4737815134381066e-05, 'epoch': 1.09} +2025-05-11 01:30:42 - ERROR - stderr - 36%|███▋ | 1357/3741 [8:04:48<13:45:47, 20.78s/it] +2025-05-11 01:31:06 - ERROR - stderr - 36%|███▋ | 1358/3741 [8:05:12<14:26:13, 21.81s/it] +2025-05-11 01:31:06 - ERROR - stderr - +2025-05-11 01:31:06 - ERROR - stderr - +2025-05-11 01:31:06 - INFO - stdout - {'loss': 0.7629, 'grad_norm': 0.6627095937728882, 'learning_rate': 1.4730187615476663e-05, 'epoch': 1.09} +2025-05-11 01:31:06 - ERROR - stderr - 36%|███▋ | 1358/3741 [8:05:12<14:26:13, 21.81s/it] +2025-05-11 01:31:26 - ERROR - stderr - 36%|███▋ | 1359/3741 [8:05:32<13:57:54, 21.11s/it] +2025-05-11 01:31:26 - ERROR - stderr - +2025-05-11 01:31:26 - ERROR - stderr - +2025-05-11 01:31:26 - INFO - stdout - {'loss': 0.7637, 'grad_norm': 0.7005937099456787, 'learning_rate': 1.4722556549716495e-05, 'epoch': 1.09} +2025-05-11 01:31:26 - ERROR - stderr - 36%|███▋ | 1359/3741 [8:05:32<13:57:54, 21.11s/it] +2025-05-11 01:31:45 - ERROR - stderr - 36%|███▋ | 1360/3741 [8:05:52<13:41:13, 20.69s/it] +2025-05-11 01:31:45 - ERROR - stderr - +2025-05-11 01:31:45 - ERROR - stderr - +2025-05-11 01:31:45 - INFO - stdout - {'loss': 0.7745, 'grad_norm': 0.7017346620559692, 'learning_rate': 1.4714921942822593e-05, 'epoch': 1.09} +2025-05-11 01:31:45 - ERROR - stderr - 36%|███▋ | 1360/3741 [8:05:52<13:41:13, 20.69s/it] +2025-05-11 01:32:05 - ERROR - stderr - 36%|███▋ | 1361/3741 [8:06:11<13:30:41, 20.44s/it] +2025-05-11 01:32:05 - ERROR - stderr - +2025-05-11 01:32:05 - ERROR - stderr - +2025-05-11 01:32:05 - INFO - stdout - {'loss': 0.7665, 'grad_norm': 0.6601778268814087, 'learning_rate': 1.4707283800519647e-05, 'epoch': 1.09} +2025-05-11 01:32:05 - ERROR - stderr - 36%|███▋ | 1361/3741 [8:06:12<13:30:41, 20.44s/it] +2025-05-11 01:32:25 - ERROR - stderr - 36%|███▋ | 1362/3741 [8:06:31<13:19:32, 20.17s/it] +2025-05-11 01:32:25 - ERROR - stderr - +2025-05-11 01:32:25 - ERROR - stderr - +2025-05-11 01:32:25 - INFO - stdout - {'loss': 0.8088, 'grad_norm': 0.6817474961280823, 'learning_rate': 1.4699642128534994e-05, 'epoch': 1.09} +2025-05-11 01:32:25 - ERROR - stderr - 36%|███▋ | 1362/3741 [8:06:31<13:19:32, 20.17s/it] +2025-05-11 01:32:48 - ERROR - stderr - 36%|███▋ | 1363/3741 [8:06:54<13:54:20, 21.05s/it] +2025-05-11 01:32:48 - ERROR - stderr - +2025-05-11 01:32:48 - ERROR - stderr - +2025-05-11 01:32:48 - INFO - stdout - {'loss': 0.7555, 'grad_norm': 0.6907531023025513, 'learning_rate': 1.4691996932598621e-05, 'epoch': 1.09} +2025-05-11 01:32:48 - ERROR - stderr - 36%|███▋ | 1363/3741 [8:06:54<13:54:20, 21.05s/it] +2025-05-11 01:33:07 - ERROR - stderr - 36%|███▋ | 1364/3741 [8:07:14<13:35:05, 20.57s/it] +2025-05-11 01:33:07 - ERROR - stderr - +2025-05-11 01:33:07 - ERROR - stderr - +2025-05-11 01:33:07 - INFO - stdout - {'loss': 0.7749, 'grad_norm': 0.7029712796211243, 'learning_rate': 1.4684348218443159e-05, 'epoch': 1.09} +2025-05-11 01:33:07 - ERROR - stderr - 36%|███▋ | 1364/3741 [8:07:14<13:35:05, 20.57s/it] +2025-05-11 01:33:31 - ERROR - stderr - 36%|███▋ | 1365/3741 [8:07:38<14:15:08, 21.59s/it] +2025-05-11 01:33:31 - ERROR - stderr - +2025-05-11 01:33:31 - ERROR - stderr - +2025-05-11 01:33:31 - INFO - stdout - {'loss': 0.7931, 'grad_norm': 0.7028645873069763, 'learning_rate': 1.4676695991803869e-05, 'epoch': 1.09} +2025-05-11 01:33:31 - ERROR - stderr - 36%|███▋ | 1365/3741 [8:07:38<14:15:08, 21.59s/it] +2025-05-11 01:33:51 - ERROR - stderr - 37%|███▋ | 1366/3741 [8:07:57<13:51:30, 21.01s/it] +2025-05-11 01:33:51 - ERROR - stderr - +2025-05-11 01:33:51 - ERROR - stderr - +2025-05-11 01:33:51 - INFO - stdout - {'loss': 0.7675, 'grad_norm': 0.6735509634017944, 'learning_rate': 1.4669040258418652e-05, 'epoch': 1.1} +2025-05-11 01:33:51 - ERROR - stderr - 37%|███▋ | 1366/3741 [8:07:57<13:51:30, 21.01s/it] +2025-05-11 01:34:14 - ERROR - stderr - 37%|███▋ | 1367/3741 [8:08:21<14:19:20, 21.72s/it] +2025-05-11 01:34:14 - ERROR - stderr - +2025-05-11 01:34:14 - ERROR - stderr - +2025-05-11 01:34:14 - INFO - stdout - {'loss': 0.7434, 'grad_norm': 0.6408675909042358, 'learning_rate': 1.4661381024028042e-05, 'epoch': 1.1} +2025-05-11 01:34:14 - ERROR - stderr - 37%|███▋ | 1367/3741 [8:08:21<14:19:20, 21.72s/it] +2025-05-11 01:34:34 - ERROR - stderr - 37%|███▋ | 1368/3741 [8:08:40<13:51:36, 21.03s/it] +2025-05-11 01:34:34 - ERROR - stderr - +2025-05-11 01:34:34 - ERROR - stderr - +2025-05-11 01:34:34 - INFO - stdout - {'loss': 0.782, 'grad_norm': 0.6668729186058044, 'learning_rate': 1.4653718294375192e-05, 'epoch': 1.1} +2025-05-11 01:34:34 - ERROR - stderr - 37%|███▋ | 1368/3741 [8:08:40<13:51:36, 21.03s/it] +2025-05-11 01:34:53 - ERROR - stderr - 37%|███▋ | 1369/3741 [8:08:59<13:32:02, 20.54s/it] +2025-05-11 01:34:53 - ERROR - stderr - +2025-05-11 01:34:53 - ERROR - stderr - +2025-05-11 01:34:53 - INFO - stdout - {'loss': 0.7711, 'grad_norm': 0.7412964701652527, 'learning_rate': 1.4646052075205874e-05, 'epoch': 1.1} +2025-05-11 01:34:53 - ERROR - stderr - 37%|███▋ | 1369/3741 [8:08:59<13:32:02, 20.54s/it] +2025-05-11 01:35:14 - ERROR - stderr - 37%|███▋ | 1370/3741 [8:09:20<13:33:37, 20.59s/it] +2025-05-11 01:35:14 - ERROR - stderr - +2025-05-11 01:35:14 - ERROR - stderr - +2025-05-11 01:35:14 - INFO - stdout - {'loss': 0.7949, 'grad_norm': 0.6989220976829529, 'learning_rate': 1.4638382372268484e-05, 'epoch': 1.1} +2025-05-11 01:35:14 - ERROR - stderr - 37%|███▋ | 1370/3741 [8:09:20<13:33:37, 20.59s/it] +2025-05-11 01:35:33 - ERROR - stderr - 37%|███▋ | 1371/3741 [8:09:39<13:17:12, 20.18s/it] +2025-05-11 01:35:33 - ERROR - stderr - +2025-05-11 01:35:33 - ERROR - stderr - +2025-05-11 01:35:33 - INFO - stdout - {'loss': 0.7403, 'grad_norm': 0.6390843987464905, 'learning_rate': 1.4630709191314026e-05, 'epoch': 1.1} +2025-05-11 01:35:33 - ERROR - stderr - 37%|███▋ | 1371/3741 [8:09:39<13:17:12, 20.18s/it] +2025-05-11 01:35:56 - ERROR - stderr - 37%|███▋ | 1372/3741 [8:10:03<13:54:52, 21.14s/it] +2025-05-11 01:35:56 - ERROR - stderr - +2025-05-11 01:35:56 - ERROR - stderr - +2025-05-11 01:35:56 - INFO - stdout - {'loss': 0.7627, 'grad_norm': 0.6512402892112732, 'learning_rate': 1.462303253809611e-05, 'epoch': 1.1} +2025-05-11 01:35:56 - ERROR - stderr - 37%|███▋ | 1372/3741 [8:10:03<13:54:52, 21.14s/it] +2025-05-11 01:36:16 - ERROR - stderr - 37%|███▋ | 1373/3741 [8:10:22<13:34:09, 20.63s/it] +2025-05-11 01:36:16 - ERROR - stderr - +2025-05-11 01:36:16 - ERROR - stderr - +2025-05-11 01:36:16 - INFO - stdout - {'loss': 0.7596, 'grad_norm': 0.6433535218238831, 'learning_rate': 1.4615352418370958e-05, 'epoch': 1.1} +2025-05-11 01:36:16 - ERROR - stderr - 37%|███▋ | 1373/3741 [8:10:22<13:34:09, 20.63s/it] +2025-05-11 01:36:40 - ERROR - stderr - 37%|███▋ | 1374/3741 [8:10:47<14:17:49, 21.74s/it] +2025-05-11 01:36:40 - ERROR - stderr - +2025-05-11 01:36:40 - ERROR - stderr - +2025-05-11 01:36:40 - INFO - stdout - {'loss': 0.7647, 'grad_norm': 0.6682513356208801, 'learning_rate': 1.460766883789738e-05, 'epoch': 1.1} +2025-05-11 01:36:40 - ERROR - stderr - 37%|███▋ | 1374/3741 [8:10:47<14:17:49, 21.74s/it] +2025-05-11 01:37:00 - ERROR - stderr - 37%|███▋ | 1375/3741 [8:11:06<13:52:26, 21.11s/it] +2025-05-11 01:37:00 - ERROR - stderr - +2025-05-11 01:37:00 - ERROR - stderr - +2025-05-11 01:37:00 - INFO - stdout - {'loss': 0.7692, 'grad_norm': 0.6825112104415894, 'learning_rate': 1.4599981802436785e-05, 'epoch': 1.1} +2025-05-11 01:37:00 - ERROR - stderr - 37%|███▋ | 1375/3741 [8:11:06<13:52:26, 21.11s/it] +2025-05-11 01:37:20 - ERROR - stderr - 37%|███▋ | 1376/3741 [8:11:26<13:37:04, 20.73s/it] +2025-05-11 01:37:20 - ERROR - stderr - +2025-05-11 01:37:20 - ERROR - stderr - +2025-05-11 01:37:20 - INFO - stdout - {'loss': 0.7661, 'grad_norm': 0.6553147435188293, 'learning_rate': 1.4592291317753178e-05, 'epoch': 1.1} +2025-05-11 01:37:20 - ERROR - stderr - 37%|███▋ | 1376/3741 [8:11:26<13:37:04, 20.73s/it] +2025-05-11 01:37:40 - ERROR - stderr - 37%|███▋ | 1377/3741 [8:11:46<13:29:57, 20.56s/it] +2025-05-11 01:37:40 - ERROR - stderr - +2025-05-11 01:37:40 - ERROR - stderr - +2025-05-11 01:37:40 - INFO - stdout - {'loss': 0.7896, 'grad_norm': 0.688605010509491, 'learning_rate': 1.4584597389613144e-05, 'epoch': 1.1} +2025-05-11 01:37:40 - ERROR - stderr - 37%|███▋ | 1377/3741 [8:11:46<13:29:57, 20.56s/it] +2025-05-11 01:37:59 - ERROR - stderr - 37%|███▋ | 1378/3741 [8:12:06<13:15:56, 20.21s/it] +2025-05-11 01:37:59 - ERROR - stderr - +2025-05-11 01:37:59 - ERROR - stderr - +2025-05-11 01:37:59 - INFO - stdout - {'loss': 0.776, 'grad_norm': 0.6833084225654602, 'learning_rate': 1.4576900023785853e-05, 'epoch': 1.11} +2025-05-11 01:37:59 - ERROR - stderr - 37%|███▋ | 1378/3741 [8:12:06<13:15:56, 20.21s/it] +2025-05-11 01:38:23 - ERROR - stderr - 37%|███▋ | 1379/3741 [8:12:29<13:59:11, 21.32s/it] +2025-05-11 01:38:23 - ERROR - stderr - +2025-05-11 01:38:23 - ERROR - stderr - +2025-05-11 01:38:23 - INFO - stdout - {'loss': 0.7468, 'grad_norm': 0.6186316013336182, 'learning_rate': 1.4569199226043051e-05, 'epoch': 1.11} +2025-05-11 01:38:23 - ERROR - stderr - 37%|███▋ | 1379/3741 [8:12:29<13:59:11, 21.32s/it] +2025-05-11 01:38:43 - ERROR - stderr - 37%|███▋ | 1380/3741 [8:12:49<13:39:01, 20.81s/it] +2025-05-11 01:38:43 - ERROR - stderr - +2025-05-11 01:38:43 - ERROR - stderr - +2025-05-11 01:38:43 - INFO - stdout - {'loss': 0.7954, 'grad_norm': 0.6914650201797485, 'learning_rate': 1.4561495002159066e-05, 'epoch': 1.11} +2025-05-11 01:38:43 - ERROR - stderr - 37%|███▋ | 1380/3741 [8:12:49<13:39:01, 20.81s/it] +2025-05-11 01:39:08 - ERROR - stderr - 37%|███▋ | 1381/3741 [8:13:15<14:35:21, 22.25s/it] +2025-05-11 01:39:08 - ERROR - stderr - +2025-05-11 01:39:08 - ERROR - stderr - +2025-05-11 01:39:08 - INFO - stdout - {'loss': 0.7775, 'grad_norm': 0.6579850912094116, 'learning_rate': 1.4553787357910774e-05, 'epoch': 1.11} +2025-05-11 01:39:08 - ERROR - stderr - 37%|███▋ | 1381/3741 [8:13:15<14:35:21, 22.25s/it] +2025-05-11 01:39:28 - ERROR - stderr - 37%|███▋ | 1382/3741 [8:13:34<14:03:19, 21.45s/it] +2025-05-11 01:39:28 - ERROR - stderr - +2025-05-11 01:39:28 - ERROR - stderr - +2025-05-11 01:39:28 - INFO - stdout - {'loss': 0.7601, 'grad_norm': 0.6452553868293762, 'learning_rate': 1.4546076299077639e-05, 'epoch': 1.11} +2025-05-11 01:39:28 - ERROR - stderr - 37%|███▋ | 1382/3741 [8:13:34<14:03:19, 21.45s/it] +2025-05-11 01:39:52 - ERROR - stderr - 37%|███▋ | 1383/3741 [8:13:58<14:33:01, 22.21s/it] +2025-05-11 01:39:52 - ERROR - stderr - +2025-05-11 01:39:52 - ERROR - stderr - +2025-05-11 01:39:52 - INFO - stdout - {'loss': 0.7614, 'grad_norm': 0.654435396194458, 'learning_rate': 1.4538361831441672e-05, 'epoch': 1.11} +2025-05-11 01:39:52 - ERROR - stderr - 37%|███▋ | 1383/3741 [8:13:58<14:33:01, 22.21s/it] +2025-05-11 01:40:12 - ERROR - stderr - 37%|███▋ | 1384/3741 [8:14:18<14:01:23, 21.42s/it] +2025-05-11 01:40:12 - ERROR - stderr - +2025-05-11 01:40:12 - ERROR - stderr - +2025-05-11 01:40:12 - INFO - stdout - {'loss': 0.7705, 'grad_norm': 0.6667703986167908, 'learning_rate': 1.4530643960787445e-05, 'epoch': 1.11} +2025-05-11 01:40:12 - ERROR - stderr - 37%|███▋ | 1384/3741 [8:14:18<14:01:23, 21.42s/it] +2025-05-11 01:40:32 - ERROR - stderr - 37%|███▋ | 1385/3741 [8:14:38<13:46:10, 21.04s/it] +2025-05-11 01:40:32 - ERROR - stderr - +2025-05-11 01:40:32 - ERROR - stderr - +2025-05-11 01:40:32 - INFO - stdout - {'loss': 0.8051, 'grad_norm': 0.6765471696853638, 'learning_rate': 1.452292269290208e-05, 'epoch': 1.11} +2025-05-11 01:40:32 - ERROR - stderr - 37%|███▋ | 1385/3741 [8:14:38<13:46:10, 21.04s/it] +2025-05-11 01:40:51 - ERROR - stderr - 37%|███▋ | 1386/3741 [8:14:57<13:26:02, 20.54s/it] +2025-05-11 01:40:51 - ERROR - stderr - +2025-05-11 01:40:51 - ERROR - stderr - +2025-05-11 01:40:51 - INFO - stdout - {'loss': 0.7119, 'grad_norm': 0.633200466632843, 'learning_rate': 1.4515198033575243e-05, 'epoch': 1.11} +2025-05-11 01:40:51 - ERROR - stderr - 37%|███▋ | 1386/3741 [8:14:57<13:26:02, 20.54s/it] +2025-05-11 01:41:11 - ERROR - stderr - 37%|███▋ | 1387/3741 [8:15:17<13:15:01, 20.26s/it] +2025-05-11 01:41:11 - ERROR - stderr - +2025-05-11 01:41:11 - ERROR - stderr - +2025-05-11 01:41:11 - INFO - stdout - {'loss': 0.758, 'grad_norm': 0.6916564702987671, 'learning_rate': 1.4507469988599153e-05, 'epoch': 1.11} +2025-05-11 01:41:11 - ERROR - stderr - 37%|███▋ | 1387/3741 [8:15:17<13:15:01, 20.26s/it] +2025-05-11 01:41:32 - ERROR - stderr - 37%|███▋ | 1388/3741 [8:15:39<13:31:03, 20.68s/it] +2025-05-11 01:41:32 - ERROR - stderr - +2025-05-11 01:41:32 - ERROR - stderr - +2025-05-11 01:41:32 - INFO - stdout - {'loss': 0.7795, 'grad_norm': 0.6819466352462769, 'learning_rate': 1.4499738563768557e-05, 'epoch': 1.11} +2025-05-11 01:41:32 - ERROR - stderr - 37%|███▋ | 1388/3741 [8:15:39<13:31:03, 20.68s/it] +2025-05-11 01:41:52 - ERROR - stderr - 37%|███▋ | 1389/3741 [8:15:58<13:18:01, 20.36s/it] +2025-05-11 01:41:52 - ERROR - stderr - +2025-05-11 01:41:52 - ERROR - stderr - +2025-05-11 01:41:52 - INFO - stdout - {'loss': 0.8001, 'grad_norm': 0.6802613735198975, 'learning_rate': 1.4492003764880744e-05, 'epoch': 1.11} +2025-05-11 01:41:52 - ERROR - stderr - 37%|███▋ | 1389/3741 [8:15:58<13:18:01, 20.36s/it] +2025-05-11 01:42:15 - ERROR - stderr - 37%|███▋ | 1390/3741 [8:16:22<13:52:42, 21.25s/it] +2025-05-11 01:42:15 - ERROR - stderr - +2025-05-11 01:42:15 - ERROR - stderr - +2025-05-11 01:42:15 - INFO - stdout - {'loss': 0.7703, 'grad_norm': 0.6491445302963257, 'learning_rate': 1.4484265597735525e-05, 'epoch': 1.11} +2025-05-11 01:42:15 - ERROR - stderr - 37%|███▋ | 1390/3741 [8:16:22<13:52:42, 21.25s/it] +2025-05-11 01:42:35 - ERROR - stderr - 37%|███▋ | 1391/3741 [8:16:41<13:33:50, 20.78s/it] +2025-05-11 01:42:35 - ERROR - stderr - +2025-05-11 01:42:35 - ERROR - stderr - +2025-05-11 01:42:35 - INFO - stdout - {'loss': 0.7764, 'grad_norm': 0.634710431098938, 'learning_rate': 1.4476524068135246e-05, 'epoch': 1.12} +2025-05-11 01:42:35 - ERROR - stderr - 37%|███▋ | 1391/3741 [8:16:41<13:33:50, 20.78s/it] +2025-05-11 01:42:58 - ERROR - stderr - 37%|███▋ | 1392/3741 [8:17:04<13:57:19, 21.39s/it] +2025-05-11 01:42:58 - ERROR - stderr - +2025-05-11 01:42:58 - ERROR - stderr - +2025-05-11 01:42:58 - INFO - stdout - {'loss': 0.7844, 'grad_norm': 0.7030678391456604, 'learning_rate': 1.4468779181884762e-05, 'epoch': 1.12} +2025-05-11 01:42:58 - ERROR - stderr - 37%|███▋ | 1392/3741 [8:17:04<13:57:19, 21.39s/it] +2025-05-11 01:43:17 - ERROR - stderr - 37%|███▋ | 1393/3741 [8:17:24<13:36:26, 20.86s/it] +2025-05-11 01:43:17 - ERROR - stderr - +2025-05-11 01:43:17 - ERROR - stderr - +2025-05-11 01:43:17 - INFO - stdout - {'loss': 0.7452, 'grad_norm': 0.6353664398193359, 'learning_rate': 1.4461030944791464e-05, 'epoch': 1.12} +2025-05-11 01:43:17 - ERROR - stderr - 37%|███▋ | 1393/3741 [8:17:24<13:36:26, 20.86s/it] +2025-05-11 01:43:40 - ERROR - stderr - 37%|███▋ | 1394/3741 [8:17:47<13:59:28, 21.46s/it] +2025-05-11 01:43:40 - ERROR - stderr - +2025-05-11 01:43:40 - ERROR - stderr - +2025-05-11 01:43:40 - INFO - stdout - {'loss': 0.7598, 'grad_norm': 0.696847677230835, 'learning_rate': 1.4453279362665234e-05, 'epoch': 1.12} +2025-05-11 01:43:40 - ERROR - stderr - 37%|███▋ | 1394/3741 [8:17:47<13:59:28, 21.46s/it] +2025-05-11 01:44:00 - ERROR - stderr - 37%|███▋ | 1395/3741 [8:18:06<13:38:22, 20.93s/it] +2025-05-11 01:44:00 - ERROR - stderr - +2025-05-11 01:44:00 - ERROR - stderr - +2025-05-11 01:44:00 - INFO - stdout - {'loss': 0.7681, 'grad_norm': 0.6439919471740723, 'learning_rate': 1.4445524441318477e-05, 'epoch': 1.12} +2025-05-11 01:44:00 - ERROR - stderr - 37%|███▋ | 1395/3741 [8:18:06<13:38:22, 20.93s/it] +2025-05-11 01:44:20 - ERROR - stderr - 37%|███▋ | 1396/3741 [8:18:26<13:25:29, 20.61s/it] +2025-05-11 01:44:20 - ERROR - stderr - +2025-05-11 01:44:20 - ERROR - stderr - +2025-05-11 01:44:20 - INFO - stdout - {'loss': 0.7165, 'grad_norm': 0.6072260737419128, 'learning_rate': 1.4437766186566094e-05, 'epoch': 1.12} +2025-05-11 01:44:20 - ERROR - stderr - 37%|███▋ | 1396/3741 [8:18:26<13:25:29, 20.61s/it] +2025-05-11 01:44:40 - ERROR - stderr - 37%|███▋ | 1397/3741 [8:18:46<13:21:29, 20.52s/it] +2025-05-11 01:44:40 - ERROR - stderr - +2025-05-11 01:44:40 - ERROR - stderr - +2025-05-11 01:44:40 - INFO - stdout - {'loss': 0.757, 'grad_norm': 0.6615963578224182, 'learning_rate': 1.4430004604225493e-05, 'epoch': 1.12} +2025-05-11 01:44:40 - ERROR - stderr - 37%|███▋ | 1397/3741 [8:18:46<13:21:29, 20.52s/it] +2025-05-11 01:45:00 - ERROR - stderr - 37%|███▋ | 1398/3741 [8:19:06<13:11:30, 20.27s/it] +2025-05-11 01:45:00 - ERROR - stderr - +2025-05-11 01:45:00 - ERROR - stderr - +2025-05-11 01:45:00 - INFO - stdout - {'loss': 0.7481, 'grad_norm': 0.6312723159790039, 'learning_rate': 1.4422239700116572e-05, 'epoch': 1.12} +2025-05-11 01:45:00 - ERROR - stderr - 37%|███▋ | 1398/3741 [8:19:06<13:11:30, 20.27s/it] +2025-05-11 01:45:23 - ERROR - stderr - 37%|███▋ | 1399/3741 [8:19:30<13:48:44, 21.23s/it] +2025-05-11 01:45:23 - ERROR - stderr - +2025-05-11 01:45:23 - ERROR - stderr - +2025-05-11 01:45:23 - INFO - stdout - {'loss': 0.766, 'grad_norm': 0.6664157509803772, 'learning_rate': 1.4414471480061716e-05, 'epoch': 1.12} +2025-05-11 01:45:23 - ERROR - stderr - 37%|███▋ | 1399/3741 [8:19:30<13:48:44, 21.23s/it] +2025-05-11 01:45:43 - ERROR - stderr - 37%|███▋ | 1400/3741 [8:19:49<13:26:43, 20.68s/it] +2025-05-11 01:45:43 - ERROR - stderr - +2025-05-11 01:45:43 - ERROR - stderr - +2025-05-11 01:45:43 - INFO - stdout - {'loss': 0.8061, 'grad_norm': 0.6936686038970947, 'learning_rate': 1.4406699949885803e-05, 'epoch': 1.12} +2025-05-11 01:45:43 - ERROR - stderr - 37%|███▋ | 1400/3741 [8:19:49<13:26:43, 20.68s/it] +2025-05-11 01:46:06 - ERROR - stderr - 37%|███▋ | 1401/3741 [8:20:12<13:55:28, 21.42s/it] +2025-05-11 01:46:06 - ERROR - stderr - +2025-05-11 01:46:06 - ERROR - stderr - +2025-05-11 01:46:06 - INFO - stdout - {'loss': 0.7682, 'grad_norm': 0.6664496660232544, 'learning_rate': 1.4398925115416196e-05, 'epoch': 1.12} +2025-05-11 01:46:06 - ERROR - stderr - 37%|███▋ | 1401/3741 [8:20:12<13:55:28, 21.42s/it] +2025-05-11 01:46:25 - ERROR - stderr - 37%|███▋ | 1402/3741 [8:20:32<13:33:13, 20.86s/it] +2025-05-11 01:46:25 - ERROR - stderr - +2025-05-11 01:46:25 - ERROR - stderr - +2025-05-11 01:46:25 - INFO - stdout - {'loss': 0.7158, 'grad_norm': 0.6195146441459656, 'learning_rate': 1.4391146982482724e-05, 'epoch': 1.12} +2025-05-11 01:46:25 - ERROR - stderr - 37%|███▋ | 1402/3741 [8:20:32<13:33:13, 20.86s/it] +2025-05-11 01:46:49 - ERROR - stderr - 38%|███▊ | 1403/3741 [8:20:56<14:09:09, 21.79s/it] +2025-05-11 01:46:49 - ERROR - stderr - +2025-05-11 01:46:49 - ERROR - stderr - +2025-05-11 01:46:49 - INFO - stdout - {'loss': 0.7568, 'grad_norm': 0.627631425857544, 'learning_rate': 1.4383365556917701e-05, 'epoch': 1.13} +2025-05-11 01:46:49 - ERROR - stderr - 38%|███▊ | 1403/3741 [8:20:56<14:09:09, 21.79s/it] +2025-05-11 01:47:09 - ERROR - stderr - 38%|███▊ | 1404/3741 [8:21:15<13:40:06, 21.06s/it] +2025-05-11 01:47:09 - ERROR - stderr - +2025-05-11 01:47:09 - ERROR - stderr - +2025-05-11 01:47:09 - INFO - stdout - {'loss': 0.7522, 'grad_norm': 0.6510641574859619, 'learning_rate': 1.4375580844555898e-05, 'epoch': 1.13} +2025-05-11 01:47:09 - ERROR - stderr - 38%|███▊ | 1404/3741 [8:21:15<13:40:06, 21.06s/it] +2025-05-11 01:47:28 - ERROR - stderr - 38%|███▊ | 1405/3741 [8:21:35<13:25:22, 20.69s/it] +2025-05-11 01:47:28 - ERROR - stderr - +2025-05-11 01:47:28 - ERROR - stderr - +2025-05-11 01:47:28 - INFO - stdout - {'loss': 0.7652, 'grad_norm': 0.6601302027702332, 'learning_rate': 1.4367792851234566e-05, 'epoch': 1.13} +2025-05-11 01:47:28 - ERROR - stderr - 38%|███▊ | 1405/3741 [8:21:35<13:25:22, 20.69s/it] +2025-05-11 01:47:48 - ERROR - stderr - 38%|███▊ | 1406/3741 [8:21:54<13:11:07, 20.33s/it] +2025-05-11 01:47:48 - ERROR - stderr - +2025-05-11 01:47:48 - ERROR - stderr - +2025-05-11 01:47:48 - INFO - stdout - {'loss': 0.7619, 'grad_norm': 0.629599928855896, 'learning_rate': 1.4360001582793404e-05, 'epoch': 1.13} +2025-05-11 01:47:48 - ERROR - stderr - 38%|███▊ | 1406/3741 [8:21:54<13:11:07, 20.33s/it] +2025-05-11 01:48:08 - ERROR - stderr - 38%|███▊ | 1407/3741 [8:22:14<13:02:00, 20.10s/it] +2025-05-11 01:48:08 - ERROR - stderr - +2025-05-11 01:48:08 - ERROR - stderr - +2025-05-11 01:48:08 - INFO - stdout - {'loss': 0.7956, 'grad_norm': 0.7037693858146667, 'learning_rate': 1.4352207045074567e-05, 'epoch': 1.13} +2025-05-11 01:48:08 - ERROR - stderr - 38%|███▊ | 1407/3741 [8:22:14<13:02:00, 20.10s/it] +2025-05-11 01:48:30 - ERROR - stderr - 38%|███▊ | 1408/3741 [8:22:37<13:31:37, 20.87s/it] +2025-05-11 01:48:30 - ERROR - stderr - +2025-05-11 01:48:30 - ERROR - stderr - +2025-05-11 01:48:30 - INFO - stdout - {'loss': 0.7827, 'grad_norm': 0.6922396421432495, 'learning_rate': 1.4344409243922667e-05, 'epoch': 1.13} +2025-05-11 01:48:30 - ERROR - stderr - 38%|███▊ | 1408/3741 [8:22:37<13:31:37, 20.87s/it] +2025-05-11 01:48:50 - ERROR - stderr - 38%|███▊ | 1409/3741 [8:22:56<13:13:27, 20.41s/it] +2025-05-11 01:48:50 - ERROR - stderr - +2025-05-11 01:48:50 - ERROR - stderr - +2025-05-11 01:48:50 - INFO - stdout - {'loss': 0.7751, 'grad_norm': 0.6473610997200012, 'learning_rate': 1.4336608185184765e-05, 'epoch': 1.13} +2025-05-11 01:48:50 - ERROR - stderr - 38%|███▊ | 1409/3741 [8:22:56<13:13:27, 20.41s/it] +2025-05-11 01:49:14 - ERROR - stderr - 38%|███▊ | 1410/3741 [8:23:20<13:59:35, 21.61s/it] +2025-05-11 01:49:14 - ERROR - stderr - +2025-05-11 01:49:14 - ERROR - stderr - +2025-05-11 01:49:14 - INFO - stdout - {'loss': 0.7786, 'grad_norm': 0.7747618556022644, 'learning_rate': 1.4328803874710358e-05, 'epoch': 1.13} +2025-05-11 01:49:14 - ERROR - stderr - 38%|███▊ | 1410/3741 [8:23:20<13:59:35, 21.61s/it] +2025-05-11 01:49:34 - ERROR - stderr - 38%|███▊ | 1411/3741 [8:23:40<13:36:47, 21.03s/it] +2025-05-11 01:49:34 - ERROR - stderr - +2025-05-11 01:49:34 - ERROR - stderr - +2025-05-11 01:49:34 - INFO - stdout - {'loss': 0.7315, 'grad_norm': 0.6290801763534546, 'learning_rate': 1.4320996318351378e-05, 'epoch': 1.13} +2025-05-11 01:49:34 - ERROR - stderr - 38%|███▊ | 1411/3741 [8:23:40<13:36:47, 21.03s/it] +2025-05-11 01:49:58 - ERROR - stderr - 38%|███▊ | 1412/3741 [8:24:05<14:18:03, 22.11s/it] +2025-05-11 01:49:58 - ERROR - stderr - +2025-05-11 01:49:58 - ERROR - stderr - +2025-05-11 01:49:58 - INFO - stdout - {'loss': 0.796, 'grad_norm': 0.6735879778862, 'learning_rate': 1.4313185521962205e-05, 'epoch': 1.13} +2025-05-11 01:49:58 - ERROR - stderr - 38%|███▊ | 1412/3741 [8:24:05<14:18:03, 22.11s/it] +2025-05-11 01:50:18 - ERROR - stderr - 38%|███▊ | 1413/3741 [8:24:24<13:47:19, 21.32s/it] +2025-05-11 01:50:18 - ERROR - stderr - +2025-05-11 01:50:18 - ERROR - stderr - +2025-05-11 01:50:18 - INFO - stdout - {'loss': 0.7771, 'grad_norm': 0.6589605212211609, 'learning_rate': 1.4305371491399638e-05, 'epoch': 1.13} +2025-05-11 01:50:18 - ERROR - stderr - 38%|███▊ | 1413/3741 [8:24:24<13:47:19, 21.32s/it] +2025-05-11 01:50:37 - ERROR - stderr - 38%|███▊ | 1414/3741 [8:24:44<13:26:40, 20.80s/it] +2025-05-11 01:50:37 - ERROR - stderr - +2025-05-11 01:50:37 - ERROR - stderr - +2025-05-11 01:50:37 - INFO - stdout - {'loss': 0.7968, 'grad_norm': 0.6696829199790955, 'learning_rate': 1.4297554232522898e-05, 'epoch': 1.13} +2025-05-11 01:50:37 - ERROR - stderr - 38%|███▊ | 1414/3741 [8:24:44<13:26:40, 20.80s/it] +2025-05-11 01:50:57 - ERROR - stderr - 38%|███▊ | 1415/3741 [8:25:03<13:09:58, 20.38s/it] +2025-05-11 01:50:57 - ERROR - stderr - +2025-05-11 01:50:57 - ERROR - stderr - +2025-05-11 01:50:57 - INFO - stdout - {'loss': 0.7734, 'grad_norm': 0.6309067010879517, 'learning_rate': 1.4289733751193643e-05, 'epoch': 1.13} +2025-05-11 01:50:57 - ERROR - stderr - 38%|███▊ | 1415/3741 [8:25:03<13:09:58, 20.38s/it] +2025-05-11 01:51:17 - ERROR - stderr - 38%|███▊ | 1416/3741 [8:25:23<13:03:58, 20.23s/it] +2025-05-11 01:51:17 - ERROR - stderr - +2025-05-11 01:51:17 - ERROR - stderr - +2025-05-11 01:51:17 - INFO - stdout - {'loss': 0.7691, 'grad_norm': 0.6822018623352051, 'learning_rate': 1.4281910053275923e-05, 'epoch': 1.14} +2025-05-11 01:51:17 - ERROR - stderr - 38%|███▊ | 1416/3741 [8:25:23<13:03:58, 20.23s/it] +2025-05-11 01:51:39 - ERROR - stderr - 38%|███▊ | 1417/3741 [8:25:45<13:24:37, 20.77s/it] +2025-05-11 01:51:39 - ERROR - stderr - +2025-05-11 01:51:39 - ERROR - stderr - +2025-05-11 01:51:39 - INFO - stdout - {'loss': 0.7564, 'grad_norm': 0.6693670153617859, 'learning_rate': 1.427408314463622e-05, 'epoch': 1.14} +2025-05-11 01:51:39 - ERROR - stderr - 38%|███▊ | 1417/3741 [8:25:45<13:24:37, 20.77s/it] +2025-05-11 01:51:58 - ERROR - stderr - 38%|███▊ | 1418/3741 [8:26:05<13:10:40, 20.42s/it] +2025-05-11 01:51:58 - ERROR - stderr - +2025-05-11 01:51:58 - ERROR - stderr - +2025-05-11 01:51:58 - INFO - stdout - {'loss': 0.7953, 'grad_norm': 0.6806270480155945, 'learning_rate': 1.4266253031143418e-05, 'epoch': 1.14} +2025-05-11 01:51:58 - ERROR - stderr - 38%|███▊ | 1418/3741 [8:26:05<13:10:40, 20.42s/it] +2025-05-11 01:52:22 - ERROR - stderr - 38%|███▊ | 1419/3741 [8:26:28<13:49:59, 21.45s/it] +2025-05-11 01:52:22 - ERROR - stderr - +2025-05-11 01:52:22 - ERROR - stderr - +2025-05-11 01:52:22 - INFO - stdout - {'loss': 0.7782, 'grad_norm': 0.7900277376174927, 'learning_rate': 1.4258419718668801e-05, 'epoch': 1.14} +2025-05-11 01:52:22 - ERROR - stderr - 38%|███▊ | 1419/3741 [8:26:28<13:49:59, 21.45s/it] +2025-05-11 01:52:42 - ERROR - stderr - 38%|███▊ | 1420/3741 [8:26:48<13:28:18, 20.90s/it] +2025-05-11 01:52:42 - ERROR - stderr - +2025-05-11 01:52:42 - ERROR - stderr - +2025-05-11 01:52:42 - INFO - stdout - {'loss': 0.7406, 'grad_norm': 0.6651455760002136, 'learning_rate': 1.4250583213086051e-05, 'epoch': 1.14} +2025-05-11 01:52:42 - ERROR - stderr - 38%|███▊ | 1420/3741 [8:26:48<13:28:18, 20.90s/it] +2025-05-11 01:53:04 - ERROR - stderr - 38%|███▊ | 1421/3741 [8:27:10<13:45:54, 21.36s/it] +2025-05-11 01:53:04 - ERROR - stderr - +2025-05-11 01:53:04 - ERROR - stderr - +2025-05-11 01:53:04 - INFO - stdout - {'loss': 0.7845, 'grad_norm': 0.6853930950164795, 'learning_rate': 1.4242743520271249e-05, 'epoch': 1.14} +2025-05-11 01:53:04 - ERROR - stderr - 38%|███▊ | 1421/3741 [8:27:10<13:45:54, 21.36s/it] +2025-05-11 01:53:24 - ERROR - stderr - 38%|███▊ | 1422/3741 [8:27:30<13:25:58, 20.85s/it] +2025-05-11 01:53:24 - ERROR - stderr - +2025-05-11 01:53:24 - ERROR - stderr - +2025-05-11 01:53:24 - INFO - stdout - {'loss': 0.7476, 'grad_norm': 0.6740282773971558, 'learning_rate': 1.4234900646102864e-05, 'epoch': 1.14} +2025-05-11 01:53:24 - ERROR - stderr - 38%|███▊ | 1422/3741 [8:27:30<13:25:58, 20.85s/it] +2025-05-11 01:53:43 - ERROR - stderr - 38%|███▊ | 1423/3741 [8:27:50<13:11:39, 20.49s/it] +2025-05-11 01:53:43 - ERROR - stderr - +2025-05-11 01:53:43 - ERROR - stderr - +2025-05-11 01:53:43 - INFO - stdout - {'loss': 0.7855, 'grad_norm': 0.6734980344772339, 'learning_rate': 1.4227054596461754e-05, 'epoch': 1.14} +2025-05-11 01:53:43 - ERROR - stderr - 38%|███▊ | 1423/3741 [8:27:50<13:11:39, 20.49s/it] +2025-05-11 01:54:04 - ERROR - stderr - 38%|███▊ | 1424/3741 [8:28:10<13:12:29, 20.52s/it] +2025-05-11 01:54:04 - ERROR - stderr - +2025-05-11 01:54:04 - ERROR - stderr - +2025-05-11 01:54:04 - INFO - stdout - {'loss': 0.7757, 'grad_norm': 0.6694862842559814, 'learning_rate': 1.4219205377231147e-05, 'epoch': 1.14} +2025-05-11 01:54:04 - ERROR - stderr - 38%|███▊ | 1424/3741 [8:28:10<13:12:29, 20.52s/it] +2025-05-11 01:54:24 - ERROR - stderr - 38%|███▊ | 1425/3741 [8:28:30<13:06:20, 20.37s/it] +2025-05-11 01:54:24 - ERROR - stderr - +2025-05-11 01:54:24 - ERROR - stderr - +2025-05-11 01:54:24 - INFO - stdout - {'loss': 0.7891, 'grad_norm': 0.68555748462677, 'learning_rate': 1.4211352994296655e-05, 'epoch': 1.14} +2025-05-11 01:54:24 - ERROR - stderr - 38%|███▊ | 1425/3741 [8:28:30<13:06:20, 20.37s/it] +2025-05-11 01:54:48 - ERROR - stderr - 38%|███▊ | 1426/3741 [8:28:54<13:48:29, 21.47s/it] +2025-05-11 01:54:48 - ERROR - stderr - +2025-05-11 01:54:48 - ERROR - stderr - +2025-05-11 01:54:48 - INFO - stdout - {'loss': 0.766, 'grad_norm': 0.6966123580932617, 'learning_rate': 1.4203497453546267e-05, 'epoch': 1.14} +2025-05-11 01:54:48 - ERROR - stderr - 38%|███▊ | 1426/3741 [8:28:54<13:48:29, 21.47s/it] +2025-05-11 01:55:08 - ERROR - stderr - 38%|███▊ | 1427/3741 [8:29:14<13:31:07, 21.03s/it] +2025-05-11 01:55:08 - ERROR - stderr - +2025-05-11 01:55:08 - ERROR - stderr - +2025-05-11 01:55:08 - INFO - stdout - {'loss': 0.7537, 'grad_norm': 0.6659271121025085, 'learning_rate': 1.4195638760870334e-05, 'epoch': 1.14} +2025-05-11 01:55:08 - ERROR - stderr - 38%|███▊ | 1427/3741 [8:29:14<13:31:07, 21.03s/it] +2025-05-11 01:55:32 - ERROR - stderr - 38%|███▊ | 1428/3741 [8:29:38<13:59:28, 21.78s/it] +2025-05-11 01:55:32 - ERROR - stderr - +2025-05-11 01:55:32 - ERROR - stderr - +2025-05-11 01:55:32 - INFO - stdout - {'loss': 0.7547, 'grad_norm': 0.6558569073677063, 'learning_rate': 1.418777692216157e-05, 'epoch': 1.15} +2025-05-11 01:55:32 - ERROR - stderr - 38%|███▊ | 1428/3741 [8:29:38<13:59:28, 21.78s/it] +2025-05-11 01:55:51 - ERROR - stderr - 38%|███▊ | 1429/3741 [8:29:58<13:35:06, 21.15s/it] +2025-05-11 01:55:51 - ERROR - stderr - +2025-05-11 01:55:51 - ERROR - stderr - +2025-05-11 01:55:51 - INFO - stdout - {'loss': 0.7408, 'grad_norm': 0.6753950119018555, 'learning_rate': 1.417991194331505e-05, 'epoch': 1.15} +2025-05-11 01:55:51 - ERROR - stderr - 38%|███▊ | 1429/3741 [8:29:58<13:35:06, 21.15s/it] +2025-05-11 01:56:14 - ERROR - stderr - 38%|███▊ | 1430/3741 [8:30:20<13:46:57, 21.47s/it] +2025-05-11 01:56:14 - ERROR - stderr - +2025-05-11 01:56:14 - ERROR - stderr - +2025-05-11 01:56:14 - INFO - stdout - {'loss': 0.7769, 'grad_norm': 0.720521092414856, 'learning_rate': 1.4172043830228202e-05, 'epoch': 1.15} +2025-05-11 01:56:14 - ERROR - stderr - 38%|███▊ | 1430/3741 [8:30:20<13:46:57, 21.47s/it] +2025-05-11 01:56:33 - ERROR - stderr - 38%|███▊ | 1431/3741 [8:30:39<13:23:58, 20.88s/it] +2025-05-11 01:56:33 - ERROR - stderr - +2025-05-11 01:56:33 - ERROR - stderr - +2025-05-11 01:56:33 - INFO - stdout - {'loss': 0.7925, 'grad_norm': 0.685820996761322, 'learning_rate': 1.4164172588800809e-05, 'epoch': 1.15} +2025-05-11 01:56:33 - ERROR - stderr - 38%|███▊ | 1431/3741 [8:30:39<13:23:58, 20.88s/it] +2025-05-11 01:56:52 - ERROR - stderr - 38%|███▊ | 1432/3741 [8:30:59<13:06:10, 20.43s/it] +2025-05-11 01:56:52 - ERROR - stderr - +2025-05-11 01:56:52 - ERROR - stderr - +2025-05-11 01:56:52 - INFO - stdout - {'loss': 0.7714, 'grad_norm': 0.6265881061553955, 'learning_rate': 1.415629822493499e-05, 'epoch': 1.15} +2025-05-11 01:56:52 - ERROR - stderr - 38%|███▊ | 1432/3741 [8:30:59<13:06:10, 20.43s/it] +2025-05-11 01:57:15 - ERROR - stderr - 38%|███▊ | 1433/3741 [8:31:21<13:28:31, 21.02s/it] +2025-05-11 01:57:15 - ERROR - stderr - +2025-05-11 01:57:15 - ERROR - stderr - +2025-05-11 01:57:15 - INFO - stdout - {'loss': 0.7476, 'grad_norm': 0.6419846415519714, 'learning_rate': 1.4148420744535214e-05, 'epoch': 1.15} +2025-05-11 01:57:15 - ERROR - stderr - 38%|███▊ | 1433/3741 [8:31:21<13:28:31, 21.02s/it] +2025-05-11 01:57:34 - ERROR - stderr - 38%|███▊ | 1434/3741 [8:31:41<13:09:39, 20.54s/it] +2025-05-11 01:57:34 - ERROR - stderr - +2025-05-11 01:57:34 - ERROR - stderr - +2025-05-11 01:57:34 - INFO - stdout - {'loss': 0.7862, 'grad_norm': 0.6394557952880859, 'learning_rate': 1.4140540153508285e-05, 'epoch': 1.15} +2025-05-11 01:57:34 - ERROR - stderr - 38%|███▊ | 1434/3741 [8:31:41<13:09:39, 20.54s/it] +2025-05-11 01:57:59 - ERROR - stderr - 38%|███▊ | 1435/3741 [8:32:05<13:54:50, 21.72s/it] +2025-05-11 01:57:59 - ERROR - stderr - +2025-05-11 01:57:59 - ERROR - stderr - +2025-05-11 01:57:59 - INFO - stdout - {'loss': 0.8058, 'grad_norm': 0.6818154454231262, 'learning_rate': 1.4132656457763338e-05, 'epoch': 1.15} +2025-05-11 01:57:59 - ERROR - stderr - 38%|███▊ | 1435/3741 [8:32:05<13:54:50, 21.72s/it] +2025-05-11 01:58:18 - ERROR - stderr - 38%|███▊ | 1436/3741 [8:32:25<13:28:58, 21.06s/it] +2025-05-11 01:58:18 - ERROR - stderr - +2025-05-11 01:58:18 - ERROR - stderr - +2025-05-11 01:58:18 - INFO - stdout - {'loss': 0.75, 'grad_norm': 0.6973996758460999, 'learning_rate': 1.4124769663211837e-05, 'epoch': 1.15} +2025-05-11 01:58:18 - ERROR - stderr - 38%|███▊ | 1436/3741 [8:32:25<13:28:58, 21.06s/it] +2025-05-11 01:58:42 - ERROR - stderr - 38%|███▊ | 1437/3741 [8:32:48<13:57:00, 21.80s/it] +2025-05-11 01:58:42 - ERROR - stderr - +2025-05-11 01:58:42 - ERROR - stderr - +2025-05-11 01:58:42 - INFO - stdout - {'loss': 0.7878, 'grad_norm': 0.6343415379524231, 'learning_rate': 1.4116879775767567e-05, 'epoch': 1.15} +2025-05-11 01:58:42 - ERROR - stderr - 38%|███▊ | 1437/3741 [8:32:48<13:57:00, 21.80s/it] +2025-05-11 01:59:01 - ERROR - stderr - 38%|███▊ | 1438/3741 [8:33:08<13:30:45, 21.12s/it] +2025-05-11 01:59:01 - ERROR - stderr - +2025-05-11 01:59:01 - ERROR - stderr - +2025-05-11 01:59:01 - INFO - stdout - {'loss': 0.7894, 'grad_norm': 0.6887206435203552, 'learning_rate': 1.4108986801346633e-05, 'epoch': 1.15} +2025-05-11 01:59:01 - ERROR - stderr - 38%|███▊ | 1438/3741 [8:33:08<13:30:45, 21.12s/it] +2025-05-11 01:59:21 - ERROR - stderr - 38%|███▊ | 1439/3741 [8:33:27<13:15:47, 20.74s/it] +2025-05-11 01:59:21 - ERROR - stderr - +2025-05-11 01:59:21 - ERROR - stderr - +2025-05-11 01:59:21 - INFO - stdout - {'loss': 0.7608, 'grad_norm': 0.6938029527664185, 'learning_rate': 1.4101090745867464e-05, 'epoch': 1.15} +2025-05-11 01:59:21 - ERROR - stderr - 38%|███▊ | 1439/3741 [8:33:27<13:15:47, 20.74s/it] +2025-05-11 01:59:41 - ERROR - stderr - 38%|███▊ | 1440/3741 [8:33:48<13:07:42, 20.54s/it] +2025-05-11 01:59:41 - ERROR - stderr - +2025-05-11 01:59:41 - ERROR - stderr - +2025-05-11 01:59:41 - INFO - stdout - {'loss': 0.7765, 'grad_norm': 0.6808055639266968, 'learning_rate': 1.4093191615250785e-05, 'epoch': 1.15} +2025-05-11 01:59:41 - ERROR - stderr - 38%|███▊ | 1440/3741 [8:33:48<13:07:42, 20.54s/it] +2025-05-11 02:00:01 - ERROR - stderr - 39%|███▊ | 1441/3741 [8:34:08<13:01:59, 20.40s/it] +2025-05-11 02:00:01 - ERROR - stderr - +2025-05-11 02:00:01 - ERROR - stderr - +2025-05-11 02:00:01 - INFO - stdout - {'loss': 0.7583, 'grad_norm': 0.6570947766304016, 'learning_rate': 1.4085289415419632e-05, 'epoch': 1.16} +2025-05-11 02:00:01 - ERROR - stderr - 39%|███▊ | 1441/3741 [8:34:08<13:01:59, 20.40s/it] +2025-05-11 02:00:24 - ERROR - stderr - 39%|███▊ | 1442/3741 [8:34:31<13:33:03, 21.22s/it] +2025-05-11 02:00:24 - ERROR - stderr - +2025-05-11 02:00:24 - ERROR - stderr - +2025-05-11 02:00:24 - INFO - stdout - {'loss': 0.7418, 'grad_norm': 0.6893501877784729, 'learning_rate': 1.4077384152299348e-05, 'epoch': 1.16} +2025-05-11 02:00:24 - ERROR - stderr - 39%|███▊ | 1442/3741 [8:34:31<13:33:03, 21.22s/it] +2025-05-11 02:00:45 - ERROR - stderr - 39%|███▊ | 1443/3741 [8:34:51<13:24:00, 20.99s/it] +2025-05-11 02:00:45 - ERROR - stderr - +2025-05-11 02:00:45 - ERROR - stderr - +2025-05-11 02:00:45 - INFO - stdout - {'loss': 0.7751, 'grad_norm': 0.689246654510498, 'learning_rate': 1.4069475831817564e-05, 'epoch': 1.16} +2025-05-11 02:00:45 - ERROR - stderr - 39%|███▊ | 1443/3741 [8:34:51<13:24:00, 20.99s/it] +2025-05-11 02:01:08 - ERROR - stderr - 39%|███▊ | 1444/3741 [8:35:15<13:50:21, 21.69s/it] +2025-05-11 02:01:08 - ERROR - stderr - +2025-05-11 02:01:08 - ERROR - stderr - +2025-05-11 02:01:08 - INFO - stdout - {'loss': 0.7582, 'grad_norm': 0.6660308241844177, 'learning_rate': 1.4061564459904214e-05, 'epoch': 1.16} +2025-05-11 02:01:08 - ERROR - stderr - 39%|███▊ | 1444/3741 [8:35:15<13:50:21, 21.69s/it] +2025-05-11 02:01:28 - ERROR - stderr - 39%|███▊ | 1445/3741 [8:35:34<13:24:07, 21.01s/it] +2025-05-11 02:01:28 - ERROR - stderr - +2025-05-11 02:01:28 - ERROR - stderr - +2025-05-11 02:01:28 - INFO - stdout - {'loss': 0.7172, 'grad_norm': 0.6339597702026367, 'learning_rate': 1.4053650042491507e-05, 'epoch': 1.16} +2025-05-11 02:01:28 - ERROR - stderr - 39%|███▊ | 1445/3741 [8:35:34<13:24:07, 21.01s/it] +2025-05-11 02:01:47 - ERROR - stderr - 39%|███▊ | 1446/3741 [8:35:54<13:09:20, 20.64s/it] +2025-05-11 02:01:47 - ERROR - stderr - +2025-05-11 02:01:47 - ERROR - stderr - +2025-05-11 02:01:47 - INFO - stdout - {'loss': 0.7813, 'grad_norm': 0.6734780073165894, 'learning_rate': 1.4045732585513945e-05, 'epoch': 1.16} +2025-05-11 02:01:47 - ERROR - stderr - 39%|███▊ | 1446/3741 [8:35:54<13:09:20, 20.64s/it] +2025-05-11 02:02:08 - ERROR - stderr - 39%|███▊ | 1447/3741 [8:36:14<13:08:22, 20.62s/it] +2025-05-11 02:02:08 - ERROR - stderr - +2025-05-11 02:02:08 - ERROR - stderr - +2025-05-11 02:02:08 - INFO - stdout - {'loss': 0.755, 'grad_norm': 0.6459300518035889, 'learning_rate': 1.403781209490831e-05, 'epoch': 1.16} +2025-05-11 02:02:08 - ERROR - stderr - 39%|███▊ | 1447/3741 [8:36:14<13:08:22, 20.62s/it] +2025-05-11 02:02:27 - ERROR - stderr - 39%|███▊ | 1448/3741 [8:36:34<12:53:07, 20.23s/it] +2025-05-11 02:02:27 - ERROR - stderr - +2025-05-11 02:02:27 - ERROR - stderr - +2025-05-11 02:02:27 - INFO - stdout - {'loss': 0.7667, 'grad_norm': 0.6735323667526245, 'learning_rate': 1.4029888576613654e-05, 'epoch': 1.16} +2025-05-11 02:02:27 - ERROR - stderr - 39%|███▊ | 1448/3741 [8:36:34<12:53:07, 20.23s/it] +2025-05-11 02:02:52 - ERROR - stderr - 39%|███▊ | 1449/3741 [8:36:58<13:39:34, 21.45s/it] +2025-05-11 02:02:52 - ERROR - stderr - +2025-05-11 02:02:52 - ERROR - stderr - +2025-05-11 02:02:52 - INFO - stdout - {'loss': 0.7973, 'grad_norm': 0.7017342448234558, 'learning_rate': 1.4021962036571301e-05, 'epoch': 1.16} +2025-05-11 02:02:52 - ERROR - stderr - 39%|███▊ | 1449/3741 [8:36:58<13:39:34, 21.45s/it] +2025-05-11 02:03:11 - ERROR - stderr - 39%|███▉ | 1450/3741 [8:37:17<13:14:51, 20.82s/it] +2025-05-11 02:03:11 - ERROR - stderr - +2025-05-11 02:03:11 - ERROR - stderr - +2025-05-11 02:03:11 - INFO - stdout - {'loss': 0.7825, 'grad_norm': 0.6533816456794739, 'learning_rate': 1.4014032480724838e-05, 'epoch': 1.16} +2025-05-11 02:03:11 - ERROR - stderr - 39%|███▉ | 1450/3741 [8:37:17<13:14:51, 20.82s/it] +2025-05-11 02:03:35 - ERROR - stderr - 39%|███▉ | 1451/3741 [8:37:41<13:47:47, 21.69s/it] +2025-05-11 02:03:35 - ERROR - stderr - +2025-05-11 02:03:35 - ERROR - stderr - +2025-05-11 02:03:35 - INFO - stdout - {'loss': 0.7378, 'grad_norm': 0.674637496471405, 'learning_rate': 1.400609991502012e-05, 'epoch': 1.16} +2025-05-11 02:03:35 - ERROR - stderr - 39%|███▉ | 1451/3741 [8:37:41<13:47:47, 21.69s/it] +2025-05-11 02:03:54 - ERROR - stderr - 39%|███▉ | 1452/3741 [8:38:00<13:22:22, 21.03s/it] +2025-05-11 02:03:54 - ERROR - stderr - +2025-05-11 02:03:54 - ERROR - stderr - +2025-05-11 02:03:54 - INFO - stdout - {'loss': 0.7617, 'grad_norm': 0.6480211019515991, 'learning_rate': 1.3998164345405253e-05, 'epoch': 1.16} +2025-05-11 02:03:54 - ERROR - stderr - 39%|███▉ | 1452/3741 [8:38:00<13:22:22, 21.03s/it] +2025-05-11 02:04:14 - ERROR - stderr - 39%|███▉ | 1453/3741 [8:38:20<13:06:41, 20.63s/it] +2025-05-11 02:04:14 - ERROR - stderr - +2025-05-11 02:04:14 - ERROR - stderr - +2025-05-11 02:04:14 - INFO - stdout - {'loss': 0.788, 'grad_norm': 0.6655580997467041, 'learning_rate': 1.3990225777830595e-05, 'epoch': 1.17} +2025-05-11 02:04:14 - ERROR - stderr - 39%|███▉ | 1453/3741 [8:38:20<13:06:41, 20.63s/it] +2025-05-11 02:04:36 - ERROR - stderr - 39%|███▉ | 1454/3741 [8:38:42<13:18:39, 20.95s/it] +2025-05-11 02:04:36 - ERROR - stderr - +2025-05-11 02:04:36 - ERROR - stderr - +2025-05-11 02:04:36 - INFO - stdout - {'loss': 0.7567, 'grad_norm': 0.6576639413833618, 'learning_rate': 1.3982284218248758e-05, 'epoch': 1.17} +2025-05-11 02:04:36 - ERROR - stderr - 39%|███▉ | 1454/3741 [8:38:42<13:18:39, 20.95s/it] +2025-05-11 02:04:55 - ERROR - stderr - 39%|███▉ | 1455/3741 [8:39:01<13:02:49, 20.55s/it] +2025-05-11 02:04:55 - ERROR - stderr - +2025-05-11 02:04:55 - ERROR - stderr - +2025-05-11 02:04:55 - INFO - stdout - {'loss': 0.7859, 'grad_norm': 0.6448426246643066, 'learning_rate': 1.3974339672614594e-05, 'epoch': 1.17} +2025-05-11 02:04:55 - ERROR - stderr - 39%|███▉ | 1455/3741 [8:39:01<13:02:49, 20.55s/it] +2025-05-11 02:05:19 - ERROR - stderr - 39%|███▉ | 1456/3741 [8:39:26<13:42:50, 21.61s/it] +2025-05-11 02:05:19 - ERROR - stderr - +2025-05-11 02:05:19 - ERROR - stderr - +2025-05-11 02:05:19 - INFO - stdout - {'loss': 0.7589, 'grad_norm': 0.6625378131866455, 'learning_rate': 1.396639214688519e-05, 'epoch': 1.17} +2025-05-11 02:05:19 - ERROR - stderr - 39%|███▉ | 1456/3741 [8:39:26<13:42:50, 21.61s/it] +2025-05-11 02:05:39 - ERROR - stderr - 39%|███▉ | 1457/3741 [8:39:45<13:19:56, 21.01s/it] +2025-05-11 02:05:39 - ERROR - stderr - +2025-05-11 02:05:39 - ERROR - stderr - +2025-05-11 02:05:39 - INFO - stdout - {'loss': 0.7464, 'grad_norm': 0.679095983505249, 'learning_rate': 1.3958441647019877e-05, 'epoch': 1.17} +2025-05-11 02:05:39 - ERROR - stderr - 39%|███▉ | 1457/3741 [8:39:45<13:19:56, 21.01s/it] +2025-05-11 02:06:02 - ERROR - stderr - 39%|███▉ | 1458/3741 [8:40:09<13:49:11, 21.79s/it] +2025-05-11 02:06:02 - ERROR - stderr - +2025-05-11 02:06:02 - ERROR - stderr - +2025-05-11 02:06:02 - INFO - stdout - {'loss': 0.7687, 'grad_norm': 0.6716442704200745, 'learning_rate': 1.3950488178980203e-05, 'epoch': 1.17} +2025-05-11 02:06:02 - ERROR - stderr - 39%|███▉ | 1458/3741 [8:40:09<13:49:11, 21.79s/it] +2025-05-11 02:06:23 - ERROR - stderr - 39%|███▉ | 1459/3741 [8:40:29<13:28:41, 21.26s/it] +2025-05-11 02:06:23 - ERROR - stderr - +2025-05-11 02:06:23 - ERROR - stderr - +2025-05-11 02:06:23 - INFO - stdout - {'loss': 0.7424, 'grad_norm': 0.6463879346847534, 'learning_rate': 1.394253174872996e-05, 'epoch': 1.17} +2025-05-11 02:06:23 - ERROR - stderr - 39%|███▉ | 1459/3741 [8:40:29<13:28:41, 21.26s/it] +2025-05-11 02:06:42 - ERROR - stderr - 39%|███▉ | 1460/3741 [8:40:48<13:07:48, 20.72s/it] +2025-05-11 02:06:42 - ERROR - stderr - +2025-05-11 02:06:42 - ERROR - stderr - +2025-05-11 02:06:42 - INFO - stdout - {'loss': 0.7628, 'grad_norm': 0.6612167358398438, 'learning_rate': 1.393457236223514e-05, 'epoch': 1.17} +2025-05-11 02:06:42 - ERROR - stderr - 39%|███▉ | 1460/3741 [8:40:48<13:07:48, 20.72s/it] +2025-05-11 02:07:03 - ERROR - stderr - 39%|███▉ | 1461/3741 [8:41:09<13:06:33, 20.70s/it] +2025-05-11 02:07:03 - ERROR - stderr - +2025-05-11 02:07:03 - ERROR - stderr - +2025-05-11 02:07:03 - INFO - stdout - {'loss': 0.7804, 'grad_norm': 0.7058016657829285, 'learning_rate': 1.3926610025463967e-05, 'epoch': 1.17} +2025-05-11 02:07:03 - ERROR - stderr - 39%|███▉ | 1461/3741 [8:41:09<13:06:33, 20.70s/it] +2025-05-11 02:07:22 - ERROR - stderr - 39%|███▉ | 1462/3741 [8:41:28<12:52:17, 20.33s/it] +2025-05-11 02:07:22 - ERROR - stderr - +2025-05-11 02:07:22 - ERROR - stderr - +2025-05-11 02:07:22 - INFO - stdout - {'loss': 0.7949, 'grad_norm': 0.697573721408844, 'learning_rate': 1.3918644744386868e-05, 'epoch': 1.17} +2025-05-11 02:07:22 - ERROR - stderr - 39%|███▉ | 1462/3741 [8:41:28<12:52:17, 20.33s/it] +2025-05-11 02:07:46 - ERROR - stderr - 39%|███▉ | 1463/3741 [8:41:52<13:32:20, 21.40s/it] +2025-05-11 02:07:46 - ERROR - stderr - +2025-05-11 02:07:46 - ERROR - stderr - +2025-05-11 02:07:46 - INFO - stdout - {'loss': 0.7611, 'grad_norm': 0.6801334023475647, 'learning_rate': 1.3910676524976489e-05, 'epoch': 1.17} +2025-05-11 02:07:46 - ERROR - stderr - 39%|███▉ | 1463/3741 [8:41:52<13:32:20, 21.40s/it] +2025-05-11 02:08:06 - ERROR - stderr - 39%|███▉ | 1464/3741 [8:42:12<13:11:50, 20.87s/it] +2025-05-11 02:08:06 - ERROR - stderr - +2025-05-11 02:08:06 - ERROR - stderr - +2025-05-11 02:08:06 - INFO - stdout - {'loss': 0.7559, 'grad_norm': 0.6564053893089294, 'learning_rate': 1.3902705373207669e-05, 'epoch': 1.17} +2025-05-11 02:08:06 - ERROR - stderr - 39%|███▉ | 1464/3741 [8:42:12<13:11:50, 20.87s/it] +2025-05-11 02:08:30 - ERROR - stderr - 39%|███▉ | 1465/3741 [8:42:36<13:50:10, 21.89s/it] +2025-05-11 02:08:30 - ERROR - stderr - +2025-05-11 02:08:30 - ERROR - stderr - +2025-05-11 02:08:30 - INFO - stdout - {'loss': 0.7549, 'grad_norm': 0.6379392147064209, 'learning_rate': 1.3894731295057446e-05, 'epoch': 1.17} +2025-05-11 02:08:30 - ERROR - stderr - 39%|███▉ | 1465/3741 [8:42:36<13:50:10, 21.89s/it] +2025-05-11 02:08:49 - ERROR - stderr - 39%|███▉ | 1466/3741 [8:42:56<13:22:19, 21.16s/it] +2025-05-11 02:08:49 - ERROR - stderr - +2025-05-11 02:08:49 - ERROR - stderr - +2025-05-11 02:08:49 - INFO - stdout - {'loss': 0.7533, 'grad_norm': 0.6612043380737305, 'learning_rate': 1.388675429650506e-05, 'epoch': 1.18} +2025-05-11 02:08:49 - ERROR - stderr - 39%|███▉ | 1466/3741 [8:42:56<13:22:19, 21.16s/it] +2025-05-11 02:09:09 - ERROR - stderr - 39%|███▉ | 1467/3741 [8:43:15<13:02:12, 20.64s/it] +2025-05-11 02:09:09 - ERROR - stderr - +2025-05-11 02:09:09 - ERROR - stderr - +2025-05-11 02:09:09 - INFO - stdout - {'loss': 0.7963, 'grad_norm': 0.6841898560523987, 'learning_rate': 1.3878774383531935e-05, 'epoch': 1.18} +2025-05-11 02:09:09 - ERROR - stderr - 39%|███▉ | 1467/3741 [8:43:15<13:02:12, 20.64s/it] +2025-05-11 02:09:30 - ERROR - stderr - 39%|███▉ | 1468/3741 [8:43:37<13:13:07, 20.94s/it] +2025-05-11 02:09:30 - ERROR - stderr - +2025-05-11 02:09:30 - ERROR - stderr - +2025-05-11 02:09:30 - INFO - stdout - {'loss': 0.7894, 'grad_norm': 0.6732792854309082, 'learning_rate': 1.3870791562121679e-05, 'epoch': 1.18} +2025-05-11 02:09:30 - ERROR - stderr - 39%|███▉ | 1468/3741 [8:43:37<13:13:07, 20.94s/it] +2025-05-11 02:09:50 - ERROR - stderr - 39%|███▉ | 1469/3741 [8:43:56<12:53:46, 20.43s/it] +2025-05-11 02:09:50 - ERROR - stderr - +2025-05-11 02:09:50 - ERROR - stderr - +2025-05-11 02:09:50 - INFO - stdout - {'loss': 0.7317, 'grad_norm': 0.6291318535804749, 'learning_rate': 1.3862805838260087e-05, 'epoch': 1.18} +2025-05-11 02:09:50 - ERROR - stderr - 39%|███▉ | 1469/3741 [8:43:56<12:53:46, 20.43s/it] +2025-05-11 02:10:14 - ERROR - stderr - 39%|███▉ | 1470/3741 [8:44:20<13:34:08, 21.51s/it] +2025-05-11 02:10:14 - ERROR - stderr - +2025-05-11 02:10:14 - ERROR - stderr - +2025-05-11 02:10:14 - INFO - stdout - {'loss': 0.7777, 'grad_norm': 0.7076729536056519, 'learning_rate': 1.3854817217935126e-05, 'epoch': 1.18} +2025-05-11 02:10:14 - ERROR - stderr - 39%|███▉ | 1470/3741 [8:44:20<13:34:08, 21.51s/it] +2025-05-11 02:10:33 - ERROR - stderr - 39%|███▉ | 1471/3741 [8:44:39<13:09:13, 20.86s/it] +2025-05-11 02:10:33 - ERROR - stderr - +2025-05-11 02:10:33 - ERROR - stderr - +2025-05-11 02:10:33 - INFO - stdout - {'loss': 0.7755, 'grad_norm': 0.6925358176231384, 'learning_rate': 1.384682570713693e-05, 'epoch': 1.18} +2025-05-11 02:10:33 - ERROR - stderr - 39%|███▉ | 1471/3741 [8:44:39<13:09:13, 20.86s/it] +2025-05-11 02:10:57 - ERROR - stderr - 39%|███▉ | 1472/3741 [8:45:04<13:47:43, 21.89s/it] +2025-05-11 02:10:57 - ERROR - stderr - +2025-05-11 02:10:57 - ERROR - stderr - +2025-05-11 02:10:57 - INFO - stdout - {'loss': 0.7508, 'grad_norm': 0.6519325971603394, 'learning_rate': 1.3838831311857812e-05, 'epoch': 1.18} +2025-05-11 02:10:57 - ERROR - stderr - 39%|███▉ | 1472/3741 [8:45:04<13:47:43, 21.89s/it] +2025-05-11 02:11:17 - ERROR - stderr - 39%|███▉ | 1473/3741 [8:45:23<13:24:16, 21.28s/it] +2025-05-11 02:11:17 - ERROR - stderr - +2025-05-11 02:11:17 - ERROR - stderr - +2025-05-11 02:11:17 - INFO - stdout - {'loss': 0.7513, 'grad_norm': 0.6759724020957947, 'learning_rate': 1.383083403809224e-05, 'epoch': 1.18} +2025-05-11 02:11:17 - ERROR - stderr - 39%|███▉ | 1473/3741 [8:45:23<13:24:16, 21.28s/it] +2025-05-11 02:11:37 - ERROR - stderr - 39%|███▉ | 1474/3741 [8:45:44<13:10:10, 20.91s/it] +2025-05-11 02:11:37 - ERROR - stderr - +2025-05-11 02:11:37 - ERROR - stderr - +2025-05-11 02:11:37 - INFO - stdout - {'loss': 0.7574, 'grad_norm': 0.6461741328239441, 'learning_rate': 1.3822833891836846e-05, 'epoch': 1.18} +2025-05-11 02:11:37 - ERROR - stderr - 39%|███▉ | 1474/3741 [8:45:44<13:10:10, 20.91s/it] +2025-05-11 02:11:59 - ERROR - stderr - 39%|███▉ | 1475/3741 [8:46:05<13:15:00, 21.05s/it] +2025-05-11 02:11:59 - ERROR - stderr - +2025-05-11 02:11:59 - ERROR - stderr - +2025-05-11 02:11:59 - INFO - stdout - {'loss': 0.7941, 'grad_norm': 0.6571494936943054, 'learning_rate': 1.3814830879090409e-05, 'epoch': 1.18} +2025-05-11 02:11:59 - ERROR - stderr - 39%|███▉ | 1475/3741 [8:46:05<13:15:00, 21.05s/it] +2025-05-11 02:12:18 - ERROR - stderr - 39%|███▉ | 1476/3741 [8:46:24<12:56:42, 20.58s/it] +2025-05-11 02:12:18 - ERROR - stderr - +2025-05-11 02:12:18 - ERROR - stderr - +2025-05-11 02:12:18 - INFO - stdout - {'loss': 0.7657, 'grad_norm': 0.7067133784294128, 'learning_rate': 1.3806825005853855e-05, 'epoch': 1.18} +2025-05-11 02:12:18 - ERROR - stderr - 39%|███▉ | 1476/3741 [8:46:24<12:56:42, 20.58s/it] +2025-05-11 02:12:41 - ERROR - stderr - 39%|███▉ | 1477/3741 [8:46:47<13:22:04, 21.26s/it] +2025-05-11 02:12:41 - ERROR - stderr - +2025-05-11 02:12:41 - ERROR - stderr - +2025-05-11 02:12:41 - INFO - stdout - {'loss': 0.7547, 'grad_norm': 0.6409063935279846, 'learning_rate': 1.3798816278130268e-05, 'epoch': 1.18} +2025-05-11 02:12:41 - ERROR - stderr - 39%|███▉ | 1477/3741 [8:46:47<13:22:04, 21.26s/it] +2025-05-11 02:13:00 - ERROR - stderr - 40%|███▉ | 1478/3741 [8:47:07<13:00:16, 20.69s/it] +2025-05-11 02:13:00 - ERROR - stderr - +2025-05-11 02:13:00 - ERROR - stderr - +2025-05-11 02:13:00 - INFO - stdout - {'loss': 0.7466, 'grad_norm': 0.6490313410758972, 'learning_rate': 1.3790804701924861e-05, 'epoch': 1.19} +2025-05-11 02:13:00 - ERROR - stderr - 40%|███▉ | 1478/3741 [8:47:07<13:00:16, 20.69s/it] +2025-05-11 02:13:24 - ERROR - stderr - 40%|███▉ | 1479/3741 [8:47:30<13:31:44, 21.53s/it] +2025-05-11 02:13:24 - ERROR - stderr - +2025-05-11 02:13:24 - ERROR - stderr - +2025-05-11 02:13:24 - INFO - stdout - {'loss': 0.7466, 'grad_norm': 0.6448349952697754, 'learning_rate': 1.378279028324499e-05, 'epoch': 1.19} +2025-05-11 02:13:24 - ERROR - stderr - 40%|███▉ | 1479/3741 [8:47:30<13:31:44, 21.53s/it] +2025-05-11 02:13:43 - ERROR - stderr - 40%|███▉ | 1480/3741 [8:47:49<13:07:03, 20.89s/it] +2025-05-11 02:13:43 - ERROR - stderr - +2025-05-11 02:13:43 - ERROR - stderr - +2025-05-11 02:13:43 - INFO - stdout - {'loss': 0.7569, 'grad_norm': 0.6649291515350342, 'learning_rate': 1.3774773028100135e-05, 'epoch': 1.19} +2025-05-11 02:13:43 - ERROR - stderr - 40%|███▉ | 1480/3741 [8:47:49<13:07:03, 20.89s/it] +2025-05-11 02:14:03 - ERROR - stderr - 40%|███▉ | 1481/3741 [8:48:09<12:51:14, 20.48s/it] +2025-05-11 02:14:03 - ERROR - stderr - +2025-05-11 02:14:03 - ERROR - stderr - +2025-05-11 02:14:03 - INFO - stdout - {'loss': 0.7479, 'grad_norm': 0.6493022441864014, 'learning_rate': 1.3766752942501911e-05, 'epoch': 1.19} +2025-05-11 02:14:03 - ERROR - stderr - 40%|███▉ | 1481/3741 [8:48:09<12:51:14, 20.48s/it] +2025-05-11 02:14:23 - ERROR - stderr - 40%|███▉ | 1482/3741 [8:48:29<12:50:00, 20.45s/it] +2025-05-11 02:14:23 - ERROR - stderr - +2025-05-11 02:14:23 - ERROR - stderr - +2025-05-11 02:14:23 - INFO - stdout - {'loss': 0.7851, 'grad_norm': 0.6873480081558228, 'learning_rate': 1.375873003246405e-05, 'epoch': 1.19} +2025-05-11 02:14:23 - ERROR - stderr - 40%|███▉ | 1482/3741 [8:48:29<12:50:00, 20.45s/it] +2025-05-11 02:14:43 - ERROR - stderr - 40%|███▉ | 1483/3741 [8:48:49<12:38:31, 20.16s/it] +2025-05-11 02:14:43 - ERROR - stderr - +2025-05-11 02:14:43 - ERROR - stderr - +2025-05-11 02:14:43 - INFO - stdout - {'loss': 0.7799, 'grad_norm': 0.6878486275672913, 'learning_rate': 1.3750704304002398e-05, 'epoch': 1.19} +2025-05-11 02:14:43 - ERROR - stderr - 40%|███▉ | 1483/3741 [8:48:49<12:38:31, 20.16s/it] +2025-05-11 02:15:07 - ERROR - stderr - 40%|███▉ | 1484/3741 [8:49:13<13:26:35, 21.44s/it] +2025-05-11 02:15:07 - ERROR - stderr - +2025-05-11 02:15:07 - ERROR - stderr - +2025-05-11 02:15:07 - INFO - stdout - {'loss': 0.7607, 'grad_norm': 0.6725685596466064, 'learning_rate': 1.3742675763134926e-05, 'epoch': 1.19} +2025-05-11 02:15:07 - ERROR - stderr - 40%|███▉ | 1484/3741 [8:49:13<13:26:35, 21.44s/it] +2025-05-11 02:15:26 - ERROR - stderr - 40%|███▉ | 1485/3741 [8:49:33<13:04:36, 20.87s/it] +2025-05-11 02:15:27 - ERROR - stderr - +2025-05-11 02:15:27 - ERROR - stderr - +2025-05-11 02:15:27 - INFO - stdout - {'loss': 0.7748, 'grad_norm': 0.715043842792511, 'learning_rate': 1.3734644415881708e-05, 'epoch': 1.19} +2025-05-11 02:15:27 - ERROR - stderr - 40%|███▉ | 1485/3741 [8:49:33<13:04:36, 20.87s/it] +2025-05-11 02:15:51 - ERROR - stderr - 40%|███▉ | 1486/3741 [8:49:58<13:47:58, 22.03s/it] +2025-05-11 02:15:51 - ERROR - stderr - +2025-05-11 02:15:51 - ERROR - stderr - +2025-05-11 02:15:51 - INFO - stdout - {'loss': 0.7979, 'grad_norm': 0.7091044783592224, 'learning_rate': 1.3726610268264917e-05, 'epoch': 1.19} +2025-05-11 02:15:51 - ERROR - stderr - 40%|███▉ | 1486/3741 [8:49:58<13:47:58, 22.03s/it] +2025-05-11 02:16:11 - ERROR - stderr - 40%|███▉ | 1487/3741 [8:50:17<13:17:40, 21.23s/it] +2025-05-11 02:16:11 - ERROR - stderr - +2025-05-11 02:16:11 - ERROR - stderr - +2025-05-11 02:16:11 - INFO - stdout - {'loss': 0.7526, 'grad_norm': 0.7154737710952759, 'learning_rate': 1.3718573326308834e-05, 'epoch': 1.19} +2025-05-11 02:16:11 - ERROR - stderr - 40%|███▉ | 1487/3741 [8:50:17<13:17:40, 21.23s/it] +2025-05-11 02:16:30 - ERROR - stderr - 40%|███▉ | 1488/3741 [8:50:36<12:57:38, 20.71s/it] +2025-05-11 02:16:30 - ERROR - stderr - +2025-05-11 02:16:30 - ERROR - stderr - +2025-05-11 02:16:30 - INFO - stdout - {'loss': 0.7468, 'grad_norm': 0.6581177711486816, 'learning_rate': 1.3710533596039828e-05, 'epoch': 1.19} +2025-05-11 02:16:30 - ERROR - stderr - 40%|███▉ | 1488/3741 [8:50:36<12:57:38, 20.71s/it] +2025-05-11 02:16:50 - ERROR - stderr - 40%|███▉ | 1489/3741 [8:50:57<12:52:10, 20.57s/it] +2025-05-11 02:16:50 - ERROR - stderr - +2025-05-11 02:16:50 - ERROR - stderr - +2025-05-11 02:16:50 - INFO - stdout - {'loss': 0.7795, 'grad_norm': 0.7028568387031555, 'learning_rate': 1.3702491083486366e-05, 'epoch': 1.19} +2025-05-11 02:16:50 - ERROR - stderr - 40%|███▉ | 1489/3741 [8:50:57<12:52:10, 20.57s/it] +2025-05-11 02:17:10 - ERROR - stderr - 40%|███▉ | 1490/3741 [8:51:16<12:40:51, 20.28s/it] +2025-05-11 02:17:10 - ERROR - stderr - +2025-05-11 02:17:10 - ERROR - stderr - +2025-05-11 02:17:10 - INFO - stdout - {'loss': 0.8018, 'grad_norm': 0.6829168200492859, 'learning_rate': 1.3694445794678996e-05, 'epoch': 1.19} +2025-05-11 02:17:10 - ERROR - stderr - 40%|███▉ | 1490/3741 [8:51:16<12:40:51, 20.28s/it] +2025-05-11 02:17:33 - ERROR - stderr - 40%|███▉ | 1491/3741 [8:51:39<13:13:16, 21.15s/it] +2025-05-11 02:17:33 - ERROR - stderr - +2025-05-11 02:17:33 - ERROR - stderr - +2025-05-11 02:17:33 - INFO - stdout - {'loss': 0.7477, 'grad_norm': 0.6602156162261963, 'learning_rate': 1.3686397735650353e-05, 'epoch': 1.2} +2025-05-11 02:17:33 - ERROR - stderr - 40%|███▉ | 1491/3741 [8:51:39<13:13:16, 21.15s/it] +2025-05-11 02:17:53 - ERROR - stderr - 40%|███▉ | 1492/3741 [8:51:59<12:52:54, 20.62s/it] +2025-05-11 02:17:53 - ERROR - stderr - +2025-05-11 02:17:53 - ERROR - stderr - +2025-05-11 02:17:53 - INFO - stdout - {'loss': 0.7608, 'grad_norm': 0.6569467782974243, 'learning_rate': 1.3678346912435141e-05, 'epoch': 1.2} +2025-05-11 02:17:53 - ERROR - stderr - 40%|███▉ | 1492/3741 [8:51:59<12:52:54, 20.62s/it] +2025-05-11 02:18:16 - ERROR - stderr - 40%|███▉ | 1493/3741 [8:52:22<13:20:14, 21.36s/it] +2025-05-11 02:18:16 - ERROR - stderr - +2025-05-11 02:18:16 - ERROR - stderr - +2025-05-11 02:18:16 - INFO - stdout - {'loss': 0.7646, 'grad_norm': 0.6565800905227661, 'learning_rate': 1.3670293331070142e-05, 'epoch': 1.2} +2025-05-11 02:18:16 - ERROR - stderr - 40%|███▉ | 1493/3741 [8:52:22<13:20:14, 21.36s/it] +2025-05-11 02:18:35 - ERROR - stderr - 40%|███▉ | 1494/3741 [8:52:41<12:56:00, 20.72s/it] +2025-05-11 02:18:35 - ERROR - stderr - +2025-05-11 02:18:35 - ERROR - stderr - +2025-05-11 02:18:35 - INFO - stdout - {'loss': 0.8143, 'grad_norm': 0.6818427443504333, 'learning_rate': 1.3662236997594209e-05, 'epoch': 1.2} +2025-05-11 02:18:35 - ERROR - stderr - 40%|███▉ | 1494/3741 [8:52:41<12:56:00, 20.72s/it] +2025-05-11 02:18:54 - ERROR - stderr - 40%|███▉ | 1495/3741 [8:53:01<12:43:23, 20.39s/it] +2025-05-11 02:18:54 - ERROR - stderr - +2025-05-11 02:18:54 - ERROR - stderr - +2025-05-11 02:18:54 - INFO - stdout - {'loss': 0.7981, 'grad_norm': 0.7073442935943604, 'learning_rate': 1.3654177918048253e-05, 'epoch': 1.2} +2025-05-11 02:18:54 - ERROR - stderr - 40%|███▉ | 1495/3741 [8:53:01<12:43:23, 20.39s/it] +2025-05-11 02:19:16 - ERROR - stderr - 40%|███▉ | 1496/3741 [8:53:22<12:51:45, 20.63s/it] +2025-05-11 02:19:16 - ERROR - stderr - +2025-05-11 02:19:16 - ERROR - stderr - +2025-05-11 02:19:16 - INFO - stdout - {'loss': 0.7556, 'grad_norm': 0.6537404656410217, 'learning_rate': 1.3646116098475246e-05, 'epoch': 1.2} +2025-05-11 02:19:16 - ERROR - stderr - 40%|███▉ | 1496/3741 [8:53:22<12:51:45, 20.63s/it] +2025-05-11 02:19:35 - ERROR - stderr - 40%|████ | 1497/3741 [8:53:42<12:39:32, 20.31s/it] +2025-05-11 02:19:35 - ERROR - stderr - +2025-05-11 02:19:35 - ERROR - stderr - +2025-05-11 02:19:35 - INFO - stdout - {'loss': 0.7843, 'grad_norm': 0.6920551061630249, 'learning_rate': 1.3638051544920217e-05, 'epoch': 1.2} +2025-05-11 02:19:35 - ERROR - stderr - 40%|████ | 1497/3741 [8:53:42<12:39:32, 20.31s/it] +2025-05-11 02:19:58 - ERROR - stderr - 40%|████ | 1498/3741 [8:54:05<13:11:46, 21.18s/it] +2025-05-11 02:19:58 - ERROR - stderr - +2025-05-11 02:19:58 - ERROR - stderr - +2025-05-11 02:19:58 - INFO - stdout - {'loss': 0.7822, 'grad_norm': 0.9153415560722351, 'learning_rate': 1.3629984263430238e-05, 'epoch': 1.2} +2025-05-11 02:19:58 - ERROR - stderr - 40%|████ | 1498/3741 [8:54:05<13:11:46, 21.18s/it] +2025-05-11 02:20:18 - ERROR - stderr - 40%|████ | 1499/3741 [8:54:24<12:54:17, 20.72s/it] +2025-05-11 02:20:18 - ERROR - stderr - +2025-05-11 02:20:18 - ERROR - stderr - +2025-05-11 02:20:18 - INFO - stdout - {'loss': 0.75, 'grad_norm': 0.6674759387969971, 'learning_rate': 1.3621914260054437e-05, 'epoch': 1.2} +2025-05-11 02:20:18 - ERROR - stderr - 40%|████ | 1499/3741 [8:54:24<12:54:17, 20.72s/it] +2025-05-11 02:20:41 - ERROR - stderr - 40%|████ | 1500/3741 [8:54:48<13:23:49, 21.52s/it] +2025-05-11 02:20:41 - ERROR - stderr - +2025-05-11 02:20:41 - ERROR - stderr - +2025-05-11 02:20:41 - INFO - stdout - {'loss': 0.7385, 'grad_norm': 0.6499200463294983, 'learning_rate': 1.3613841540843978e-05, 'epoch': 1.2} +2025-05-11 02:20:41 - ERROR - stderr - 40%|████ | 1500/3741 [8:54:48<13:23:49, 21.52s/it] +2025-05-11 02:21:01 - ERROR - stderr - 40%|████ | 1501/3741 [8:55:07<13:01:29, 20.93s/it] +2025-05-11 02:21:01 - ERROR - stderr - +2025-05-11 02:21:01 - ERROR - stderr - +2025-05-11 02:21:01 - INFO - stdout - {'loss': 0.7836, 'grad_norm': 0.724064290523529, 'learning_rate': 1.3605766111852052e-05, 'epoch': 1.2} +2025-05-11 02:21:01 - ERROR - stderr - 40%|████ | 1501/3741 [8:55:07<13:01:29, 20.93s/it] +2025-05-11 02:21:20 - ERROR - stderr - 40%|████ | 1502/3741 [8:55:27<12:45:01, 20.50s/it] +2025-05-11 02:21:21 - ERROR - stderr - +2025-05-11 02:21:21 - ERROR - stderr - +2025-05-11 02:21:21 - INFO - stdout - {'loss': 0.7304, 'grad_norm': 0.6528597474098206, 'learning_rate': 1.3597687979133898e-05, 'epoch': 1.2} +2025-05-11 02:21:21 - ERROR - stderr - 40%|████ | 1502/3741 [8:55:27<12:45:01, 20.50s/it] +2025-05-11 02:21:42 - ERROR - stderr - 40%|████ | 1503/3741 [8:55:48<12:57:15, 20.84s/it] +2025-05-11 02:21:42 - ERROR - stderr - +2025-05-11 02:21:42 - ERROR - stderr - +2025-05-11 02:21:42 - INFO - stdout - {'loss': 0.7487, 'grad_norm': 0.6762754917144775, 'learning_rate': 1.3589607148746775e-05, 'epoch': 1.21} +2025-05-11 02:21:42 - ERROR - stderr - 40%|████ | 1503/3741 [8:55:48<12:57:15, 20.84s/it] +2025-05-11 02:22:01 - ERROR - stderr - 40%|████ | 1504/3741 [8:56:08<12:40:09, 20.39s/it] +2025-05-11 02:22:01 - ERROR - stderr - +2025-05-11 02:22:01 - ERROR - stderr - +2025-05-11 02:22:01 - INFO - stdout - {'loss': 0.762, 'grad_norm': 0.693828284740448, 'learning_rate': 1.3581523626749966e-05, 'epoch': 1.21} +2025-05-11 02:22:01 - ERROR - stderr - 40%|████ | 1504/3741 [8:56:08<12:40:09, 20.39s/it] +2025-05-11 02:22:26 - ERROR - stderr - 40%|████ | 1505/3741 [8:56:32<13:22:21, 21.53s/it] +2025-05-11 02:22:26 - ERROR - stderr - +2025-05-11 02:22:26 - ERROR - stderr - +2025-05-11 02:22:26 - INFO - stdout - {'loss': 0.8055, 'grad_norm': 0.6931832432746887, 'learning_rate': 1.3573437419204765e-05, 'epoch': 1.21} +2025-05-11 02:22:26 - ERROR - stderr - 40%|████ | 1505/3741 [8:56:32<13:22:21, 21.53s/it] +2025-05-11 02:22:45 - ERROR - stderr - 40%|████ | 1506/3741 [8:56:52<13:00:22, 20.95s/it] +2025-05-11 02:22:45 - ERROR - stderr - +2025-05-11 02:22:45 - ERROR - stderr - +2025-05-11 02:22:45 - INFO - stdout - {'loss': 0.7511, 'grad_norm': 0.691015362739563, 'learning_rate': 1.3565348532174487e-05, 'epoch': 1.21} +2025-05-11 02:22:45 - ERROR - stderr - 40%|████ | 1506/3741 [8:56:52<13:00:22, 20.95s/it] +2025-05-11 02:23:09 - ERROR - stderr - 40%|████ | 1507/3741 [8:57:15<13:31:36, 21.80s/it] +2025-05-11 02:23:09 - ERROR - stderr - +2025-05-11 02:23:09 - ERROR - stderr - +2025-05-11 02:23:09 - INFO - stdout - {'loss': 0.7281, 'grad_norm': 0.6388340592384338, 'learning_rate': 1.355725697172446e-05, 'epoch': 1.21} +2025-05-11 02:23:09 - ERROR - stderr - 40%|████ | 1507/3741 [8:57:15<13:31:36, 21.80s/it] +2025-05-11 02:23:29 - ERROR - stderr - 40%|████ | 1508/3741 [8:57:35<13:08:16, 21.18s/it] +2025-05-11 02:23:29 - ERROR - stderr - +2025-05-11 02:23:29 - ERROR - stderr - +2025-05-11 02:23:29 - INFO - stdout - {'loss': 0.7788, 'grad_norm': 0.6492217779159546, 'learning_rate': 1.354916274392201e-05, 'epoch': 1.21} +2025-05-11 02:23:29 - ERROR - stderr - 40%|████ | 1508/3741 [8:57:35<13:08:16, 21.18s/it] +2025-05-11 02:23:49 - ERROR - stderr - 40%|████ | 1509/3741 [8:57:56<13:01:07, 21.00s/it] +2025-05-11 02:23:49 - ERROR - stderr - +2025-05-11 02:23:49 - ERROR - stderr - +2025-05-11 02:23:49 - INFO - stdout - {'loss': 0.7426, 'grad_norm': 0.6737982034683228, 'learning_rate': 1.3541065854836464e-05, 'epoch': 1.21} +2025-05-11 02:23:49 - ERROR - stderr - 40%|████ | 1509/3741 [8:57:56<13:01:07, 21.00s/it] +2025-05-11 02:24:09 - ERROR - stderr - 40%|████ | 1510/3741 [8:58:15<12:47:08, 20.63s/it] +2025-05-11 02:24:09 - ERROR - stderr - +2025-05-11 02:24:09 - ERROR - stderr - +2025-05-11 02:24:09 - INFO - stdout - {'loss': 0.7495, 'grad_norm': 0.6400448083877563, 'learning_rate': 1.3532966310539142e-05, 'epoch': 1.21} +2025-05-11 02:24:09 - ERROR - stderr - 40%|████ | 1510/3741 [8:58:15<12:47:08, 20.63s/it] +2025-05-11 02:24:29 - ERROR - stderr - 40%|████ | 1511/3741 [8:58:35<12:37:38, 20.38s/it] +2025-05-11 02:24:29 - ERROR - stderr - +2025-05-11 02:24:29 - ERROR - stderr - +2025-05-11 02:24:29 - INFO - stdout - {'loss': 0.7913, 'grad_norm': 0.6762053370475769, 'learning_rate': 1.352486411710336e-05, 'epoch': 1.21} +2025-05-11 02:24:29 - ERROR - stderr - 40%|████ | 1511/3741 [8:58:35<12:37:38, 20.38s/it] +2025-05-11 02:24:52 - ERROR - stderr - 40%|████ | 1512/3741 [8:58:58<13:08:59, 21.24s/it] +2025-05-11 02:24:52 - ERROR - stderr - +2025-05-11 02:24:52 - ERROR - stderr - +2025-05-11 02:24:52 - INFO - stdout - {'loss': 0.7498, 'grad_norm': 0.6663560271263123, 'learning_rate': 1.3516759280604423e-05, 'epoch': 1.21} +2025-05-11 02:24:52 - ERROR - stderr - 40%|████ | 1512/3741 [8:58:58<13:08:59, 21.24s/it] +2025-05-11 02:25:12 - ERROR - stderr - 40%|████ | 1513/3741 [8:59:18<12:52:34, 20.81s/it] +2025-05-11 02:25:12 - ERROR - stderr - +2025-05-11 02:25:12 - ERROR - stderr - +2025-05-11 02:25:12 - INFO - stdout - {'loss': 0.7328, 'grad_norm': 0.6616519689559937, 'learning_rate': 1.3508651807119609e-05, 'epoch': 1.21} +2025-05-11 02:25:12 - ERROR - stderr - 40%|████ | 1513/3741 [8:59:18<12:52:34, 20.81s/it] +2025-05-11 02:25:35 - ERROR - stderr - 40%|████ | 1514/3741 [8:59:41<13:18:37, 21.52s/it] +2025-05-11 02:25:35 - ERROR - stderr - +2025-05-11 02:25:35 - ERROR - stderr - +2025-05-11 02:25:35 - INFO - stdout - {'loss': 0.7758, 'grad_norm': 0.636685311794281, 'learning_rate': 1.3500541702728175e-05, 'epoch': 1.21} +2025-05-11 02:25:35 - ERROR - stderr - 40%|████ | 1514/3741 [8:59:41<13:18:37, 21.52s/it] +2025-05-11 02:25:55 - ERROR - stderr - 40%|████ | 1515/3741 [9:00:01<12:59:17, 21.00s/it] +2025-05-11 02:25:55 - ERROR - stderr - +2025-05-11 02:25:55 - ERROR - stderr - +2025-05-11 02:25:55 - INFO - stdout - {'loss': 0.7013, 'grad_norm': 0.6234886050224304, 'learning_rate': 1.3492428973511363e-05, 'epoch': 1.21} +2025-05-11 02:25:55 - ERROR - stderr - 40%|████ | 1515/3741 [9:00:01<12:59:17, 21.00s/it] +2025-05-11 02:26:19 - ERROR - stderr - 41%|████ | 1516/3741 [9:00:25<13:32:49, 21.92s/it] +2025-05-11 02:26:19 - ERROR - stderr - +2025-05-11 02:26:19 - ERROR - stderr - +2025-05-11 02:26:19 - INFO - stdout - {'loss': 0.7369, 'grad_norm': 0.6537496447563171, 'learning_rate': 1.3484313625552362e-05, 'epoch': 1.22} +2025-05-11 02:26:19 - ERROR - stderr - 41%|████ | 1516/3741 [9:00:25<13:32:49, 21.92s/it] +2025-05-11 02:26:39 - ERROR - stderr - 41%|████ | 1517/3741 [9:00:45<13:11:06, 21.34s/it] +2025-05-11 02:26:39 - ERROR - stderr - +2025-05-11 02:26:39 - ERROR - stderr - +2025-05-11 02:26:39 - INFO - stdout - {'loss': 0.7798, 'grad_norm': 0.6941004991531372, 'learning_rate': 1.3476195664936347e-05, 'epoch': 1.22} +2025-05-11 02:26:39 - ERROR - stderr - 41%|████ | 1517/3741 [9:00:45<13:11:06, 21.34s/it] +2025-05-11 02:26:59 - ERROR - stderr - 41%|████ | 1518/3741 [9:01:06<12:58:47, 21.02s/it] +2025-05-11 02:26:59 - ERROR - stderr - +2025-05-11 02:26:59 - ERROR - stderr - +2025-05-11 02:26:59 - INFO - stdout - {'loss': 0.7566, 'grad_norm': 0.670886754989624, 'learning_rate': 1.3468075097750432e-05, 'epoch': 1.22} +2025-05-11 02:26:59 - ERROR - stderr - 41%|████ | 1518/3741 [9:01:06<12:58:47, 21.02s/it] +2025-05-11 02:27:20 - ERROR - stderr - 41%|████ | 1519/3741 [9:01:27<12:59:08, 21.04s/it] +2025-05-11 02:27:20 - ERROR - stderr - +2025-05-11 02:27:20 - ERROR - stderr - +2025-05-11 02:27:20 - INFO - stdout - {'loss': 0.7695, 'grad_norm': 0.751348614692688, 'learning_rate': 1.3459951930083698e-05, 'epoch': 1.22} +2025-05-11 02:27:20 - ERROR - stderr - 41%|████ | 1519/3741 [9:01:27<12:59:08, 21.04s/it] +2025-05-11 02:27:40 - ERROR - stderr - 41%|████ | 1520/3741 [9:01:47<12:47:25, 20.73s/it] +2025-05-11 02:27:40 - ERROR - stderr - +2025-05-11 02:27:40 - ERROR - stderr - +2025-05-11 02:27:40 - INFO - stdout - {'loss': 0.7587, 'grad_norm': 0.6668398976325989, 'learning_rate': 1.345182616802718e-05, 'epoch': 1.22} +2025-05-11 02:27:40 - ERROR - stderr - 41%|████ | 1520/3741 [9:01:47<12:47:25, 20.73s/it] +2025-05-11 02:28:03 - ERROR - stderr - 41%|████ | 1521/3741 [9:02:10<13:13:16, 21.44s/it] +2025-05-11 02:28:03 - ERROR - stderr - +2025-05-11 02:28:03 - ERROR - stderr - +2025-05-11 02:28:03 - INFO - stdout - {'loss': 0.7838, 'grad_norm': 0.6931249499320984, 'learning_rate': 1.3443697817673842e-05, 'epoch': 1.22} +2025-05-11 02:28:03 - ERROR - stderr - 41%|████ | 1521/3741 [9:02:10<13:13:16, 21.44s/it] +2025-05-11 02:28:23 - ERROR - stderr - 41%|████ | 1522/3741 [9:02:29<12:51:11, 20.85s/it] +2025-05-11 02:28:23 - ERROR - stderr - +2025-05-11 02:28:23 - ERROR - stderr - +2025-05-11 02:28:23 - INFO - stdout - {'loss': 0.7395, 'grad_norm': 0.6422427892684937, 'learning_rate': 1.34355668851186e-05, 'epoch': 1.22} +2025-05-11 02:28:23 - ERROR - stderr - 41%|████ | 1522/3741 [9:02:29<12:51:11, 20.85s/it] +2025-05-11 02:28:47 - ERROR - stderr - 41%|████ | 1523/3741 [9:02:53<13:25:26, 21.79s/it] +2025-05-11 02:28:47 - ERROR - stderr - +2025-05-11 02:28:47 - ERROR - stderr - +2025-05-11 02:28:47 - INFO - stdout - {'loss': 0.8113, 'grad_norm': 0.6797451376914978, 'learning_rate': 1.3427433376458306e-05, 'epoch': 1.22} +2025-05-11 02:28:47 - ERROR - stderr - 41%|████ | 1523/3741 [9:02:53<13:25:26, 21.79s/it] +2025-05-11 02:29:06 - ERROR - stderr - 41%|████ | 1524/3741 [9:03:12<12:57:04, 21.03s/it] +2025-05-11 02:29:06 - ERROR - stderr - +2025-05-11 02:29:06 - ERROR - stderr - +2025-05-11 02:29:06 - INFO - stdout - {'loss': 0.774, 'grad_norm': 0.6311590075492859, 'learning_rate': 1.341929729779174e-05, 'epoch': 1.22} +2025-05-11 02:29:06 - ERROR - stderr - 41%|████ | 1524/3741 [9:03:12<12:57:04, 21.03s/it] +2025-05-11 02:29:26 - ERROR - stderr - 41%|████ | 1525/3741 [9:03:33<12:47:38, 20.78s/it] +2025-05-11 02:29:26 - ERROR - stderr - +2025-05-11 02:29:26 - ERROR - stderr - +2025-05-11 02:29:26 - INFO - stdout - {'loss': 0.7781, 'grad_norm': 0.6638842821121216, 'learning_rate': 1.3411158655219615e-05, 'epoch': 1.22} +2025-05-11 02:29:26 - ERROR - stderr - 41%|████ | 1525/3741 [9:03:33<12:47:38, 20.78s/it] +2025-05-11 02:29:46 - ERROR - stderr - 41%|████ | 1526/3741 [9:03:52<12:35:34, 20.47s/it] +2025-05-11 02:29:46 - ERROR - stderr - +2025-05-11 02:29:46 - ERROR - stderr - +2025-05-11 02:29:46 - INFO - stdout - {'loss': 0.7779, 'grad_norm': 0.6489601135253906, 'learning_rate': 1.3403017454844556e-05, 'epoch': 1.22} +2025-05-11 02:29:46 - ERROR - stderr - 41%|████ | 1526/3741 [9:03:52<12:35:34, 20.47s/it] +2025-05-11 02:30:06 - ERROR - stderr - 41%|████ | 1527/3741 [9:04:12<12:26:47, 20.24s/it] +2025-05-11 02:30:06 - ERROR - stderr - +2025-05-11 02:30:06 - ERROR - stderr - +2025-05-11 02:30:06 - INFO - stdout - {'loss': 0.7736, 'grad_norm': 0.6765308976173401, 'learning_rate': 1.3394873702771114e-05, 'epoch': 1.22} +2025-05-11 02:30:06 - ERROR - stderr - 41%|████ | 1527/3741 [9:04:12<12:26:47, 20.24s/it] +2025-05-11 02:30:29 - ERROR - stderr - 41%|████ | 1528/3741 [9:04:35<12:58:22, 21.10s/it] +2025-05-11 02:30:29 - ERROR - stderr - +2025-05-11 02:30:29 - ERROR - stderr - +2025-05-11 02:30:29 - INFO - stdout - {'loss': 0.7436, 'grad_norm': 0.6653009653091431, 'learning_rate': 1.3386727405105756e-05, 'epoch': 1.23} +2025-05-11 02:30:29 - ERROR - stderr - 41%|████ | 1528/3741 [9:04:35<12:58:22, 21.10s/it] +2025-05-11 02:30:48 - ERROR - stderr - 41%|████ | 1529/3741 [9:04:55<12:39:57, 20.61s/it] +2025-05-11 02:30:48 - ERROR - stderr - +2025-05-11 02:30:48 - ERROR - stderr - +2025-05-11 02:30:48 - INFO - stdout - {'loss': 0.7256, 'grad_norm': 0.6361008286476135, 'learning_rate': 1.337857856795685e-05, 'epoch': 1.23} +2025-05-11 02:30:48 - ERROR - stderr - 41%|████ | 1529/3741 [9:04:55<12:39:57, 20.61s/it] +2025-05-11 02:31:12 - ERROR - stderr - 41%|████ | 1530/3741 [9:05:19<13:17:20, 21.64s/it] +2025-05-11 02:31:12 - ERROR - stderr - +2025-05-11 02:31:12 - ERROR - stderr - +2025-05-11 02:31:12 - INFO - stdout - {'loss': 0.7511, 'grad_norm': 0.6522552967071533, 'learning_rate': 1.3370427197434673e-05, 'epoch': 1.23} +2025-05-11 02:31:12 - ERROR - stderr - 41%|████ | 1530/3741 [9:05:19<13:17:20, 21.64s/it] +2025-05-11 02:31:32 - ERROR - stderr - 41%|████ | 1531/3741 [9:05:38<12:51:33, 20.95s/it] +2025-05-11 02:31:32 - ERROR - stderr - +2025-05-11 02:31:32 - ERROR - stderr - +2025-05-11 02:31:32 - INFO - stdout - {'loss': 0.749, 'grad_norm': 0.6462847590446472, 'learning_rate': 1.3362273299651395e-05, 'epoch': 1.23} +2025-05-11 02:31:32 - ERROR - stderr - 41%|████ | 1531/3741 [9:05:38<12:51:33, 20.95s/it] +2025-05-11 02:31:54 - ERROR - stderr - 41%|████ | 1532/3741 [9:06:01<13:09:42, 21.45s/it] +2025-05-11 02:31:54 - ERROR - stderr - +2025-05-11 02:31:54 - ERROR - stderr - +2025-05-11 02:31:54 - INFO - stdout - {'loss': 0.7598, 'grad_norm': 0.6825747489929199, 'learning_rate': 1.3354116880721093e-05, 'epoch': 1.23} +2025-05-11 02:31:54 - ERROR - stderr - 41%|████ | 1532/3741 [9:06:01<13:09:42, 21.45s/it] +2025-05-11 02:32:14 - ERROR - stderr - 41%|████ | 1533/3741 [9:06:20<12:46:57, 20.84s/it] +2025-05-11 02:32:14 - ERROR - stderr - +2025-05-11 02:32:14 - ERROR - stderr - +2025-05-11 02:32:14 - INFO - stdout - {'loss': 0.7954, 'grad_norm': 0.6631978154182434, 'learning_rate': 1.334595794675973e-05, 'epoch': 1.23} +2025-05-11 02:32:14 - ERROR - stderr - 41%|████ | 1533/3741 [9:06:20<12:46:57, 20.84s/it] +2025-05-11 02:32:33 - ERROR - stderr - 41%|████ | 1534/3741 [9:06:40<12:31:14, 20.42s/it] +2025-05-11 02:32:33 - ERROR - stderr - +2025-05-11 02:32:33 - ERROR - stderr - +2025-05-11 02:32:33 - INFO - stdout - {'loss': 0.7623, 'grad_norm': 0.6733466386795044, 'learning_rate': 1.333779650388514e-05, 'epoch': 1.23} +2025-05-11 02:32:33 - ERROR - stderr - 41%|████ | 1534/3741 [9:06:40<12:31:14, 20.42s/it] +2025-05-11 02:32:55 - ERROR - stderr - 41%|████ | 1535/3741 [9:07:01<12:45:57, 20.83s/it] +2025-05-11 02:32:55 - ERROR - stderr - +2025-05-11 02:32:55 - ERROR - stderr - +2025-05-11 02:32:55 - INFO - stdout - {'loss': 0.7626, 'grad_norm': 0.6870176792144775, 'learning_rate': 1.3329632558217065e-05, 'epoch': 1.23} +2025-05-11 02:32:55 - ERROR - stderr - 41%|████ | 1535/3741 [9:07:01<12:45:57, 20.83s/it] +2025-05-11 02:33:15 - ERROR - stderr - 41%|████ | 1536/3741 [9:07:22<12:38:53, 20.65s/it] +2025-05-11 02:33:15 - ERROR - stderr - +2025-05-11 02:33:15 - ERROR - stderr - +2025-05-11 02:33:15 - INFO - stdout - {'loss': 0.7815, 'grad_norm': 0.7062235474586487, 'learning_rate': 1.33214661158771e-05, 'epoch': 1.23} +2025-05-11 02:33:15 - ERROR - stderr - 41%|████ | 1536/3741 [9:07:22<12:38:53, 20.65s/it] +2025-05-11 02:33:38 - ERROR - stderr - 41%|████ | 1537/3741 [9:07:44<12:59:22, 21.22s/it] +2025-05-11 02:33:38 - ERROR - stderr - +2025-05-11 02:33:38 - ERROR - stderr - +2025-05-11 02:33:38 - INFO - stdout - {'loss': 0.7597, 'grad_norm': 0.6793636679649353, 'learning_rate': 1.3313297182988722e-05, 'epoch': 1.23} +2025-05-11 02:33:38 - ERROR - stderr - 41%|████ | 1537/3741 [9:07:44<12:59:22, 21.22s/it] +2025-05-11 02:33:57 - ERROR - stderr - 41%|████ | 1538/3741 [9:08:03<12:37:17, 20.63s/it] +2025-05-11 02:33:57 - ERROR - stderr - +2025-05-11 02:33:57 - ERROR - stderr - +2025-05-11 02:33:57 - INFO - stdout - {'loss': 0.7883, 'grad_norm': 0.697195291519165, 'learning_rate': 1.3305125765677283e-05, 'epoch': 1.23} +2025-05-11 02:33:57 - ERROR - stderr - 41%|████ | 1538/3741 [9:08:03<12:37:17, 20.63s/it] +2025-05-11 02:34:21 - ERROR - stderr - 41%|████ | 1539/3741 [9:08:27<13:09:08, 21.50s/it] +2025-05-11 02:34:21 - ERROR - stderr - +2025-05-11 02:34:21 - ERROR - stderr - +2025-05-11 02:34:21 - INFO - stdout - {'loss': 0.7931, 'grad_norm': 0.7242034673690796, 'learning_rate': 1.3296951870069981e-05, 'epoch': 1.23} +2025-05-11 02:34:21 - ERROR - stderr - 41%|████ | 1539/3741 [9:08:27<13:09:08, 21.50s/it] +2025-05-11 02:34:40 - ERROR - stderr - 41%|████ | 1540/3741 [9:08:47<12:49:15, 20.97s/it] +2025-05-11 02:34:40 - ERROR - stderr - +2025-05-11 02:34:40 - ERROR - stderr - +2025-05-11 02:34:40 - INFO - stdout - {'loss': 0.737, 'grad_norm': 0.6695935130119324, 'learning_rate': 1.328877550229589e-05, 'epoch': 1.23} +2025-05-11 02:34:40 - ERROR - stderr - 41%|████ | 1540/3741 [9:08:47<12:49:15, 20.97s/it] +2025-05-11 02:35:00 - ERROR - stderr - 41%|████ | 1541/3741 [9:09:06<12:34:33, 20.58s/it] +2025-05-11 02:35:00 - ERROR - stderr - +2025-05-11 02:35:00 - ERROR - stderr - +2025-05-11 02:35:00 - INFO - stdout - {'loss': 0.7701, 'grad_norm': 0.6315851807594299, 'learning_rate': 1.3280596668485919e-05, 'epoch': 1.24} +2025-05-11 02:35:00 - ERROR - stderr - 41%|████ | 1541/3741 [9:09:06<12:34:33, 20.58s/it] +2025-05-11 02:35:20 - ERROR - stderr - 41%|████ | 1542/3741 [9:09:26<12:28:47, 20.43s/it] +2025-05-11 02:35:20 - ERROR - stderr - +2025-05-11 02:35:20 - ERROR - stderr - +2025-05-11 02:35:20 - INFO - stdout - {'loss': 0.7729, 'grad_norm': 0.6564438343048096, 'learning_rate': 1.3272415374772844e-05, 'epoch': 1.24} +2025-05-11 02:35:20 - ERROR - stderr - 41%|████ | 1542/3741 [9:09:26<12:28:47, 20.43s/it] +2025-05-11 02:35:40 - ERROR - stderr - 41%|████ | 1543/3741 [9:09:46<12:21:53, 20.25s/it] +2025-05-11 02:35:40 - ERROR - stderr - +2025-05-11 02:35:40 - ERROR - stderr - +2025-05-11 02:35:40 - INFO - stdout - {'loss': 0.8072, 'grad_norm': 0.6894364953041077, 'learning_rate': 1.3264231627291273e-05, 'epoch': 1.24} +2025-05-11 02:35:40 - ERROR - stderr - 41%|████ | 1543/3741 [9:09:46<12:21:53, 20.25s/it] +2025-05-11 02:36:03 - ERROR - stderr - 41%|████▏ | 1544/3741 [9:10:10<12:56:12, 21.20s/it] +2025-05-11 02:36:03 - ERROR - stderr - +2025-05-11 02:36:03 - ERROR - stderr - +2025-05-11 02:36:03 - INFO - stdout - {'loss': 0.752, 'grad_norm': 0.7009897828102112, 'learning_rate': 1.325604543217766e-05, 'epoch': 1.24} +2025-05-11 02:36:03 - ERROR - stderr - 41%|████▏ | 1544/3741 [9:10:10<12:56:12, 21.20s/it] +2025-05-11 02:36:23 - ERROR - stderr - 41%|████▏ | 1545/3741 [9:10:30<12:41:22, 20.80s/it] +2025-05-11 02:36:23 - ERROR - stderr - +2025-05-11 02:36:23 - ERROR - stderr - +2025-05-11 02:36:23 - INFO - stdout - {'loss': 0.7707, 'grad_norm': 0.7217838764190674, 'learning_rate': 1.3247856795570295e-05, 'epoch': 1.24} +2025-05-11 02:36:23 - ERROR - stderr - 41%|████▏ | 1545/3741 [9:10:30<12:41:22, 20.80s/it] +2025-05-11 02:36:47 - ERROR - stderr - 41%|████▏ | 1546/3741 [9:10:53<13:15:49, 21.75s/it] +2025-05-11 02:36:47 - ERROR - stderr - +2025-05-11 02:36:47 - ERROR - stderr - +2025-05-11 02:36:47 - INFO - stdout - {'loss': 0.7444, 'grad_norm': 0.6767851710319519, 'learning_rate': 1.3239665723609294e-05, 'epoch': 1.24} +2025-05-11 02:36:47 - ERROR - stderr - 41%|████▏ | 1546/3741 [9:10:53<13:15:49, 21.75s/it] +2025-05-11 02:37:07 - ERROR - stderr - 41%|████▏ | 1547/3741 [9:11:13<12:52:37, 21.13s/it] +2025-05-11 02:37:07 - ERROR - stderr - +2025-05-11 02:37:07 - ERROR - stderr - +2025-05-11 02:37:07 - INFO - stdout - {'loss': 0.7341, 'grad_norm': 0.68946373462677, 'learning_rate': 1.3231472222436605e-05, 'epoch': 1.24} +2025-05-11 02:37:07 - ERROR - stderr - 41%|████▏ | 1547/3741 [9:11:13<12:52:37, 21.13s/it] +2025-05-11 02:37:27 - ERROR - stderr - 41%|████▏ | 1548/3741 [9:11:33<12:42:10, 20.85s/it] +2025-05-11 02:37:27 - ERROR - stderr - +2025-05-11 02:37:27 - ERROR - stderr - +2025-05-11 02:37:27 - INFO - stdout - {'loss': 0.7759, 'grad_norm': 0.7371004223823547, 'learning_rate': 1.3223276298195988e-05, 'epoch': 1.24} +2025-05-11 02:37:27 - ERROR - stderr - 41%|████▏ | 1548/3741 [9:11:33<12:42:10, 20.85s/it] +2025-05-11 02:37:46 - ERROR - stderr - 41%|████▏ | 1549/3741 [9:11:53<12:25:51, 20.42s/it] +2025-05-11 02:37:46 - ERROR - stderr - +2025-05-11 02:37:46 - ERROR - stderr - +2025-05-11 02:37:46 - INFO - stdout - {'loss': 0.7692, 'grad_norm': 0.6482488512992859, 'learning_rate': 1.3215077957033032e-05, 'epoch': 1.24} +2025-05-11 02:37:46 - ERROR - stderr - 41%|████▏ | 1549/3741 [9:11:53<12:25:51, 20.42s/it] +2025-05-11 02:38:06 - ERROR - stderr - 41%|████▏ | 1550/3741 [9:12:12<12:14:18, 20.11s/it] +2025-05-11 02:38:06 - ERROR - stderr - +2025-05-11 02:38:06 - ERROR - stderr - +2025-05-11 02:38:06 - INFO - stdout - {'loss': 0.7739, 'grad_norm': 0.6388265490531921, 'learning_rate': 1.3206877205095133e-05, 'epoch': 1.24} +2025-05-11 02:38:06 - ERROR - stderr - 41%|████▏ | 1550/3741 [9:12:12<12:14:18, 20.11s/it] +2025-05-11 02:38:29 - ERROR - stderr - 41%|████▏ | 1551/3741 [9:12:35<12:44:45, 20.95s/it] +2025-05-11 02:38:29 - ERROR - stderr - +2025-05-11 02:38:29 - ERROR - stderr - +2025-05-11 02:38:29 - INFO - stdout - {'loss': 0.8046, 'grad_norm': 0.7499321699142456, 'learning_rate': 1.3198674048531488e-05, 'epoch': 1.24} +2025-05-11 02:38:29 - ERROR - stderr - 41%|████▏ | 1551/3741 [9:12:35<12:44:45, 20.95s/it] +2025-05-11 02:38:48 - ERROR - stderr - 41%|████▏ | 1552/3741 [9:12:55<12:29:22, 20.54s/it] +2025-05-11 02:38:48 - ERROR - stderr - +2025-05-11 02:38:48 - ERROR - stderr - +2025-05-11 02:38:48 - INFO - stdout - {'loss': 0.7477, 'grad_norm': 0.6427833437919617, 'learning_rate': 1.3190468493493107e-05, 'epoch': 1.24} +2025-05-11 02:38:48 - ERROR - stderr - 41%|████▏ | 1552/3741 [9:12:55<12:29:22, 20.54s/it] +2025-05-11 02:39:12 - ERROR - stderr - 42%|████▏ | 1553/3741 [9:13:19<13:08:25, 21.62s/it] +2025-05-11 02:39:12 - ERROR - stderr - +2025-05-11 02:39:12 - ERROR - stderr - +2025-05-11 02:39:12 - INFO - stdout - {'loss': 0.7773, 'grad_norm': 0.6992602944374084, 'learning_rate': 1.3182260546132795e-05, 'epoch': 1.25} +2025-05-11 02:39:12 - ERROR - stderr - 42%|████▏ | 1553/3741 [9:13:19<13:08:25, 21.62s/it] +2025-05-11 02:39:32 - ERROR - stderr - 42%|████▏ | 1554/3741 [9:13:39<12:48:19, 21.08s/it] +2025-05-11 02:39:32 - ERROR - stderr - +2025-05-11 02:39:32 - ERROR - stderr - +2025-05-11 02:39:32 - INFO - stdout - {'loss': 0.7649, 'grad_norm': 0.6634381413459778, 'learning_rate': 1.3174050212605147e-05, 'epoch': 1.25} +2025-05-11 02:39:32 - ERROR - stderr - 42%|████▏ | 1554/3741 [9:13:39<12:48:19, 21.08s/it] +2025-05-11 02:39:52 - ERROR - stderr - 42%|████▏ | 1555/3741 [9:13:59<12:35:05, 20.73s/it] +2025-05-11 02:39:52 - ERROR - stderr - +2025-05-11 02:39:52 - ERROR - stderr - +2025-05-11 02:39:52 - INFO - stdout - {'loss': 0.7866, 'grad_norm': 0.6472319960594177, 'learning_rate': 1.316583749906656e-05, 'epoch': 1.25} +2025-05-11 02:39:52 - ERROR - stderr - 42%|████▏ | 1555/3741 [9:13:59<12:35:05, 20.73s/it] +2025-05-11 02:40:12 - ERROR - stderr - 42%|████▏ | 1556/3741 [9:14:18<12:24:07, 20.43s/it] +2025-05-11 02:40:12 - ERROR - stderr - +2025-05-11 02:40:12 - ERROR - stderr - +2025-05-11 02:40:12 - INFO - stdout - {'loss': 0.7582, 'grad_norm': 0.6691563725471497, 'learning_rate': 1.3157622411675195e-05, 'epoch': 1.25} +2025-05-11 02:40:12 - ERROR - stderr - 42%|████▏ | 1556/3741 [9:14:18<12:24:07, 20.43s/it] +2025-05-11 02:40:32 - ERROR - stderr - 42%|████▏ | 1557/3741 [9:14:38<12:18:06, 20.28s/it] +2025-05-11 02:40:32 - ERROR - stderr - +2025-05-11 02:40:32 - ERROR - stderr - +2025-05-11 02:40:32 - INFO - stdout - {'loss': 0.7923, 'grad_norm': 0.7087457776069641, 'learning_rate': 1.3149404956591008e-05, 'epoch': 1.25} +2025-05-11 02:40:32 - ERROR - stderr - 42%|████▏ | 1557/3741 [9:14:38<12:18:06, 20.28s/it] +2025-05-11 02:40:55 - ERROR - stderr - 42%|████▏ | 1558/3741 [9:15:01<12:50:58, 21.19s/it] +2025-05-11 02:40:55 - ERROR - stderr - +2025-05-11 02:40:55 - ERROR - stderr - +2025-05-11 02:40:55 - INFO - stdout - {'loss': 0.7327, 'grad_norm': 0.6310862898826599, 'learning_rate': 1.3141185139975728e-05, 'epoch': 1.25} +2025-05-11 02:40:55 - ERROR - stderr - 42%|████▏ | 1558/3741 [9:15:02<12:50:58, 21.19s/it] +2025-05-11 02:41:15 - ERROR - stderr - 42%|████▏ | 1559/3741 [9:15:22<12:37:56, 20.84s/it] +2025-05-11 02:41:15 - ERROR - stderr - +2025-05-11 02:41:15 - ERROR - stderr - +2025-05-11 02:41:15 - INFO - stdout - {'loss': 0.8019, 'grad_norm': 0.6904287338256836, 'learning_rate': 1.3132962967992854e-05, 'epoch': 1.25} +2025-05-11 02:41:15 - ERROR - stderr - 42%|████▏ | 1559/3741 [9:15:22<12:37:56, 20.84s/it] +2025-05-11 02:41:39 - ERROR - stderr - 42%|████▏ | 1560/3741 [9:15:46<13:13:42, 21.83s/it] +2025-05-11 02:41:39 - ERROR - stderr - +2025-05-11 02:41:39 - ERROR - stderr - +2025-05-11 02:41:39 - INFO - stdout - {'loss': 0.7593, 'grad_norm': 0.6679447889328003, 'learning_rate': 1.3124738446807652e-05, 'epoch': 1.25} +2025-05-11 02:41:39 - ERROR - stderr - 42%|████▏ | 1560/3741 [9:15:46<13:13:42, 21.83s/it] +2025-05-11 02:41:59 - ERROR - stderr - 42%|████▏ | 1561/3741 [9:16:05<12:49:52, 21.19s/it] +2025-05-11 02:41:59 - ERROR - stderr - +2025-05-11 02:41:59 - ERROR - stderr - +2025-05-11 02:41:59 - INFO - stdout - {'loss': 0.7633, 'grad_norm': 0.6390910744667053, 'learning_rate': 1.3116511582587144e-05, 'epoch': 1.25} +2025-05-11 02:41:59 - ERROR - stderr - 42%|████▏ | 1561/3741 [9:16:05<12:49:52, 21.19s/it] +2025-05-11 02:42:21 - ERROR - stderr - 42%|████▏ | 1562/3741 [9:16:27<12:59:30, 21.46s/it] +2025-05-11 02:42:21 - ERROR - stderr - +2025-05-11 02:42:21 - ERROR - stderr - +2025-05-11 02:42:21 - INFO - stdout - {'loss': 0.7773, 'grad_norm': 0.7109375, 'learning_rate': 1.3108282381500113e-05, 'epoch': 1.25} +2025-05-11 02:42:21 - ERROR - stderr - 42%|████▏ | 1562/3741 [9:16:27<12:59:30, 21.46s/it] +2025-05-11 02:42:41 - ERROR - stderr - 42%|████▏ | 1563/3741 [9:16:47<12:40:43, 20.96s/it] +2025-05-11 02:42:41 - ERROR - stderr - +2025-05-11 02:42:41 - ERROR - stderr - +2025-05-11 02:42:41 - INFO - stdout - {'loss': 0.7661, 'grad_norm': 0.7146701812744141, 'learning_rate': 1.3100050849717102e-05, 'epoch': 1.25} +2025-05-11 02:42:41 - ERROR - stderr - 42%|████▏ | 1563/3741 [9:16:47<12:40:43, 20.96s/it] +2025-05-11 02:43:01 - ERROR - stderr - 42%|████▏ | 1564/3741 [9:17:07<12:25:45, 20.55s/it] +2025-05-11 02:43:01 - ERROR - stderr - +2025-05-11 02:43:01 - ERROR - stderr - +2025-05-11 02:43:01 - INFO - stdout - {'loss': 0.7478, 'grad_norm': 0.6369656920433044, 'learning_rate': 1.309181699341038e-05, 'epoch': 1.25} +2025-05-11 02:43:01 - ERROR - stderr - 42%|████▏ | 1564/3741 [9:17:07<12:25:45, 20.55s/it] +2025-05-11 02:43:22 - ERROR - stderr - 42%|████▏ | 1565/3741 [9:17:29<12:38:14, 20.91s/it] +2025-05-11 02:43:22 - ERROR - stderr - +2025-05-11 02:43:22 - ERROR - stderr - +2025-05-11 02:43:22 - INFO - stdout - {'loss': 0.7648, 'grad_norm': 0.6547572016716003, 'learning_rate': 1.3083580818753985e-05, 'epoch': 1.26} +2025-05-11 02:43:22 - ERROR - stderr - 42%|████▏ | 1565/3741 [9:17:29<12:38:14, 20.91s/it] +2025-05-11 02:43:42 - ERROR - stderr - 42%|████▏ | 1566/3741 [9:17:48<12:24:57, 20.55s/it] +2025-05-11 02:43:42 - ERROR - stderr - +2025-05-11 02:43:42 - ERROR - stderr - +2025-05-11 02:43:42 - INFO - stdout - {'loss': 0.7727, 'grad_norm': 0.6653978228569031, 'learning_rate': 1.3075342331923675e-05, 'epoch': 1.26} +2025-05-11 02:43:42 - ERROR - stderr - 42%|████▏ | 1566/3741 [9:17:48<12:24:57, 20.55s/it] +2025-05-11 02:44:05 - ERROR - stderr - 42%|████▏ | 1567/3741 [9:18:12<12:55:24, 21.40s/it] +2025-05-11 02:44:05 - ERROR - stderr - +2025-05-11 02:44:05 - ERROR - stderr - +2025-05-11 02:44:05 - INFO - stdout - {'loss': 0.7463, 'grad_norm': 0.6517826914787292, 'learning_rate': 1.3067101539096952e-05, 'epoch': 1.26} +2025-05-11 02:44:05 - ERROR - stderr - 42%|████▏ | 1567/3741 [9:18:12<12:55:24, 21.40s/it] +2025-05-11 02:44:25 - ERROR - stderr - 42%|��███▏ | 1568/3741 [9:18:31<12:34:02, 20.82s/it] +2025-05-11 02:44:25 - ERROR - stderr - +2025-05-11 02:44:25 - ERROR - stderr - +2025-05-11 02:44:25 - INFO - stdout - {'loss': 0.7474, 'grad_norm': 0.673172116279602, 'learning_rate': 1.305885844645304e-05, 'epoch': 1.26} +2025-05-11 02:44:25 - ERROR - stderr - 42%|████▏ | 1568/3741 [9:18:31<12:34:02, 20.82s/it] +2025-05-11 02:44:48 - ERROR - stderr - 42%|████▏ | 1569/3741 [9:18:54<12:58:36, 21.51s/it] +2025-05-11 02:44:48 - ERROR - stderr - +2025-05-11 02:44:48 - ERROR - stderr - +2025-05-11 02:44:48 - INFO - stdout - {'loss': 0.7819, 'grad_norm': 0.6843745112419128, 'learning_rate': 1.3050613060172893e-05, 'epoch': 1.26} +2025-05-11 02:44:48 - ERROR - stderr - 42%|████▏ | 1569/3741 [9:18:54<12:58:36, 21.51s/it] +2025-05-11 02:45:07 - ERROR - stderr - 42%|████▏ | 1570/3741 [9:19:14<12:36:24, 20.91s/it] +2025-05-11 02:45:07 - ERROR - stderr - +2025-05-11 02:45:07 - ERROR - stderr - +2025-05-11 02:45:07 - INFO - stdout - {'loss': 0.7617, 'grad_norm': 0.6570084095001221, 'learning_rate': 1.304236538643918e-05, 'epoch': 1.26} +2025-05-11 02:45:07 - ERROR - stderr - 42%|████▏ | 1570/3741 [9:19:14<12:36:24, 20.91s/it] +2025-05-11 02:45:28 - ERROR - stderr - 42%|████▏ | 1571/3741 [9:19:34<12:31:04, 20.77s/it] +2025-05-11 02:45:28 - ERROR - stderr - +2025-05-11 02:45:28 - ERROR - stderr - +2025-05-11 02:45:28 - INFO - stdout - {'loss': 0.7473, 'grad_norm': 0.6739295125007629, 'learning_rate': 1.3034115431436286e-05, 'epoch': 1.26} +2025-05-11 02:45:28 - ERROR - stderr - 42%|████▏ | 1571/3741 [9:19:34<12:31:04, 20.77s/it] +2025-05-11 02:45:47 - ERROR - stderr - 42%|████▏ | 1572/3741 [9:19:54<12:16:19, 20.37s/it] +2025-05-11 02:45:47 - ERROR - stderr - +2025-05-11 02:45:47 - ERROR - stderr - +2025-05-11 02:45:47 - INFO - stdout - {'loss': 0.7476, 'grad_norm': 0.6281715035438538, 'learning_rate': 1.3025863201350315e-05, 'epoch': 1.26} +2025-05-11 02:45:47 - ERROR - stderr - 42%|████▏ | 1572/3741 [9:19:54<12:16:19, 20.37s/it] +2025-05-11 02:46:07 - ERROR - stderr - 42%|████▏ | 1573/3741 [9:20:13<12:05:07, 20.07s/it] +2025-05-11 02:46:07 - ERROR - stderr - +2025-05-11 02:46:07 - ERROR - stderr - +2025-05-11 02:46:07 - INFO - stdout - {'loss': 0.7579, 'grad_norm': 0.6386378407478333, 'learning_rate': 1.3017608702369065e-05, 'epoch': 1.26} +2025-05-11 02:46:07 - ERROR - stderr - 42%|████▏ | 1573/3741 [9:20:13<12:05:07, 20.07s/it] +2025-05-11 02:46:30 - ERROR - stderr - 42%|████▏ | 1574/3741 [9:20:36<12:36:04, 20.93s/it] +2025-05-11 02:46:30 - ERROR - stderr - +2025-05-11 02:46:30 - ERROR - stderr - +2025-05-11 02:46:30 - INFO - stdout - {'loss': 0.7787, 'grad_norm': 0.6800333857536316, 'learning_rate': 1.300935194068204e-05, 'epoch': 1.26} +2025-05-11 02:46:30 - ERROR - stderr - 42%|████▏ | 1574/3741 [9:20:36<12:36:04, 20.93s/it] +2025-05-11 02:46:49 - ERROR - stderr - 42%|████▏ | 1575/3741 [9:20:56<12:22:56, 20.58s/it] +2025-05-11 02:46:49 - ERROR - stderr - +2025-05-11 02:46:49 - ERROR - stderr - +2025-05-11 02:46:49 - INFO - stdout - {'loss': 0.7985, 'grad_norm': 0.6754662394523621, 'learning_rate': 1.3001092922480445e-05, 'epoch': 1.26} +2025-05-11 02:46:49 - ERROR - stderr - 42%|████▏ | 1575/3741 [9:20:56<12:22:56, 20.58s/it] +2025-05-11 02:47:12 - ERROR - stderr - 42%|████▏ | 1576/3741 [9:21:19<12:47:56, 21.28s/it] +2025-05-11 02:47:12 - ERROR - stderr - +2025-05-11 02:47:12 - ERROR - stderr - +2025-05-11 02:47:12 - INFO - stdout - {'loss': 0.7681, 'grad_norm': 0.666191041469574, 'learning_rate': 1.2992831653957173e-05, 'epoch': 1.26} +2025-05-11 02:47:12 - ERROR - stderr - 42%|████▏ | 1576/3741 [9:21:19<12:47:56, 21.28s/it] +2025-05-11 02:47:32 - ERROR - stderr - 42%|████▏ | 1577/3741 [9:21:39<12:32:51, 20.87s/it] +2025-05-11 02:47:32 - ERROR - stderr - +2025-05-11 02:47:32 - ERROR - stderr - +2025-05-11 02:47:32 - INFO - stdout - {'loss': 0.7185, 'grad_norm': 0.636799693107605, 'learning_rate': 1.2984568141306797e-05, 'epoch': 1.26} +2025-05-11 02:47:32 - ERROR - stderr - 42%|████▏ | 1577/3741 [9:21:39<12:32:51, 20.87s/it] +2025-05-11 02:47:53 - ERROR - stderr - 42%|████▏ | 1578/3741 [9:21:59<12:26:02, 20.69s/it] +2025-05-11 02:47:53 - ERROR - stderr - +2025-05-11 02:47:53 - ERROR - stderr - +2025-05-11 02:47:53 - INFO - stdout - {'loss': 0.8076, 'grad_norm': 0.6819969415664673, 'learning_rate': 1.2976302390725586e-05, 'epoch': 1.27} +2025-05-11 02:47:53 - ERROR - stderr - 42%|████▏ | 1578/3741 [9:21:59<12:26:02, 20.69s/it] +2025-05-11 02:48:12 - ERROR - stderr - 42%|████▏ | 1579/3741 [9:22:19<12:16:32, 20.44s/it] +2025-05-11 02:48:12 - ERROR - stderr - +2025-05-11 02:48:12 - ERROR - stderr - +2025-05-11 02:48:12 - INFO - stdout - {'loss': 0.77, 'grad_norm': 0.658902645111084, 'learning_rate': 1.296803440841148e-05, 'epoch': 1.27} +2025-05-11 02:48:12 - ERROR - stderr - 42%|████▏ | 1579/3741 [9:22:19<12:16:32, 20.44s/it] +2025-05-11 02:48:32 - ERROR - stderr - 42%|████▏ | 1580/3741 [9:22:39<12:10:19, 20.28s/it] +2025-05-11 02:48:32 - ERROR - stderr - +2025-05-11 02:48:32 - ERROR - stderr - +2025-05-11 02:48:32 - INFO - stdout - {'loss': 0.7159, 'grad_norm': 0.6477823853492737, 'learning_rate': 1.29597642005641e-05, 'epoch': 1.27} +2025-05-11 02:48:32 - ERROR - stderr - 42%|████▏ | 1580/3741 [9:22:39<12:10:19, 20.28s/it] +2025-05-11 02:48:55 - ERROR - stderr - 42%|████▏ | 1581/3741 [9:23:01<12:33:30, 20.93s/it] +2025-05-11 02:48:55 - ERROR - stderr - +2025-05-11 02:48:55 - ERROR - stderr - +2025-05-11 02:48:55 - INFO - stdout - {'loss': 0.7573, 'grad_norm': 0.6573794484138489, 'learning_rate': 1.2951491773384722e-05, 'epoch': 1.27} +2025-05-11 02:48:55 - ERROR - stderr - 42%|████▏ | 1581/3741 [9:23:01<12:33:30, 20.93s/it] +2025-05-11 02:49:15 - ERROR - stderr - 42%|████▏ | 1582/3741 [9:23:21<12:23:58, 20.68s/it] +2025-05-11 02:49:15 - ERROR - stderr - +2025-05-11 02:49:15 - ERROR - stderr - +2025-05-11 02:49:15 - INFO - stdout - {'loss': 0.7136, 'grad_norm': 0.6093468070030212, 'learning_rate': 1.2943217133076294e-05, 'epoch': 1.27} +2025-05-11 02:49:15 - ERROR - stderr - 42%|████▏ | 1582/3741 [9:23:21<12:23:58, 20.68s/it] +2025-05-11 02:49:39 - ERROR - stderr - 42%|████▏ | 1583/3741 [9:23:45<12:59:53, 21.68s/it] +2025-05-11 02:49:39 - ERROR - stderr - +2025-05-11 02:49:39 - ERROR - stderr - +2025-05-11 02:49:39 - INFO - stdout - {'loss': 0.7904, 'grad_norm': 0.6893898844718933, 'learning_rate': 1.2934940285843425e-05, 'epoch': 1.27} +2025-05-11 02:49:39 - ERROR - stderr - 42%|████▏ | 1583/3741 [9:23:45<12:59:53, 21.68s/it] +2025-05-11 02:49:58 - ERROR - stderr - 42%|████▏ | 1584/3741 [9:24:05<12:35:53, 21.03s/it] +2025-05-11 02:49:58 - ERROR - stderr - +2025-05-11 02:49:58 - ERROR - stderr - +2025-05-11 02:49:58 - INFO - stdout - {'loss': 0.7664, 'grad_norm': 0.6822935938835144, 'learning_rate': 1.2926661237892377e-05, 'epoch': 1.27} +2025-05-11 02:49:58 - ERROR - stderr - 42%|████▏ | 1584/3741 [9:24:05<12:35:53, 21.03s/it] +2025-05-11 02:50:21 - ERROR - stderr - 42%|████▏ | 1585/3741 [9:24:28<12:58:06, 21.65s/it] +2025-05-11 02:50:21 - ERROR - stderr - +2025-05-11 02:50:21 - ERROR - stderr - +2025-05-11 02:50:21 - INFO - stdout - {'loss': 0.7504, 'grad_norm': 0.6702585220336914, 'learning_rate': 1.2918379995431062e-05, 'epoch': 1.27} +2025-05-11 02:50:21 - ERROR - stderr - 42%|████▏ | 1585/3741 [9:24:28<12:58:06, 21.65s/it] +2025-05-11 02:50:41 - ERROR - stderr - 42%|████▏ | 1586/3741 [9:24:47<12:36:28, 21.06s/it] +2025-05-11 02:50:41 - ERROR - stderr - +2025-05-11 02:50:41 - ERROR - stderr - +2025-05-11 02:50:41 - INFO - stdout - {'loss': 0.7665, 'grad_norm': 0.7012314200401306, 'learning_rate': 1.2910096564669037e-05, 'epoch': 1.27} +2025-05-11 02:50:41 - ERROR - stderr - 42%|████▏ | 1586/3741 [9:24:47<12:36:28, 21.06s/it] +2025-05-11 02:51:01 - ERROR - stderr - 42%|████▏ | 1587/3741 [9:25:07<12:21:23, 20.65s/it] +2025-05-11 02:51:01 - ERROR - stderr - +2025-05-11 02:51:01 - ERROR - stderr - +2025-05-11 02:51:01 - INFO - stdout - {'loss': 0.7426, 'grad_norm': 0.6500260233879089, 'learning_rate': 1.2901810951817499e-05, 'epoch': 1.27} +2025-05-11 02:51:01 - ERROR - stderr - 42%|████▏ | 1587/3741 [9:25:07<12:21:23, 20.65s/it] +2025-05-11 02:51:21 - ERROR - stderr - 42%|████▏ | 1588/3741 [9:25:28<12:19:51, 20.62s/it] +2025-05-11 02:51:21 - ERROR - stderr - +2025-05-11 02:51:21 - ERROR - stderr - +2025-05-11 02:51:21 - INFO - stdout - {'loss': 0.7817, 'grad_norm': 0.6530648469924927, 'learning_rate': 1.2893523163089285e-05, 'epoch': 1.27} +2025-05-11 02:51:21 - ERROR - stderr - 42%|████▏ | 1588/3741 [9:25:28<12:19:51, 20.62s/it] +2025-05-11 02:51:41 - ERROR - stderr - 42%|████▏ | 1589/3741 [9:25:47<12:07:13, 20.28s/it] +2025-05-11 02:51:41 - ERROR - stderr - +2025-05-11 02:51:41 - ERROR - stderr - +2025-05-11 02:51:41 - INFO - stdout - {'loss': 0.75, 'grad_norm': 0.6491802334785461, 'learning_rate': 1.2885233204698866e-05, 'epoch': 1.27} +2025-05-11 02:51:41 - ERROR - stderr - 42%|████▏ | 1589/3741 [9:25:47<12:07:13, 20.28s/it] +2025-05-11 02:52:04 - ERROR - stderr - 43%|████▎ | 1590/3741 [9:26:10<12:38:59, 21.17s/it] +2025-05-11 02:52:04 - ERROR - stderr - +2025-05-11 02:52:04 - ERROR - stderr - +2025-05-11 02:52:04 - INFO - stdout - {'loss': 0.7574, 'grad_norm': 0.6376941204071045, 'learning_rate': 1.2876941082862324e-05, 'epoch': 1.28} +2025-05-11 02:52:04 - ERROR - stderr - 43%|████▎ | 1590/3741 [9:26:10<12:38:59, 21.17s/it] +2025-05-11 02:52:24 - ERROR - stderr - 43%|████▎ | 1591/3741 [9:26:30<12:22:58, 20.73s/it] +2025-05-11 02:52:24 - ERROR - stderr - +2025-05-11 02:52:24 - ERROR - stderr - +2025-05-11 02:52:24 - INFO - stdout - {'loss': 0.7617, 'grad_norm': 0.6957758665084839, 'learning_rate': 1.2868646803797384e-05, 'epoch': 1.28} +2025-05-11 02:52:24 - ERROR - stderr - 43%|████▎ | 1591/3741 [9:26:30<12:22:58, 20.73s/it] +2025-05-11 02:52:47 - ERROR - stderr - 43%|████▎ | 1592/3741 [9:26:53<12:50:06, 21.50s/it] +2025-05-11 02:52:47 - ERROR - stderr - +2025-05-11 02:52:47 - ERROR - stderr - +2025-05-11 02:52:47 - INFO - stdout - {'loss': 0.7365, 'grad_norm': 0.6394297480583191, 'learning_rate': 1.2860350373723374e-05, 'epoch': 1.28} +2025-05-11 02:52:47 - ERROR - stderr - 43%|████▎ | 1592/3741 [9:26:53<12:50:06, 21.50s/it] +2025-05-11 02:53:07 - ERROR - stderr - 43%|████▎ | 1593/3741 [9:27:13<12:28:31, 20.91s/it] +2025-05-11 02:53:07 - ERROR - stderr - +2025-05-11 02:53:07 - ERROR - stderr - +2025-05-11 02:53:07 - INFO - stdout - {'loss': 0.7461, 'grad_norm': 0.6853082180023193, 'learning_rate': 1.2852051798861243e-05, 'epoch': 1.28} +2025-05-11 02:53:07 - ERROR - stderr - 43%|████▎ | 1593/3741 [9:27:13<12:28:31, 20.91s/it] +2025-05-11 02:53:26 - ERROR - stderr - 43%|████▎ | 1594/3741 [9:27:32<12:12:35, 20.47s/it] +2025-05-11 02:53:26 - ERROR - stderr - +2025-05-11 02:53:26 - ERROR - stderr - +2025-05-11 02:53:26 - INFO - stdout - {'loss': 0.7031, 'grad_norm': 0.6423301100730896, 'learning_rate': 1.2843751085433539e-05, 'epoch': 1.28} +2025-05-11 02:53:26 - ERROR - stderr - 43%|████▎ | 1594/3741 [9:27:32<12:12:35, 20.47s/it] +2025-05-11 02:53:47 - ERROR - stderr - 43%|████▎ | 1595/3741 [9:27:53<12:16:46, 20.60s/it] +2025-05-11 02:53:47 - ERROR - stderr - +2025-05-11 02:53:47 - ERROR - stderr - +2025-05-11 02:53:47 - INFO - stdout - {'loss': 0.7431, 'grad_norm': 0.6504702568054199, 'learning_rate': 1.2835448239664425e-05, 'epoch': 1.28} +2025-05-11 02:53:47 - ERROR - stderr - 43%|████▎ | 1595/3741 [9:27:53<12:16:46, 20.60s/it] +2025-05-11 02:54:07 - ERROR - stderr - 43%|████▎ | 1596/3741 [9:28:13<12:05:31, 20.29s/it] +2025-05-11 02:54:07 - ERROR - stderr - +2025-05-11 02:54:07 - ERROR - stderr - +2025-05-11 02:54:07 - INFO - stdout - {'loss': 0.7593, 'grad_norm': 0.6869426369667053, 'learning_rate': 1.2827143267779658e-05, 'epoch': 1.28} +2025-05-11 02:54:07 - ERROR - stderr - 43%|████▎ | 1596/3741 [9:28:13<12:05:31, 20.29s/it] +2025-05-11 02:54:30 - ERROR - stderr - 43%|████▎ | 1597/3741 [9:28:36<12:36:09, 21.16s/it] +2025-05-11 02:54:30 - ERROR - stderr - +2025-05-11 02:54:30 - ERROR - stderr - +2025-05-11 02:54:30 - INFO - stdout - {'loss': 0.7978, 'grad_norm': 0.708145022392273, 'learning_rate': 1.2818836176006586e-05, 'epoch': 1.28} +2025-05-11 02:54:30 - ERROR - stderr - 43%|████▎ | 1597/3741 [9:28:36<12:36:09, 21.16s/it] +2025-05-11 02:54:50 - ERROR - stderr - 43%|████▎ | 1598/3741 [9:28:56<12:22:33, 20.79s/it] +2025-05-11 02:54:50 - ERROR - stderr - +2025-05-11 02:54:50 - ERROR - stderr - +2025-05-11 02:54:50 - INFO - stdout - {'loss': 0.7948, 'grad_norm': 0.6440792679786682, 'learning_rate': 1.2810526970574151e-05, 'epoch': 1.28} +2025-05-11 02:54:50 - ERROR - stderr - 43%|████▎ | 1598/3741 [9:28:56<12:22:33, 20.79s/it] +2025-05-11 02:55:13 - ERROR - stderr - 43%|████▎ | 1599/3741 [9:29:19<12:50:30, 21.58s/it] +2025-05-11 02:55:13 - ERROR - stderr - +2025-05-11 02:55:13 - ERROR - stderr - +2025-05-11 02:55:13 - INFO - stdout - {'loss': 0.7789, 'grad_norm': 0.6792011260986328, 'learning_rate': 1.2802215657712876e-05, 'epoch': 1.28} +2025-05-11 02:55:13 - ERROR - stderr - 43%|████▎ | 1599/3741 [9:29:19<12:50:30, 21.58s/it] +2025-05-11 02:55:33 - ERROR - stderr - 43%|████▎ | 1600/3741 [9:29:39<12:27:51, 20.96s/it] +2025-05-11 02:55:33 - ERROR - stderr - +2025-05-11 02:55:33 - ERROR - stderr - +2025-05-11 02:55:33 - INFO - stdout - {'loss': 0.7309, 'grad_norm': 0.6505358815193176, 'learning_rate': 1.2793902243654868e-05, 'epoch': 1.28} +2025-05-11 02:55:33 - ERROR - stderr - 43%|████▎ | 1600/3741 [9:29:39<12:27:51, 20.96s/it] +2025-05-11 02:55:53 - ERROR - stderr - 43%|████▎ | 1601/3741 [9:29:59<12:16:51, 20.66s/it] +2025-05-11 02:55:53 - ERROR - stderr - +2025-05-11 02:55:53 - ERROR - stderr - +2025-05-11 02:55:53 - INFO - stdout - {'loss': 0.7607, 'grad_norm': 0.6658748388290405, 'learning_rate': 1.278558673463381e-05, 'epoch': 1.28} +2025-05-11 02:55:53 - ERROR - stderr - 43%|████▎ | 1601/3741 [9:29:59<12:16:51, 20.66s/it] +2025-05-11 02:56:12 - ERROR - stderr - 43%|████▎ | 1602/3741 [9:30:18<12:04:51, 20.33s/it] +2025-05-11 02:56:12 - ERROR - stderr - +2025-05-11 02:56:12 - ERROR - stderr - +2025-05-11 02:56:12 - INFO - stdout - {'loss': 0.758, 'grad_norm': 0.6865249276161194, 'learning_rate': 1.2777269136884952e-05, 'epoch': 1.28} +2025-05-11 02:56:12 - ERROR - stderr - 43%|████▎ | 1602/3741 [9:30:18<12:04:51, 20.33s/it] +2025-05-11 02:56:32 - ERROR - stderr - 43%|████▎ | 1603/3741 [9:30:38<11:55:04, 20.07s/it] +2025-05-11 02:56:32 - ERROR - stderr - +2025-05-11 02:56:32 - ERROR - stderr - +2025-05-11 02:56:32 - INFO - stdout - {'loss': 0.7318, 'grad_norm': 0.6584889888763428, 'learning_rate': 1.2768949456645108e-05, 'epoch': 1.29} +2025-05-11 02:56:32 - ERROR - stderr - 43%|████▎ | 1603/3741 [9:30:38<11:55:04, 20.07s/it] +2025-05-11 02:56:54 - ERROR - stderr - 43%|████▎ | 1604/3741 [9:31:01<12:22:51, 20.86s/it] +2025-05-11 02:56:54 - ERROR - stderr - +2025-05-11 02:56:54 - ERROR - stderr - +2025-05-11 02:56:54 - INFO - stdout - {'loss': 0.7606, 'grad_norm': 0.6798214912414551, 'learning_rate': 1.2760627700152664e-05, 'epoch': 1.29} +2025-05-11 02:56:54 - ERROR - stderr - 43%|████▎ | 1604/3741 [9:31:01<12:22:51, 20.86s/it] +2025-05-11 02:57:14 - ERROR - stderr - 43%|████▎ | 1605/3741 [9:31:20<12:09:23, 20.49s/it] +2025-05-11 02:57:14 - ERROR - stderr - +2025-05-11 02:57:14 - ERROR - stderr - +2025-05-11 02:57:14 - INFO - stdout - {'loss': 0.8089, 'grad_norm': 0.6833958625793457, 'learning_rate': 1.275230387364755e-05, 'epoch': 1.29} +2025-05-11 02:57:14 - ERROR - stderr - 43%|████▎ | 1605/3741 [9:31:20<12:09:23, 20.49s/it] +2025-05-11 02:57:38 - ERROR - stderr - 43%|████▎ | 1606/3741 [9:31:44<12:46:26, 21.54s/it] +2025-05-11 02:57:38 - ERROR - stderr - +2025-05-11 02:57:38 - ERROR - stderr - +2025-05-11 02:57:38 - INFO - stdout - {'loss': 0.7443, 'grad_norm': 0.6360597014427185, 'learning_rate': 1.2743977983371268e-05, 'epoch': 1.29} +2025-05-11 02:57:38 - ERROR - stderr - 43%|████▎ | 1606/3741 [9:31:44<12:46:26, 21.54s/it] +2025-05-11 02:57:57 - ERROR - stderr - 43%|████▎ | 1607/3741 [9:32:04<12:23:32, 20.91s/it] +2025-05-11 02:57:57 - ERROR - stderr - +2025-05-11 02:57:57 - ERROR - stderr - +2025-05-11 02:57:57 - INFO - stdout - {'loss': 0.7916, 'grad_norm': 0.7116873860359192, 'learning_rate': 1.2735650035566836e-05, 'epoch': 1.29} +2025-05-11 02:57:57 - ERROR - stderr - 43%|████▎ | 1607/3741 [9:32:04<12:23:32, 20.91s/it] +2025-05-11 02:58:18 - ERROR - stderr - 43%|████▎ | 1608/3741 [9:32:24<12:15:40, 20.69s/it] +2025-05-11 02:58:18 - ERROR - stderr - +2025-05-11 02:58:18 - ERROR - stderr - +2025-05-11 02:58:18 - INFO - stdout - {'loss': 0.7767, 'grad_norm': 0.6559532880783081, 'learning_rate': 1.2727320036478843e-05, 'epoch': 1.29} +2025-05-11 02:58:18 - ERROR - stderr - 43%|████▎ | 1608/3741 [9:32:24<12:15:40, 20.69s/it] +2025-05-11 02:58:38 - ERROR - stderr - 43%|████▎ | 1609/3741 [9:32:44<12:07:36, 20.48s/it] +2025-05-11 02:58:38 - ERROR - stderr - +2025-05-11 02:58:38 - ERROR - stderr - +2025-05-11 02:58:38 - INFO - stdout - {'loss': 0.7425, 'grad_norm': 0.6576822400093079, 'learning_rate': 1.2718987992353403e-05, 'epoch': 1.29} +2025-05-11 02:58:38 - ERROR - stderr - 43%|████▎ | 1609/3741 [9:32:44<12:07:36, 20.48s/it] +2025-05-11 02:58:57 - ERROR - stderr - 43%|████▎ | 1610/3741 [9:33:03<11:55:11, 20.14s/it] +2025-05-11 02:58:57 - ERROR - stderr - +2025-05-11 02:58:57 - ERROR - stderr - +2025-05-11 02:58:57 - INFO - stdout - {'loss': 0.7506, 'grad_norm': 0.6480294466018677, 'learning_rate': 1.2710653909438172e-05, 'epoch': 1.29} +2025-05-11 02:58:57 - ERROR - stderr - 43%|████▎ | 1610/3741 [9:33:03<11:55:11, 20.14s/it] +2025-05-11 02:59:19 - ERROR - stderr - 43%|████▎ | 1611/3741 [9:33:25<12:11:50, 20.62s/it] +2025-05-11 02:59:19 - ERROR - stderr - +2025-05-11 02:59:19 - ERROR - stderr - +2025-05-11 02:59:19 - INFO - stdout - {'loss': 0.7592, 'grad_norm': 0.6556183099746704, 'learning_rate': 1.2702317793982327e-05, 'epoch': 1.29} +2025-05-11 02:59:19 - ERROR - stderr - 43%|████▎ | 1611/3741 [9:33:25<12:11:50, 20.62s/it] +2025-05-11 02:59:38 - ERROR - stderr - 43%|████▎ | 1612/3741 [9:33:44<12:00:26, 20.30s/it] +2025-05-11 02:59:38 - ERROR - stderr - +2025-05-11 02:59:38 - ERROR - stderr - +2025-05-11 02:59:38 - INFO - stdout - {'loss': 0.7769, 'grad_norm': 0.7214966416358948, 'learning_rate': 1.2693979652236564e-05, 'epoch': 1.29} +2025-05-11 02:59:38 - ERROR - stderr - 43%|████▎ | 1612/3741 [9:33:44<12:00:26, 20.30s/it] +2025-05-11 03:00:01 - ERROR - stderr - 43%|████▎ | 1613/3741 [9:34:07<12:26:39, 21.05s/it] +2025-05-11 03:00:01 - ERROR - stderr - +2025-05-11 03:00:01 - ERROR - stderr - +2025-05-11 03:00:01 - INFO - stdout - {'loss': 0.7976, 'grad_norm': 0.6807628273963928, 'learning_rate': 1.2685639490453113e-05, 'epoch': 1.29} +2025-05-11 03:00:01 - ERROR - stderr - 43%|████▎ | 1613/3741 [9:34:07<12:26:39, 21.05s/it] +2025-05-11 03:00:20 - ERROR - stderr - 43%|████▎ | 1614/3741 [9:34:27<12:08:19, 20.54s/it] +2025-05-11 03:00:20 - ERROR - stderr - +2025-05-11 03:00:20 - ERROR - stderr - +2025-05-11 03:00:20 - INFO - stdout - {'loss': 0.7919, 'grad_norm': 0.6581088900566101, 'learning_rate': 1.2677297314885708e-05, 'epoch': 1.29} +2025-05-11 03:00:20 - ERROR - stderr - 43%|████▎ | 1614/3741 [9:34:27<12:08:19, 20.54s/it] +2025-05-11 03:00:43 - ERROR - stderr - 43%|████▎ | 1615/3741 [9:34:50<12:35:49, 21.33s/it] +2025-05-11 03:00:43 - ERROR - stderr - +2025-05-11 03:00:43 - ERROR - stderr - +2025-05-11 03:00:43 - INFO - stdout - {'loss': 0.774, 'grad_norm': 0.6764626502990723, 'learning_rate': 1.2668953131789599e-05, 'epoch': 1.3} +2025-05-11 03:00:43 - ERROR - stderr - 43%|████▎ | 1615/3741 [9:34:50<12:35:49, 21.33s/it] +2025-05-11 03:01:03 - ERROR - stderr - 43%|████▎ | 1616/3741 [9:35:09<12:13:59, 20.72s/it] +2025-05-11 03:01:03 - ERROR - stderr - +2025-05-11 03:01:03 - ERROR - stderr - +2025-05-11 03:01:03 - INFO - stdout - {'loss': 0.7757, 'grad_norm': 0.696811854839325, 'learning_rate': 1.2660606947421537e-05, 'epoch': 1.3} +2025-05-11 03:01:03 - ERROR - stderr - 43%|████▎ | 1616/3741 [9:35:09<12:13:59, 20.72s/it] +2025-05-11 03:01:23 - ERROR - stderr - 43%|████▎ | 1617/3741 [9:35:29<12:04:23, 20.46s/it] +2025-05-11 03:01:23 - ERROR - stderr - +2025-05-11 03:01:23 - ERROR - stderr - +2025-05-11 03:01:23 - INFO - stdout - {'loss': 0.7861, 'grad_norm': 0.6721716523170471, 'learning_rate': 1.2652258768039775e-05, 'epoch': 1.3} +2025-05-11 03:01:23 - ERROR - stderr - 43%|████▎ | 1617/3741 [9:35:29<12:04:23, 20.46s/it] +2025-05-11 03:01:42 - ERROR - stderr - 43%|████▎ | 1618/3741 [9:35:48<11:50:59, 20.09s/it] +2025-05-11 03:01:42 - ERROR - stderr - +2025-05-11 03:01:42 - ERROR - stderr - +2025-05-11 03:01:42 - INFO - stdout - {'loss': 0.7621, 'grad_norm': 0.7006967663764954, 'learning_rate': 1.2643908599904063e-05, 'epoch': 1.3} +2025-05-11 03:01:42 - ERROR - stderr - 43%|████▎ | 1618/3741 [9:35:48<11:50:59, 20.09s/it] +2025-05-11 03:02:01 - ERROR - stderr - 43%|████▎ | 1619/3741 [9:36:08<11:44:03, 19.91s/it] +2025-05-11 03:02:01 - ERROR - stderr - +2025-05-11 03:02:01 - ERROR - stderr - +2025-05-11 03:02:01 - INFO - stdout - {'loss': 0.7318, 'grad_norm': 0.630478024482727, 'learning_rate': 1.2635556449275641e-05, 'epoch': 1.3} +2025-05-11 03:02:01 - ERROR - stderr - 43%|████▎ | 1619/3741 [9:36:08<11:44:03, 19.91s/it] +2025-05-11 03:02:23 - ERROR - stderr - 43%|████▎ | 1620/3741 [9:36:30<12:05:05, 20.51s/it] +2025-05-11 03:02:23 - ERROR - stderr - +2025-05-11 03:02:23 - ERROR - stderr - +2025-05-11 03:02:23 - INFO - stdout - {'loss': 0.8096, 'grad_norm': 0.6607829332351685, 'learning_rate': 1.2627202322417235e-05, 'epoch': 1.3} +2025-05-11 03:02:23 - ERROR - stderr - 43%|████▎ | 1620/3741 [9:36:30<12:05:05, 20.51s/it] +2025-05-11 03:02:43 - ERROR - stderr - 43%|████▎ | 1621/3741 [9:36:49<11:54:20, 20.22s/it] +2025-05-11 03:02:43 - ERROR - stderr - +2025-05-11 03:02:43 - ERROR - stderr - +2025-05-11 03:02:43 - INFO - stdout - {'loss': 0.7783, 'grad_norm': 0.6839569807052612, 'learning_rate': 1.2618846225593057e-05, 'epoch': 1.3} +2025-05-11 03:02:43 - ERROR - stderr - 43%|████▎ | 1621/3741 [9:36:49<11:54:20, 20.22s/it] +2025-05-11 03:03:06 - ERROR - stderr - 43%|████▎ | 1622/3741 [9:37:12<12:22:23, 21.02s/it] +2025-05-11 03:03:06 - ERROR - stderr - +2025-05-11 03:03:06 - ERROR - stderr - +2025-05-11 03:03:06 - INFO - stdout - {'loss': 0.7737, 'grad_norm': 0.6771467328071594, 'learning_rate': 1.2610488165068793e-05, 'epoch': 1.3} +2025-05-11 03:03:06 - ERROR - stderr - 43%|████▎ | 1622/3741 [9:37:12<12:22:23, 21.02s/it] +2025-05-11 03:03:25 - ERROR - stderr - 43%|████▎ | 1623/3741 [9:37:31<12:05:07, 20.54s/it] +2025-05-11 03:03:25 - ERROR - stderr - +2025-05-11 03:03:25 - ERROR - stderr - +2025-05-11 03:03:25 - INFO - stdout - {'loss': 0.7958, 'grad_norm': 0.6908150315284729, 'learning_rate': 1.2602128147111597e-05, 'epoch': 1.3} +2025-05-11 03:03:25 - ERROR - stderr - 43%|████▎ | 1623/3741 [9:37:31<12:05:07, 20.54s/it] +2025-05-11 03:03:45 - ERROR - stderr - 43%|████▎ | 1624/3741 [9:37:51<11:55:41, 20.28s/it] +2025-05-11 03:03:45 - ERROR - stderr - +2025-05-11 03:03:45 - ERROR - stderr - +2025-05-11 03:03:45 - INFO - stdout - {'loss': 0.7927, 'grad_norm': 0.731979489326477, 'learning_rate': 1.2593766177990096e-05, 'epoch': 1.3} +2025-05-11 03:03:45 - ERROR - stderr - 43%|████▎ | 1624/3741 [9:37:51<11:55:41, 20.28s/it] +2025-05-11 03:04:04 - ERROR - stderr - 43%|████▎ | 1625/3741 [9:38:11<11:46:27, 20.03s/it] +2025-05-11 03:04:04 - ERROR - stderr - +2025-05-11 03:04:04 - ERROR - stderr - +2025-05-11 03:04:04 - INFO - stdout - {'loss': 0.7429, 'grad_norm': 0.6440238952636719, 'learning_rate': 1.2585402263974383e-05, 'epoch': 1.3} +2025-05-11 03:04:04 - ERROR - stderr - 43%|████▎ | 1625/3741 [9:38:11<11:46:27, 20.03s/it] +2025-05-11 03:04:24 - ERROR - stderr - 43%|████▎ | 1626/3741 [9:38:30<11:40:15, 19.87s/it] +2025-05-11 03:04:24 - ERROR - stderr - +2025-05-11 03:04:24 - ERROR - stderr - +2025-05-11 03:04:24 - INFO - stdout - {'loss': 0.758, 'grad_norm': 0.6557809114456177, 'learning_rate': 1.2577036411336003e-05, 'epoch': 1.3} +2025-05-11 03:04:24 - ERROR - stderr - 43%|████▎ | 1626/3741 [9:38:30<11:40:15, 19.87s/it] +2025-05-11 03:04:46 - ERROR - stderr - 43%|████▎ | 1627/3741 [9:38:53<12:10:37, 20.74s/it] +2025-05-11 03:04:47 - ERROR - stderr - +2025-05-11 03:04:47 - ERROR - stderr - +2025-05-11 03:04:47 - INFO - stdout - {'loss': 0.7308, 'grad_norm': 0.670251727104187, 'learning_rate': 1.256866862634796e-05, 'epoch': 1.3} +2025-05-11 03:04:47 - ERROR - stderr - 43%|████▎ | 1627/3741 [9:38:53<12:10:37, 20.74s/it] +2025-05-11 03:05:06 - ERROR - stderr - 44%|████▎ | 1628/3741 [9:39:12<11:57:13, 20.37s/it] +2025-05-11 03:05:06 - ERROR - stderr - +2025-05-11 03:05:06 - ERROR - stderr - +2025-05-11 03:05:06 - INFO - stdout - {'loss': 0.7549, 'grad_norm': 0.6871209144592285, 'learning_rate': 1.2560298915284699e-05, 'epoch': 1.31} +2025-05-11 03:05:06 - ERROR - stderr - 44%|████▎ | 1628/3741 [9:39:12<11:57:13, 20.37s/it] +2025-05-11 03:05:30 - ERROR - stderr - 44%|████▎ | 1629/3741 [9:39:36<12:30:12, 21.31s/it] +2025-05-11 03:05:30 - ERROR - stderr - +2025-05-11 03:05:30 - ERROR - stderr - +2025-05-11 03:05:30 - INFO - stdout - {'loss': 0.7439, 'grad_norm': 0.6390516757965088, 'learning_rate': 1.2551927284422117e-05, 'epoch': 1.31} +2025-05-11 03:05:30 - ERROR - stderr - 44%|████▎ | 1629/3741 [9:39:36<12:30:12, 21.31s/it] +2025-05-11 03:05:49 - ERROR - stderr - 44%|████▎ | 1630/3741 [9:39:55<12:09:52, 20.74s/it] +2025-05-11 03:05:49 - ERROR - stderr - +2025-05-11 03:05:49 - ERROR - stderr - +2025-05-11 03:05:49 - INFO - stdout - {'loss': 0.7618, 'grad_norm': 0.6146913170814514, 'learning_rate': 1.2543553740037546e-05, 'epoch': 1.31} +2025-05-11 03:05:49 - ERROR - stderr - 44%|████▎ | 1630/3741 [9:39:55<12:09:52, 20.74s/it] +2025-05-11 03:06:12 - ERROR - stderr - 44%|████▎ | 1631/3741 [9:40:19<12:38:34, 21.57s/it] +2025-05-11 03:06:12 - ERROR - stderr - +2025-05-11 03:06:12 - ERROR - stderr - +2025-05-11 03:06:12 - INFO - stdout - {'loss': 0.7575, 'grad_norm': 0.6606637835502625, 'learning_rate': 1.2535178288409761e-05, 'epoch': 1.31} +2025-05-11 03:06:12 - ERROR - stderr - 44%|████▎ | 1631/3741 [9:40:19<12:38:34, 21.57s/it] +2025-05-11 03:06:32 - ERROR - stderr - 44%|████▎ | 1632/3741 [9:40:38<12:18:14, 21.00s/it] +2025-05-11 03:06:32 - ERROR - stderr - +2025-05-11 03:06:32 - ERROR - stderr - +2025-05-11 03:06:32 - INFO - stdout - {'loss': 0.8049, 'grad_norm': 0.6702629327774048, 'learning_rate': 1.2526800935818956e-05, 'epoch': 1.31} +2025-05-11 03:06:32 - ERROR - stderr - 44%|████▎ | 1632/3741 [9:40:38<12:18:14, 21.00s/it] +2025-05-11 03:06:56 - ERROR - stderr - 44%|████▎ | 1633/3741 [9:41:02<12:45:49, 21.80s/it] +2025-05-11 03:06:56 - ERROR - stderr - +2025-05-11 03:06:56 - ERROR - stderr - +2025-05-11 03:06:56 - INFO - stdout - {'loss': 0.7453, 'grad_norm': 0.6716954112052917, 'learning_rate': 1.2518421688546757e-05, 'epoch': 1.31} +2025-05-11 03:06:56 - ERROR - stderr - 44%|████▎ | 1633/3741 [9:41:02<12:45:49, 21.80s/it] +2025-05-11 03:07:15 - ERROR - stderr - 44%|████▎ | 1634/3741 [9:41:22<12:21:44, 21.12s/it] +2025-05-11 03:07:15 - ERROR - stderr - +2025-05-11 03:07:15 - ERROR - stderr - +2025-05-11 03:07:15 - INFO - stdout - {'loss': 0.7823, 'grad_norm': 0.6508925557136536, 'learning_rate': 1.2510040552876204e-05, 'epoch': 1.31} +2025-05-11 03:07:15 - ERROR - stderr - 44%|████▎ | 1634/3741 [9:41:22<12:21:44, 21.12s/it] +2025-05-11 03:07:39 - ERROR - stderr - 44%|████▎ | 1635/3741 [9:41:45<12:46:31, 21.84s/it] +2025-05-11 03:07:39 - ERROR - stderr - +2025-05-11 03:07:39 - ERROR - stderr - +2025-05-11 03:07:39 - INFO - stdout - {'loss': 0.7895, 'grad_norm': 0.8851492404937744, 'learning_rate': 1.2501657535091765e-05, 'epoch': 1.31} +2025-05-11 03:07:39 - ERROR - stderr - 44%|████▎ | 1635/3741 [9:41:45<12:46:31, 21.84s/it] +2025-05-11 03:07:58 - ERROR - stderr - 44%|████▎ | 1636/3741 [9:42:05<12:20:41, 21.11s/it] +2025-05-11 03:07:58 - ERROR - stderr - +2025-05-11 03:07:58 - ERROR - stderr - +2025-05-11 03:07:58 - INFO - stdout - {'loss': 0.7168, 'grad_norm': 0.648134171962738, 'learning_rate': 1.2493272641479311e-05, 'epoch': 1.31} +2025-05-11 03:07:58 - ERROR - stderr - 44%|████▎ | 1636/3741 [9:42:05<12:20:41, 21.11s/it] +2025-05-11 03:08:18 - ERROR - stderr - 44%|████▍ | 1637/3741 [9:42:24<12:04:48, 20.67s/it] +2025-05-11 03:08:18 - ERROR - stderr - +2025-05-11 03:08:18 - ERROR - stderr - +2025-05-11 03:08:18 - INFO - stdout - {'loss': 0.7674, 'grad_norm': 0.6762018799781799, 'learning_rate': 1.2484885878326114e-05, 'epoch': 1.31} +2025-05-11 03:08:18 - ERROR - stderr - 44%|████▍ | 1637/3741 [9:42:24<12:04:48, 20.67s/it] +2025-05-11 03:08:37 - ERROR - stderr - 44%|████▍ | 1638/3741 [9:42:44<11:53:02, 20.34s/it] +2025-05-11 03:08:37 - ERROR - stderr - +2025-05-11 03:08:37 - ERROR - stderr - +2025-05-11 03:08:37 - INFO - stdout - {'loss': 0.7295, 'grad_norm': 0.6717413067817688, 'learning_rate': 1.247649725192086e-05, 'epoch': 1.31} +2025-05-11 03:08:37 - ERROR - stderr - 44%|████▍ | 1638/3741 [9:42:44<11:53:02, 20.34s/it] +2025-05-11 03:08:57 - ERROR - stderr - 44%|████▍ | 1639/3741 [9:43:03<11:44:05, 20.10s/it] +2025-05-11 03:08:57 - ERROR - stderr - +2025-05-11 03:08:57 - ERROR - stderr - +2025-05-11 03:08:57 - INFO - stdout - {'loss': 0.7754, 'grad_norm': 0.7482190728187561, 'learning_rate': 1.246810676855363e-05, 'epoch': 1.31} +2025-05-11 03:08:57 - ERROR - stderr - 44%|████▍ | 1639/3741 [9:43:03<11:44:05, 20.10s/it] +2025-05-11 03:09:18 - ERROR - stderr - 44%|████▍ | 1640/3741 [9:43:25<11:55:44, 20.44s/it] +2025-05-11 03:09:18 - ERROR - stderr - +2025-05-11 03:09:18 - ERROR - stderr - +2025-05-11 03:09:18 - INFO - stdout - {'loss': 0.7827, 'grad_norm': 0.6837365627288818, 'learning_rate': 1.2459714434515888e-05, 'epoch': 1.32} +2025-05-11 03:09:18 - ERROR - stderr - 44%|████▍ | 1640/3741 [9:43:25<11:55:44, 20.44s/it] +2025-05-11 03:09:38 - ERROR - stderr - 44%|████▍ | 1641/3741 [9:43:44<11:46:07, 20.17s/it] +2025-05-11 03:09:38 - ERROR - stderr - +2025-05-11 03:09:38 - ERROR - stderr - +2025-05-11 03:09:38 - INFO - stdout - {'loss': 0.785, 'grad_norm': 0.6977565288543701, 'learning_rate': 1.2451320256100497e-05, 'epoch': 1.32} +2025-05-11 03:09:38 - ERROR - stderr - 44%|████▍ | 1641/3741 [9:43:44<11:46:07, 20.17s/it] +2025-05-11 03:10:00 - ERROR - stderr - 44%|████▍ | 1642/3741 [9:44:06<12:07:22, 20.79s/it] +2025-05-11 03:10:00 - ERROR - stderr - +2025-05-11 03:10:00 - ERROR - stderr - +2025-05-11 03:10:00 - INFO - stdout - {'loss': 0.7275, 'grad_norm': 0.6693125367164612, 'learning_rate': 1.2442924239601692e-05, 'epoch': 1.32} +2025-05-11 03:10:00 - ERROR - stderr - 44%|████▍ | 1642/3741 [9:44:06<12:07:22, 20.79s/it] +2025-05-11 03:10:19 - ERROR - stderr - 44%|████▍ | 1643/3741 [9:44:26<11:53:17, 20.40s/it] +2025-05-11 03:10:20 - ERROR - stderr - +2025-05-11 03:10:20 - ERROR - stderr - +2025-05-11 03:10:20 - INFO - stdout - {'loss': 0.8095, 'grad_norm': 0.7254882454872131, 'learning_rate': 1.2434526391315095e-05, 'epoch': 1.32} +2025-05-11 03:10:20 - ERROR - stderr - 44%|████▍ | 1643/3741 [9:44:26<11:53:17, 20.40s/it] +2025-05-11 03:10:43 - ERROR - stderr - 44%|████▍ | 1644/3741 [9:44:49<12:20:24, 21.18s/it] +2025-05-11 03:10:43 - ERROR - stderr - +2025-05-11 03:10:43 - ERROR - stderr - +2025-05-11 03:10:43 - INFO - stdout - {'loss': 0.7396, 'grad_norm': 0.6936510801315308, 'learning_rate': 1.2426126717537704e-05, 'epoch': 1.32} +2025-05-11 03:10:43 - ERROR - stderr - 44%|████▍ | 1644/3741 [9:44:49<12:20:24, 21.18s/it] +2025-05-11 03:11:02 - ERROR - stderr - 44%|████▍ | 1645/3741 [9:45:08<12:03:59, 20.72s/it] +2025-05-11 03:11:02 - ERROR - stderr - +2025-05-11 03:11:02 - ERROR - stderr - +2025-05-11 03:11:02 - INFO - stdout - {'loss': 0.7378, 'grad_norm': 0.6720640063285828, 'learning_rate': 1.2417725224567872e-05, 'epoch': 1.32} +2025-05-11 03:11:02 - ERROR - stderr - 44%|████▍ | 1645/3741 [9:45:08<12:03:59, 20.72s/it] +2025-05-11 03:11:25 - ERROR - stderr - 44%|████▍ | 1646/3741 [9:45:31<12:26:13, 21.37s/it] +2025-05-11 03:11:25 - ERROR - stderr - +2025-05-11 03:11:25 - ERROR - stderr - +2025-05-11 03:11:25 - INFO - stdout - {'loss': 0.7372, 'grad_norm': 0.655472457408905, 'learning_rate': 1.2409321918705329e-05, 'epoch': 1.32} +2025-05-11 03:11:25 - ERROR - stderr - 44%|████▍ | 1646/3741 [9:45:31<12:26:13, 21.37s/it] +2025-05-11 03:11:45 - ERROR - stderr - 44%|████▍ | 1647/3741 [9:45:51<12:07:07, 20.83s/it] +2025-05-11 03:11:45 - ERROR - stderr - +2025-05-11 03:11:45 - ERROR - stderr - +2025-05-11 03:11:45 - INFO - stdout - {'loss': 0.7659, 'grad_norm': 0.7163864970207214, 'learning_rate': 1.2400916806251157e-05, 'epoch': 1.32} +2025-05-11 03:11:45 - ERROR - stderr - 44%|████▍ | 1647/3741 [9:45:51<12:07:07, 20.83s/it] +2025-05-11 03:12:06 - ERROR - stderr - 44%|████▍ | 1648/3741 [9:46:12<12:08:46, 20.89s/it] +2025-05-11 03:12:06 - ERROR - stderr - +2025-05-11 03:12:06 - ERROR - stderr - +2025-05-11 03:12:06 - INFO - stdout - {'loss': 0.7724, 'grad_norm': 0.6562886238098145, 'learning_rate': 1.2392509893507799e-05, 'epoch': 1.32} +2025-05-11 03:12:06 - ERROR - stderr - 44%|████▍ | 1648/3741 [9:46:12<12:08:46, 20.89s/it] +2025-05-11 03:12:25 - ERROR - stderr - 44%|��███▍ | 1649/3741 [9:46:31<11:52:46, 20.44s/it] +2025-05-11 03:12:25 - ERROR - stderr - +2025-05-11 03:12:25 - ERROR - stderr - +2025-05-11 03:12:25 - INFO - stdout - {'loss': 0.7505, 'grad_norm': 0.6730007529258728, 'learning_rate': 1.2384101186779042e-05, 'epoch': 1.32} +2025-05-11 03:12:25 - ERROR - stderr - 44%|████▍ | 1649/3741 [9:46:31<11:52:46, 20.44s/it] +2025-05-11 03:12:44 - ERROR - stderr - 44%|████▍ | 1650/3741 [9:46:51<11:40:38, 20.10s/it] +2025-05-11 03:12:44 - ERROR - stderr - +2025-05-11 03:12:44 - ERROR - stderr - +2025-05-11 03:12:44 - INFO - stdout - {'loss': 0.784, 'grad_norm': 0.7325595617294312, 'learning_rate': 1.2375690692370022e-05, 'epoch': 1.32} +2025-05-11 03:12:44 - ERROR - stderr - 44%|████▍ | 1650/3741 [9:46:51<11:40:38, 20.10s/it] +2025-05-11 03:13:06 - ERROR - stderr - 44%|████▍ | 1651/3741 [9:47:12<11:54:13, 20.50s/it] +2025-05-11 03:13:06 - ERROR - stderr - +2025-05-11 03:13:06 - ERROR - stderr - +2025-05-11 03:13:06 - INFO - stdout - {'loss': 0.7435, 'grad_norm': 0.6311853528022766, 'learning_rate': 1.2367278416587216e-05, 'epoch': 1.32} +2025-05-11 03:13:06 - ERROR - stderr - 44%|████▍ | 1651/3741 [9:47:12<11:54:13, 20.50s/it] +2025-05-11 03:13:25 - ERROR - stderr - 44%|████▍ | 1652/3741 [9:47:32<11:42:52, 20.19s/it] +2025-05-11 03:13:25 - ERROR - stderr - +2025-05-11 03:13:25 - ERROR - stderr - +2025-05-11 03:13:25 - INFO - stdout - {'loss': 0.7545, 'grad_norm': 0.67484050989151, 'learning_rate': 1.235886436573843e-05, 'epoch': 1.32} +2025-05-11 03:13:25 - ERROR - stderr - 44%|████▍ | 1652/3741 [9:47:32<11:42:52, 20.19s/it] +2025-05-11 03:13:47 - ERROR - stderr - 44%|████▍ | 1653/3741 [9:47:53<11:59:57, 20.69s/it] +2025-05-11 03:13:47 - ERROR - stderr - +2025-05-11 03:13:47 - ERROR - stderr - +2025-05-11 03:13:47 - INFO - stdout - {'loss': 0.7657, 'grad_norm': 0.7450650930404663, 'learning_rate': 1.235044854613281e-05, 'epoch': 1.33} +2025-05-11 03:13:47 - ERROR - stderr - 44%|████▍ | 1653/3741 [9:47:53<11:59:57, 20.69s/it] +2025-05-11 03:14:07 - ERROR - stderr - 44%|████▍ | 1654/3741 [9:48:13<11:48:21, 20.37s/it] +2025-05-11 03:14:07 - ERROR - stderr - +2025-05-11 03:14:07 - ERROR - stderr - +2025-05-11 03:14:07 - INFO - stdout - {'loss': 0.7257, 'grad_norm': 0.6910774111747742, 'learning_rate': 1.2342030964080822e-05, 'epoch': 1.33} +2025-05-11 03:14:07 - ERROR - stderr - 44%|████▍ | 1654/3741 [9:48:13<11:48:21, 20.37s/it] +2025-05-11 03:14:28 - ERROR - stderr - 44%|████▍ | 1655/3741 [9:48:35<12:00:31, 20.72s/it] +2025-05-11 03:14:28 - ERROR - stderr - +2025-05-11 03:14:28 - ERROR - stderr - +2025-05-11 03:14:28 - INFO - stdout - {'loss': 0.7467, 'grad_norm': 0.6374717354774475, 'learning_rate': 1.2333611625894254e-05, 'epoch': 1.33} +2025-05-11 03:14:28 - ERROR - stderr - 44%|████▍ | 1655/3741 [9:48:35<12:00:31, 20.72s/it] +2025-05-11 03:14:48 - ERROR - stderr - 44%|████▍ | 1656/3741 [9:48:54<11:51:12, 20.47s/it] +2025-05-11 03:14:48 - ERROR - stderr - +2025-05-11 03:14:48 - ERROR - stderr - +2025-05-11 03:14:48 - INFO - stdout - {'loss': 0.7808, 'grad_norm': 0.6719356775283813, 'learning_rate': 1.2325190537886221e-05, 'epoch': 1.33} +2025-05-11 03:14:48 - ERROR - stderr - 44%|████▍ | 1656/3741 [9:48:54<11:51:12, 20.47s/it] +2025-05-11 03:15:10 - ERROR - stderr - 44%|████▍ | 1657/3741 [9:49:16<12:06:31, 20.92s/it] +2025-05-11 03:15:10 - ERROR - stderr - +2025-05-11 03:15:10 - ERROR - stderr - +2025-05-11 03:15:10 - INFO - stdout - {'loss': 0.7416, 'grad_norm': 0.7069136500358582, 'learning_rate': 1.231676770637113e-05, 'epoch': 1.33} +2025-05-11 03:15:10 - ERROR - stderr - 44%|████▍ | 1657/3741 [9:49:16<12:06:31, 20.92s/it] +2025-05-11 03:15:30 - ERROR - stderr - 44%|████▍ | 1658/3741 [9:49:36<11:53:19, 20.55s/it] +2025-05-11 03:15:30 - ERROR - stderr - +2025-05-11 03:15:30 - ERROR - stderr - +2025-05-11 03:15:30 - INFO - stdout - {'loss': 0.7745, 'grad_norm': 0.6578108668327332, 'learning_rate': 1.2308343137664716e-05, 'epoch': 1.33} +2025-05-11 03:15:30 - ERROR - stderr - 44%|████▍ | 1658/3741 [9:49:36<11:53:19, 20.55s/it] +2025-05-11 03:15:50 - ERROR - stderr - 44%|████▍ | 1659/3741 [9:49:56<11:46:40, 20.37s/it] +2025-05-11 03:15:50 - ERROR - stderr - +2025-05-11 03:15:50 - ERROR - stderr - +2025-05-11 03:15:50 - INFO - stdout - {'loss': 0.7411, 'grad_norm': 0.684407114982605, 'learning_rate': 1.2299916838084001e-05, 'epoch': 1.33} +2025-05-11 03:15:50 - ERROR - stderr - 44%|████▍ | 1659/3741 [9:49:56<11:46:40, 20.37s/it] +2025-05-11 03:16:09 - ERROR - stderr - 44%|████▍ | 1660/3741 [9:50:16<11:37:50, 20.12s/it] +2025-05-11 03:16:09 - ERROR - stderr - +2025-05-11 03:16:09 - ERROR - stderr - +2025-05-11 03:16:09 - INFO - stdout - {'loss': 0.7518, 'grad_norm': 0.6682063341140747, 'learning_rate': 1.2291488813947315e-05, 'epoch': 1.33} +2025-05-11 03:16:09 - ERROR - stderr - 44%|████▍ | 1660/3741 [9:50:16<11:37:50, 20.12s/it] +2025-05-11 03:16:29 - ERROR - stderr - 44%|████▍ | 1661/3741 [9:50:35<11:32:48, 19.98s/it] +2025-05-11 03:16:29 - ERROR - stderr - +2025-05-11 03:16:29 - ERROR - stderr - +2025-05-11 03:16:29 - INFO - stdout - {'loss': 0.7582, 'grad_norm': 0.6518343091011047, 'learning_rate': 1.2283059071574278e-05, 'epoch': 1.33} +2025-05-11 03:16:29 - ERROR - stderr - 44%|████▍ | 1661/3741 [9:50:35<11:32:48, 19.98s/it] +2025-05-11 03:16:48 - ERROR - stderr - 44%|████▍ | 1662/3741 [9:50:55<11:27:23, 19.84s/it] +2025-05-11 03:16:48 - ERROR - stderr - +2025-05-11 03:16:48 - ERROR - stderr - +2025-05-11 03:16:48 - INFO - stdout - {'loss': 0.7964, 'grad_norm': 0.6990170478820801, 'learning_rate': 1.2274627617285797e-05, 'epoch': 1.33} +2025-05-11 03:16:48 - ERROR - stderr - 44%|████▍ | 1662/3741 [9:50:55<11:27:23, 19.84s/it] +2025-05-11 03:17:08 - ERROR - stderr - 44%|████▍ | 1663/3741 [9:51:14<11:23:29, 19.74s/it] +2025-05-11 03:17:08 - ERROR - stderr - +2025-05-11 03:17:08 - ERROR - stderr - +2025-05-11 03:17:08 - INFO - stdout - {'loss': 0.7903, 'grad_norm': 0.6665741205215454, 'learning_rate': 1.2266194457404061e-05, 'epoch': 1.33} +2025-05-11 03:17:08 - ERROR - stderr - 44%|████▍ | 1663/3741 [9:51:14<11:23:29, 19.74s/it] +2025-05-11 03:17:29 - ERROR - stderr - 44%|████▍ | 1664/3741 [9:51:35<11:36:05, 20.11s/it] +2025-05-11 03:17:29 - ERROR - stderr - +2025-05-11 03:17:29 - ERROR - stderr - +2025-05-11 03:17:29 - INFO - stdout - {'loss': 0.7422, 'grad_norm': 0.6582921743392944, 'learning_rate': 1.2257759598252543e-05, 'epoch': 1.33} +2025-05-11 03:17:29 - ERROR - stderr - 44%|████▍ | 1664/3741 [9:51:35<11:36:05, 20.11s/it] +2025-05-11 03:17:48 - ERROR - stderr - 45%|████▍ | 1665/3741 [9:51:55<11:29:18, 19.92s/it] +2025-05-11 03:17:48 - ERROR - stderr - +2025-05-11 03:17:48 - ERROR - stderr - +2025-05-11 03:17:48 - INFO - stdout - {'loss': 0.7956, 'grad_norm': 0.6946974396705627, 'learning_rate': 1.224932304615599e-05, 'epoch': 1.34} +2025-05-11 03:17:48 - ERROR - stderr - 45%|████▍ | 1665/3741 [9:51:55<11:29:18, 19.92s/it] +2025-05-11 03:18:12 - ERROR - stderr - 45%|████▍ | 1666/3741 [9:52:18<12:03:12, 20.91s/it] +2025-05-11 03:18:12 - ERROR - stderr - +2025-05-11 03:18:12 - ERROR - stderr - +2025-05-11 03:18:12 - INFO - stdout - {'loss': 0.772, 'grad_norm': 0.6991271376609802, 'learning_rate': 1.2240884807440413e-05, 'epoch': 1.34} +2025-05-11 03:18:12 - ERROR - stderr - 45%|████▍ | 1666/3741 [9:52:18<12:03:12, 20.91s/it] +2025-05-11 03:18:31 - ERROR - stderr - 45%|████▍ | 1667/3741 [9:52:38<11:51:33, 20.58s/it] +2025-05-11 03:18:31 - ERROR - stderr - +2025-05-11 03:18:31 - ERROR - stderr - +2025-05-11 03:18:31 - INFO - stdout - {'loss': 0.7572, 'grad_norm': 0.6818379759788513, 'learning_rate': 1.223244488843309e-05, 'epoch': 1.34} +2025-05-11 03:18:31 - ERROR - stderr - 45%|████▍ | 1667/3741 [9:52:38<11:51:33, 20.58s/it] +2025-05-11 03:18:53 - ERROR - stderr - 45%|████▍ | 1668/3741 [9:52:59<12:02:45, 20.92s/it] +2025-05-11 03:18:53 - ERROR - stderr - +2025-05-11 03:18:53 - ERROR - stderr - +2025-05-11 03:18:53 - INFO - stdout - {'loss': 0.736, 'grad_norm': 0.6668140292167664, 'learning_rate': 1.2224003295462561e-05, 'epoch': 1.34} +2025-05-11 03:18:53 - ERROR - stderr - 45%|████▍ | 1668/3741 [9:52:59<12:02:45, 20.92s/it] +2025-05-11 03:19:13 - ERROR - stderr - 45%|████▍ | 1669/3741 [9:53:19<11:47:41, 20.49s/it] +2025-05-11 03:19:13 - ERROR - stderr - +2025-05-11 03:19:13 - ERROR - stderr - +2025-05-11 03:19:13 - INFO - stdout - {'loss': 0.7643, 'grad_norm': 0.6249803304672241, 'learning_rate': 1.221556003485862e-05, 'epoch': 1.34} +2025-05-11 03:19:13 - ERROR - stderr - 45%|████▍ | 1669/3741 [9:53:19<11:47:41, 20.49s/it] +2025-05-11 03:19:32 - ERROR - stderr - 45%|████▍ | 1670/3741 [9:53:39<11:40:38, 20.30s/it] +2025-05-11 03:19:33 - ERROR - stderr - +2025-05-11 03:19:33 - ERROR - stderr - +2025-05-11 03:19:33 - INFO - stdout - {'loss': 0.7568, 'grad_norm': 0.6772146821022034, 'learning_rate': 1.2207115112952313e-05, 'epoch': 1.34} +2025-05-11 03:19:33 - ERROR - stderr - 45%|████▍ | 1670/3741 [9:53:39<11:40:38, 20.30s/it] +2025-05-11 03:19:52 - ERROR - stderr - 45%|████▍ | 1671/3741 [9:53:58<11:32:40, 20.08s/it] +2025-05-11 03:19:52 - ERROR - stderr - +2025-05-11 03:19:52 - ERROR - stderr - +2025-05-11 03:19:52 - INFO - stdout - {'loss': 0.8018, 'grad_norm': 0.6962553858757019, 'learning_rate': 1.2198668536075924e-05, 'epoch': 1.34} +2025-05-11 03:19:52 - ERROR - stderr - 45%|████▍ | 1671/3741 [9:53:58<11:32:40, 20.08s/it] +2025-05-11 03:19:52 - INFO - stdout - WARNING: tokenization mismatch: 3167 vs. 3185. (ignored) +2025-05-11 03:20:12 - ERROR - stderr - 45%|████▍ | 1672/3741 [9:54:18<11:26:50, 19.92s/it] +2025-05-11 03:20:12 - ERROR - stderr - +2025-05-11 03:20:12 - ERROR - stderr - +2025-05-11 03:20:12 - INFO - stdout - {'loss': 0.7296, 'grad_norm': 0.6322622299194336, 'learning_rate': 1.2190220310562992e-05, 'epoch': 1.34} +2025-05-11 03:20:12 - ERROR - stderr - 45%|████▍ | 1672/3741 [9:54:18<11:26:50, 19.92s/it] +2025-05-11 03:20:32 - ERROR - stderr - 45%|████▍ | 1673/3741 [9:54:38<11:32:57, 20.11s/it] +2025-05-11 03:20:32 - ERROR - stderr - +2025-05-11 03:20:32 - ERROR - stderr - +2025-05-11 03:20:32 - INFO - stdout - {'loss': 0.8024, 'grad_norm': 0.6803750991821289, 'learning_rate': 1.218177044274828e-05, 'epoch': 1.34} +2025-05-11 03:20:32 - ERROR - stderr - 45%|████▍ | 1673/3741 [9:54:38<11:32:57, 20.11s/it] +2025-05-11 03:20:52 - ERROR - stderr - 45%|████▍ | 1674/3741 [9:54:58<11:26:22, 19.92s/it] +2025-05-11 03:20:52 - ERROR - stderr - +2025-05-11 03:20:52 - ERROR - stderr - +2025-05-11 03:20:52 - INFO - stdout - {'loss': 0.7697, 'grad_norm': 0.6757524013519287, 'learning_rate': 1.217331893896779e-05, 'epoch': 1.34} +2025-05-11 03:20:52 - ERROR - stderr - 45%|████▍ | 1674/3741 [9:54:58<11:26:22, 19.92s/it] +2025-05-11 03:21:14 - ERROR - stderr - 45%|████▍ | 1675/3741 [9:55:20<11:48:57, 20.59s/it] +2025-05-11 03:21:14 - ERROR - stderr - +2025-05-11 03:21:14 - ERROR - stderr - +2025-05-11 03:21:14 - INFO - stdout - {'loss': 0.7728, 'grad_norm': 0.7142401337623596, 'learning_rate': 1.2164865805558738e-05, 'epoch': 1.34} +2025-05-11 03:21:14 - ERROR - stderr - 45%|████▍ | 1675/3741 [9:55:20<11:48:57, 20.59s/it] +2025-05-11 03:21:34 - ERROR - stderr - 45%|████▍ | 1676/3741 [9:55:40<11:42:54, 20.42s/it] +2025-05-11 03:21:34 - ERROR - stderr - +2025-05-11 03:21:34 - ERROR - stderr - +2025-05-11 03:21:34 - INFO - stdout - {'loss': 0.7395, 'grad_norm': 0.631919801235199, 'learning_rate': 1.215641104885958e-05, 'epoch': 1.34} +2025-05-11 03:21:34 - ERROR - stderr - 45%|████▍ | 1676/3741 [9:55:40<11:42:54, 20.42s/it] +2025-05-11 03:21:56 - ERROR - stderr - 45%|████▍ | 1677/3741 [9:56:02<11:57:31, 20.86s/it] +2025-05-11 03:21:56 - ERROR - stderr - +2025-05-11 03:21:56 - ERROR - stderr - +2025-05-11 03:21:56 - INFO - stdout - {'loss': 0.7703, 'grad_norm': 0.6723388433456421, 'learning_rate': 1.2147954675209982e-05, 'epoch': 1.34} +2025-05-11 03:21:56 - ERROR - stderr - 45%|████▍ | 1677/3741 [9:56:02<11:57:31, 20.86s/it] +2025-05-11 03:22:15 - ERROR - stderr - 45%|████▍ | 1678/3741 [9:56:21<11:42:33, 20.43s/it] +2025-05-11 03:22:15 - ERROR - stderr - +2025-05-11 03:22:15 - ERROR - stderr - +2025-05-11 03:22:15 - INFO - stdout - {'loss': 0.7829, 'grad_norm': 0.654248833656311, 'learning_rate': 1.2139496690950813e-05, 'epoch': 1.35} +2025-05-11 03:22:15 - ERROR - stderr - 45%|████▍ | 1678/3741 [9:56:21<11:42:33, 20.43s/it] +2025-05-11 03:22:35 - ERROR - stderr - 45%|████▍ | 1679/3741 [9:56:41<11:36:05, 20.25s/it] +2025-05-11 03:22:35 - ERROR - stderr - +2025-05-11 03:22:35 - ERROR - stderr - +2025-05-11 03:22:35 - INFO - stdout - {'loss': 0.7534, 'grad_norm': 0.6374508738517761, 'learning_rate': 1.2131037102424165e-05, 'epoch': 1.35} +2025-05-11 03:22:35 - ERROR - stderr - 45%|████▍ | 1679/3741 [9:56:41<11:36:05, 20.25s/it] +2025-05-11 03:22:54 - ERROR - stderr - 45%|████▍ | 1680/3741 [9:57:01<11:26:27, 19.98s/it] +2025-05-11 03:22:54 - ERROR - stderr - +2025-05-11 03:22:54 - ERROR - stderr - +2025-05-11 03:22:54 - INFO - stdout - {'loss': 0.7589, 'grad_norm': 0.6467137932777405, 'learning_rate': 1.2122575915973321e-05, 'epoch': 1.35} +2025-05-11 03:22:54 - ERROR - stderr - 45%|████▍ | 1680/3741 [9:57:01<11:26:27, 19.98s/it] +2025-05-11 03:23:14 - ERROR - stderr - 45%|████▍ | 1681/3741 [9:57:20<11:21:41, 19.86s/it] +2025-05-11 03:23:14 - ERROR - stderr - +2025-05-11 03:23:14 - ERROR - stderr - +2025-05-11 03:23:14 - INFO - stdout - {'loss': 0.7685, 'grad_norm': 0.686490535736084, 'learning_rate': 1.2114113137942767e-05, 'epoch': 1.35} +2025-05-11 03:23:14 - ERROR - stderr - 45%|████▍ | 1681/3741 [9:57:20<11:21:41, 19.86s/it] +2025-05-11 03:23:35 - ERROR - stderr - 45%|████▍ | 1682/3741 [9:57:42<11:38:53, 20.37s/it] +2025-05-11 03:23:35 - ERROR - stderr - +2025-05-11 03:23:35 - ERROR - stderr - +2025-05-11 03:23:35 - INFO - stdout - {'loss': 0.7302, 'grad_norm': 0.6674497127532959, 'learning_rate': 1.2105648774678188e-05, 'epoch': 1.35} +2025-05-11 03:23:35 - ERROR - stderr - 45%|████▍ | 1682/3741 [9:57:42<11:38:53, 20.37s/it] +2025-05-11 03:23:55 - ERROR - stderr - 45%|████▍ | 1683/3741 [9:58:01<11:30:35, 20.13s/it] +2025-05-11 03:23:55 - ERROR - stderr - +2025-05-11 03:23:55 - ERROR - stderr - +2025-05-11 03:23:55 - INFO - stdout - {'loss': 0.7784, 'grad_norm': 0.6758518218994141, 'learning_rate': 1.2097182832526443e-05, 'epoch': 1.35} +2025-05-11 03:23:55 - ERROR - stderr - 45%|████▍ | 1683/3741 [9:58:01<11:30:35, 20.13s/it] +2025-05-11 03:24:16 - ERROR - stderr - 45%|████▌ | 1684/3741 [9:58:23<11:41:45, 20.47s/it] +2025-05-11 03:24:16 - ERROR - stderr - +2025-05-11 03:24:16 - ERROR - stderr - +2025-05-11 03:24:16 - INFO - stdout - {'loss': 0.745, 'grad_norm': 0.663431704044342, 'learning_rate': 1.2088715317835589e-05, 'epoch': 1.35} +2025-05-11 03:24:16 - ERROR - stderr - 45%|████▌ | 1684/3741 [9:58:23<11:41:45, 20.47s/it] +2025-05-11 03:24:36 - ERROR - stderr - 45%|████▌ | 1685/3741 [9:58:42<11:31:20, 20.18s/it] +2025-05-11 03:24:36 - ERROR - stderr - +2025-05-11 03:24:36 - ERROR - stderr - +2025-05-11 03:24:36 - INFO - stdout - {'loss': 0.8056, 'grad_norm': 0.6859851479530334, 'learning_rate': 1.2080246236954856e-05, 'epoch': 1.35} +2025-05-11 03:24:36 - ERROR - stderr - 45%|████▌ | 1685/3741 [9:58:42<11:31:20, 20.18s/it] +2025-05-11 03:24:57 - ERROR - stderr - 45%|████▌ | 1686/3741 [9:59:04<11:44:57, 20.58s/it] +2025-05-11 03:24:57 - ERROR - stderr - +2025-05-11 03:24:57 - ERROR - stderr - +2025-05-11 03:24:57 - INFO - stdout - {'loss': 0.7598, 'grad_norm': 0.6677384972572327, 'learning_rate': 1.2071775596234647e-05, 'epoch': 1.35} +2025-05-11 03:24:57 - ERROR - stderr - 45%|████▌ | 1686/3741 [9:59:04<11:44:57, 20.58s/it] +2025-05-11 03:25:17 - ERROR - stderr - 45%|████▌ | 1687/3741 [9:59:23<11:33:27, 20.26s/it] +2025-05-11 03:25:17 - ERROR - stderr - +2025-05-11 03:25:17 - ERROR - stderr - +2025-05-11 03:25:17 - INFO - stdout - {'loss': 0.8167, 'grad_norm': 0.7199987769126892, 'learning_rate': 1.2063303402026545e-05, 'epoch': 1.35} +2025-05-11 03:25:17 - ERROR - stderr - 45%|████▌ | 1687/3741 [9:59:23<11:33:27, 20.26s/it] +2025-05-11 03:25:39 - ERROR - stderr - 45%|████▌ | 1688/3741 [9:59:45<11:49:29, 20.74s/it] +2025-05-11 03:25:39 - ERROR - stderr - +2025-05-11 03:25:39 - ERROR - stderr - +2025-05-11 03:25:39 - INFO - stdout - {'loss': 0.7508, 'grad_norm': 0.6905441284179688, 'learning_rate': 1.2054829660683281e-05, 'epoch': 1.35} +2025-05-11 03:25:39 - ERROR - stderr - 45%|████▌ | 1688/3741 [9:59:45<11:49:29, 20.74s/it] +2025-05-11 03:25:58 - ERROR - stderr - 45%|████▌ | 1689/3741 [10:00:05<11:37:31, 20.40s/it] +2025-05-11 03:25:58 - ERROR - stderr - +2025-05-11 03:25:58 - ERROR - stderr - +2025-05-11 03:25:58 - INFO - stdout - {'loss': 0.7208, 'grad_norm': 0.6765400171279907, 'learning_rate': 1.2046354378558753e-05, 'epoch': 1.35} +2025-05-11 03:25:58 - ERROR - stderr - 45%|████▌ | 1689/3741 [10:00:05<11:37:31, 20.40s/it] +2025-05-11 03:26:21 - ERROR - stderr - 45%|████▌ | 1690/3741 [10:00:27<11:58:25, 21.02s/it] +2025-05-11 03:26:21 - ERROR - stderr - +2025-05-11 03:26:21 - ERROR - stderr - +2025-05-11 03:26:21 - INFO - stdout - {'loss': 0.7675, 'grad_norm': 0.6659426093101501, 'learning_rate': 1.2037877562008025e-05, 'epoch': 1.36} +2025-05-11 03:26:21 - ERROR - stderr - 45%|████▌ | 1690/3741 [10:00:27<11:58:25, 21.02s/it] +2025-05-11 03:26:40 - ERROR - stderr - 45%|████▌ | 1691/3741 [10:00:47<11:43:04, 20.58s/it] +2025-05-11 03:26:40 - ERROR - stderr - +2025-05-11 03:26:40 - ERROR - stderr - +2025-05-11 03:26:40 - INFO - stdout - {'loss': 0.753, 'grad_norm': 0.6474359035491943, 'learning_rate': 1.2029399217387299e-05, 'epoch': 1.36} +2025-05-11 03:26:40 - ERROR - stderr - 45%|████▌ | 1691/3741 [10:00:47<11:43:04, 20.58s/it] +2025-05-11 03:27:01 - ERROR - stderr - 45%|████▌ | 1692/3741 [10:01:07<11:39:55, 20.50s/it] +2025-05-11 03:27:01 - ERROR - stderr - +2025-05-11 03:27:01 - ERROR - stderr - +2025-05-11 03:27:01 - INFO - stdout - {'loss': 0.7842, 'grad_norm': 0.7105376124382019, 'learning_rate': 1.2020919351053927e-05, 'epoch': 1.36} +2025-05-11 03:27:01 - ERROR - stderr - 45%|████▌ | 1692/3741 [10:01:07<11:39:55, 20.50s/it] +2025-05-11 03:27:20 - ERROR - stderr - 45%|████▌ | 1693/3741 [10:01:26<11:29:00, 20.19s/it] +2025-05-11 03:27:20 - ERROR - stderr - +2025-05-11 03:27:20 - ERROR - stderr - +2025-05-11 03:27:20 - INFO - stdout - {'loss': 0.7136, 'grad_norm': 0.6748633980751038, 'learning_rate': 1.2012437969366397e-05, 'epoch': 1.36} +2025-05-11 03:27:20 - ERROR - stderr - 45%|████▌ | 1693/3741 [10:01:26<11:29:00, 20.19s/it] +2025-05-11 03:27:20 - INFO - stdout - WARNING: tokenization mismatch: 1 vs. 3126. (ignored) +2025-05-11 03:27:40 - ERROR - stderr - 45%|████▌ | 1694/3741 [10:01:46<11:23:51, 20.04s/it] +2025-05-11 03:27:40 - ERROR - stderr - +2025-05-11 03:27:40 - ERROR - stderr - +2025-05-11 03:27:40 - INFO - stdout - {'loss': 0.7817, 'grad_norm': 0.6597528457641602, 'learning_rate': 1.2003955078684344e-05, 'epoch': 1.36} +2025-05-11 03:27:40 - ERROR - stderr - 45%|████▌ | 1694/3741 [10:01:46<11:23:51, 20.04s/it] +2025-05-11 03:28:00 - ERROR - stderr - 45%|████▌ | 1695/3741 [10:02:06<11:23:19, 20.04s/it] +2025-05-11 03:28:00 - ERROR - stderr - +2025-05-11 03:28:00 - ERROR - stderr - +2025-05-11 03:28:00 - INFO - stdout - {'loss': 0.7433, 'grad_norm': 0.6678489446640015, 'learning_rate': 1.1995470685368527e-05, 'epoch': 1.36} +2025-05-11 03:28:00 - ERROR - stderr - 45%|████▌ | 1695/3741 [10:02:06<11:23:19, 20.04s/it] +2025-05-11 03:28:20 - ERROR - stderr - 45%|████▌ | 1696/3741 [10:02:26<11:21:05, 19.98s/it] +2025-05-11 03:28:20 - ERROR - stderr - +2025-05-11 03:28:20 - ERROR - stderr - +2025-05-11 03:28:20 - INFO - stdout - {'loss': 0.7635, 'grad_norm': 0.7042926549911499, 'learning_rate': 1.1986984795780829e-05, 'epoch': 1.36} +2025-05-11 03:28:20 - ERROR - stderr - 45%|████▌ | 1696/3741 [10:02:26<11:21:05, 19.98s/it] +2025-05-11 03:28:41 - ERROR - stderr - 45%|████▌ | 1697/3741 [10:02:47<11:34:40, 20.39s/it] +2025-05-11 03:28:41 - ERROR - stderr - +2025-05-11 03:28:41 - ERROR - stderr - +2025-05-11 03:28:41 - INFO - stdout - {'loss': 0.703, 'grad_norm': 0.6309202313423157, 'learning_rate': 1.1978497416284265e-05, 'epoch': 1.36} +2025-05-11 03:28:41 - ERROR - stderr - 45%|████▌ | 1697/3741 [10:02:47<11:34:40, 20.39s/it] +2025-05-11 03:29:01 - ERROR - stderr - 45%|████▌ | 1698/3741 [10:03:07<11:26:17, 20.16s/it] +2025-05-11 03:29:01 - ERROR - stderr - +2025-05-11 03:29:01 - ERROR - stderr - +2025-05-11 03:29:01 - INFO - stdout - {'loss': 0.7649, 'grad_norm': 0.7144943475723267, 'learning_rate': 1.1970008553242955e-05, 'epoch': 1.36} +2025-05-11 03:29:01 - ERROR - stderr - 45%|████▌ | 1698/3741 [10:03:07<11:26:17, 20.16s/it] +2025-05-11 03:29:22 - ERROR - stderr - 45%|████▌ | 1699/3741 [10:03:29<11:43:24, 20.67s/it] +2025-05-11 03:29:22 - ERROR - stderr - +2025-05-11 03:29:22 - ERROR - stderr - +2025-05-11 03:29:22 - INFO - stdout - {'loss': 0.7039, 'grad_norm': 0.64893639087677, 'learning_rate': 1.196151821302214e-05, 'epoch': 1.36} +2025-05-11 03:29:22 - ERROR - stderr - 45%|████▌ | 1699/3741 [10:03:29<11:43:24, 20.67s/it] +2025-05-11 03:29:42 - ERROR - stderr - 45%|████▌ | 1700/3741 [10:03:48<11:31:41, 20.33s/it] +2025-05-11 03:29:42 - ERROR - stderr - +2025-05-11 03:29:42 - ERROR - stderr - +2025-05-11 03:29:42 - INFO - stdout - {'loss': 0.7479, 'grad_norm': 0.8356136083602905, 'learning_rate': 1.1953026401988172e-05, 'epoch': 1.36} +2025-05-11 03:29:42 - ERROR - stderr - 45%|████▌ | 1700/3741 [10:03:48<11:31:41, 20.33s/it] +2025-05-11 03:30:04 - ERROR - stderr - 45%|████▌ | 1701/3741 [10:04:10<11:46:19, 20.77s/it] +2025-05-11 03:30:04 - ERROR - stderr - +2025-05-11 03:30:04 - ERROR - stderr - +2025-05-11 03:30:04 - INFO - stdout - {'loss': 0.7319, 'grad_norm': 0.6598641276359558, 'learning_rate': 1.1944533126508491e-05, 'epoch': 1.36} +2025-05-11 03:30:04 - ERROR - stderr - 45%|████▌ | 1701/3741 [10:04:10<11:46:19, 20.77s/it] +2025-05-11 03:30:23 - ERROR - stderr - 45%|████▌ | 1702/3741 [10:04:30<11:32:54, 20.39s/it] +2025-05-11 03:30:23 - ERROR - stderr - +2025-05-11 03:30:23 - ERROR - stderr - +2025-05-11 03:30:23 - INFO - stdout - {'loss': 0.7221, 'grad_norm': 0.6679075360298157, 'learning_rate': 1.193603839295165e-05, 'epoch': 1.36} +2025-05-11 03:30:23 - ERROR - stderr - 45%|████▌ | 1702/3741 [10:04:30<11:32:54, 20.39s/it] +2025-05-11 03:30:45 - ERROR - stderr - 46%|████▌ | 1703/3741 [10:04:51<11:46:40, 20.80s/it] +2025-05-11 03:30:45 - ERROR - stderr - +2025-05-11 03:30:45 - ERROR - stderr - +2025-05-11 03:30:45 - INFO - stdout - {'loss': 0.7633, 'grad_norm': 0.6554751396179199, 'learning_rate': 1.1927542207687287e-05, 'epoch': 1.37} +2025-05-11 03:30:45 - ERROR - stderr - 46%|████▌ | 1703/3741 [10:04:51<11:46:40, 20.80s/it] +2025-05-11 03:31:05 - ERROR - stderr - 46%|████▌ | 1704/3741 [10:05:11<11:36:13, 20.51s/it] +2025-05-11 03:31:05 - ERROR - stderr - +2025-05-11 03:31:05 - ERROR - stderr - +2025-05-11 03:31:05 - INFO - stdout - {'loss': 0.7493, 'grad_norm': 0.6698046922683716, 'learning_rate': 1.1919044577086135e-05, 'epoch': 1.37} +2025-05-11 03:31:05 - ERROR - stderr - 46%|████▌ | 1704/3741 [10:05:11<11:36:13, 20.51s/it] +2025-05-11 03:31:24 - ERROR - stderr - 46%|████▌ | 1705/3741 [10:05:31<11:26:21, 20.23s/it] +2025-05-11 03:31:24 - ERROR - stderr - +2025-05-11 03:31:24 - ERROR - stderr - +2025-05-11 03:31:24 - INFO - stdout - {'loss': 0.7613, 'grad_norm': 0.7508684396743774, 'learning_rate': 1.191054550752e-05, 'epoch': 1.37} +2025-05-11 03:31:24 - ERROR - stderr - 46%|████▌ | 1705/3741 [10:05:31<11:26:21, 20.23s/it] +2025-05-11 03:31:44 - ERROR - stderr - 46%|████▌ | 1706/3741 [10:05:50<11:20:27, 20.06s/it] +2025-05-11 03:31:44 - ERROR - stderr - +2025-05-11 03:31:44 - ERROR - stderr - +2025-05-11 03:31:44 - INFO - stdout - {'loss': 0.7632, 'grad_norm': 0.6737611293792725, 'learning_rate': 1.1902045005361775e-05, 'epoch': 1.37} +2025-05-11 03:31:44 - ERROR - stderr - 46%|████▌ | 1706/3741 [10:05:50<11:20:27, 20.06s/it] +2025-05-11 03:32:04 - ERROR - stderr - 46%|████▌ | 1707/3741 [10:06:10<11:14:18, 19.89s/it] +2025-05-11 03:32:04 - ERROR - stderr - +2025-05-11 03:32:04 - ERROR - stderr - +2025-05-11 03:32:04 - INFO - stdout - {'loss': 0.7383, 'grad_norm': 0.6801590919494629, 'learning_rate': 1.1893543076985434e-05, 'epoch': 1.37} +2025-05-11 03:32:04 - ERROR - stderr - 46%|████▌ | 1707/3741 [10:06:10<11:14:18, 19.89s/it] +2025-05-11 03:32:24 - ERROR - stderr - 46%|████▌ | 1708/3741 [10:06:31<11:21:28, 20.11s/it] +2025-05-11 03:32:24 - ERROR - stderr - +2025-05-11 03:32:24 - ERROR - stderr - +2025-05-11 03:32:24 - INFO - stdout - {'loss': 0.7311, 'grad_norm': 0.6327788233757019, 'learning_rate': 1.1885039728766006e-05, 'epoch': 1.37} +2025-05-11 03:32:24 - ERROR - stderr - 46%|████▌ | 1708/3741 [10:06:31<11:21:28, 20.11s/it] +2025-05-11 03:32:44 - ERROR - stderr - 46%|████▌ | 1709/3741 [10:06:50<11:17:37, 20.01s/it] +2025-05-11 03:32:44 - ERROR - stderr - +2025-05-11 03:32:44 - ERROR - stderr - +2025-05-11 03:32:44 - INFO - stdout - {'loss': 0.7488, 'grad_norm': 0.6967527270317078, 'learning_rate': 1.187653496707959e-05, 'epoch': 1.37} +2025-05-11 03:32:44 - ERROR - stderr - 46%|████▌ | 1709/3741 [10:06:50<11:17:37, 20.01s/it] +2025-05-11 03:33:07 - ERROR - stderr - 46%|████▌ | 1710/3741 [10:07:13<11:46:47, 20.88s/it] +2025-05-11 03:33:07 - ERROR - stderr - +2025-05-11 03:33:07 - ERROR - stderr - +2025-05-11 03:33:07 - INFO - stdout - {'loss': 0.7433, 'grad_norm': 0.6776365041732788, 'learning_rate': 1.1868028798303346e-05, 'epoch': 1.37} +2025-05-11 03:33:07 - ERROR - stderr - 46%|████▌ | 1710/3741 [10:07:13<11:46:47, 20.88s/it] +2025-05-11 03:33:26 - ERROR - stderr - 46%|████▌ | 1711/3741 [10:07:33<11:30:57, 20.42s/it] +2025-05-11 03:33:26 - ERROR - stderr - +2025-05-11 03:33:26 - ERROR - stderr - +2025-05-11 03:33:26 - INFO - stdout - {'loss': 0.7492, 'grad_norm': 0.6518088579177856, 'learning_rate': 1.1859521228815495e-05, 'epoch': 1.37} +2025-05-11 03:33:26 - ERROR - stderr - 46%|████▌ | 1711/3741 [10:07:33<11:30:57, 20.42s/it] +2025-05-11 03:33:50 - ERROR - stderr - 46%|████▌ | 1712/3741 [10:07:56<11:59:45, 21.28s/it] +2025-05-11 03:33:50 - ERROR - stderr - +2025-05-11 03:33:50 - ERROR - stderr - +2025-05-11 03:33:50 - INFO - stdout - {'loss': 0.7494, 'grad_norm': 0.7493876814842224, 'learning_rate': 1.1851012264995296e-05, 'epoch': 1.37} +2025-05-11 03:33:50 - ERROR - stderr - 46%|████▌ | 1712/3741 [10:07:56<11:59:45, 21.28s/it] +2025-05-11 03:34:09 - ERROR - stderr - 46%|████▌ | 1713/3741 [10:08:15<11:41:37, 20.76s/it] +2025-05-11 03:34:09 - ERROR - stderr - +2025-05-11 03:34:09 - ERROR - stderr - +2025-05-11 03:34:09 - INFO - stdout - {'loss': 0.7581, 'grad_norm': 0.6971762180328369, 'learning_rate': 1.1842501913223066e-05, 'epoch': 1.37} +2025-05-11 03:34:09 - ERROR - stderr - 46%|████▌ | 1713/3741 [10:08:15<11:41:37, 20.76s/it] +2025-05-11 03:34:30 - ERROR - stderr - 46%|████▌ | 1714/3741 [10:08:36<11:42:32, 20.80s/it] +2025-05-11 03:34:30 - ERROR - stderr - +2025-05-11 03:34:30 - ERROR - stderr - +2025-05-11 03:34:30 - INFO - stdout - {'loss': 0.7684, 'grad_norm': 0.6640205979347229, 'learning_rate': 1.1833990179880148e-05, 'epoch': 1.37} +2025-05-11 03:34:30 - ERROR - stderr - 46%|████▌ | 1714/3741 [10:08:36<11:42:32, 20.80s/it] +2025-05-11 03:34:50 - ERROR - stderr - 46%|████▌ | 1715/3741 [10:08:56<11:30:36, 20.45s/it] +2025-05-11 03:34:50 - ERROR - stderr - +2025-05-11 03:34:50 - ERROR - stderr - +2025-05-11 03:34:50 - INFO - stdout - {'loss': 0.7265, 'grad_norm': 0.6524538397789001, 'learning_rate': 1.1825477071348937e-05, 'epoch': 1.38} +2025-05-11 03:34:50 - ERROR - stderr - 46%|████▌ | 1715/3741 [10:08:56<11:30:36, 20.45s/it] +2025-05-11 03:35:10 - ERROR - stderr - 46%|████▌ | 1716/3741 [10:09:16<11:25:15, 20.30s/it] +2025-05-11 03:35:10 - ERROR - stderr - +2025-05-11 03:35:10 - ERROR - stderr - +2025-05-11 03:35:10 - INFO - stdout - {'loss': 0.806, 'grad_norm': 0.7143647074699402, 'learning_rate': 1.1816962594012849e-05, 'epoch': 1.38} +2025-05-11 03:35:10 - ERROR - stderr - 46%|████▌ | 1716/3741 [10:09:16<11:25:15, 20.30s/it] +2025-05-11 03:35:30 - ERROR - stderr - 46%|████▌ | 1717/3741 [10:09:36<11:24:23, 20.29s/it] +2025-05-11 03:35:30 - ERROR - stderr - +2025-05-11 03:35:30 - ERROR - stderr - +2025-05-11 03:35:30 - INFO - stdout - {'loss': 0.7126, 'grad_norm': 0.605280876159668, 'learning_rate': 1.1808446754256329e-05, 'epoch': 1.38} +2025-05-11 03:35:30 - ERROR - stderr - 46%|████▌ | 1717/3741 [10:09:36<11:24:23, 20.29s/it] +2025-05-11 03:35:50 - ERROR - stderr - 46%|████▌ | 1718/3741 [10:09:56<11:18:30, 20.12s/it] +2025-05-11 03:35:50 - ERROR - stderr - +2025-05-11 03:35:50 - ERROR - stderr - +2025-05-11 03:35:50 - INFO - stdout - {'loss': 0.7651, 'grad_norm': 0.6456320881843567, 'learning_rate': 1.1799929558464843e-05, 'epoch': 1.38} +2025-05-11 03:35:50 - ERROR - stderr - 46%|████▌ | 1718/3741 [10:09:56<11:18:30, 20.12s/it] +2025-05-11 03:36:11 - ERROR - stderr - 46%|████▌ | 1719/3741 [10:10:18<11:35:45, 20.65s/it] +2025-05-11 03:36:11 - ERROR - stderr - +2025-05-11 03:36:11 - ERROR - stderr - +2025-05-11 03:36:11 - INFO - stdout - {'loss': 0.804, 'grad_norm': 0.7090603709220886, 'learning_rate': 1.1791411013024873e-05, 'epoch': 1.38} +2025-05-11 03:36:11 - ERROR - stderr - 46%|████▌ | 1719/3741 [10:10:18<11:35:45, 20.65s/it] +2025-05-11 03:36:31 - ERROR - stderr - 46%|████▌ | 1720/3741 [10:10:38<11:26:40, 20.39s/it] +2025-05-11 03:36:31 - ERROR - stderr - +2025-05-11 03:36:31 - ERROR - stderr - +2025-05-11 03:36:31 - INFO - stdout - {'loss': 0.7683, 'grad_norm': 0.6659294962882996, 'learning_rate': 1.178289112432392e-05, 'epoch': 1.38} +2025-05-11 03:36:31 - ERROR - stderr - 46%|████▌ | 1720/3741 [10:10:38<11:26:40, 20.39s/it] +2025-05-11 03:36:53 - ERROR - stderr - 46%|████▌ | 1721/3741 [10:11:00<11:43:37, 20.90s/it] +2025-05-11 03:36:53 - ERROR - stderr - +2025-05-11 03:36:53 - ERROR - stderr - +2025-05-11 03:36:53 - INFO - stdout - {'loss': 0.7351, 'grad_norm': 0.6606502532958984, 'learning_rate': 1.1774369898750484e-05, 'epoch': 1.38} +2025-05-11 03:36:53 - ERROR - stderr - 46%|████▌ | 1721/3741 [10:11:00<11:43:37, 20.90s/it] +2025-05-11 03:37:13 - ERROR - stderr - 46%|████▌ | 1722/3741 [10:11:19<11:29:10, 20.48s/it] +2025-05-11 03:37:13 - ERROR - stderr - +2025-05-11 03:37:13 - ERROR - stderr - +2025-05-11 03:37:13 - INFO - stdout - {'loss': 0.7632, 'grad_norm': 0.6632405519485474, 'learning_rate': 1.176584734269407e-05, 'epoch': 1.38} +2025-05-11 03:37:13 - ERROR - stderr - 46%|████▌ | 1722/3741 [10:11:19<11:29:10, 20.48s/it] +2025-05-11 03:37:35 - ERROR - stderr - 46%|████▌ | 1723/3741 [10:11:41<11:44:24, 20.94s/it] +2025-05-11 03:37:35 - ERROR - stderr - +2025-05-11 03:37:35 - ERROR - stderr - +2025-05-11 03:37:35 - INFO - stdout - {'loss': 0.7277, 'grad_norm': 0.6848816275596619, 'learning_rate': 1.1757323462545177e-05, 'epoch': 1.38} +2025-05-11 03:37:35 - ERROR - stderr - 46%|████▌ | 1723/3741 [10:11:41<11:44:24, 20.94s/it] +2025-05-11 03:37:55 - ERROR - stderr - 46%|████▌ | 1724/3741 [10:12:01<11:32:49, 20.61s/it] +2025-05-11 03:37:55 - ERROR - stderr - +2025-05-11 03:37:55 - ERROR - stderr - +2025-05-11 03:37:55 - INFO - stdout - {'loss': 0.7468, 'grad_norm': 0.6847621202468872, 'learning_rate': 1.1748798264695305e-05, 'epoch': 1.38} +2025-05-11 03:37:55 - ERROR - stderr - 46%|████▌ | 1724/3741 [10:12:01<11:32:49, 20.61s/it] +2025-05-11 03:38:14 - ERROR - stderr - 46%|████▌ | 1725/3741 [10:12:21<11:21:28, 20.28s/it] +2025-05-11 03:38:14 - ERROR - stderr - +2025-05-11 03:38:14 - ERROR - stderr - +2025-05-11 03:38:14 - INFO - stdout - {'loss': 0.7894, 'grad_norm': 0.7737583518028259, 'learning_rate': 1.1740271755536939e-05, 'epoch': 1.38} +2025-05-11 03:38:14 - ERROR - stderr - 46%|████▌ | 1725/3741 [10:12:21<11:21:28, 20.28s/it] +2025-05-11 03:38:34 - ERROR - stderr - 46%|████▌ | 1726/3741 [10:12:40<11:13:21, 20.05s/it] +2025-05-11 03:38:34 - ERROR - stderr - +2025-05-11 03:38:34 - ERROR - stderr - +2025-05-11 03:38:34 - INFO - stdout - {'loss': 0.7865, 'grad_norm': 0.6926838755607605, 'learning_rate': 1.173174394146354e-05, 'epoch': 1.38} +2025-05-11 03:38:34 - ERROR - stderr - 46%|████▌ | 1726/3741 [10:12:40<11:13:21, 20.05s/it] +2025-05-11 03:38:53 - ERROR - stderr - 46%|████▌ | 1727/3741 [10:13:00<11:09:04, 19.93s/it] +2025-05-11 03:38:53 - ERROR - stderr - +2025-05-11 03:38:53 - ERROR - stderr - +2025-05-11 03:38:53 - INFO - stdout - {'loss': 0.7498, 'grad_norm': 0.7265371680259705, 'learning_rate': 1.172321482886956e-05, 'epoch': 1.38} +2025-05-11 03:38:53 - ERROR - stderr - 46%|████▌ | 1727/3741 [10:13:00<11:09:04, 19.93s/it] +2025-05-11 03:39:13 - ERROR - stderr - 46%|████▌ | 1728/3741 [10:13:19<11:05:51, 19.85s/it] +2025-05-11 03:39:13 - ERROR - stderr - +2025-05-11 03:39:13 - ERROR - stderr - +2025-05-11 03:39:13 - INFO - stdout - {'loss': 0.7666, 'grad_norm': 0.6517270803451538, 'learning_rate': 1.1714684424150413e-05, 'epoch': 1.39} +2025-05-11 03:39:13 - ERROR - stderr - 46%|████▌ | 1728/3741 [10:13:19<11:05:51, 19.85s/it] +2025-05-11 03:39:32 - ERROR - stderr - 46%|████▌ | 1729/3741 [10:13:39<11:00:52, 19.71s/it] +2025-05-11 03:39:32 - ERROR - stderr - +2025-05-11 03:39:32 - ERROR - stderr - +2025-05-11 03:39:32 - INFO - stdout - {'loss': 0.7528, 'grad_norm': 0.7151353359222412, 'learning_rate': 1.1706152733702489e-05, 'epoch': 1.39} +2025-05-11 03:39:32 - ERROR - stderr - 46%|████▌ | 1729/3741 [10:13:39<11:00:52, 19.71s/it] +2025-05-11 03:39:54 - ERROR - stderr - 46%|████▌ | 1730/3741 [10:14:00<11:17:57, 20.23s/it] +2025-05-11 03:39:54 - ERROR - stderr - +2025-05-11 03:39:54 - ERROR - stderr - +2025-05-11 03:39:54 - INFO - stdout - {'loss': 0.7428, 'grad_norm': 0.6981160044670105, 'learning_rate': 1.1697619763923143e-05, 'epoch': 1.39} +2025-05-11 03:39:54 - ERROR - stderr - 46%|████▌ | 1730/3741 [10:14:00<11:17:57, 20.23s/it] +2025-05-11 03:40:14 - ERROR - stderr - 46%|████▋ | 1731/3741 [10:14:20<11:11:46, 20.05s/it] +2025-05-11 03:40:14 - ERROR - stderr - +2025-05-11 03:40:14 - ERROR - stderr - +2025-05-11 03:40:14 - INFO - stdout - {'loss': 0.7919, 'grad_norm': 0.7034778594970703, 'learning_rate': 1.168908552121068e-05, 'epoch': 1.39} +2025-05-11 03:40:14 - ERROR - stderr - 46%|████▋ | 1731/3741 [10:14:20<11:11:46, 20.05s/it] +2025-05-11 03:40:36 - ERROR - stderr - 46%|████▋ | 1732/3741 [10:14:42<11:33:14, 20.70s/it] +2025-05-11 03:40:36 - ERROR - stderr - +2025-05-11 03:40:36 - ERROR - stderr - +2025-05-11 03:40:36 - INFO - stdout - {'loss': 0.7575, 'grad_norm': 0.6916285753250122, 'learning_rate': 1.1680550011964374e-05, 'epoch': 1.39} +2025-05-11 03:40:36 - ERROR - stderr - 46%|████▋ | 1732/3741 [10:14:42<11:33:14, 20.70s/it] +2025-05-11 03:40:56 - ERROR - stderr - 46%|████▋ | 1733/3741 [10:15:02<11:24:08, 20.44s/it] +2025-05-11 03:40:56 - ERROR - stderr - +2025-05-11 03:40:56 - ERROR - stderr - +2025-05-11 03:40:56 - INFO - stdout - {'loss': 0.7571, 'grad_norm': 0.660354495048523, 'learning_rate': 1.167201324258443e-05, 'epoch': 1.39} +2025-05-11 03:40:56 - ERROR - stderr - 46%|████▋ | 1733/3741 [10:15:02<11:24:08, 20.44s/it] +2025-05-11 03:41:18 - ERROR - stderr - 46%|████▋ | 1734/3741 [10:15:24<11:39:54, 20.92s/it] +2025-05-11 03:41:18 - ERROR - stderr - +2025-05-11 03:41:18 - ERROR - stderr - +2025-05-11 03:41:18 - INFO - stdout - {'loss': 0.7792, 'grad_norm': 0.7056530117988586, 'learning_rate': 1.166347521947202e-05, 'epoch': 1.39} +2025-05-11 03:41:18 - ERROR - stderr - 46%|████▋ | 1734/3741 [10:15:24<11:39:54, 20.92s/it] +2025-05-11 03:41:38 - ERROR - stderr - 46%|████▋ | 1735/3741 [10:15:44<11:30:26, 20.65s/it] +2025-05-11 03:41:38 - ERROR - stderr - +2025-05-11 03:41:38 - ERROR - stderr - +2025-05-11 03:41:38 - INFO - stdout - {'loss': 0.725, 'grad_norm': 0.6491231918334961, 'learning_rate': 1.1654935949029234e-05, 'epoch': 1.39} +2025-05-11 03:41:38 - ERROR - stderr - 46%|████▋ | 1735/3741 [10:15:44<11:30:26, 20.65s/it] +2025-05-11 03:42:00 - ERROR - stderr - 46%|████▋ | 1736/3741 [10:16:06<11:48:30, 21.20s/it] +2025-05-11 03:42:00 - ERROR - stderr - +2025-05-11 03:42:00 - ERROR - stderr - +2025-05-11 03:42:00 - INFO - stdout - {'loss': 0.791, 'grad_norm': 0.681361198425293, 'learning_rate': 1.1646395437659112e-05, 'epoch': 1.39} +2025-05-11 03:42:00 - ERROR - stderr - 46%|████▋ | 1736/3741 [10:16:06<11:48:30, 21.20s/it] +2025-05-11 03:42:20 - ERROR - stderr - 46%|████▋ | 1737/3741 [10:16:26<11:30:33, 20.68s/it] +2025-05-11 03:42:20 - ERROR - stderr - +2025-05-11 03:42:20 - ERROR - stderr - +2025-05-11 03:42:20 - INFO - stdout - {'loss': 0.769, 'grad_norm': 0.6471366286277771, 'learning_rate': 1.1637853691765625e-05, 'epoch': 1.39} +2025-05-11 03:42:20 - ERROR - stderr - 46%|████▋ | 1737/3741 [10:16:26<11:30:33, 20.68s/it] +2025-05-11 03:42:39 - ERROR - stderr - 46%|████▋ | 1738/3741 [10:16:45<11:18:58, 20.34s/it] +2025-05-11 03:42:39 - ERROR - stderr - +2025-05-11 03:42:39 - ERROR - stderr - +2025-05-11 03:42:39 - INFO - stdout - {'loss': 0.7581, 'grad_norm': 0.7128544449806213, 'learning_rate': 1.162931071775366e-05, 'epoch': 1.39} +2025-05-11 03:42:39 - ERROR - stderr - 46%|████▋ | 1738/3741 [10:16:45<11:18:58, 20.34s/it] +2025-05-11 03:42:59 - ERROR - stderr - 46%|████▋ | 1739/3741 [10:17:05<11:13:57, 20.20s/it] +2025-05-11 03:42:59 - ERROR - stderr - +2025-05-11 03:42:59 - ERROR - stderr - +2025-05-11 03:42:59 - INFO - stdout - {'loss': 0.7889, 'grad_norm': 0.6705619692802429, 'learning_rate': 1.162076652202903e-05, 'epoch': 1.39} +2025-05-11 03:42:59 - ERROR - stderr - 46%|████▋ | 1739/3741 [10:17:05<11:13:57, 20.20s/it] +2025-05-11 03:43:19 - ERROR - stderr - 47%|████▋ | 1740/3741 [10:17:25<11:11:54, 20.15s/it] +2025-05-11 03:43:19 - ERROR - stderr - +2025-05-11 03:43:19 - ERROR - stderr - +2025-05-11 03:43:19 - INFO - stdout - {'loss': 0.7282, 'grad_norm': 0.642647922039032, 'learning_rate': 1.1612221110998463e-05, 'epoch': 1.4} +2025-05-11 03:43:19 - ERROR - stderr - 47%|████▋ | 1740/3741 [10:17:25<11:11:54, 20.15s/it] +2025-05-11 03:43:41 - ERROR - stderr - 47%|████▋ | 1741/3741 [10:17:47<11:25:24, 20.56s/it] +2025-05-11 03:43:41 - ERROR - stderr - +2025-05-11 03:43:41 - ERROR - stderr - +2025-05-11 03:43:41 - INFO - stdout - {'loss': 0.7476, 'grad_norm': 0.6899245381355286, 'learning_rate': 1.16036744910696e-05, 'epoch': 1.4} +2025-05-11 03:43:41 - ERROR - stderr - 47%|████▋ | 1741/3741 [10:17:47<11:25:24, 20.56s/it] +2025-05-11 03:44:00 - ERROR - stderr - 47%|████▋ | 1742/3741 [10:18:06<11:13:14, 20.21s/it] +2025-05-11 03:44:00 - ERROR - stderr - +2025-05-11 03:44:00 - ERROR - stderr - +2025-05-11 03:44:00 - INFO - stdout - {'loss': 0.7197, 'grad_norm': 0.6138514876365662, 'learning_rate': 1.1595126668650993e-05, 'epoch': 1.4} +2025-05-11 03:44:00 - ERROR - stderr - 47%|████▋ | 1742/3741 [10:18:06<11:13:14, 20.21s/it] +2025-05-11 03:44:22 - ERROR - stderr - 47%|████▋ | 1743/3741 [10:18:28<11:29:32, 20.71s/it] +2025-05-11 03:44:22 - ERROR - stderr - +2025-05-11 03:44:22 - ERROR - stderr - +2025-05-11 03:44:22 - INFO - stdout - {'loss': 0.7255, 'grad_norm': 0.7220246195793152, 'learning_rate': 1.1586577650152084e-05, 'epoch': 1.4} +2025-05-11 03:44:22 - ERROR - stderr - 47%|████▋ | 1743/3741 [10:18:28<11:29:32, 20.71s/it] +2025-05-11 03:44:41 - ERROR - stderr - 47%|████▋ | 1744/3741 [10:18:48<11:17:44, 20.36s/it] +2025-05-11 03:44:41 - ERROR - stderr - +2025-05-11 03:44:41 - ERROR - stderr - +2025-05-11 03:44:41 - INFO - stdout - {'loss': 0.7612, 'grad_norm': 0.6729558706283569, 'learning_rate': 1.1578027441983219e-05, 'epoch': 1.4} +2025-05-11 03:44:41 - ERROR - stderr - 47%|████▋ | 1744/3741 [10:18:48<11:17:44, 20.36s/it] +2025-05-11 03:45:04 - ERROR - stderr - 47%|████▋ | 1745/3741 [10:19:10<11:38:26, 21.00s/it] +2025-05-11 03:45:04 - ERROR - stderr - +2025-05-11 03:45:04 - ERROR - stderr - +2025-05-11 03:45:04 - INFO - stdout - {'loss': 0.7713, 'grad_norm': 0.6875990629196167, 'learning_rate': 1.1569476050555637e-05, 'epoch': 1.4} +2025-05-11 03:45:04 - ERROR - stderr - 47%|████▋ | 1745/3741 [10:19:10<11:38:26, 21.00s/it] +2025-05-11 03:45:23 - ERROR - stderr - 47%|████▋ | 1746/3741 [10:19:30<11:22:35, 20.53s/it] +2025-05-11 03:45:23 - ERROR - stderr - +2025-05-11 03:45:23 - ERROR - stderr - +2025-05-11 03:45:23 - INFO - stdout - {'loss': 0.7852, 'grad_norm': 0.6855756044387817, 'learning_rate': 1.156092348228146e-05, 'epoch': 1.4} +2025-05-11 03:45:23 - ERROR - stderr - 47%|████▋ | 1746/3741 [10:19:30<11:22:35, 20.53s/it] +2025-05-11 03:45:45 - ERROR - stderr - 47%|████▋ | 1747/3741 [10:19:51<11:35:34, 20.93s/it] +2025-05-11 03:45:45 - ERROR - stderr - +2025-05-11 03:45:45 - ERROR - stderr - +2025-05-11 03:45:45 - INFO - stdout - {'loss': 0.7576, 'grad_norm': 0.666982114315033, 'learning_rate': 1.1552369743573699e-05, 'epoch': 1.4} +2025-05-11 03:45:45 - ERROR - stderr - 47%|████▋ | 1747/3741 [10:19:51<11:35:34, 20.93s/it] +2025-05-11 03:46:05 - ERROR - stderr - 47%|████▋ | 1748/3741 [10:20:11<11:19:49, 20.47s/it] +2025-05-11 03:46:05 - ERROR - stderr - +2025-05-11 03:46:05 - ERROR - stderr - +2025-05-11 03:46:05 - INFO - stdout - {'loss': 0.7467, 'grad_norm': 0.6429185271263123, 'learning_rate': 1.1543814840846237e-05, 'epoch': 1.4} +2025-05-11 03:46:05 - ERROR - stderr - 47%|████▋ | 1748/3741 [10:20:11<11:19:49, 20.47s/it] +2025-05-11 03:46:25 - ERROR - stderr - 47%|████▋ | 1749/3741 [10:20:31<11:15:21, 20.34s/it] +2025-05-11 03:46:25 - ERROR - stderr - +2025-05-11 03:46:25 - ERROR - stderr - +2025-05-11 03:46:25 - INFO - stdout - {'loss': 0.7397, 'grad_norm': 0.6782676577568054, 'learning_rate': 1.153525878051383e-05, 'epoch': 1.4} +2025-05-11 03:46:25 - ERROR - stderr - 47%|████▋ | 1749/3741 [10:20:31<11:15:21, 20.34s/it] +2025-05-11 03:46:44 - ERROR - stderr - 47%|████▋ | 1750/3741 [10:20:50<11:07:50, 20.13s/it] +2025-05-11 03:46:44 - ERROR - stderr - +2025-05-11 03:46:44 - ERROR - stderr - +2025-05-11 03:46:44 - INFO - stdout - {'loss': 0.7627, 'grad_norm': 0.6263085007667542, 'learning_rate': 1.1526701568992102e-05, 'epoch': 1.4} +2025-05-11 03:46:44 - ERROR - stderr - 47%|████▋ | 1750/3741 [10:20:51<11:07:50, 20.13s/it] +2025-05-11 03:47:04 - ERROR - stderr - 47%|████▋ | 1751/3741 [10:21:10<11:02:07, 19.96s/it] +2025-05-11 03:47:04 - ERROR - stderr - +2025-05-11 03:47:04 - ERROR - stderr - +2025-05-11 03:47:04 - INFO - stdout - {'loss': 0.7476, 'grad_norm': 0.7163293361663818, 'learning_rate': 1.1518143212697547e-05, 'epoch': 1.4} +2025-05-11 03:47:04 - ERROR - stderr - 47%|████▋ | 1751/3741 [10:21:10<11:02:07, 19.96s/it] +2025-05-11 03:47:26 - ERROR - stderr - 47%|████▋ | 1752/3741 [10:21:32<11:25:44, 20.69s/it] +2025-05-11 03:47:26 - ERROR - stderr - +2025-05-11 03:47:26 - ERROR - stderr - +2025-05-11 03:47:26 - INFO - stdout - {'loss': 0.7012, 'grad_norm': 0.6758559942245483, 'learning_rate': 1.1509583718047508e-05, 'epoch': 1.4} +2025-05-11 03:47:26 - ERROR - stderr - 47%|████▋ | 1752/3741 [10:21:32<11:25:44, 20.69s/it] +2025-05-11 03:47:45 - ERROR - stderr - 47%|████▋ | 1753/3741 [10:21:52<11:11:44, 20.27s/it] +2025-05-11 03:47:45 - ERROR - stderr - +2025-05-11 03:47:45 - ERROR - stderr - +2025-05-11 03:47:45 - INFO - stdout - {'loss': 0.7344, 'grad_norm': 0.6683815717697144, 'learning_rate': 1.1501023091460187e-05, 'epoch': 1.41} +2025-05-11 03:47:45 - ERROR - stderr - 47%|████▋ | 1753/3741 [10:21:52<11:11:44, 20.27s/it] +2025-05-11 03:48:08 - ERROR - stderr - 47%|████▋ | 1754/3741 [10:22:14<11:29:59, 20.84s/it] +2025-05-11 03:48:08 - ERROR - stderr - +2025-05-11 03:48:08 - ERROR - stderr - +2025-05-11 03:48:08 - INFO - stdout - {'loss': 0.7192, 'grad_norm': 0.7088032364845276, 'learning_rate': 1.149246133935463e-05, 'epoch': 1.41} +2025-05-11 03:48:08 - ERROR - stderr - 47%|████▋ | 1754/3741 [10:22:14<11:29:59, 20.84s/it] +2025-05-11 03:48:29 - ERROR - stderr - 47%|████▋ | 1755/3741 [10:22:35<11:37:05, 21.06s/it] +2025-05-11 03:48:29 - ERROR - stderr - +2025-05-11 03:48:29 - ERROR - stderr - +2025-05-11 03:48:29 - INFO - stdout - {'loss': 0.7312, 'grad_norm': 0.6811531782150269, 'learning_rate': 1.1483898468150736e-05, 'epoch': 1.41} +2025-05-11 03:48:29 - ERROR - stderr - 47%|████▋ | 1755/3741 [10:22:36<11:37:05, 21.06s/it] +2025-05-11 03:48:51 - ERROR - stderr - 47%|████▋ | 1756/3741 [10:22:58<11:46:32, 21.36s/it] +2025-05-11 03:48:51 - ERROR - stderr - +2025-05-11 03:48:51 - ERROR - stderr - +2025-05-11 03:48:51 - INFO - stdout - {'loss': 0.7249, 'grad_norm': 0.6816032528877258, 'learning_rate': 1.1475334484269234e-05, 'epoch': 1.41} +2025-05-11 03:48:51 - ERROR - stderr - 47%|████▋ | 1756/3741 [10:22:58<11:46:32, 21.36s/it] +2025-05-11 03:49:11 - ERROR - stderr - 47%|████▋ | 1757/3741 [10:23:17<11:29:21, 20.85s/it] +2025-05-11 03:49:11 - ERROR - stderr - +2025-05-11 03:49:11 - ERROR - stderr - +2025-05-11 03:49:11 - INFO - stdout - {'loss': 0.7065, 'grad_norm': 0.6524032354354858, 'learning_rate': 1.146676939413169e-05, 'epoch': 1.41} +2025-05-11 03:49:11 - ERROR - stderr - 47%|████▋ | 1757/3741 [10:23:17<11:29:21, 20.85s/it] +2025-05-11 03:49:31 - ERROR - stderr - 47%|████▋ | 1758/3741 [10:23:37<11:17:13, 20.49s/it] +2025-05-11 03:49:31 - ERROR - stderr - +2025-05-11 03:49:31 - ERROR - stderr - +2025-05-11 03:49:31 - INFO - stdout - {'loss': 0.7836, 'grad_norm': 0.6719584465026855, 'learning_rate': 1.1458203204160503e-05, 'epoch': 1.41} +2025-05-11 03:49:31 - ERROR - stderr - 47%|████▋ | 1758/3741 [10:23:37<11:17:13, 20.49s/it] +2025-05-11 03:49:50 - ERROR - stderr - 47%|████▋ | 1759/3741 [10:23:56<11:08:22, 20.23s/it] +2025-05-11 03:49:50 - ERROR - stderr - +2025-05-11 03:49:50 - ERROR - stderr - +2025-05-11 03:49:50 - INFO - stdout - {'loss': 0.7608, 'grad_norm': 0.6611328721046448, 'learning_rate': 1.1449635920778894e-05, 'epoch': 1.41} +2025-05-11 03:49:50 - ERROR - stderr - 47%|████▋ | 1759/3741 [10:23:57<11:08:22, 20.23s/it] +2025-05-11 03:50:10 - ERROR - stderr - 47%|████▋ | 1760/3741 [10:24:16<11:01:59, 20.05s/it] +2025-05-11 03:50:10 - ERROR - stderr - +2025-05-11 03:50:10 - ERROR - stderr - +2025-05-11 03:50:10 - INFO - stdout - {'loss': 0.7607, 'grad_norm': 0.6723156571388245, 'learning_rate': 1.14410675504109e-05, 'epoch': 1.41} +2025-05-11 03:50:10 - ERROR - stderr - 47%|████▋ | 1760/3741 [10:24:16<11:01:59, 20.05s/it] +2025-05-11 03:50:32 - ERROR - stderr - 47%|████▋ | 1761/3741 [10:24:39<11:26:08, 20.79s/it] +2025-05-11 03:50:32 - ERROR - stderr - +2025-05-11 03:50:32 - ERROR - stderr - +2025-05-11 03:50:32 - INFO - stdout - {'loss': 0.7835, 'grad_norm': 0.7079909443855286, 'learning_rate': 1.143249809948138e-05, 'epoch': 1.41} +2025-05-11 03:50:32 - ERROR - stderr - 47%|████▋ | 1761/3741 [10:24:39<11:26:08, 20.79s/it] +2025-05-11 03:50:52 - ERROR - stderr - 47%|████▋ | 1762/3741 [10:24:58<11:13:48, 20.43s/it] +2025-05-11 03:50:52 - ERROR - stderr - +2025-05-11 03:50:52 - ERROR - stderr - +2025-05-11 03:50:52 - INFO - stdout - {'loss': 0.7477, 'grad_norm': 0.6389018893241882, 'learning_rate': 1.1423927574415998e-05, 'epoch': 1.41} +2025-05-11 03:50:52 - ERROR - stderr - 47%|████▋ | 1762/3741 [10:24:58<11:13:48, 20.43s/it] +2025-05-11 03:51:14 - ERROR - stderr - 47%|████▋ | 1763/3741 [10:25:21<11:34:42, 21.07s/it] +2025-05-11 03:51:15 - ERROR - stderr - +2025-05-11 03:51:15 - ERROR - stderr - +2025-05-11 03:51:15 - INFO - stdout - {'loss': 0.7811, 'grad_norm': 0.7065446972846985, 'learning_rate': 1.1415355981641229e-05, 'epoch': 1.41} +2025-05-11 03:51:15 - ERROR - stderr - 47%|████▋ | 1763/3741 [10:25:21<11:34:42, 21.07s/it] +2025-05-11 03:51:34 - ERROR - stderr - 47%|████▋ | 1764/3741 [10:25:40<11:19:37, 20.63s/it] +2025-05-11 03:51:34 - ERROR - stderr - +2025-05-11 03:51:34 - ERROR - stderr - +2025-05-11 03:51:34 - INFO - stdout - {'loss': 0.7824, 'grad_norm': 0.6564798951148987, 'learning_rate': 1.1406783327584345e-05, 'epoch': 1.41} +2025-05-11 03:51:34 - ERROR - stderr - 47%|████▋ | 1764/3741 [10:25:40<11:19:37, 20.63s/it] +2025-05-11 03:51:57 - ERROR - stderr - 47%|████▋ | 1765/3741 [10:26:03<11:43:22, 21.36s/it] +2025-05-11 03:51:57 - ERROR - stderr - +2025-05-11 03:51:57 - ERROR - stderr - +2025-05-11 03:51:57 - INFO - stdout - {'loss': 0.7373, 'grad_norm': 0.6832185983657837, 'learning_rate': 1.139820961867341e-05, 'epoch': 1.42} +2025-05-11 03:51:57 - ERROR - stderr - 47%|████▋ | 1765/3741 [10:26:03<11:43:22, 21.36s/it] +2025-05-11 03:52:17 - ERROR - stderr - 47%|████▋ | 1766/3741 [10:26:23<11:29:35, 20.95s/it] +2025-05-11 03:52:17 - ERROR - stderr - +2025-05-11 03:52:17 - ERROR - stderr - +2025-05-11 03:52:17 - INFO - stdout - {'loss': 0.7716, 'grad_norm': 0.6919088363647461, 'learning_rate': 1.1389634861337284e-05, 'epoch': 1.42} +2025-05-11 03:52:17 - ERROR - stderr - 47%|████▋ | 1766/3741 [10:26:23<11:29:35, 20.95s/it] +2025-05-11 03:52:41 - ERROR - stderr - 47%|████▋ | 1767/3741 [10:26:47<11:57:00, 21.79s/it] +2025-05-11 03:52:41 - ERROR - stderr - +2025-05-11 03:52:41 - ERROR - stderr - +2025-05-11 03:52:41 - INFO - stdout - {'loss': 0.757, 'grad_norm': 0.6253665685653687, 'learning_rate': 1.1381059062005617e-05, 'epoch': 1.42} +2025-05-11 03:52:41 - ERROR - stderr - 47%|████▋ | 1767/3741 [10:26:47<11:57:00, 21.79s/it] +2025-05-11 03:53:01 - ERROR - stderr - 47%|████▋ | 1768/3741 [10:27:07<11:35:13, 21.14s/it] +2025-05-11 03:53:01 - ERROR - stderr - +2025-05-11 03:53:01 - ERROR - stderr - +2025-05-11 03:53:01 - INFO - stdout - {'loss': 0.7826, 'grad_norm': 0.7172982692718506, 'learning_rate': 1.137248222710883e-05, 'epoch': 1.42} +2025-05-11 03:53:01 - ERROR - stderr - 47%|████▋ | 1768/3741 [10:27:07<11:35:13, 21.14s/it] +2025-05-11 03:53:24 - ERROR - stderr - 47%|████▋ | 1769/3741 [10:27:30<11:54:17, 21.73s/it] +2025-05-11 03:53:24 - ERROR - stderr - +2025-05-11 03:53:24 - ERROR - stderr - +2025-05-11 03:53:24 - INFO - stdout - {'loss': 0.7525, 'grad_norm': 0.6555379629135132, 'learning_rate': 1.1363904363078126e-05, 'epoch': 1.42} +2025-05-11 03:53:24 - ERROR - stderr - 47%|████▋ | 1769/3741 [10:27:30<11:54:17, 21.73s/it] +2025-05-11 03:53:44 - ERROR - stderr - 47%|████▋ | 1770/3741 [10:27:50<11:40:39, 21.33s/it] +2025-05-11 03:53:44 - ERROR - stderr - +2025-05-11 03:53:44 - ERROR - stderr - +2025-05-11 03:53:44 - INFO - stdout - {'loss': 0.7448, 'grad_norm': 0.6733851432800293, 'learning_rate': 1.135532547634548e-05, 'epoch': 1.42} +2025-05-11 03:53:44 - ERROR - stderr - 47%|████▋ | 1770/3741 [10:27:50<11:40:39, 21.33s/it] +2025-05-11 03:54:06 - ERROR - stderr - 47%|████▋ | 1771/3741 [10:28:13<11:50:06, 21.63s/it] +2025-05-11 03:54:06 - ERROR - stderr - +2025-05-11 03:54:06 - ERROR - stderr - +2025-05-11 03:54:06 - INFO - stdout - {'loss': 0.7659, 'grad_norm': 0.6609330177307129, 'learning_rate': 1.1346745573343636e-05, 'epoch': 1.42} +2025-05-11 03:54:06 - ERROR - stderr - 47%|████▋ | 1771/3741 [10:28:13<11:50:06, 21.63s/it] +2025-05-11 03:54:26 - ERROR - stderr - 47%|████▋ | 1772/3741 [10:28:32<11:30:21, 21.04s/it] +2025-05-11 03:54:26 - ERROR - stderr - +2025-05-11 03:54:26 - ERROR - stderr - +2025-05-11 03:54:26 - INFO - stdout - {'loss': 0.7557, 'grad_norm': 0.6952093839645386, 'learning_rate': 1.13381646605061e-05, 'epoch': 1.42} +2025-05-11 03:54:26 - ERROR - stderr - 47%|████▋ | 1772/3741 [10:28:32<11:30:21, 21.04s/it] +2025-05-11 03:54:48 - ERROR - stderr - 47%|████▋ | 1773/3741 [10:28:54<11:41:07, 21.38s/it] +2025-05-11 03:54:48 - ERROR - stderr - +2025-05-11 03:54:48 - ERROR - stderr - +2025-05-11 03:54:48 - INFO - stdout - {'loss': 0.7528, 'grad_norm': 0.7016155123710632, 'learning_rate': 1.1329582744267125e-05, 'epoch': 1.42} +2025-05-11 03:54:48 - ERROR - stderr - 47%|████▋ | 1773/3741 [10:28:54<11:41:07, 21.38s/it] +2025-05-11 03:55:08 - ERROR - stderr - 47%|████▋ | 1774/3741 [10:29:14<11:25:13, 20.90s/it] +2025-05-11 03:55:08 - ERROR - stderr - +2025-05-11 03:55:08 - ERROR - stderr - +2025-05-11 03:55:08 - INFO - stdout - {'loss': 0.755, 'grad_norm': 0.6713129281997681, 'learning_rate': 1.1320999831061727e-05, 'epoch': 1.42} +2025-05-11 03:55:08 - ERROR - stderr - 47%|████▋ | 1774/3741 [10:29:14<11:25:13, 20.90s/it] +2025-05-11 03:55:09 - WARNING - transformers.tokenization_utils_base - Token indices sequence length is longer than the specified maximum sequence length for this model (5807 > 4096). Running this sequence through the model will result in indexing errors +2025-05-11 03:55:09 - WARNING - transformers.tokenization_utils_base - Token indices sequence length is longer than the specified maximum sequence length for this model (5807 > 4096). Running this sequence through the model will result in indexing errors +2025-05-11 03:55:28 - ERROR - stderr - 47%|████▋ | 1775/3741 [10:29:34<11:16:11, 20.64s/it] +2025-05-11 03:55:28 - ERROR - stderr - +2025-05-11 03:55:28 - ERROR - stderr - +2025-05-11 03:55:28 - INFO - stdout - {'loss': 0.7445, 'grad_norm': 0.6963139176368713, 'learning_rate': 1.1312415927325668e-05, 'epoch': 1.42} +2025-05-11 03:55:28 - ERROR - stderr - 47%|████▋ | 1775/3741 [10:29:34<11:16:11, 20.64s/it] +2025-05-11 03:55:52 - ERROR - stderr - 47%|████▋ | 1776/3741 [10:29:58<11:49:02, 21.65s/it] +2025-05-11 03:55:52 - ERROR - stderr - +2025-05-11 03:55:52 - ERROR - stderr - +2025-05-11 03:55:52 - INFO - stdout - {'loss': 0.7368, 'grad_norm': 0.6338862180709839, 'learning_rate': 1.1303831039495452e-05, 'epoch': 1.42} +2025-05-11 03:55:52 - ERROR - stderr - 47%|████▋ | 1776/3741 [10:29:58<11:49:02, 21.65s/it] +2025-05-11 03:56:11 - ERROR - stderr - 48%|████▊ | 1777/3741 [10:30:18<11:27:36, 21.01s/it] +2025-05-11 03:56:12 - ERROR - stderr - +2025-05-11 03:56:12 - ERROR - stderr - +2025-05-11 03:56:12 - INFO - stdout - {'loss': 0.7628, 'grad_norm': 0.6802634596824646, 'learning_rate': 1.1295245174008317e-05, 'epoch': 1.43} +2025-05-11 03:56:12 - ERROR - stderr - 48%|████▊ | 1777/3741 [10:30:18<11:27:36, 21.01s/it] +2025-05-11 03:56:31 - ERROR - stderr - 48%|████▊ | 1778/3741 [10:30:38<11:14:53, 20.63s/it] +2025-05-11 03:56:31 - ERROR - stderr - +2025-05-11 03:56:31 - ERROR - stderr - +2025-05-11 03:56:31 - INFO - stdout - {'loss': 0.7874, 'grad_norm': 0.6717724204063416, 'learning_rate': 1.1286658337302243e-05, 'epoch': 1.43} +2025-05-11 03:56:31 - ERROR - stderr - 48%|████▊ | 1778/3741 [10:30:38<11:14:53, 20.63s/it] +2025-05-11 03:56:51 - ERROR - stderr - 48%|████▊ | 1779/3741 [10:30:57<11:04:58, 20.34s/it] +2025-05-11 03:56:51 - ERROR - stderr - +2025-05-11 03:56:51 - ERROR - stderr - +2025-05-11 03:56:51 - INFO - stdout - {'loss': 0.7606, 'grad_norm': 0.6719787120819092, 'learning_rate': 1.1278070535815927e-05, 'epoch': 1.43} +2025-05-11 03:56:51 - ERROR - stderr - 48%|████▊ | 1779/3741 [10:30:57<11:04:58, 20.34s/it] +2025-05-11 03:57:11 - ERROR - stderr - 48%|████▊ | 1780/3741 [10:31:17<10:58:23, 20.14s/it] +2025-05-11 03:57:11 - ERROR - stderr - +2025-05-11 03:57:11 - ERROR - stderr - +2025-05-11 03:57:11 - INFO - stdout - {'loss': 0.7203, 'grad_norm': 0.6636870503425598, 'learning_rate': 1.1269481775988793e-05, 'epoch': 1.43} +2025-05-11 03:57:11 - ERROR - stderr - 48%|████▊ | 1780/3741 [10:31:17<10:58:23, 20.14s/it] +2025-05-11 03:57:30 - ERROR - stderr - 48%|████▊ | 1781/3741 [10:31:36<10:49:42, 19.89s/it] +2025-05-11 03:57:30 - ERROR - stderr - +2025-05-11 03:57:30 - ERROR - stderr - +2025-05-11 03:57:30 - INFO - stdout - {'loss': 0.7462, 'grad_norm': 0.6615099310874939, 'learning_rate': 1.1260892064260995e-05, 'epoch': 1.43} +2025-05-11 03:57:30 - ERROR - stderr - 48%|████▊ | 1781/3741 [10:31:36<10:49:42, 19.89s/it] +2025-05-11 03:57:50 - ERROR - stderr - 48%|████▊ | 1782/3741 [10:31:56<10:48:09, 19.85s/it] +2025-05-11 03:57:50 - ERROR - stderr - +2025-05-11 03:57:50 - ERROR - stderr - +2025-05-11 03:57:50 - INFO - stdout - {'loss': 0.7762, 'grad_norm': 0.703061044216156, 'learning_rate': 1.1252301407073386e-05, 'epoch': 1.43} +2025-05-11 03:57:50 - ERROR - stderr - 48%|████▊ | 1782/3741 [10:31:56<10:48:09, 19.85s/it] +2025-05-11 03:58:09 - ERROR - stderr - 48%|████▊ | 1783/3741 [10:32:16<10:46:18, 19.81s/it] +2025-05-11 03:58:09 - ERROR - stderr - +2025-05-11 03:58:09 - ERROR - stderr - +2025-05-11 03:58:09 - INFO - stdout - {'loss': 0.7658, 'grad_norm': 0.743383526802063, 'learning_rate': 1.124370981086753e-05, 'epoch': 1.43} +2025-05-11 03:58:09 - ERROR - stderr - 48%|████▊ | 1783/3741 [10:32:16<10:46:18, 19.81s/it] +2025-05-11 03:58:30 - ERROR - stderr - 48%|████▊ | 1784/3741 [10:32:37<10:57:53, 20.17s/it] +2025-05-11 03:58:30 - ERROR - stderr - +2025-05-11 03:58:30 - ERROR - stderr - +2025-05-11 03:58:30 - INFO - stdout - {'loss': 0.7852, 'grad_norm': 0.6267064809799194, 'learning_rate': 1.1235117282085704e-05, 'epoch': 1.43} +2025-05-11 03:58:30 - ERROR - stderr - 48%|████▊ | 1784/3741 [10:32:37<10:57:53, 20.17s/it] +2025-05-11 03:58:50 - ERROR - stderr - 48%|████▊ | 1785/3741 [10:32:57<10:54:23, 20.07s/it] +2025-05-11 03:58:50 - ERROR - stderr - +2025-05-11 03:58:50 - ERROR - stderr - +2025-05-11 03:58:50 - INFO - stdout - {'loss': 0.7605, 'grad_norm': 0.6929275393486023, 'learning_rate': 1.1226523827170876e-05, 'epoch': 1.43} +2025-05-11 03:58:50 - ERROR - stderr - 48%|████▊ | 1785/3741 [10:32:57<10:54:23, 20.07s/it] +2025-05-11 03:59:11 - ERROR - stderr - 48%|████▊ | 1786/3741 [10:33:18<11:03:28, 20.36s/it] +2025-05-11 03:59:11 - ERROR - stderr - +2025-05-11 03:59:11 - ERROR - stderr - +2025-05-11 03:59:11 - INFO - stdout - {'loss': 0.7657, 'grad_norm': 0.6413291692733765, 'learning_rate': 1.121792945256671e-05, 'epoch': 1.43} +2025-05-11 03:59:11 - ERROR - stderr - 48%|████▊ | 1786/3741 [10:33:18<11:03:28, 20.36s/it] +2025-05-11 03:59:31 - ERROR - stderr - 48%|████▊ | 1787/3741 [10:33:37<10:57:25, 20.19s/it] +2025-05-11 03:59:31 - ERROR - stderr - +2025-05-11 03:59:31 - ERROR - stderr - +2025-05-11 03:59:31 - INFO - stdout - {'loss': 0.7738, 'grad_norm': 0.6685841083526611, 'learning_rate': 1.1209334164717562e-05, 'epoch': 1.43} +2025-05-11 03:59:31 - ERROR - stderr - 48%|████▊ | 1787/3741 [10:33:37<10:57:25, 20.19s/it] +2025-05-11 03:59:53 - ERROR - stderr - 48%|████▊ | 1788/3741 [10:33:59<11:12:01, 20.65s/it] +2025-05-11 03:59:53 - ERROR - stderr - +2025-05-11 03:59:53 - ERROR - stderr - +2025-05-11 03:59:53 - INFO - stdout - {'loss': 0.7591, 'grad_norm': 0.697259247303009, 'learning_rate': 1.1200737970068476e-05, 'epoch': 1.43} +2025-05-11 03:59:53 - ERROR - stderr - 48%|████▊ | 1788/3741 [10:33:59<11:12:01, 20.65s/it] +2025-05-11 04:00:12 - ERROR - stderr - 48%|████▊ | 1789/3741 [10:34:18<10:59:20, 20.27s/it] +2025-05-11 04:00:12 - ERROR - stderr - +2025-05-11 04:00:12 - ERROR - stderr - +2025-05-11 04:00:12 - INFO - stdout - {'loss': 0.7636, 'grad_norm': 0.6685236096382141, 'learning_rate': 1.1192140875065167e-05, 'epoch': 1.43} +2025-05-11 04:00:12 - ERROR - stderr - 48%|████▊ | 1789/3741 [10:34:18<10:59:20, 20.27s/it] +2025-05-11 04:00:34 - ERROR - stderr - 48%|████▊ | 1790/3741 [10:34:40<11:10:47, 20.63s/it] +2025-05-11 04:00:34 - ERROR - stderr - +2025-05-11 04:00:34 - ERROR - stderr - +2025-05-11 04:00:34 - INFO - stdout - {'loss': 0.7549, 'grad_norm': 0.6757694482803345, 'learning_rate': 1.1183542886154027e-05, 'epoch': 1.44} +2025-05-11 04:00:34 - ERROR - stderr - 48%|████▊ | 1790/3741 [10:34:40<11:10:47, 20.63s/it] +2025-05-11 04:00:53 - ERROR - stderr - 48%|████▊ | 1791/3741 [10:34:59<10:59:18, 20.29s/it] +2025-05-11 04:00:53 - ERROR - stderr - +2025-05-11 04:00:53 - ERROR - stderr - +2025-05-11 04:00:53 - INFO - stdout - {'loss': 0.7774, 'grad_norm': 0.6777142286300659, 'learning_rate': 1.1174944009782123e-05, 'epoch': 1.44} +2025-05-11 04:00:53 - ERROR - stderr - 48%|████▊ | 1791/3741 [10:34:59<10:59:18, 20.29s/it] +2025-05-11 04:01:13 - ERROR - stderr - 48%|████▊ | 1792/3741 [10:35:19<10:53:45, 20.13s/it] +2025-05-11 04:01:13 - ERROR - stderr - +2025-05-11 04:01:13 - ERROR - stderr - +2025-05-11 04:01:13 - INFO - stdout - {'loss': 0.7841, 'grad_norm': 0.7203055620193481, 'learning_rate': 1.1166344252397187e-05, 'epoch': 1.44} +2025-05-11 04:01:13 - ERROR - stderr - 48%|████▊ | 1792/3741 [10:35:19<10:53:45, 20.13s/it] +2025-05-11 04:01:32 - ERROR - stderr - 48%|████▊ | 1793/3741 [10:35:39<10:47:42, 19.95s/it] +2025-05-11 04:01:32 - ERROR - stderr - +2025-05-11 04:01:32 - ERROR - stderr - +2025-05-11 04:01:32 - INFO - stdout - {'loss': 0.7389, 'grad_norm': 0.6814801096916199, 'learning_rate': 1.1157743620447611e-05, 'epoch': 1.44} +2025-05-11 04:01:32 - ERROR - stderr - 48%|████▊ | 1793/3741 [10:35:39<10:47:42, 19.95s/it] +2025-05-11 04:01:52 - ERROR - stderr - 48%|████▊ | 1794/3741 [10:35:58<10:43:07, 19.82s/it] +2025-05-11 04:01:52 - ERROR - stderr - +2025-05-11 04:01:52 - ERROR - stderr - +2025-05-11 04:01:52 - INFO - stdout - {'loss': 0.7395, 'grad_norm': 0.6721707582473755, 'learning_rate': 1.1149142120382443e-05, 'epoch': 1.44} +2025-05-11 04:01:52 - ERROR - stderr - 48%|████▊ | 1794/3741 [10:35:58<10:43:07, 19.82s/it] +2025-05-11 04:02:12 - ERROR - stderr - 48%|████▊ | 1795/3741 [10:36:18<10:44:58, 19.89s/it] +2025-05-11 04:02:12 - ERROR - stderr - +2025-05-11 04:02:12 - ERROR - stderr - +2025-05-11 04:02:12 - INFO - stdout - {'loss': 0.7273, 'grad_norm': 0.6581818461418152, 'learning_rate': 1.1140539758651372e-05, 'epoch': 1.44} +2025-05-11 04:02:12 - ERROR - stderr - 48%|████▊ | 1795/3741 [10:36:18<10:44:58, 19.89s/it] +2025-05-11 04:02:32 - ERROR - stderr - 48%|████▊ | 1796/3741 [10:36:38<10:43:43, 19.86s/it] +2025-05-11 04:02:32 - ERROR - stderr - +2025-05-11 04:02:32 - ERROR - stderr - +2025-05-11 04:02:32 - INFO - stdout - {'loss': 0.7649, 'grad_norm': 0.6775161027908325, 'learning_rate': 1.1131936541704749e-05, 'epoch': 1.44} +2025-05-11 04:02:32 - ERROR - stderr - 48%|████▊ | 1796/3741 [10:36:38<10:43:43, 19.86s/it] +2025-05-11 04:02:52 - ERROR - stderr - 48%|████▊ | 1797/3741 [10:36:59<10:51:46, 20.12s/it] +2025-05-11 04:02:52 - ERROR - stderr - +2025-05-11 04:02:52 - ERROR - stderr - +2025-05-11 04:02:52 - INFO - stdout - {'loss': 0.766, 'grad_norm': 0.6994383931159973, 'learning_rate': 1.112333247599356e-05, 'epoch': 1.44} +2025-05-11 04:02:52 - ERROR - stderr - 48%|████▊ | 1797/3741 [10:36:59<10:51:46, 20.12s/it] +2025-05-11 04:03:12 - ERROR - stderr - 48%|████▊ | 1798/3741 [10:37:18<10:45:57, 19.95s/it] +2025-05-11 04:03:12 - ERROR - stderr - +2025-05-11 04:03:12 - ERROR - stderr - +2025-05-11 04:03:12 - INFO - stdout - {'loss': 0.7642, 'grad_norm': 0.6743654012680054, 'learning_rate': 1.1114727567969423e-05, 'epoch': 1.44} +2025-05-11 04:03:12 - ERROR - stderr - 48%|████▊ | 1798/3741 [10:37:18<10:45:57, 19.95s/it] +2025-05-11 04:03:33 - ERROR - stderr - 48%|████▊ | 1799/3741 [10:37:40<10:58:41, 20.35s/it] +2025-05-11 04:03:33 - ERROR - stderr - +2025-05-11 04:03:33 - ERROR - stderr - +2025-05-11 04:03:33 - INFO - stdout - {'loss': 0.758, 'grad_norm': 0.6504762172698975, 'learning_rate': 1.1106121824084593e-05, 'epoch': 1.44} +2025-05-11 04:03:33 - ERROR - stderr - 48%|████▊ | 1799/3741 [10:37:40<10:58:41, 20.35s/it] +2025-05-11 04:03:53 - ERROR - stderr - 48%|████▊ | 1800/3741 [10:37:59<10:50:36, 20.11s/it] +2025-05-11 04:03:53 - ERROR - stderr - +2025-05-11 04:03:53 - ERROR - stderr - +2025-05-11 04:03:53 - INFO - stdout - {'loss': 0.7632, 'grad_norm': 0.6630826592445374, 'learning_rate': 1.1097515250791945e-05, 'epoch': 1.44} +2025-05-11 04:03:53 - ERROR - stderr - 48%|████▊ | 1800/3741 [10:37:59<10:50:36, 20.11s/it] +2025-05-11 04:04:15 - ERROR - stderr - 48%|████▊ | 1801/3741 [10:38:21<11:05:47, 20.59s/it] +2025-05-11 04:04:15 - ERROR - stderr - +2025-05-11 04:04:15 - ERROR - stderr - +2025-05-11 04:04:15 - INFO - stdout - {'loss': 0.7214, 'grad_norm': 0.6329621076583862, 'learning_rate': 1.1088907854544985e-05, 'epoch': 1.44} +2025-05-11 04:04:15 - ERROR - stderr - 48%|████▊ | 1801/3741 [10:38:21<11:05:47, 20.59s/it] +2025-05-11 04:04:34 - ERROR - stderr - 48%|████▊ | 1802/3741 [10:38:41<10:58:42, 20.38s/it] +2025-05-11 04:04:34 - ERROR - stderr - +2025-05-11 04:04:34 - ERROR - stderr - +2025-05-11 04:04:34 - INFO - stdout - {'loss': 0.732, 'grad_norm': 0.6646215319633484, 'learning_rate': 1.1080299641797837e-05, 'epoch': 1.45} +2025-05-11 04:04:34 - ERROR - stderr - 48%|████▊ | 1802/3741 [10:38:41<10:58:42, 20.38s/it] +2025-05-11 04:04:56 - ERROR - stderr - 48%|████▊ | 1803/3741 [10:39:02<11:06:33, 20.64s/it] +2025-05-11 04:04:56 - ERROR - stderr - +2025-05-11 04:04:56 - ERROR - stderr - +2025-05-11 04:04:56 - INFO - stdout - {'loss': 0.7722, 'grad_norm': 0.6855160593986511, 'learning_rate': 1.1071690619005224e-05, 'epoch': 1.45} +2025-05-11 04:04:56 - ERROR - stderr - 48%|████▊ | 1803/3741 [10:39:02<11:06:33, 20.64s/it] +2025-05-11 04:05:15 - ERROR - stderr - 48%|████▊ | 1804/3741 [10:39:22<10:56:05, 20.32s/it] +2025-05-11 04:05:15 - ERROR - stderr - +2025-05-11 04:05:15 - ERROR - stderr - +2025-05-11 04:05:15 - INFO - stdout - {'loss': 0.7716, 'grad_norm': 0.7373548746109009, 'learning_rate': 1.1063080792622484e-05, 'epoch': 1.45} +2025-05-11 04:05:15 - ERROR - stderr - 48%|████▊ | 1804/3741 [10:39:22<10:56:05, 20.32s/it] +2025-05-11 04:05:36 - ERROR - stderr - 48%|████▊ | 1805/3741 [10:39:43<11:04:28, 20.59s/it] +2025-05-11 04:05:37 - ERROR - stderr - +2025-05-11 04:05:37 - ERROR - stderr - +2025-05-11 04:05:37 - INFO - stdout - {'loss': 0.7635, 'grad_norm': 0.6910948753356934, 'learning_rate': 1.1054470169105564e-05, 'epoch': 1.45} +2025-05-11 04:05:37 - ERROR - stderr - 48%|████▊ | 1805/3741 [10:39:43<11:04:28, 20.59s/it] +2025-05-11 04:05:56 - ERROR - stderr - 48%|████▊ | 1806/3741 [10:40:02<10:52:40, 20.24s/it] +2025-05-11 04:05:56 - ERROR - stderr - +2025-05-11 04:05:56 - ERROR - stderr - +2025-05-11 04:05:56 - INFO - stdout - {'loss': 0.7724, 'grad_norm': 0.6741712093353271, 'learning_rate': 1.1045858754911001e-05, 'epoch': 1.45} +2025-05-11 04:05:56 - ERROR - stderr - 48%|████▊ | 1806/3741 [10:40:02<10:52:40, 20.24s/it] +2025-05-11 04:06:16 - ERROR - stderr - 48%|████▊ | 1807/3741 [10:40:22<10:48:50, 20.13s/it] +2025-05-11 04:06:16 - ERROR - stderr - +2025-05-11 04:06:16 - ERROR - stderr - +2025-05-11 04:06:16 - INFO - stdout - {'loss': 0.7664, 'grad_norm': 0.6909212470054626, 'learning_rate': 1.1037246556495922e-05, 'epoch': 1.45} +2025-05-11 04:06:16 - ERROR - stderr - 48%|████▊ | 1807/3741 [10:40:22<10:48:50, 20.13s/it] +2025-05-11 04:06:35 - ERROR - stderr - 48%|████▊ | 1808/3741 [10:40:42<10:43:17, 19.97s/it] +2025-05-11 04:06:35 - ERROR - stderr - +2025-05-11 04:06:35 - ERROR - stderr - +2025-05-11 04:06:35 - INFO - stdout - {'loss': 0.7787, 'grad_norm': 0.7179321050643921, 'learning_rate': 1.1028633580318056e-05, 'epoch': 1.45} +2025-05-11 04:06:35 - ERROR - stderr - 48%|████▊ | 1808/3741 [10:40:42<10:43:17, 19.97s/it] +2025-05-11 04:06:55 - ERROR - stderr - 48%|████▊ | 1809/3741 [10:41:01<10:38:43, 19.84s/it] +2025-05-11 04:06:55 - ERROR - stderr - +2025-05-11 04:06:55 - ERROR - stderr - +2025-05-11 04:06:55 - INFO - stdout - {'loss': 0.7634, 'grad_norm': 0.6822714805603027, 'learning_rate': 1.1020019832835694e-05, 'epoch': 1.45} +2025-05-11 04:06:55 - ERROR - stderr - 48%|████▊ | 1809/3741 [10:41:01<10:38:43, 19.84s/it] +2025-05-11 04:07:14 - ERROR - stderr - 48%|████▊ | 1810/3741 [10:41:21<10:34:57, 19.73s/it] +2025-05-11 04:07:14 - ERROR - stderr - +2025-05-11 04:07:14 - ERROR - stderr - +2025-05-11 04:07:14 - INFO - stdout - {'loss': 0.7614, 'grad_norm': 0.680316686630249, 'learning_rate': 1.1011405320507726e-05, 'epoch': 1.45} +2025-05-11 04:07:14 - ERROR - stderr - 48%|████▊ | 1810/3741 [10:41:21<10:34:57, 19.73s/it] +2025-05-11 04:07:34 - ERROR - stderr - 48%|████▊ | 1811/3741 [10:41:40<10:32:14, 19.65s/it] +2025-05-11 04:07:34 - ERROR - stderr - +2025-05-11 04:07:34 - ERROR - stderr - +2025-05-11 04:07:34 - INFO - stdout - {'loss': 0.6952, 'grad_norm': 0.6558269262313843, 'learning_rate': 1.1002790049793604e-05, 'epoch': 1.45} +2025-05-11 04:07:34 - ERROR - stderr - 48%|████▊ | 1811/3741 [10:41:40<10:32:14, 19.65s/it] +2025-05-11 04:07:55 - ERROR - stderr - 48%|████▊ | 1812/3741 [10:42:01<10:41:40, 19.96s/it] +2025-05-11 04:07:55 - ERROR - stderr - +2025-05-11 04:07:55 - ERROR - stderr - +2025-05-11 04:07:55 - INFO - stdout - {'loss': 0.7696, 'grad_norm': 0.6913748979568481, 'learning_rate': 1.099417402715335e-05, 'epoch': 1.45} +2025-05-11 04:07:55 - ERROR - stderr - 48%|████▊ | 1812/3741 [10:42:01<10:41:40, 19.96s/it] +2025-05-11 04:08:14 - ERROR - stderr - 48%|████▊ | 1813/3741 [10:42:20<10:37:57, 19.85s/it] +2025-05-11 04:08:14 - ERROR - stderr - +2025-05-11 04:08:14 - ERROR - stderr - +2025-05-11 04:08:14 - INFO - stdout - {'loss': 0.7428, 'grad_norm': 0.6790192723274231, 'learning_rate': 1.0985557259047557e-05, 'epoch': 1.45} +2025-05-11 04:08:14 - ERROR - stderr - 48%|████▊ | 1813/3741 [10:42:20<10:37:57, 19.85s/it] +2025-05-11 04:08:35 - ERROR - stderr - 48%|████▊ | 1814/3741 [10:42:42<10:49:59, 20.24s/it] +2025-05-11 04:08:35 - ERROR - stderr - +2025-05-11 04:08:35 - ERROR - stderr - +2025-05-11 04:08:35 - INFO - stdout - {'loss': 0.7443, 'grad_norm': 0.6557827591896057, 'learning_rate': 1.0976939751937361e-05, 'epoch': 1.45} +2025-05-11 04:08:35 - ERROR - stderr - 48%|████▊ | 1814/3741 [10:42:42<10:49:59, 20.24s/it] +2025-05-11 04:08:55 - ERROR - stderr - 49%|████▊ | 1815/3741 [10:43:01<10:44:31, 20.08s/it] +2025-05-11 04:08:55 - ERROR - stderr - +2025-05-11 04:08:55 - ERROR - stderr - +2025-05-11 04:08:55 - INFO - stdout - {'loss': 0.7227, 'grad_norm': 0.6664519309997559, 'learning_rate': 1.0968321512284472e-05, 'epoch': 1.46} +2025-05-11 04:08:55 - ERROR - stderr - 49%|████▊ | 1815/3741 [10:43:01<10:44:31, 20.08s/it] +2025-05-11 04:09:17 - ERROR - stderr - 49%|████▊ | 1816/3741 [10:43:24<11:06:56, 20.79s/it] +2025-05-11 04:09:17 - ERROR - stderr - +2025-05-11 04:09:17 - ERROR - stderr - +2025-05-11 04:09:17 - INFO - stdout - {'loss': 0.7558, 'grad_norm': 0.6873872876167297, 'learning_rate': 1.0959702546551135e-05, 'epoch': 1.46} +2025-05-11 04:09:17 - ERROR - stderr - 49%|████▊ | 1816/3741 [10:43:24<11:06:56, 20.79s/it] +2025-05-11 04:09:37 - ERROR - stderr - 49%|████▊ | 1817/3741 [10:43:43<10:56:07, 20.46s/it] +2025-05-11 04:09:37 - ERROR - stderr - +2025-05-11 04:09:37 - ERROR - stderr - +2025-05-11 04:09:37 - INFO - stdout - {'loss': 0.7435, 'grad_norm': 0.6596401333808899, 'learning_rate': 1.0951082861200142e-05, 'epoch': 1.46} +2025-05-11 04:09:37 - ERROR - stderr - 49%|████▊ | 1817/3741 [10:43:43<10:56:07, 20.46s/it] +2025-05-11 04:09:59 - ERROR - stderr - 49%|████▊ | 1818/3741 [10:44:05<11:08:40, 20.86s/it] +2025-05-11 04:09:59 - ERROR - stderr - +2025-05-11 04:09:59 - ERROR - stderr - +2025-05-11 04:09:59 - INFO - stdout - {'loss': 0.8009, 'grad_norm': 0.6788073182106018, 'learning_rate': 1.0942462462694834e-05, 'epoch': 1.46} +2025-05-11 04:09:59 - ERROR - stderr - 49%|████▊ | 1818/3741 [10:44:05<11:08:40, 20.86s/it] +2025-05-11 04:10:19 - ERROR - stderr - 49%|████▊ | 1819/3741 [10:44:25<10:57:19, 20.52s/it] +2025-05-11 04:10:19 - ERROR - stderr - +2025-05-11 04:10:19 - ERROR - stderr - +2025-05-11 04:10:19 - INFO - stdout - {'loss': 0.7332, 'grad_norm': 0.7057682871818542, 'learning_rate': 1.0933841357499074e-05, 'epoch': 1.46} +2025-05-11 04:10:19 - ERROR - stderr - 49%|████▊ | 1819/3741 [10:44:25<10:57:19, 20.52s/it] +2025-05-11 04:10:41 - ERROR - stderr - 49%|████▊ | 1820/3741 [10:44:47<11:15:59, 21.11s/it] +2025-05-11 04:10:41 - ERROR - stderr - +2025-05-11 04:10:41 - ERROR - stderr - +2025-05-11 04:10:41 - INFO - stdout - {'loss': 0.7549, 'grad_norm': 0.6896910071372986, 'learning_rate': 1.0925219552077258e-05, 'epoch': 1.46} +2025-05-11 04:10:41 - ERROR - stderr - 49%|████▊ | 1820/3741 [10:44:47<11:15:59, 21.11s/it] +2025-05-11 04:11:01 - ERROR - stderr - 49%|████▊ | 1821/3741 [10:45:07<11:02:58, 20.72s/it] +2025-05-11 04:11:01 - ERROR - stderr - +2025-05-11 04:11:01 - ERROR - stderr - +2025-05-11 04:11:01 - INFO - stdout - {'loss': 0.7439, 'grad_norm': 0.6752651333808899, 'learning_rate': 1.091659705289431e-05, 'epoch': 1.46} +2025-05-11 04:11:01 - ERROR - stderr - 49%|████▊ | 1821/3741 [10:45:07<11:02:58, 20.72s/it] +2025-05-11 04:11:24 - ERROR - stderr - 49%|████▊ | 1822/3741 [10:45:30<11:25:28, 21.43s/it] +2025-05-11 04:11:24 - ERROR - stderr - +2025-05-11 04:11:24 - ERROR - stderr - +2025-05-11 04:11:24 - INFO - stdout - {'loss': 0.7304, 'grad_norm': 0.7121813297271729, 'learning_rate': 1.090797386641568e-05, 'epoch': 1.46} +2025-05-11 04:11:24 - ERROR - stderr - 49%|████▊ | 1822/3741 [10:45:30<11:25:28, 21.43s/it] +2025-05-11 04:11:44 - ERROR - stderr - 49%|████▊ | 1823/3741 [10:45:50<11:07:02, 20.87s/it] +2025-05-11 04:11:44 - ERROR - stderr - +2025-05-11 04:11:44 - ERROR - stderr - +2025-05-11 04:11:44 - INFO - stdout - {'loss': 0.7529, 'grad_norm': 0.7077644467353821, 'learning_rate': 1.0899349999107325e-05, 'epoch': 1.46} +2025-05-11 04:11:44 - ERROR - stderr - 49%|████▊ | 1823/3741 [10:45:50<11:07:02, 20.87s/it] +2025-05-11 04:12:06 - ERROR - stderr - 49%|████▉ | 1824/3741 [10:46:12<11:20:09, 21.29s/it] +2025-05-11 04:12:06 - ERROR - stderr - +2025-05-11 04:12:06 - ERROR - stderr - +2025-05-11 04:12:06 - INFO - stdout - {'loss': 0.7544, 'grad_norm': 0.6643130779266357, 'learning_rate': 1.089072545743571e-05, 'epoch': 1.46} +2025-05-11 04:12:06 - ERROR - stderr - 49%|████▉ | 1824/3741 [10:46:12<11:20:09, 21.29s/it] +2025-05-11 04:12:25 - ERROR - stderr - 49%|████▉ | 1825/3741 [10:46:32<11:02:04, 20.73s/it] +2025-05-11 04:12:25 - ERROR - stderr - +2025-05-11 04:12:25 - ERROR - stderr - +2025-05-11 04:12:25 - INFO - stdout - {'loss': 0.76, 'grad_norm': 0.6911596655845642, 'learning_rate': 1.088210024786781e-05, 'epoch': 1.46} +2025-05-11 04:12:25 - ERROR - stderr - 49%|████▉ | 1825/3741 [10:46:32<11:02:04, 20.73s/it] +2025-05-11 04:12:48 - ERROR - stderr - 49%|████▉ | 1826/3741 [10:46:54<11:18:25, 21.26s/it] +2025-05-11 04:12:48 - ERROR - stderr - +2025-05-11 04:12:48 - ERROR - stderr - +2025-05-11 04:12:48 - INFO - stdout - {'loss': 0.7368, 'grad_norm': 0.6701585054397583, 'learning_rate': 1.0873474376871105e-05, 'epoch': 1.46} +2025-05-11 04:12:48 - ERROR - stderr - 49%|████▉ | 1826/3741 [10:46:54<11:18:25, 21.26s/it] +2025-05-11 04:13:07 - ERROR - stderr - 49%|████▉ | 1827/3741 [10:47:14<11:02:11, 20.76s/it] +2025-05-11 04:13:07 - ERROR - stderr - +2025-05-11 04:13:07 - ERROR - stderr - +2025-05-11 04:13:07 - INFO - stdout - {'loss': 0.7703, 'grad_norm': 0.7087423801422119, 'learning_rate': 1.0864847850913568e-05, 'epoch': 1.47} +2025-05-11 04:13:07 - ERROR - stderr - 49%|████▉ | 1827/3741 [10:47:14<11:02:11, 20.76s/it] +2025-05-11 04:13:27 - ERROR - stderr - 49%|████▉ | 1828/3741 [10:47:34<10:55:48, 20.57s/it] +2025-05-11 04:13:28 - ERROR - stderr - +2025-05-11 04:13:28 - ERROR - stderr - +2025-05-11 04:13:28 - INFO - stdout - {'loss': 0.812, 'grad_norm': 0.7176198363304138, 'learning_rate': 1.0856220676463654e-05, 'epoch': 1.47} +2025-05-11 04:13:28 - ERROR - stderr - 49%|████▉ | 1828/3741 [10:47:34<10:55:48, 20.57s/it] +2025-05-11 04:13:47 - ERROR - stderr - 49%|████▉ | 1829/3741 [10:47:53<10:43:49, 20.20s/it] +2025-05-11 04:13:47 - ERROR - stderr - +2025-05-11 04:13:47 - ERROR - stderr - +2025-05-11 04:13:47 - INFO - stdout - {'loss': 0.792, 'grad_norm': 0.6819745302200317, 'learning_rate': 1.084759285999032e-05, 'epoch': 1.47} +2025-05-11 04:13:47 - ERROR - stderr - 49%|████▉ | 1829/3741 [10:47:53<10:43:49, 20.20s/it] +2025-05-11 04:14:07 - ERROR - stderr - 49%|████▉ | 1830/3741 [10:48:13<10:43:43, 20.21s/it] +2025-05-11 04:14:07 - ERROR - stderr - +2025-05-11 04:14:07 - ERROR - stderr - +2025-05-11 04:14:07 - INFO - stdout - {'loss': 0.7629, 'grad_norm': 0.7085966467857361, 'learning_rate': 1.0838964407962993e-05, 'epoch': 1.47} +2025-05-11 04:14:07 - ERROR - stderr - 49%|████▉ | 1830/3741 [10:48:13<10:43:43, 20.21s/it] +2025-05-11 04:14:27 - ERROR - stderr - 49%|████▉ | 1831/3741 [10:48:33<10:36:49, 20.00s/it] +2025-05-11 04:14:27 - ERROR - stderr - +2025-05-11 04:14:27 - ERROR - stderr - +2025-05-11 04:14:27 - INFO - stdout - {'loss': 0.7573, 'grad_norm': 0.6867752075195312, 'learning_rate': 1.0830335326851577e-05, 'epoch': 1.47} +2025-05-11 04:14:27 - ERROR - stderr - 49%|████▉ | 1831/3741 [10:48:33<10:36:49, 20.00s/it] +2025-05-11 04:14:46 - ERROR - stderr - 49%|████▉ | 1832/3741 [10:48:53<10:33:23, 19.91s/it] +2025-05-11 04:14:46 - ERROR - stderr - +2025-05-11 04:14:46 - ERROR - stderr - +2025-05-11 04:14:46 - INFO - stdout - {'loss': 0.7561, 'grad_norm': 0.665777325630188, 'learning_rate': 1.0821705623126461e-05, 'epoch': 1.47} +2025-05-11 04:14:46 - ERROR - stderr - 49%|████▉ | 1832/3741 [10:48:53<10:33:23, 19.91s/it] +2025-05-11 04:15:06 - ERROR - stderr - 49%|████▉ | 1833/3741 [10:49:12<10:28:56, 19.78s/it] +2025-05-11 04:15:06 - ERROR - stderr - +2025-05-11 04:15:06 - ERROR - stderr - +2025-05-11 04:15:06 - INFO - stdout - {'loss': 0.7441, 'grad_norm': 0.7135064601898193, 'learning_rate': 1.0813075303258483e-05, 'epoch': 1.47} +2025-05-11 04:15:06 - ERROR - stderr - 49%|████▉ | 1833/3741 [10:49:12<10:28:56, 19.78s/it] +2025-05-11 04:15:25 - ERROR - stderr - 49%|████▉ | 1834/3741 [10:49:31<10:23:52, 19.63s/it] +2025-05-11 04:15:25 - ERROR - stderr - +2025-05-11 04:15:25 - ERROR - stderr - +2025-05-11 04:15:25 - INFO - stdout - {'loss': 0.7223, 'grad_norm': 0.6736664772033691, 'learning_rate': 1.0804444373718952e-05, 'epoch': 1.47} +2025-05-11 04:15:25 - ERROR - stderr - 49%|████▉ | 1834/3741 [10:49:31<10:23:52, 19.63s/it] +2025-05-11 04:15:45 - ERROR - stderr - 49%|████▉ | 1835/3741 [10:49:51<10:25:51, 19.70s/it] +2025-05-11 04:15:45 - ERROR - stderr - +2025-05-11 04:15:45 - ERROR - stderr - +2025-05-11 04:15:45 - INFO - stdout - {'loss': 0.728, 'grad_norm': 0.6878182888031006, 'learning_rate': 1.0795812840979632e-05, 'epoch': 1.47} +2025-05-11 04:15:45 - ERROR - stderr - 49%|████▉ | 1835/3741 [10:49:51<10:25:51, 19.70s/it] +2025-05-11 04:16:04 - ERROR - stderr - 49%|████▉ | 1836/3741 [10:50:11<10:22:34, 19.61s/it] +2025-05-11 04:16:04 - ERROR - stderr - +2025-05-11 04:16:04 - ERROR - stderr - +2025-05-11 04:16:04 - INFO - stdout - {'loss': 0.7475, 'grad_norm': 0.6910241842269897, 'learning_rate': 1.0787180711512744e-05, 'epoch': 1.47} +2025-05-11 04:16:04 - ERROR - stderr - 49%|████▉ | 1836/3741 [10:50:11<10:22:34, 19.61s/it] +2025-05-11 04:16:25 - ERROR - stderr - 49%|████▉ | 1837/3741 [10:50:32<10:35:26, 20.02s/it] +2025-05-11 04:16:25 - ERROR - stderr - +2025-05-11 04:16:25 - ERROR - stderr - +2025-05-11 04:16:25 - INFO - stdout - {'loss': 0.7601, 'grad_norm': 0.739133894443512, 'learning_rate': 1.0778547991790946e-05, 'epoch': 1.47} +2025-05-11 04:16:25 - ERROR - stderr - 49%|████▉ | 1837/3741 [10:50:32<10:35:26, 20.02s/it] +2025-05-11 04:16:45 - ERROR - stderr - 49%|████▉ | 1838/3741 [10:50:51<10:27:30, 19.78s/it] +2025-05-11 04:16:45 - ERROR - stderr - +2025-05-11 04:16:45 - ERROR - stderr - +2025-05-11 04:16:45 - INFO - stdout - {'loss': 0.7574, 'grad_norm': 0.7009455561637878, 'learning_rate': 1.076991468828735e-05, 'epoch': 1.47} +2025-05-11 04:16:45 - ERROR - stderr - 49%|████▉ | 1838/3741 [10:50:51<10:27:30, 19.78s/it] +2025-05-11 04:17:06 - ERROR - stderr - 49%|████▉ | 1839/3741 [10:51:12<10:44:12, 20.32s/it] +2025-05-11 04:17:06 - ERROR - stderr - +2025-05-11 04:17:06 - ERROR - stderr - +2025-05-11 04:17:06 - INFO - stdout - {'loss': 0.7775, 'grad_norm': 0.7256219983100891, 'learning_rate': 1.0761280807475504e-05, 'epoch': 1.47} +2025-05-11 04:17:06 - ERROR - stderr - 49%|████▉ | 1839/3741 [10:51:12<10:44:12, 20.32s/it] +2025-05-11 04:17:26 - ERROR - stderr - 49%|████▉ | 1840/3741 [10:51:32<10:35:58, 20.07s/it] +2025-05-11 04:17:26 - ERROR - stderr - +2025-05-11 04:17:26 - ERROR - stderr - +2025-05-11 04:17:26 - INFO - stdout - {'loss': 0.7355, 'grad_norm': 0.7107866406440735, 'learning_rate': 1.0752646355829382e-05, 'epoch': 1.48} +2025-05-11 04:17:26 - ERROR - stderr - 49%|████▉ | 1840/3741 [10:51:32<10:35:58, 20.07s/it] +2025-05-11 04:17:47 - ERROR - stderr - 49%|████▉ | 1841/3741 [10:51:54<10:51:55, 20.59s/it] +2025-05-11 04:17:47 - ERROR - stderr - +2025-05-11 04:17:47 - ERROR - stderr - +2025-05-11 04:17:47 - INFO - stdout - {'loss': 0.7747, 'grad_norm': 0.7084487676620483, 'learning_rate': 1.0744011339823389e-05, 'epoch': 1.48} +2025-05-11 04:17:47 - ERROR - stderr - 49%|████▉ | 1841/3741 [10:51:54<10:51:55, 20.59s/it] +2025-05-11 04:18:07 - ERROR - stderr - 49%|████▉ | 1842/3741 [10:52:13<10:43:01, 20.32s/it] +2025-05-11 04:18:07 - ERROR - stderr - +2025-05-11 04:18:07 - ERROR - stderr - +2025-05-11 04:18:07 - INFO - stdout - {'loss': 0.7539, 'grad_norm': 0.6767612099647522, 'learning_rate': 1.0735375765932352e-05, 'epoch': 1.48} +2025-05-11 04:18:07 - ERROR - stderr - 49%|████▉ | 1842/3741 [10:52:13<10:43:01, 20.32s/it] +2025-05-11 04:18:28 - ERROR - stderr - 49%|████▉ | 1843/3741 [10:52:35<10:51:40, 20.60s/it] +2025-05-11 04:18:28 - ERROR - stderr - +2025-05-11 04:18:28 - ERROR - stderr - +2025-05-11 04:18:28 - INFO - stdout - {'loss': 0.788, 'grad_norm': 0.7070626616477966, 'learning_rate': 1.0726739640631523e-05, 'epoch': 1.48} +2025-05-11 04:18:28 - ERROR - stderr - 49%|████▉ | 1843/3741 [10:52:35<10:51:40, 20.60s/it] +2025-05-11 04:18:48 - ERROR - stderr - 49%|████▉ | 1844/3741 [10:52:54<10:41:24, 20.29s/it] +2025-05-11 04:18:48 - ERROR - stderr - +2025-05-11 04:18:48 - ERROR - stderr - +2025-05-11 04:18:48 - INFO - stdout - {'loss': 0.7404, 'grad_norm': 0.6804197430610657, 'learning_rate': 1.0718102970396564e-05, 'epoch': 1.48} +2025-05-11 04:18:48 - ERROR - stderr - 49%|████▉ | 1844/3741 [10:52:54<10:41:24, 20.29s/it] +2025-05-11 04:19:09 - ERROR - stderr - 49%|████▉ | 1845/3741 [10:53:16<10:50:58, 20.60s/it] +2025-05-11 04:19:09 - ERROR - stderr - +2025-05-11 04:19:09 - ERROR - stderr - +2025-05-11 04:19:09 - INFO - stdout - {'loss': 0.7441, 'grad_norm': 0.6973698139190674, 'learning_rate': 1.0709465761703542e-05, 'epoch': 1.48} +2025-05-11 04:19:09 - ERROR - stderr - 49%|████▉ | 1845/3741 [10:53:16<10:50:58, 20.60s/it] +2025-05-11 04:19:29 - ERROR - stderr - 49%|████▉ | 1846/3741 [10:53:35<10:42:03, 20.33s/it] +2025-05-11 04:19:29 - ERROR - stderr - +2025-05-11 04:19:29 - ERROR - stderr - +2025-05-11 04:19:29 - INFO - stdout - {'loss': 0.7265, 'grad_norm': 0.6551907658576965, 'learning_rate': 1.0700828021028929e-05, 'epoch': 1.48} +2025-05-11 04:19:29 - ERROR - stderr - 49%|████▉ | 1846/3741 [10:53:35<10:42:03, 20.33s/it] +2025-05-11 04:19:50 - ERROR - stderr - 49%|████▉ | 1847/3741 [10:53:56<10:45:37, 20.45s/it] +2025-05-11 04:19:50 - ERROR - stderr - +2025-05-11 04:19:50 - ERROR - stderr - +2025-05-11 04:19:50 - INFO - stdout - {'loss': 0.736, 'grad_norm': 0.6486260890960693, 'learning_rate': 1.0692189754849595e-05, 'epoch': 1.48} +2025-05-11 04:19:50 - ERROR - stderr - 49%|████▉ | 1847/3741 [10:53:56<10:45:37, 20.45s/it] +2025-05-11 04:20:09 - ERROR - stderr - 49%|████▉ | 1848/3741 [10:54:15<10:35:48, 20.15s/it] +2025-05-11 04:20:09 - ERROR - stderr - +2025-05-11 04:20:09 - ERROR - stderr - +2025-05-11 04:20:09 - INFO - stdout - {'loss': 0.6997, 'grad_norm': 0.601466715335846, 'learning_rate': 1.0683550969642813e-05, 'epoch': 1.48} +2025-05-11 04:20:09 - ERROR - stderr - 49%|████▉ | 1848/3741 [10:54:15<10:35:48, 20.15s/it] +2025-05-11 04:20:29 - ERROR - stderr - 49%|████▉ | 1849/3741 [10:54:35<10:30:07, 19.98s/it] +2025-05-11 04:20:29 - ERROR - stderr - +2025-05-11 04:20:29 - ERROR - stderr - +2025-05-11 04:20:29 - INFO - stdout - {'loss': 0.7438, 'grad_norm': 0.6700170636177063, 'learning_rate': 1.0674911671886236e-05, 'epoch': 1.48} +2025-05-11 04:20:29 - ERROR - stderr - 49%|████▉ | 1849/3741 [10:54:35<10:30:07, 19.98s/it] +2025-05-11 04:20:48 - ERROR - stderr - 49%|████▉ | 1850/3741 [10:54:55<10:26:21, 19.87s/it] +2025-05-11 04:20:48 - ERROR - stderr - +2025-05-11 04:20:48 - ERROR - stderr - +2025-05-11 04:20:48 - INFO - stdout - {'loss': 0.7422, 'grad_norm': 0.6908175349235535, 'learning_rate': 1.06662718680579e-05, 'epoch': 1.48} +2025-05-11 04:20:48 - ERROR - stderr - 49%|████▉ | 1850/3741 [10:54:55<10:26:21, 19.87s/it] +2025-05-11 04:21:08 - ERROR - stderr - 49%|████▉ | 1851/3741 [10:55:15<10:27:22, 19.92s/it] +2025-05-11 04:21:08 - ERROR - stderr - +2025-05-11 04:21:08 - ERROR - stderr - +2025-05-11 04:21:08 - INFO - stdout - {'loss': 0.8039, 'grad_norm': 0.7098249197006226, 'learning_rate': 1.0657631564636226e-05, 'epoch': 1.48} +2025-05-11 04:21:08 - ERROR - stderr - 49%|████▉ | 1851/3741 [10:55:15<10:27:22, 19.92s/it] +2025-05-11 04:21:28 - ERROR - stderr - 50%|████▉ | 1852/3741 [10:55:34<10:23:13, 19.80s/it] +2025-05-11 04:21:28 - ERROR - stderr - +2025-05-11 04:21:28 - ERROR - stderr - +2025-05-11 04:21:28 - INFO - stdout - {'loss': 0.7891, 'grad_norm': 0.7431460022926331, 'learning_rate': 1.0648990768100009e-05, 'epoch': 1.49} +2025-05-11 04:21:28 - ERROR - stderr - 50%|████▉ | 1852/3741 [10:55:34<10:23:13, 19.80s/it] +2025-05-11 04:21:47 - ERROR - stderr - 50%|████▉ | 1853/3741 [10:55:54<10:21:06, 19.74s/it] +2025-05-11 04:21:47 - ERROR - stderr - +2025-05-11 04:21:47 - ERROR - stderr - +2025-05-11 04:21:47 - INFO - stdout - {'loss': 0.742, 'grad_norm': 0.6714998483657837, 'learning_rate': 1.0640349484928413e-05, 'epoch': 1.49} +2025-05-11 04:21:47 - ERROR - stderr - 50%|████▉ | 1853/3741 [10:55:54<10:21:06, 19.74s/it] +2025-05-11 04:22:08 - ERROR - stderr - 50%|████▉ | 1854/3741 [10:56:15<10:31:48, 20.09s/it] +2025-05-11 04:22:08 - ERROR - stderr - +2025-05-11 04:22:08 - ERROR - stderr - +2025-05-11 04:22:08 - INFO - stdout - {'loss': 0.7708, 'grad_norm': 0.6916069984436035, 'learning_rate': 1.0631707721600965e-05, 'epoch': 1.49} +2025-05-11 04:22:08 - ERROR - stderr - 50%|████▉ | 1854/3741 [10:56:15<10:31:48, 20.09s/it] +2025-05-11 04:22:28 - ERROR - stderr - 50%|████▉ | 1855/3741 [10:56:34<10:25:46, 19.91s/it] +2025-05-11 04:22:28 - ERROR - stderr - +2025-05-11 04:22:28 - ERROR - stderr - +2025-05-11 04:22:28 - INFO - stdout - {'loss': 0.7498, 'grad_norm': 0.7198412418365479, 'learning_rate': 1.0623065484597555e-05, 'epoch': 1.49} +2025-05-11 04:22:28 - ERROR - stderr - 50%|████▉ | 1855/3741 [10:56:34<10:25:46, 19.91s/it] +2025-05-11 04:22:48 - ERROR - stderr - 50%|████▉ | 1856/3741 [10:56:55<10:30:00, 20.05s/it] +2025-05-11 04:22:48 - ERROR - stderr - +2025-05-11 04:22:48 - ERROR - stderr - +2025-05-11 04:22:48 - INFO - stdout - {'loss': 0.7526, 'grad_norm': 0.6497362852096558, 'learning_rate': 1.0614422780398422e-05, 'epoch': 1.49} +2025-05-11 04:22:48 - ERROR - stderr - 50%|████▉ | 1856/3741 [10:56:55<10:30:00, 20.05s/it] +2025-05-11 04:23:08 - ERROR - stderr - 50%|████▉ | 1857/3741 [10:57:14<10:26:39, 19.96s/it] +2025-05-11 04:23:08 - ERROR - stderr - +2025-05-11 04:23:08 - ERROR - stderr - +2025-05-11 04:23:08 - INFO - stdout - {'loss': 0.737, 'grad_norm': 0.6391859650611877, 'learning_rate': 1.0605779615484167e-05, 'epoch': 1.49} +2025-05-11 04:23:08 - ERROR - stderr - 50%|████▉ | 1857/3741 [10:57:14<10:26:39, 19.96s/it] +2025-05-11 04:23:29 - ERROR - stderr - 50%|████▉ | 1858/3741 [10:57:35<10:33:05, 20.17s/it] +2025-05-11 04:23:29 - ERROR - stderr - +2025-05-11 04:23:29 - ERROR - stderr - +2025-05-11 04:23:29 - INFO - stdout - {'loss': 0.7439, 'grad_norm': 0.6709389090538025, 'learning_rate': 1.0597135996335723e-05, 'epoch': 1.49} +2025-05-11 04:23:29 - ERROR - stderr - 50%|████▉ | 1858/3741 [10:57:35<10:33:05, 20.17s/it] +2025-05-11 04:23:48 - ERROR - stderr - 50%|████▉ | 1859/3741 [10:57:54<10:25:34, 19.94s/it] +2025-05-11 04:23:48 - ERROR - stderr - +2025-05-11 04:23:48 - ERROR - stderr - +2025-05-11 04:23:48 - INFO - stdout - {'loss': 0.7798, 'grad_norm': 0.7073272466659546, 'learning_rate': 1.0588491929434375e-05, 'epoch': 1.49} +2025-05-11 04:23:48 - ERROR - stderr - 50%|████▉ | 1859/3741 [10:57:54<10:25:34, 19.94s/it] +2025-05-11 04:24:08 - ERROR - stderr - 50%|████▉ | 1860/3741 [10:58:14<10:26:30, 19.98s/it] +2025-05-11 04:24:08 - ERROR - stderr - +2025-05-11 04:24:08 - ERROR - stderr - +2025-05-11 04:24:08 - INFO - stdout - {'loss': 0.7422, 'grad_norm': 0.6603201031684875, 'learning_rate': 1.0579847421261733e-05, 'epoch': 1.49} +2025-05-11 04:24:08 - ERROR - stderr - 50%|████▉ | 1860/3741 [10:58:14<10:26:30, 19.98s/it] +2025-05-11 04:24:28 - ERROR - stderr - 50%|████▉ | 1861/3741 [10:58:34<10:21:32, 19.84s/it] +2025-05-11 04:24:28 - ERROR - stderr - +2025-05-11 04:24:28 - ERROR - stderr - +2025-05-11 04:24:28 - INFO - stdout - {'loss': 0.7667, 'grad_norm': 0.6485406160354614, 'learning_rate': 1.057120247829975e-05, 'epoch': 1.49} +2025-05-11 04:24:28 - ERROR - stderr - 50%|████▉ | 1861/3741 [10:58:34<10:21:32, 19.84s/it] +2025-05-11 04:24:47 - ERROR - stderr - 50%|████▉ | 1862/3741 [10:58:54<10:20:15, 19.81s/it] +2025-05-11 04:24:47 - ERROR - stderr - +2025-05-11 04:24:47 - ERROR - stderr - +2025-05-11 04:24:47 - INFO - stdout - {'loss': 0.7603, 'grad_norm': 0.648036777973175, 'learning_rate': 1.0562557107030695e-05, 'epoch': 1.49} +2025-05-11 04:24:47 - ERROR - stderr - 50%|████▉ | 1862/3741 [10:58:54<10:20:15, 19.81s/it] +2025-05-11 04:25:07 - ERROR - stderr - 50%|████▉ | 1863/3741 [10:59:13<10:17:13, 19.72s/it] +2025-05-11 04:25:07 - ERROR - stderr - +2025-05-11 04:25:07 - ERROR - stderr - +2025-05-11 04:25:07 - INFO - stdout - {'loss': 0.7725, 'grad_norm': 0.7099272012710571, 'learning_rate': 1.0553911313937162e-05, 'epoch': 1.49} +2025-05-11 04:25:07 - ERROR - stderr - 50%|████▉ | 1863/3741 [10:59:13<10:17:13, 19.72s/it] +2025-05-11 04:25:27 - ERROR - stderr - 50%|████▉ | 1864/3741 [10:59:33<10:18:53, 19.78s/it] +2025-05-11 04:25:27 - ERROR - stderr - +2025-05-11 04:25:27 - ERROR - stderr - +2025-05-11 04:25:27 - INFO - stdout - {'loss': 0.7704, 'grad_norm': 0.7114027142524719, 'learning_rate': 1.0545265105502065e-05, 'epoch': 1.49} +2025-05-11 04:25:27 - ERROR - stderr - 50%|████▉ | 1864/3741 [10:59:33<10:18:53, 19.78s/it] +2025-05-11 04:25:47 - ERROR - stderr - 50%|████▉ | 1865/3741 [10:59:53<10:22:51, 19.92s/it] +2025-05-11 04:25:47 - ERROR - stderr - +2025-05-11 04:25:47 - ERROR - stderr - +2025-05-11 04:25:47 - INFO - stdout - {'loss': 0.733, 'grad_norm': 0.6591110825538635, 'learning_rate': 1.053661848820862e-05, 'epoch': 1.5} +2025-05-11 04:25:47 - ERROR - stderr - 50%|████▉ | 1865/3741 [10:59:53<10:22:51, 19.92s/it] +2025-05-11 04:26:06 - ERROR - stderr - 50%|████▉ | 1866/3741 [11:00:13<10:17:09, 19.75s/it] +2025-05-11 04:26:06 - ERROR - stderr - +2025-05-11 04:26:06 - ERROR - stderr - +2025-05-11 04:26:06 - INFO - stdout - {'loss': 0.7702, 'grad_norm': 0.7078248262405396, 'learning_rate': 1.0527971468540356e-05, 'epoch': 1.5} +2025-05-11 04:26:06 - ERROR - stderr - 50%|████▉ | 1866/3741 [11:00:13<10:17:09, 19.75s/it] +2025-05-11 04:26:28 - ERROR - stderr - 50%|████▉ | 1867/3741 [11:00:34<10:31:49, 20.23s/it] +2025-05-11 04:26:28 - ERROR - stderr - +2025-05-11 04:26:28 - ERROR - stderr - +2025-05-11 04:26:28 - INFO - stdout - {'loss': 0.7377, 'grad_norm': 0.6885595917701721, 'learning_rate': 1.0519324052981103e-05, 'epoch': 1.5} +2025-05-11 04:26:28 - ERROR - stderr - 50%|████▉ | 1867/3741 [11:00:34<10:31:49, 20.23s/it] +2025-05-11 04:26:47 - ERROR - stderr - 50%|████▉ | 1868/3741 [11:00:54<10:24:53, 20.02s/it] +2025-05-11 04:26:47 - ERROR - stderr - +2025-05-11 04:26:47 - ERROR - stderr - +2025-05-11 04:26:47 - INFO - stdout - {'loss': 0.7427, 'grad_norm': 0.681225061416626, 'learning_rate': 1.0510676248014991e-05, 'epoch': 1.5} +2025-05-11 04:26:47 - ERROR - stderr - 50%|████▉ | 1868/3741 [11:00:54<10:24:53, 20.02s/it] +2025-05-11 04:27:09 - ERROR - stderr - 50%|████▉ | 1869/3741 [11:01:15<10:41:22, 20.56s/it] +2025-05-11 04:27:09 - ERROR - stderr - +2025-05-11 04:27:09 - ERROR - stderr - +2025-05-11 04:27:09 - INFO - stdout - {'loss': 0.7611, 'grad_norm': 0.6760066747665405, 'learning_rate': 1.050202806012644e-05, 'epoch': 1.5} +2025-05-11 04:27:09 - ERROR - stderr - 50%|████▉ | 1869/3741 [11:01:15<10:41:22, 20.56s/it] +2025-05-11 04:27:28 - ERROR - stderr - 50%|████▉ | 1870/3741 [11:01:35<10:29:41, 20.19s/it] +2025-05-11 04:27:28 - ERROR - stderr - +2025-05-11 04:27:28 - ERROR - stderr - +2025-05-11 04:27:28 - INFO - stdout - {'loss': 0.7486, 'grad_norm': 0.6731633543968201, 'learning_rate': 1.0493379495800149e-05, 'epoch': 1.5} +2025-05-11 04:27:28 - ERROR - stderr - 50%|████▉ | 1870/3741 [11:01:35<10:29:41, 20.19s/it] +2025-05-11 04:27:49 - ERROR - stderr - 50%|█████ | 1871/3741 [11:01:55<10:31:57, 20.28s/it] +2025-05-11 04:27:49 - ERROR - stderr - +2025-05-11 04:27:49 - ERROR - stderr - +2025-05-11 04:27:49 - INFO - stdout - {'loss': 0.7713, 'grad_norm': 0.6760236620903015, 'learning_rate': 1.0484730561521107e-05, 'epoch': 1.5} +2025-05-11 04:27:49 - ERROR - stderr - 50%|█████ | 1871/3741 [11:01:55<10:31:57, 20.28s/it] +2025-05-11 04:28:08 - ERROR - stderr - 50%|█████ | 1872/3741 [11:02:15<10:24:50, 20.06s/it] +2025-05-11 04:28:08 - ERROR - stderr - +2025-05-11 04:28:08 - ERROR - stderr - +2025-05-11 04:28:08 - INFO - stdout - {'loss': 0.7235, 'grad_norm': 0.6664961576461792, 'learning_rate': 1.0476081263774585e-05, 'epoch': 1.5} +2025-05-11 04:28:08 - ERROR - stderr - 50%|█████ | 1872/3741 [11:02:15<10:24:50, 20.06s/it] +2025-05-11 04:28:28 - ERROR - stderr - 50%|█████ | 1873/3741 [11:02:34<10:19:33, 19.90s/it] +2025-05-11 04:28:28 - ERROR - stderr - +2025-05-11 04:28:28 - ERROR - stderr - +2025-05-11 04:28:28 - INFO - stdout - {'loss': 0.7431, 'grad_norm': 0.6721197366714478, 'learning_rate': 1.0467431609046116e-05, 'epoch': 1.5} +2025-05-11 04:28:28 - ERROR - stderr - 50%|█████ | 1873/3741 [11:02:34<10:19:33, 19.90s/it] +2025-05-11 04:28:48 - ERROR - stderr - 50%|█████ | 1874/3741 [11:02:54<10:16:46, 19.82s/it] +2025-05-11 04:28:48 - ERROR - stderr - +2025-05-11 04:28:48 - ERROR - stderr - +2025-05-11 04:28:48 - INFO - stdout - {'loss': 0.7797, 'grad_norm': 0.6789388060569763, 'learning_rate': 1.0458781603821508e-05, 'epoch': 1.5} +2025-05-11 04:28:48 - ERROR - stderr - 50%|█████ | 1874/3741 [11:02:54<10:16:46, 19.82s/it] +2025-05-11 04:29:07 - ERROR - stderr - 50%|█████ | 1875/3741 [11:03:14<10:14:16, 19.75s/it] +2025-05-11 04:29:07 - ERROR - stderr - +2025-05-11 04:29:07 - ERROR - stderr - +2025-05-11 04:29:07 - INFO - stdout - {'loss': 0.7856, 'grad_norm': 0.7579322457313538, 'learning_rate': 1.045013125458683e-05, 'epoch': 1.5} +2025-05-11 04:29:07 - ERROR - stderr - 50%|█████ | 1875/3741 [11:03:14<10:14:16, 19.75s/it] +2025-05-11 04:29:27 - ERROR - stderr - 50%|█████ | 1876/3741 [11:03:33<10:12:41, 19.71s/it] +2025-05-11 04:29:27 - ERROR - stderr - +2025-05-11 04:29:27 - ERROR - stderr - +2025-05-11 04:29:27 - INFO - stdout - {'loss': 0.7576, 'grad_norm': 0.6685303449630737, 'learning_rate': 1.0441480567828408e-05, 'epoch': 1.5} +2025-05-11 04:29:27 - ERROR - stderr - 50%|█████ | 1876/3741 [11:03:33<10:12:41, 19.71s/it] +2025-05-11 04:29:46 - ERROR - stderr - 50%|█████ | 1877/3741 [11:03:53<10:11:03, 19.67s/it] +2025-05-11 04:29:46 - ERROR - stderr - +2025-05-11 04:29:46 - ERROR - stderr - +2025-05-11 04:29:46 - INFO - stdout - {'loss': 0.7651, 'grad_norm': 0.6478756666183472, 'learning_rate': 1.0432829550032818e-05, 'epoch': 1.51} +2025-05-11 04:29:46 - ERROR - stderr - 50%|█████ | 1877/3741 [11:03:53<10:11:03, 19.67s/it] +2025-05-11 04:30:06 - ERROR - stderr - 50%|█��███ | 1878/3741 [11:04:12<10:08:01, 19.58s/it] +2025-05-11 04:30:06 - ERROR - stderr - +2025-05-11 04:30:06 - ERROR - stderr - +2025-05-11 04:30:06 - INFO - stdout - {'loss': 0.7516, 'grad_norm': 0.6975502967834473, 'learning_rate': 1.0424178207686894e-05, 'epoch': 1.51} +2025-05-11 04:30:06 - ERROR - stderr - 50%|█████ | 1878/3741 [11:04:12<10:08:01, 19.58s/it] +2025-05-11 04:30:25 - ERROR - stderr - 50%|█████ | 1879/3741 [11:04:32<10:08:02, 19.59s/it] +2025-05-11 04:30:25 - ERROR - stderr - +2025-05-11 04:30:25 - ERROR - stderr - +2025-05-11 04:30:25 - INFO - stdout - {'loss': 0.767, 'grad_norm': 0.6927207708358765, 'learning_rate': 1.0415526547277706e-05, 'epoch': 1.51} +2025-05-11 04:30:25 - ERROR - stderr - 50%|█████ | 1879/3741 [11:04:32<10:08:02, 19.59s/it] +2025-05-11 04:30:45 - ERROR - stderr - 50%|█████ | 1880/3741 [11:04:52<10:10:51, 19.69s/it] +2025-05-11 04:30:45 - ERROR - stderr - +2025-05-11 04:30:45 - ERROR - stderr - +2025-05-11 04:30:45 - INFO - stdout - {'loss': 0.7695, 'grad_norm': 0.7247032523155212, 'learning_rate': 1.0406874575292558e-05, 'epoch': 1.51} +2025-05-11 04:30:45 - ERROR - stderr - 50%|█████ | 1880/3741 [11:04:52<10:10:51, 19.69s/it] +2025-05-11 04:31:05 - ERROR - stderr - 50%|█████ | 1881/3741 [11:05:11<10:11:13, 19.72s/it] +2025-05-11 04:31:05 - ERROR - stderr - +2025-05-11 04:31:05 - ERROR - stderr - +2025-05-11 04:31:05 - INFO - stdout - {'loss': 0.7968, 'grad_norm': 0.6950458288192749, 'learning_rate': 1.0398222298218996e-05, 'epoch': 1.51} +2025-05-11 04:31:05 - ERROR - stderr - 50%|█████ | 1881/3741 [11:05:11<10:11:13, 19.72s/it] +2025-05-11 04:31:25 - ERROR - stderr - 50%|█████ | 1882/3741 [11:05:31<10:09:05, 19.66s/it] +2025-05-11 04:31:25 - ERROR - stderr - +2025-05-11 04:31:25 - ERROR - stderr - +2025-05-11 04:31:25 - INFO - stdout - {'loss': 0.7443, 'grad_norm': 0.660862386226654, 'learning_rate': 1.0389569722544794e-05, 'epoch': 1.51} +2025-05-11 04:31:25 - ERROR - stderr - 50%|█████ | 1882/3741 [11:05:31<10:09:05, 19.66s/it] +2025-05-11 04:31:44 - ERROR - stderr - 50%|█████ | 1883/3741 [11:05:50<10:06:48, 19.60s/it] +2025-05-11 04:31:44 - ERROR - stderr - +2025-05-11 04:31:44 - ERROR - stderr - +2025-05-11 04:31:44 - INFO - stdout - {'loss': 0.7537, 'grad_norm': 0.6698028445243835, 'learning_rate': 1.0380916854757948e-05, 'epoch': 1.51} +2025-05-11 04:31:44 - ERROR - stderr - 50%|█████ | 1883/3741 [11:05:50<10:06:48, 19.60s/it] +2025-05-11 04:32:05 - ERROR - stderr - 50%|█████ | 1884/3741 [11:06:11<10:17:36, 19.96s/it] +2025-05-11 04:32:05 - ERROR - stderr - +2025-05-11 04:32:05 - ERROR - stderr - +2025-05-11 04:32:05 - INFO - stdout - {'loss': 0.7432, 'grad_norm': 0.6830818057060242, 'learning_rate': 1.0372263701346671e-05, 'epoch': 1.51} +2025-05-11 04:32:05 - ERROR - stderr - 50%|█████ | 1884/3741 [11:06:11<10:17:36, 19.96s/it] +2025-05-11 04:32:24 - ERROR - stderr - 50%|█████ | 1885/3741 [11:06:31<10:13:20, 19.83s/it] +2025-05-11 04:32:24 - ERROR - stderr - +2025-05-11 04:32:24 - ERROR - stderr - +2025-05-11 04:32:24 - INFO - stdout - {'loss': 0.747, 'grad_norm': 0.7310096025466919, 'learning_rate': 1.0363610268799393e-05, 'epoch': 1.51} +2025-05-11 04:32:24 - ERROR - stderr - 50%|█████ | 1885/3741 [11:06:31<10:13:20, 19.83s/it] +2025-05-11 04:32:46 - ERROR - stderr - 50%|█████ | 1886/3741 [11:06:52<10:29:49, 20.37s/it] +2025-05-11 04:32:46 - ERROR - stderr - +2025-05-11 04:32:46 - ERROR - stderr - +2025-05-11 04:32:46 - INFO - stdout - {'loss': 0.798, 'grad_norm': 0.7210774421691895, 'learning_rate': 1.035495656360475e-05, 'epoch': 1.51} +2025-05-11 04:32:46 - ERROR - stderr - 50%|█████ | 1886/3741 [11:06:52<10:29:49, 20.37s/it] +2025-05-11 04:33:06 - ERROR - stderr - 50%|█████ | 1887/3741 [11:07:12<10:24:31, 20.21s/it] +2025-05-11 04:33:06 - ERROR - stderr - +2025-05-11 04:33:06 - ERROR - stderr - +2025-05-11 04:33:06 - INFO - stdout - {'loss': 0.7494, 'grad_norm': 0.6990832686424255, 'learning_rate': 1.0346302592251591e-05, 'epoch': 1.51} +2025-05-11 04:33:06 - ERROR - stderr - 50%|█████ | 1887/3741 [11:07:12<10:24:31, 20.21s/it] +2025-05-11 04:33:27 - ERROR - stderr - 50%|█████ | 1888/3741 [11:07:33<10:34:23, 20.54s/it] +2025-05-11 04:33:27 - ERROR - stderr - +2025-05-11 04:33:27 - ERROR - stderr - +2025-05-11 04:33:27 - INFO - stdout - {'loss': 0.7489, 'grad_norm': 0.6791195273399353, 'learning_rate': 1.033764836122895e-05, 'epoch': 1.51} +2025-05-11 04:33:27 - ERROR - stderr - 50%|█████ | 1888/3741 [11:07:34<10:34:23, 20.54s/it] +2025-05-11 04:33:47 - ERROR - stderr - 50%|█████ | 1889/3741 [11:07:53<10:27:43, 20.34s/it] +2025-05-11 04:33:47 - ERROR - stderr - +2025-05-11 04:33:47 - ERROR - stderr - +2025-05-11 04:33:47 - INFO - stdout - {'loss': 0.7514, 'grad_norm': 0.6856716275215149, 'learning_rate': 1.0328993877026075e-05, 'epoch': 1.51} +2025-05-11 04:33:47 - ERROR - stderr - 50%|█████ | 1889/3741 [11:07:53<10:27:43, 20.34s/it] +2025-05-11 04:34:07 - ERROR - stderr - 51%|█████ | 1890/3741 [11:08:13<10:22:45, 20.19s/it] +2025-05-11 04:34:07 - ERROR - stderr - +2025-05-11 04:34:07 - ERROR - stderr - +2025-05-11 04:34:07 - INFO - stdout - {'loss': 0.7234, 'grad_norm': 0.6292150020599365, 'learning_rate': 1.032033914613238e-05, 'epoch': 1.52} +2025-05-11 04:34:07 - ERROR - stderr - 51%|█████ | 1890/3741 [11:08:13<10:22:45, 20.19s/it] +2025-05-11 04:34:26 - ERROR - stderr - 51%|█████ | 1891/3741 [11:08:33<10:14:38, 19.93s/it] +2025-05-11 04:34:26 - ERROR - stderr - +2025-05-11 04:34:26 - ERROR - stderr - +2025-05-11 04:34:26 - INFO - stdout - {'loss': 0.7079, 'grad_norm': 0.6631137132644653, 'learning_rate': 1.0311684175037488e-05, 'epoch': 1.52} +2025-05-11 04:34:26 - ERROR - stderr - 51%|█████ | 1891/3741 [11:08:33<10:14:38, 19.93s/it] +2025-05-11 04:34:46 - ERROR - stderr - 51%|█████ | 1892/3741 [11:08:52<10:08:44, 19.75s/it] +2025-05-11 04:34:46 - ERROR - stderr - +2025-05-11 04:34:46 - ERROR - stderr - +2025-05-11 04:34:46 - INFO - stdout - {'loss': 0.7566, 'grad_norm': 0.6873642206192017, 'learning_rate': 1.0303028970231185e-05, 'epoch': 1.52} +2025-05-11 04:34:46 - ERROR - stderr - 51%|█████ | 1892/3741 [11:08:52<10:08:44, 19.75s/it] +2025-05-11 04:35:06 - ERROR - stderr - 51%|█████ | 1893/3741 [11:09:13<10:18:55, 20.10s/it] +2025-05-11 04:35:06 - ERROR - stderr - +2025-05-11 04:35:06 - ERROR - stderr - +2025-05-11 04:35:06 - INFO - stdout - {'loss': 0.7369, 'grad_norm': 0.6789519190788269, 'learning_rate': 1.0294373538203439e-05, 'epoch': 1.52} +2025-05-11 04:35:06 - ERROR - stderr - 51%|█████ | 1893/3741 [11:09:13<10:18:55, 20.10s/it] +2025-05-11 04:35:26 - ERROR - stderr - 51%|█████ | 1894/3741 [11:09:32<10:12:03, 19.88s/it] +2025-05-11 04:35:26 - ERROR - stderr - +2025-05-11 04:35:26 - ERROR - stderr - +2025-05-11 04:35:26 - INFO - stdout - {'loss': 0.7133, 'grad_norm': 0.716335117816925, 'learning_rate': 1.028571788544439e-05, 'epoch': 1.52} +2025-05-11 04:35:26 - ERROR - stderr - 51%|█████ | 1894/3741 [11:09:32<10:12:03, 19.88s/it] +2025-05-11 04:35:46 - ERROR - stderr - 51%|█████ | 1895/3741 [11:09:53<10:18:44, 20.11s/it] +2025-05-11 04:35:46 - ERROR - stderr - +2025-05-11 04:35:46 - ERROR - stderr - +2025-05-11 04:35:46 - INFO - stdout - {'loss': 0.7786, 'grad_norm': 0.7093126177787781, 'learning_rate': 1.0277062018444342e-05, 'epoch': 1.52} +2025-05-11 04:35:46 - ERROR - stderr - 51%|█████ | 1895/3741 [11:09:53<10:18:44, 20.11s/it] +2025-05-11 04:36:06 - ERROR - stderr - 51%|█████ | 1896/3741 [11:10:12<10:12:01, 19.90s/it] +2025-05-11 04:36:06 - ERROR - stderr - +2025-05-11 04:36:06 - ERROR - stderr - +2025-05-11 04:36:06 - INFO - stdout - {'loss': 0.78, 'grad_norm': 0.6854731440544128, 'learning_rate': 1.0268405943693757e-05, 'epoch': 1.52} +2025-05-11 04:36:06 - ERROR - stderr - 51%|█████ | 1896/3741 [11:10:12<10:12:01, 19.90s/it] +2025-05-11 04:36:27 - ERROR - stderr - 51%|█████ | 1897/3741 [11:10:33<10:23:26, 20.29s/it] +2025-05-11 04:36:27 - ERROR - stderr - +2025-05-11 04:36:27 - ERROR - stderr - +2025-05-11 04:36:27 - INFO - stdout - {'loss': 0.7239, 'grad_norm': 0.6630930304527283, 'learning_rate': 1.0259749667683252e-05, 'epoch': 1.52} +2025-05-11 04:36:27 - ERROR - stderr - 51%|█████ | 1897/3741 [11:10:33<10:23:26, 20.29s/it] +2025-05-11 04:36:47 - ERROR - stderr - 51%|█████ | 1898/3741 [11:10:53<10:17:37, 20.11s/it] +2025-05-11 04:36:47 - ERROR - stderr - +2025-05-11 04:36:47 - ERROR - stderr - +2025-05-11 04:36:47 - INFO - stdout - {'loss': 0.7385, 'grad_norm': 0.6397947669029236, 'learning_rate': 1.0251093196903601e-05, 'epoch': 1.52} +2025-05-11 04:36:47 - ERROR - stderr - 51%|█████ | 1898/3741 [11:10:53<10:17:37, 20.11s/it] +2025-05-11 04:37:09 - ERROR - stderr - 51%|█████ | 1899/3741 [11:11:15<10:38:03, 20.78s/it] +2025-05-11 04:37:09 - ERROR - stderr - +2025-05-11 04:37:09 - ERROR - stderr - +2025-05-11 04:37:09 - INFO - stdout - {'loss': 0.7366, 'grad_norm': 0.6733710169792175, 'learning_rate': 1.0242436537845719e-05, 'epoch': 1.52} +2025-05-11 04:37:09 - ERROR - stderr - 51%|█████ | 1899/3741 [11:11:15<10:38:03, 20.78s/it] +2025-05-11 04:37:10 - INFO - stdout - WARNING: tokenization mismatch: 3181 vs. 3198. (ignored) +2025-05-11 04:37:29 - ERROR - stderr - 51%|█████ | 1900/3741 [11:11:35<10:26:38, 20.42s/it] +2025-05-11 04:37:29 - ERROR - stderr - +2025-05-11 04:37:29 - ERROR - stderr - +2025-05-11 04:37:29 - INFO - stdout - {'loss': 0.7702, 'grad_norm': 0.7009027600288391, 'learning_rate': 1.0233779697000667e-05, 'epoch': 1.52} +2025-05-11 04:37:29 - ERROR - stderr - 51%|█████ | 1900/3741 [11:11:35<10:26:38, 20.42s/it] +2025-05-11 04:37:51 - ERROR - stderr - 51%|█████ | 1901/3741 [11:11:57<10:44:56, 21.03s/it] +2025-05-11 04:37:51 - ERROR - stderr - +2025-05-11 04:37:51 - ERROR - stderr - +2025-05-11 04:37:51 - INFO - stdout - {'loss': 0.7649, 'grad_norm': 0.7578801512718201, 'learning_rate': 1.0225122680859633e-05, 'epoch': 1.52} +2025-05-11 04:37:51 - ERROR - stderr - 51%|█████ | 1901/3741 [11:11:57<10:44:56, 21.03s/it] +2025-05-11 04:38:11 - ERROR - stderr - 51%|█████ | 1902/3741 [11:12:17<10:32:31, 20.64s/it] +2025-05-11 04:38:11 - ERROR - stderr - +2025-05-11 04:38:11 - ERROR - stderr - +2025-05-11 04:38:11 - INFO - stdout - {'loss': 0.7646, 'grad_norm': 0.7465493083000183, 'learning_rate': 1.0216465495913947e-05, 'epoch': 1.53} +2025-05-11 04:38:11 - ERROR - stderr - 51%|█████ | 1902/3741 [11:12:17<10:32:31, 20.64s/it] +2025-05-11 04:38:31 - ERROR - stderr - 51%|█████ | 1903/3741 [11:12:37<10:27:58, 20.50s/it] +2025-05-11 04:38:31 - ERROR - stderr - +2025-05-11 04:38:31 - ERROR - stderr - +2025-05-11 04:38:31 - INFO - stdout - {'loss': 0.7389, 'grad_norm': 0.6907299160957336, 'learning_rate': 1.020780814865506e-05, 'epoch': 1.53} +2025-05-11 04:38:31 - ERROR - stderr - 51%|█████ | 1903/3741 [11:12:37<10:27:58, 20.50s/it] +2025-05-11 04:38:51 - ERROR - stderr - 51%|█████ | 1904/3741 [11:12:57<10:19:54, 20.25s/it] +2025-05-11 04:38:51 - ERROR - stderr - +2025-05-11 04:38:51 - ERROR - stderr - +2025-05-11 04:38:51 - INFO - stdout - {'loss': 0.7454, 'grad_norm': 0.682547926902771, 'learning_rate': 1.0199150645574548e-05, 'epoch': 1.53} +2025-05-11 04:38:51 - ERROR - stderr - 51%|█████ | 1904/3741 [11:12:57<10:19:54, 20.25s/it] +2025-05-11 04:39:11 - ERROR - stderr - 51%|█████ | 1905/3741 [11:13:17<10:19:46, 20.25s/it] +2025-05-11 04:39:11 - ERROR - stderr - +2025-05-11 04:39:11 - ERROR - stderr - +2025-05-11 04:39:11 - INFO - stdout - {'loss': 0.7432, 'grad_norm': 0.6859135031700134, 'learning_rate': 1.0190492993164101e-05, 'epoch': 1.53} +2025-05-11 04:39:11 - ERROR - stderr - 51%|█████ | 1905/3741 [11:13:17<10:19:46, 20.25s/it] +2025-05-11 04:39:31 - ERROR - stderr - 51%|█████ | 1906/3741 [11:13:37<10:14:53, 20.11s/it] +2025-05-11 04:39:31 - ERROR - stderr - +2025-05-11 04:39:31 - ERROR - stderr - +2025-05-11 04:39:31 - INFO - stdout - {'loss': 0.7214, 'grad_norm': 0.6617407202720642, 'learning_rate': 1.0181835197915515e-05, 'epoch': 1.53} +2025-05-11 04:39:31 - ERROR - stderr - 51%|█████ | 1906/3741 [11:13:37<10:14:53, 20.11s/it] +2025-05-11 04:39:50 - ERROR - stderr - 51%|█████ | 1907/3741 [11:13:57<10:10:35, 19.98s/it] +2025-05-11 04:39:50 - ERROR - stderr - +2025-05-11 04:39:50 - ERROR - stderr - +2025-05-11 04:39:50 - INFO - stdout - {'loss': 0.7437, 'grad_norm': 0.6514879465103149, 'learning_rate': 1.0173177266320706e-05, 'epoch': 1.53} +2025-05-11 04:39:50 - ERROR - stderr - 51%|█████ | 1907/3741 [11:13:57<10:10:35, 19.98s/it] +2025-05-11 04:40:10 - ERROR - stderr - 51%|█████ | 1908/3741 [11:14:17<10:09:14, 19.94s/it] +2025-05-11 04:40:10 - ERROR - stderr - +2025-05-11 04:40:10 - ERROR - stderr - +2025-05-11 04:40:10 - INFO - stdout - {'loss': 0.7648, 'grad_norm': 0.6830449104309082, 'learning_rate': 1.016451920487169e-05, 'epoch': 1.53} +2025-05-11 04:40:10 - ERROR - stderr - 51%|█████ | 1908/3741 [11:14:17<10:09:14, 19.94s/it] +2025-05-11 04:40:30 - ERROR - stderr - 51%|█████ | 1909/3741 [11:14:36<10:05:38, 19.84s/it] +2025-05-11 04:40:30 - ERROR - stderr - +2025-05-11 04:40:30 - ERROR - stderr - +2025-05-11 04:40:30 - INFO - stdout - {'loss': 0.7236, 'grad_norm': 0.6907112002372742, 'learning_rate': 1.0155861020060566e-05, 'epoch': 1.53} +2025-05-11 04:40:30 - ERROR - stderr - 51%|█████ | 1909/3741 [11:14:36<10:05:38, 19.84s/it] +2025-05-11 04:40:49 - ERROR - stderr - 51%|█████ | 1910/3741 [11:14:56<10:02:17, 19.74s/it] +2025-05-11 04:40:49 - ERROR - stderr - +2025-05-11 04:40:49 - ERROR - stderr - +2025-05-11 04:40:49 - INFO - stdout - {'loss': 0.7153, 'grad_norm': 0.6831691861152649, 'learning_rate': 1.0147202718379544e-05, 'epoch': 1.53} +2025-05-11 04:40:49 - ERROR - stderr - 51%|█████ | 1910/3741 [11:14:56<10:02:17, 19.74s/it] +2025-05-11 04:41:09 - ERROR - stderr - 51%|█████ | 1911/3741 [11:15:16<10:02:52, 19.77s/it] +2025-05-11 04:41:09 - ERROR - stderr - +2025-05-11 04:41:09 - ERROR - stderr - +2025-05-11 04:41:09 - INFO - stdout - {'loss': 0.7333, 'grad_norm': 0.6687254905700684, 'learning_rate': 1.013854430632091e-05, 'epoch': 1.53} +2025-05-11 04:41:09 - ERROR - stderr - 51%|█████ | 1911/3741 [11:15:16<10:02:52, 19.77s/it] +2025-05-11 04:41:32 - ERROR - stderr - 51%|█████ | 1912/3741 [11:15:38<10:30:52, 20.70s/it] +2025-05-11 04:41:32 - ERROR - stderr - +2025-05-11 04:41:32 - ERROR - stderr - +2025-05-11 04:41:32 - INFO - stdout - {'loss': 0.7489, 'grad_norm': 0.6817905902862549, 'learning_rate': 1.0129885790377034e-05, 'epoch': 1.53} +2025-05-11 04:41:32 - ERROR - stderr - 51%|█████ | 1912/3741 [11:15:38<10:30:52, 20.70s/it] +2025-05-11 04:41:52 - ERROR - stderr - 51%|█████ | 1913/3741 [11:15:58<10:23:44, 20.47s/it] +2025-05-11 04:41:52 - ERROR - stderr - +2025-05-11 04:41:52 - ERROR - stderr - +2025-05-11 04:41:52 - INFO - stdout - {'loss': 0.7337, 'grad_norm': 0.6689939498901367, 'learning_rate': 1.0121227177040373e-05, 'epoch': 1.53} +2025-05-11 04:41:52 - ERROR - stderr - 51%|█████ | 1913/3741 [11:15:58<10:23:44, 20.47s/it] +2025-05-11 04:42:13 - ERROR - stderr - 51%|█████ | 1914/3741 [11:16:19<10:27:27, 20.61s/it] +2025-05-11 04:42:13 - ERROR - stderr - +2025-05-11 04:42:13 - ERROR - stderr - +2025-05-11 04:42:13 - INFO - stdout - {'loss': 0.7522, 'grad_norm': 0.6985632181167603, 'learning_rate': 1.0112568472803443e-05, 'epoch': 1.53} +2025-05-11 04:42:13 - ERROR - stderr - 51%|█████ | 1914/3741 [11:16:19<10:27:27, 20.61s/it] +2025-05-11 04:42:33 - ERROR - stderr - 51%|█████ | 1915/3741 [11:16:39<10:20:57, 20.40s/it] +2025-05-11 04:42:33 - ERROR - stderr - +2025-05-11 04:42:33 - ERROR - stderr - +2025-05-11 04:42:33 - INFO - stdout - {'loss': 0.776, 'grad_norm': 0.7158617973327637, 'learning_rate': 1.0103909684158841e-05, 'epoch': 1.54} +2025-05-11 04:42:33 - ERROR - stderr - 51%|█████ | 1915/3741 [11:16:39<10:20:57, 20.40s/it] +2025-05-11 04:42:54 - ERROR - stderr - 51%|█████ | 1916/3741 [11:17:01<10:29:27, 20.69s/it] +2025-05-11 04:42:54 - ERROR - stderr - +2025-05-11 04:42:54 - ERROR - stderr - +2025-05-11 04:42:54 - INFO - stdout - {'loss': 0.7396, 'grad_norm': 0.671501874923706, 'learning_rate': 1.0095250817599218e-05, 'epoch': 1.54} +2025-05-11 04:42:54 - ERROR - stderr - 51%|█████ | 1916/3741 [11:17:01<10:29:27, 20.69s/it] +2025-05-11 04:43:14 - ERROR - stderr - 51%|█████ | 1917/3741 [11:17:20<10:19:31, 20.38s/it] +2025-05-11 04:43:14 - ERROR - stderr - +2025-05-11 04:43:14 - ERROR - stderr - +2025-05-11 04:43:14 - INFO - stdout - {'loss': 0.6984, 'grad_norm': 0.6825124025344849, 'learning_rate': 1.008659187961729e-05, 'epoch': 1.54} +2025-05-11 04:43:14 - ERROR - stderr - 51%|█████ | 1917/3741 [11:17:20<10:19:31, 20.38s/it] +2025-05-11 04:43:36 - ERROR - stderr - 51%|█████▏ | 1918/3741 [11:17:42<10:35:02, 20.90s/it] +2025-05-11 04:43:36 - ERROR - stderr - +2025-05-11 04:43:36 - ERROR - stderr - +2025-05-11 04:43:36 - INFO - stdout - {'loss': 0.7488, 'grad_norm': 0.6682149171829224, 'learning_rate': 1.0077932876705819e-05, 'epoch': 1.54} +2025-05-11 04:43:36 - ERROR - stderr - 51%|█████▏ | 1918/3741 [11:17:42<10:35:02, 20.90s/it] +2025-05-11 04:43:56 - ERROR - stderr - 51%|█████▏ | 1919/3741 [11:18:02<10:24:25, 20.56s/it] +2025-05-11 04:43:56 - ERROR - stderr - +2025-05-11 04:43:56 - ERROR - stderr - +2025-05-11 04:43:56 - INFO - stdout - {'loss': 0.7576, 'grad_norm': 0.6929824948310852, 'learning_rate': 1.0069273815357621e-05, 'epoch': 1.54} +2025-05-11 04:43:56 - ERROR - stderr - 51%|█████▏ | 1919/3741 [11:18:02<10:24:25, 20.56s/it] +2025-05-11 04:44:18 - ERROR - stderr - 51%|█████▏ | 1920/3741 [11:18:24<10:34:58, 20.92s/it] +2025-05-11 04:44:18 - ERROR - stderr - +2025-05-11 04:44:18 - ERROR - stderr - +2025-05-11 04:44:18 - INFO - stdout - {'loss': 0.7617, 'grad_norm': 0.6691644191741943, 'learning_rate': 1.006061470206556e-05, 'epoch': 1.54} +2025-05-11 04:44:18 - ERROR - stderr - 51%|█████▏ | 1920/3741 [11:18:24<10:34:58, 20.92s/it] +2025-05-11 04:44:37 - ERROR - stderr - 51%|█████▏ | 1921/3741 [11:18:43<10:22:02, 20.51s/it] +2025-05-11 04:44:37 - ERROR - stderr - +2025-05-11 04:44:37 - ERROR - stderr - +2025-05-11 04:44:37 - INFO - stdout - {'loss': 0.7602, 'grad_norm': 0.6622249484062195, 'learning_rate': 1.0051955543322533e-05, 'epoch': 1.54} +2025-05-11 04:44:37 - ERROR - stderr - 51%|█████▏ | 1921/3741 [11:18:43<10:22:02, 20.51s/it] +2025-05-11 04:44:57 - ERROR - stderr - 51%|█████▏ | 1922/3741 [11:19:03<10:14:26, 20.27s/it] +2025-05-11 04:44:57 - ERROR - stderr - +2025-05-11 04:44:57 - ERROR - stderr - +2025-05-11 04:44:57 - INFO - stdout - {'loss': 0.7423, 'grad_norm': 0.6343883275985718, 'learning_rate': 1.0043296345621467e-05, 'epoch': 1.54} +2025-05-11 04:44:57 - ERROR - stderr - 51%|█████▏ | 1922/3741 [11:19:03<10:14:26, 20.27s/it] +2025-05-11 04:45:16 - ERROR - stderr - 51%|█████▏ | 1923/3741 [11:19:23<10:08:03, 20.07s/it] +2025-05-11 04:45:16 - ERROR - stderr - +2025-05-11 04:45:16 - ERROR - stderr - +2025-05-11 04:45:16 - INFO - stdout - {'loss': 0.7551, 'grad_norm': 0.668969452381134, 'learning_rate': 1.0034637115455327e-05, 'epoch': 1.54} +2025-05-11 04:45:16 - ERROR - stderr - 51%|█████▏ | 1923/3741 [11:19:23<10:08:03, 20.07s/it] +2025-05-11 04:45:36 - ERROR - stderr - 51%|█████▏ | 1924/3741 [11:19:42<10:04:47, 19.97s/it] +2025-05-11 04:45:36 - ERROR - stderr - +2025-05-11 04:45:36 - ERROR - stderr - +2025-05-11 04:45:36 - INFO - stdout - {'loss': 0.716, 'grad_norm': 0.7144033908843994, 'learning_rate': 1.0025977859317097e-05, 'epoch': 1.54} +2025-05-11 04:45:36 - ERROR - stderr - 51%|█████▏ | 1924/3741 [11:19:42<10:04:47, 19.97s/it] +2025-05-11 04:45:56 - ERROR - stderr - 51%|█████▏ | 1925/3741 [11:20:02<9:59:18, 19.80s/it] +2025-05-11 04:45:56 - ERROR - stderr - +2025-05-11 04:45:56 - ERROR - stderr - +2025-05-11 04:45:56 - INFO - stdout - {'loss': 0.7375, 'grad_norm': 0.6621528267860413, 'learning_rate': 1.0017318583699786e-05, 'epoch': 1.54} +2025-05-11 04:45:56 - ERROR - stderr - 51%|█████▏ | 1925/3741 [11:20:02<9:59:18, 19.80s/it] +2025-05-11 04:46:15 - ERROR - stderr - 51%|█████▏ | 1926/3741 [11:20:21<9:56:39, 19.72s/it] +2025-05-11 04:46:15 - ERROR - stderr - +2025-05-11 04:46:15 - ERROR - stderr - +2025-05-11 04:46:15 - INFO - stdout - {'loss': 0.8017, 'grad_norm': 0.7118502259254456, 'learning_rate': 1.0008659295096412e-05, 'epoch': 1.54} +2025-05-11 04:46:15 - ERROR - stderr - 51%|█████▏ | 1926/3741 [11:20:21<9:56:39, 19.72s/it] +2025-05-11 04:46:36 - ERROR - stderr - 52%|█████▏ | 1927/3741 [11:20:42<10:04:56, 20.01s/it] +2025-05-11 04:46:36 - ERROR - stderr - +2025-05-11 04:46:36 - ERROR - stderr - +2025-05-11 04:46:36 - INFO - stdout - {'loss': 0.7292, 'grad_norm': 0.6664971113204956, 'learning_rate': 1e-05, 'epoch': 1.55} +2025-05-11 04:46:36 - ERROR - stderr - 52%|█████▏ | 1927/3741 [11:20:42<10:04:56, 20.01s/it] +2025-05-11 04:46:55 - ERROR - stderr - 52%|█████▏ | 1928/3741 [11:21:02<10:00:42, 19.88s/it] +2025-05-11 04:46:55 - ERROR - stderr - +2025-05-11 04:46:55 - ERROR - stderr - +2025-05-11 04:46:55 - INFO - stdout - {'loss': 0.7586, 'grad_norm': 0.7016831636428833, 'learning_rate': 9.991340704903593e-06, 'epoch': 1.55} +2025-05-11 04:46:55 - ERROR - stderr - 52%|█████▏ | 1928/3741 [11:21:02<10:00:42, 19.88s/it] +2025-05-11 04:47:17 - ERROR - stderr - 52%|█████▏ | 1929/3741 [11:21:23<10:13:54, 20.33s/it] +2025-05-11 04:47:17 - ERROR - stderr - +2025-05-11 04:47:17 - ERROR - stderr - +2025-05-11 04:47:17 - INFO - stdout - {'loss': 0.7695, 'grad_norm': 0.7355296015739441, 'learning_rate': 9.982681416300217e-06, 'epoch': 1.55} +2025-05-11 04:47:17 - ERROR - stderr - 52%|█████▏ | 1929/3741 [11:21:23<10:13:54, 20.33s/it] +2025-05-11 04:47:36 - ERROR - stderr - 52%|█████▏ | 1930/3741 [11:21:42<10:04:27, 20.03s/it] +2025-05-11 04:47:36 - ERROR - stderr - +2025-05-11 04:47:36 - ERROR - stderr - +2025-05-11 04:47:36 - INFO - stdout - {'loss': 0.7447, 'grad_norm': 0.7109652757644653, 'learning_rate': 9.974022140682906e-06, 'epoch': 1.55} +2025-05-11 04:47:36 - ERROR - stderr - 52%|█████▏ | 1930/3741 [11:21:42<10:04:27, 20.03s/it] +2025-05-11 04:47:58 - ERROR - stderr - 52%|█████▏ | 1931/3741 [11:22:04<10:20:32, 20.57s/it] +2025-05-11 04:47:58 - ERROR - stderr - +2025-05-11 04:47:58 - ERROR - stderr - +2025-05-11 04:47:58 - INFO - stdout - {'loss': 0.7483, 'grad_norm': 0.8703607320785522, 'learning_rate': 9.965362884544674e-06, 'epoch': 1.55} +2025-05-11 04:47:58 - ERROR - stderr - 52%|█████▏ | 1931/3741 [11:22:04<10:20:32, 20.57s/it] +2025-05-11 04:48:18 - ERROR - stderr - 52%|█████▏ | 1932/3741 [11:22:24<10:16:24, 20.44s/it] +2025-05-11 04:48:18 - ERROR - stderr - +2025-05-11 04:48:18 - ERROR - stderr - +2025-05-11 04:48:18 - INFO - stdout - {'loss': 0.7431, 'grad_norm': 0.6655393242835999, 'learning_rate': 9.956703654378536e-06, 'epoch': 1.55} +2025-05-11 04:48:18 - ERROR - stderr - 52%|█████▏ | 1932/3741 [11:22:24<10:16:24, 20.44s/it] +2025-05-11 04:48:40 - ERROR - stderr - 52%|█████▏ | 1933/3741 [11:22:46<10:26:03, 20.78s/it] +2025-05-11 04:48:40 - ERROR - stderr - +2025-05-11 04:48:40 - ERROR - stderr - +2025-05-11 04:48:40 - INFO - stdout - {'loss': 0.6951, 'grad_norm': 0.6632983088493347, 'learning_rate': 9.948044456677472e-06, 'epoch': 1.55} +2025-05-11 04:48:40 - ERROR - stderr - 52%|█████▏ | 1933/3741 [11:22:46<10:26:03, 20.78s/it] +2025-05-11 04:48:59 - ERROR - stderr - 52%|█████▏ | 1934/3741 [11:23:06<10:15:56, 20.45s/it] +2025-05-11 04:48:59 - ERROR - stderr - +2025-05-11 04:48:59 - ERROR - stderr - +2025-05-11 04:48:59 - INFO - stdout - {'loss': 0.7628, 'grad_norm': 0.7212697267532349, 'learning_rate': 9.939385297934441e-06, 'epoch': 1.55} +2025-05-11 04:48:59 - ERROR - stderr - 52%|█████▏ | 1934/3741 [11:23:06<10:15:56, 20.45s/it] +2025-05-11 04:49:21 - ERROR - stderr - 52%|█████▏ | 1935/3741 [11:23:28<10:31:37, 20.98s/it] +2025-05-11 04:49:22 - ERROR - stderr - +2025-05-11 04:49:22 - ERROR - stderr - +2025-05-11 04:49:22 - INFO - stdout - {'loss': 0.7403, 'grad_norm': 0.6794565916061401, 'learning_rate': 9.930726184642382e-06, 'epoch': 1.55} +2025-05-11 04:49:22 - ERROR - stderr - 52%|█████▏ | 1935/3741 [11:23:28<10:31:37, 20.98s/it] +2025-05-11 04:49:22 - INFO - stdout - WARNING: tokenization mismatch: 3183 vs. 3209. (ignored) +2025-05-11 04:49:41 - ERROR - stderr - 52%|█████▏ | 1936/3741 [11:23:48<10:21:44, 20.67s/it] +2025-05-11 04:49:41 - ERROR - stderr - +2025-05-11 04:49:41 - ERROR - stderr - +2025-05-11 04:49:41 - INFO - stdout - {'loss': 0.7689, 'grad_norm': 0.7048789262771606, 'learning_rate': 9.922067123294183e-06, 'epoch': 1.55} +2025-05-11 04:49:41 - ERROR - stderr - 52%|█████▏ | 1936/3741 [11:23:48<10:21:44, 20.67s/it] +2025-05-11 04:50:05 - ERROR - stderr - 52%|█████▏ | 1937/3741 [11:24:11<10:43:26, 21.40s/it] +2025-05-11 04:50:05 - ERROR - stderr - +2025-05-11 04:50:05 - ERROR - stderr - +2025-05-11 04:50:05 - INFO - stdout - {'loss': 0.7855, 'grad_norm': 0.7070860862731934, 'learning_rate': 9.913408120382714e-06, 'epoch': 1.55} +2025-05-11 04:50:05 - ERROR - stderr - 52%|█████▏ | 1937/3741 [11:24:11<10:43:26, 21.40s/it] +2025-05-11 04:50:24 - ERROR - stderr - 52%|█████▏ | 1938/3741 [11:24:30<10:25:26, 20.81s/it] +2025-05-11 04:50:24 - ERROR - stderr - +2025-05-11 04:50:24 - ERROR - stderr - +2025-05-11 04:50:24 - INFO - stdout - {'loss': 0.7341, 'grad_norm': 0.681030809879303, 'learning_rate': 9.904749182400786e-06, 'epoch': 1.55} +2025-05-11 04:50:24 - ERROR - stderr - 52%|█████▏ | 1938/3741 [11:24:30<10:25:26, 20.81s/it] +2025-05-11 04:50:46 - ERROR - stderr - 52%|█████▏ | 1939/3741 [11:24:53<10:40:02, 21.31s/it] +2025-05-11 04:50:46 - ERROR - stderr - +2025-05-11 04:50:46 - ERROR - stderr - +2025-05-11 04:50:46 - INFO - stdout - {'loss': 0.7725, 'grad_norm': 0.6738923788070679, 'learning_rate': 9.896090315841162e-06, 'epoch': 1.55} +2025-05-11 04:50:46 - ERROR - stderr - 52%|█████▏ | 1939/3741 [11:24:53<10:40:02, 21.31s/it] +2025-05-11 04:51:06 - ERROR - stderr - 52%|█████▏ | 1940/3741 [11:25:12<10:23:24, 20.77s/it] +2025-05-11 04:51:06 - ERROR - stderr - +2025-05-11 04:51:06 - ERROR - stderr - +2025-05-11 04:51:06 - INFO - stdout - {'loss': 0.7083, 'grad_norm': 0.6607416868209839, 'learning_rate': 9.88743152719656e-06, 'epoch': 1.56} +2025-05-11 04:51:06 - ERROR - stderr - 52%|█████▏ | 1940/3741 [11:25:12<10:23:24, 20.77s/it] +2025-05-11 04:51:26 - ERROR - stderr - 52%|█████▏ | 1941/3741 [11:25:32<10:14:47, 20.49s/it] +2025-05-11 04:51:26 - ERROR - stderr - +2025-05-11 04:51:26 - ERROR - stderr - +2025-05-11 04:51:26 - INFO - stdout - {'loss': 0.7544, 'grad_norm': 0.6659730672836304, 'learning_rate': 9.878772822959628e-06, 'epoch': 1.56} +2025-05-11 04:51:26 - ERROR - stderr - 52%|█████▏ | 1941/3741 [11:25:32<10:14:47, 20.49s/it] +2025-05-11 04:51:45 - ERROR - stderr - 52%|█████▏ | 1942/3741 [11:25:52<10:07:05, 20.25s/it] +2025-05-11 04:51:45 - ERROR - stderr - +2025-05-11 04:51:45 - ERROR - stderr - +2025-05-11 04:51:45 - INFO - stdout - {'loss': 0.741, 'grad_norm': 0.669576108455658, 'learning_rate': 9.870114209622969e-06, 'epoch': 1.56} +2025-05-11 04:51:45 - ERROR - stderr - 52%|█████▏ | 1942/3741 [11:25:52<10:07:05, 20.25s/it] +2025-05-11 04:52:05 - ERROR - stderr - 52%|█████▏ | 1943/3741 [11:26:12<10:01:59, 20.09s/it] +2025-05-11 04:52:05 - ERROR - stderr - +2025-05-11 04:52:05 - ERROR - stderr - +2025-05-11 04:52:05 - INFO - stdout - {'loss': 0.7228, 'grad_norm': 0.6290959715843201, 'learning_rate': 9.861455693679096e-06, 'epoch': 1.56} +2025-05-11 04:52:05 - ERROR - stderr - 52%|█████▏ | 1943/3741 [11:26:12<10:01:59, 20.09s/it] +2025-05-11 04:52:26 - ERROR - stderr - 52%|█████▏ | 1944/3741 [11:26:32<10:04:48, 20.19s/it] +2025-05-11 04:52:26 - ERROR - stderr - +2025-05-11 04:52:26 - ERROR - stderr - +2025-05-11 04:52:26 - INFO - stdout - {'loss': 0.716, 'grad_norm': 0.6587105393409729, 'learning_rate': 9.852797281620459e-06, 'epoch': 1.56} +2025-05-11 04:52:26 - ERROR - stderr - 52%|█████▏ | 1944/3741 [11:26:32<10:04:48, 20.19s/it] +2025-05-11 04:52:45 - ERROR - stderr - 52%|█████▏ | 1945/3741 [11:26:51<9:58:20, 19.99s/it] +2025-05-11 04:52:45 - ERROR - stderr - +2025-05-11 04:52:45 - ERROR - stderr - +2025-05-11 04:52:45 - INFO - stdout - {'loss': 0.7684, 'grad_norm': 0.7020542621612549, 'learning_rate': 9.844138979939437e-06, 'epoch': 1.56} +2025-05-11 04:52:45 - ERROR - stderr - 52%|█████▏ | 1945/3741 [11:26:51<9:58:20, 19.99s/it] +2025-05-11 04:53:06 - ERROR - stderr - 52%|█████▏ | 1946/3741 [11:27:12<10:07:19, 20.30s/it] +2025-05-11 04:53:06 - ERROR - stderr - +2025-05-11 04:53:06 - ERROR - stderr - +2025-05-11 04:53:06 - INFO - stdout - {'loss': 0.7699, 'grad_norm': 0.6784189343452454, 'learning_rate': 9.835480795128314e-06, 'epoch': 1.56} +2025-05-11 04:53:06 - ERROR - stderr - 52%|█████▏ | 1946/3741 [11:27:13<10:07:19, 20.30s/it] +2025-05-11 04:53:26 - ERROR - stderr - 52%|█████▏ | 1947/3741 [11:27:32<10:00:47, 20.09s/it] +2025-05-11 04:53:26 - ERROR - stderr - +2025-05-11 04:53:26 - ERROR - stderr - +2025-05-11 04:53:26 - INFO - stdout - {'loss': 0.7578, 'grad_norm': 0.6841933727264404, 'learning_rate': 9.826822733679296e-06, 'epoch': 1.56} +2025-05-11 04:53:26 - ERROR - stderr - 52%|█████▏ | 1947/3741 [11:27:32<10:00:47, 20.09s/it] +2025-05-11 04:53:48 - ERROR - stderr - 52%|█████▏ | 1948/3741 [11:27:54<10:15:27, 20.60s/it] +2025-05-11 04:53:48 - ERROR - stderr - +2025-05-11 04:53:48 - ERROR - stderr - +2025-05-11 04:53:48 - INFO - stdout - {'loss': 0.7228, 'grad_norm': 0.7108730673789978, 'learning_rate': 9.81816480208449e-06, 'epoch': 1.56} +2025-05-11 04:53:48 - ERROR - stderr - 52%|█████▏ | 1948/3741 [11:27:54<10:15:27, 20.60s/it] +2025-05-11 04:54:07 - ERROR - stderr - 52%|█████▏ | 1949/3741 [11:28:14<10:08:55, 20.39s/it] +2025-05-11 04:54:07 - ERROR - stderr - +2025-05-11 04:54:07 - ERROR - stderr - +2025-05-11 04:54:07 - INFO - stdout - {'loss': 0.7476, 'grad_norm': 0.6719072461128235, 'learning_rate': 9.809507006835904e-06, 'epoch': 1.56} +2025-05-11 04:54:07 - ERROR - stderr - 52%|█████▏ | 1949/3741 [11:28:14<10:08:55, 20.39s/it] +2025-05-11 04:54:30 - ERROR - stderr - 52%|█████▏ | 1950/3741 [11:28:36<10:28:30, 21.06s/it] +2025-05-11 04:54:30 - ERROR - stderr - +2025-05-11 04:54:30 - ERROR - stderr - +2025-05-11 04:54:30 - INFO - stdout - {'loss': 0.7254, 'grad_norm': 0.6494175791740417, 'learning_rate': 9.800849354425455e-06, 'epoch': 1.56} +2025-05-11 04:54:30 - ERROR - stderr - 52%|█████▏ | 1950/3741 [11:28:36<10:28:30, 21.06s/it] +2025-05-11 04:54:50 - ERROR - stderr - 52%|█████▏ | 1951/3741 [11:28:56<10:15:55, 20.65s/it] +2025-05-11 04:54:50 - ERROR - stderr - +2025-05-11 04:54:50 - ERROR - stderr - +2025-05-11 04:54:50 - INFO - stdout - {'loss': 0.7566, 'grad_norm': 0.6985930800437927, 'learning_rate': 9.79219185134494e-06, 'epoch': 1.56} +2025-05-11 04:54:50 - ERROR - stderr - 52%|█████▏ | 1951/3741 [11:28:56<10:15:55, 20.65s/it] +2025-05-11 04:55:10 - ERROR - stderr - 52%|█████▏ | 1952/3741 [11:29:17<10:15:02, 20.63s/it] +2025-05-11 04:55:10 - ERROR - stderr - +2025-05-11 04:55:10 - ERROR - stderr - +2025-05-11 04:55:10 - INFO - stdout - {'loss': 0.7368, 'grad_norm': 0.6902778744697571, 'learning_rate': 9.783534504086055e-06, 'epoch': 1.57} +2025-05-11 04:55:10 - ERROR - stderr - 52%|█████▏ | 1952/3741 [11:29:17<10:15:02, 20.63s/it] +2025-05-11 04:55:30 - ERROR - stderr - 52%|█████▏ | 1953/3741 [11:29:36<10:04:39, 20.29s/it] +2025-05-11 04:55:30 - ERROR - stderr - +2025-05-11 04:55:30 - ERROR - stderr - +2025-05-11 04:55:30 - INFO - stdout - {'loss': 0.7191, 'grad_norm': 0.6868149042129517, 'learning_rate': 9.774877319140372e-06, 'epoch': 1.57} +2025-05-11 04:55:30 - ERROR - stderr - 52%|█████▏ | 1953/3741 [11:29:36<10:04:39, 20.29s/it] +2025-05-11 04:55:51 - ERROR - stderr - 52%|█████▏ | 1954/3741 [11:29:57<10:10:14, 20.49s/it] +2025-05-11 04:55:51 - ERROR - stderr - +2025-05-11 04:55:51 - ERROR - stderr - +2025-05-11 04:55:51 - INFO - stdout - {'loss': 0.7588, 'grad_norm': 0.6847664713859558, 'learning_rate': 9.766220302999336e-06, 'epoch': 1.57} +2025-05-11 04:55:51 - ERROR - stderr - 52%|█████▏ | 1954/3741 [11:29:57<10:10:14, 20.49s/it] +2025-05-11 04:56:10 - ERROR - stderr - 52%|█████▏ | 1955/3741 [11:30:17<10:01:52, 20.22s/it] +2025-05-11 04:56:10 - ERROR - stderr - +2025-05-11 04:56:10 - ERROR - stderr - +2025-05-11 04:56:10 - INFO - stdout - {'loss': 0.7608, 'grad_norm': 0.6886143088340759, 'learning_rate': 9.757563462154283e-06, 'epoch': 1.57} +2025-05-11 04:56:10 - ERROR - stderr - 52%|█████▏ | 1955/3741 [11:30:17<10:01:52, 20.22s/it] +2025-05-11 04:56:33 - ERROR - stderr - 52%|█████▏ | 1956/3741 [11:30:39<10:20:20, 20.85s/it] +2025-05-11 04:56:33 - ERROR - stderr - +2025-05-11 04:56:33 - ERROR - stderr - +2025-05-11 04:56:33 - INFO - stdout - {'loss': 0.7273, 'grad_norm': 0.7180765271186829, 'learning_rate': 9.7489068030964e-06, 'epoch': 1.57} +2025-05-11 04:56:33 - ERROR - stderr - 52%|█████▏ | 1956/3741 [11:30:39<10:20:20, 20.85s/it] +2025-05-11 04:56:52 - ERROR - stderr - 52%|█████▏ | 1957/3741 [11:30:59<10:09:52, 20.51s/it] +2025-05-11 04:56:52 - ERROR - stderr - +2025-05-11 04:56:52 - ERROR - stderr - +2025-05-11 04:56:52 - INFO - stdout - {'loss': 0.7549, 'grad_norm': 0.6763858795166016, 'learning_rate': 9.74025033231675e-06, 'epoch': 1.57} +2025-05-11 04:56:52 - ERROR - stderr - 52%|█████▏ | 1957/3741 [11:30:59<10:09:52, 20.51s/it] +2025-05-11 04:57:15 - ERROR - stderr - 52%|█████▏ | 1958/3741 [11:31:22<10:31:54, 21.26s/it] +2025-05-11 04:57:15 - ERROR - stderr - +2025-05-11 04:57:15 - ERROR - stderr - +2025-05-11 04:57:15 - INFO - stdout - {'loss': 0.7512, 'grad_norm': 0.6915479898452759, 'learning_rate': 9.731594056306248e-06, 'epoch': 1.57} +2025-05-11 04:57:15 - ERROR - stderr - 52%|█████▏ | 1958/3741 [11:31:22<10:31:54, 21.26s/it] +2025-05-11 04:57:35 - ERROR - stderr - 52%|█████▏ | 1959/3741 [11:31:41<10:17:00, 20.77s/it] +2025-05-11 04:57:35 - ERROR - stderr - +2025-05-11 04:57:35 - ERROR - stderr - +2025-05-11 04:57:35 - INFO - stdout - {'loss': 0.7536, 'grad_norm': 0.6777629852294922, 'learning_rate': 9.72293798155566e-06, 'epoch': 1.57} +2025-05-11 04:57:35 - ERROR - stderr - 52%|█████▏ | 1959/3741 [11:31:41<10:17:00, 20.77s/it] +2025-05-11 04:57:55 - ERROR - stderr - 52%|█████▏ | 1960/3741 [11:32:01<10:08:36, 20.50s/it] +2025-05-11 04:57:55 - ERROR - stderr - +2025-05-11 04:57:55 - ERROR - stderr - +2025-05-11 04:57:55 - INFO - stdout - {'loss': 0.7347, 'grad_norm': 0.6455657482147217, 'learning_rate': 9.714282114555613e-06, 'epoch': 1.57} +2025-05-11 04:57:55 - ERROR - stderr - 52%|█████▏ | 1960/3741 [11:32:01<10:08:36, 20.50s/it] +2025-05-11 04:58:15 - ERROR - stderr - 52%|█████▏ | 1961/3741 [11:32:21<10:01:07, 20.26s/it] +2025-05-11 04:58:15 - ERROR - stderr - +2025-05-11 04:58:15 - ERROR - stderr - +2025-05-11 04:58:15 - INFO - stdout - {'loss': 0.7709, 'grad_norm': 0.700589656829834, 'learning_rate': 9.70562646179656e-06, 'epoch': 1.57} +2025-05-11 04:58:15 - ERROR - stderr - 52%|█████▏ | 1961/3741 [11:32:21<10:01:07, 20.26s/it] +2025-05-11 04:58:34 - ERROR - stderr - 52%|█████▏ | 1962/3741 [11:32:41<9:55:23, 20.08s/it] +2025-05-11 04:58:34 - ERROR - stderr - +2025-05-11 04:58:34 - ERROR - stderr - +2025-05-11 04:58:34 - INFO - stdout - {'loss': 0.7816, 'grad_norm': 0.6736430525779724, 'learning_rate': 9.696971029768817e-06, 'epoch': 1.57} +2025-05-11 04:58:34 - ERROR - stderr - 52%|█████▏ | 1962/3741 [11:32:41<9:55:23, 20.08s/it] +2025-05-11 04:58:54 - ERROR - stderr - 52%|█████▏ | 1963/3741 [11:33:01<9:53:19, 20.02s/it] +2025-05-11 04:58:54 - ERROR - stderr - +2025-05-11 04:58:54 - ERROR - stderr - +2025-05-11 04:58:54 - INFO - stdout - {'loss': 0.7248, 'grad_norm': 0.6946107149124146, 'learning_rate': 9.688315824962516e-06, 'epoch': 1.57} +2025-05-11 04:58:54 - ERROR - stderr - 52%|█████▏ | 1963/3741 [11:33:01<9:53:19, 20.02s/it] +2025-05-11 04:59:14 - ERROR - stderr - 52%|█████▏ | 1964/3741 [11:33:20<9:48:01, 19.85s/it] +2025-05-11 04:59:14 - ERROR - stderr - +2025-05-11 04:59:14 - ERROR - stderr - +2025-05-11 04:59:14 - INFO - stdout - {'loss': 0.7486, 'grad_norm': 0.6668774485588074, 'learning_rate': 9.679660853867621e-06, 'epoch': 1.57} +2025-05-11 04:59:14 - ERROR - stderr - 52%|█████▏ | 1964/3741 [11:33:20<9:48:01, 19.85s/it] +2025-05-11 04:59:33 - ERROR - stderr - 53%|█████▎ | 1965/3741 [11:33:40<9:44:51, 19.76s/it] +2025-05-11 04:59:33 - ERROR - stderr - +2025-05-11 04:59:33 - ERROR - stderr - +2025-05-11 04:59:33 - INFO - stdout - {'loss': 0.7377, 'grad_norm': 0.7241086363792419, 'learning_rate': 9.67100612297393e-06, 'epoch': 1.58} +2025-05-11 04:59:33 - ERROR - stderr - 53%|█████▎ | 1965/3741 [11:33:40<9:44:51, 19.76s/it] +2025-05-11 04:59:53 - ERROR - stderr - 53%|█████▎ | 1966/3741 [11:33:59<9:43:26, 19.72s/it] +2025-05-11 04:59:53 - ERROR - stderr - +2025-05-11 04:59:53 - ERROR - stderr - +2025-05-11 04:59:53 - INFO - stdout - {'loss': 0.7387, 'grad_norm': 0.6383355259895325, 'learning_rate': 9.662351638771049e-06, 'epoch': 1.58} +2025-05-11 04:59:53 - ERROR - stderr - 53%|█████▎ | 1966/3741 [11:33:59<9:43:26, 19.72s/it] +2025-05-11 05:00:15 - ERROR - stderr - 53%|█████▎ | 1967/3741 [11:34:21<10:00:40, 20.32s/it] +2025-05-11 05:00:15 - ERROR - stderr - +2025-05-11 05:00:15 - ERROR - stderr - +2025-05-11 05:00:15 - INFO - stdout - {'loss': 0.7487, 'grad_norm': 0.6955791115760803, 'learning_rate': 9.653697407748412e-06, 'epoch': 1.58} +2025-05-11 05:00:15 - ERROR - stderr - 53%|█████▎ | 1967/3741 [11:34:21<10:00:40, 20.32s/it] +2025-05-11 05:00:34 - ERROR - stderr - 53%|█████▎ | 1968/3741 [11:34:40<9:52:38, 20.06s/it] +2025-05-11 05:00:34 - ERROR - stderr - +2025-05-11 05:00:34 - ERROR - stderr - +2025-05-11 05:00:34 - INFO - stdout - {'loss': 0.7984, 'grad_norm': 0.6960842609405518, 'learning_rate': 9.645043436395253e-06, 'epoch': 1.58} +2025-05-11 05:00:34 - ERROR - stderr - 53%|█████▎ | 1968/3741 [11:34:40<9:52:38, 20.06s/it] +2025-05-11 05:00:56 - ERROR - stderr - 53%|█████▎ | 1969/3741 [11:35:02<10:09:04, 20.62s/it] +2025-05-11 05:00:56 - ERROR - stderr - +2025-05-11 05:00:56 - ERROR - stderr - +2025-05-11 05:00:56 - INFO - stdout - {'loss': 0.7261, 'grad_norm': 0.7226347923278809, 'learning_rate': 9.63638973120061e-06, 'epoch': 1.58} +2025-05-11 05:00:56 - ERROR - stderr - 53%|█████▎ | 1969/3741 [11:35:02<10:09:04, 20.62s/it] +2025-05-11 05:01:16 - ERROR - stderr - 53%|█████▎ | 1970/3741 [11:35:22<10:04:40, 20.49s/it] +2025-05-11 05:01:16 - ERROR - stderr - +2025-05-11 05:01:16 - ERROR - stderr - +2025-05-11 05:01:16 - INFO - stdout - {'loss': 0.732, 'grad_norm': 0.6601559519767761, 'learning_rate': 9.627736298653332e-06, 'epoch': 1.58} +2025-05-11 05:01:16 - ERROR - stderr - 53%|█████▎ | 1970/3741 [11:35:22<10:04:40, 20.49s/it] +2025-05-11 05:01:38 - ERROR - stderr - 53%|█████▎ | 1971/3741 [11:35:45<10:20:21, 21.03s/it] +2025-05-11 05:01:38 - ERROR - stderr - +2025-05-11 05:01:38 - ERROR - stderr - +2025-05-11 05:01:38 - INFO - stdout - {'loss': 0.7392, 'grad_norm': 0.6827449798583984, 'learning_rate': 9.619083145242053e-06, 'epoch': 1.58} +2025-05-11 05:01:38 - ERROR - stderr - 53%|█████▎ | 1971/3741 [11:35:45<10:20:21, 21.03s/it] +2025-05-11 05:01:59 - ERROR - stderr - 53%|█████▎ | 1972/3741 [11:36:05<10:13:06, 20.80s/it] +2025-05-11 05:01:59 - ERROR - stderr - +2025-05-11 05:01:59 - ERROR - stderr - +2025-05-11 05:01:59 - INFO - stdout - {'loss': 0.7435, 'grad_norm': 0.6553324460983276, 'learning_rate': 9.610430277455209e-06, 'epoch': 1.58} +2025-05-11 05:01:59 - ERROR - stderr - 53%|█████▎ | 1972/3741 [11:36:05<10:13:06, 20.80s/it] +2025-05-11 05:02:21 - ERROR - stderr - 53%|█████▎ | 1973/3741 [11:36:27<10:24:26, 21.19s/it] +2025-05-11 05:02:21 - ERROR - stderr - +2025-05-11 05:02:21 - ERROR - stderr - +2025-05-11 05:02:21 - INFO - stdout - {'loss': 0.7591, 'grad_norm': 0.6525527238845825, 'learning_rate': 9.601777701781009e-06, 'epoch': 1.58} +2025-05-11 05:02:21 - ERROR - stderr - 53%|█████▎ | 1973/3741 [11:36:27<10:24:26, 21.19s/it] +2025-05-11 05:02:40 - ERROR - stderr - 53%|█████▎ | 1974/3741 [11:36:47<10:09:30, 20.70s/it] +2025-05-11 05:02:40 - ERROR - stderr - +2025-05-11 05:02:40 - ERROR - stderr - +2025-05-11 05:02:40 - INFO - stdout - {'loss': 0.7414, 'grad_norm': 0.6529831290245056, 'learning_rate': 9.593125424707446e-06, 'epoch': 1.58} +2025-05-11 05:02:40 - ERROR - stderr - 53%|█████▎ | 1974/3741 [11:36:47<10:09:30, 20.70s/it] +2025-05-11 05:03:03 - ERROR - stderr - 53%|█████▎ | 1975/3741 [11:37:09<10:26:11, 21.27s/it] +2025-05-11 05:03:03 - ERROR - stderr - +2025-05-11 05:03:03 - ERROR - stderr - +2025-05-11 05:03:03 - INFO - stdout - {'loss': 0.7597, 'grad_norm': 0.6787192821502686, 'learning_rate': 9.584473452722299e-06, 'epoch': 1.58} +2025-05-11 05:03:03 - ERROR - stderr - 53%|█████▎ | 1975/3741 [11:37:09<10:26:11, 21.27s/it] +2025-05-11 05:03:22 - ERROR - stderr - 53%|█████▎ | 1976/3741 [11:37:29<10:09:39, 20.73s/it] +2025-05-11 05:03:22 - ERROR - stderr - +2025-05-11 05:03:22 - ERROR - stderr - +2025-05-11 05:03:22 - INFO - stdout - {'loss': 0.7418, 'grad_norm': 0.6774669289588928, 'learning_rate': 9.575821792313108e-06, 'epoch': 1.58} +2025-05-11 05:03:22 - ERROR - stderr - 53%|█████▎ | 1976/3741 [11:37:29<10:09:39, 20.73s/it] +2025-05-11 05:03:45 - ERROR - stderr - 53%|█████▎ | 1977/3741 [11:37:52<10:27:49, 21.35s/it] +2025-05-11 05:03:45 - ERROR - stderr - +2025-05-11 05:03:45 - ERROR - stderr - +2025-05-11 05:03:45 - INFO - stdout - {'loss': 0.6952, 'grad_norm': 0.6893562078475952, 'learning_rate': 9.567170449967183e-06, 'epoch': 1.59} +2025-05-11 05:03:45 - ERROR - stderr - 53%|█████▎ | 1977/3741 [11:37:52<10:27:49, 21.35s/it] +2025-05-11 05:04:05 - ERROR - stderr - 53%|█████▎ | 1978/3741 [11:38:11<10:11:47, 20.82s/it] +2025-05-11 05:04:05 - ERROR - stderr - +2025-05-11 05:04:05 - ERROR - stderr - +2025-05-11 05:04:05 - INFO - stdout - {'loss': 0.7763, 'grad_norm': 0.6523553133010864, 'learning_rate': 9.558519432171597e-06, 'epoch': 1.59} +2025-05-11 05:04:05 - ERROR - stderr - 53%|█████▎ | 1978/3741 [11:38:11<10:11:47, 20.82s/it] +2025-05-11 05:04:27 - ERROR - stderr - 53%|█████▎ | 1979/3741 [11:38:33<10:23:22, 21.23s/it] +2025-05-11 05:04:27 - ERROR - stderr - +2025-05-11 05:04:27 - ERROR - stderr - +2025-05-11 05:04:27 - INFO - stdout - {'loss': 0.7025, 'grad_norm': 0.668114960193634, 'learning_rate': 9.549868745413172e-06, 'epoch': 1.59} +2025-05-11 05:04:27 - ERROR - stderr - 53%|█████▎ | 1979/3741 [11:38:33<10:23:22, 21.23s/it] +2025-05-11 05:04:46 - ERROR - stderr - 53%|█████▎ | 1980/3741 [11:38:53<10:08:18, 20.73s/it] +2025-05-11 05:04:47 - ERROR - stderr - +2025-05-11 05:04:47 - ERROR - stderr - +2025-05-11 05:04:47 - INFO - stdout - {'loss': 0.7311, 'grad_norm': 0.6470924019813538, 'learning_rate': 9.541218396178494e-06, 'epoch': 1.59} +2025-05-11 05:04:47 - ERROR - stderr - 53%|█████▎ | 1980/3741 [11:38:53<10:08:18, 20.73s/it] +2025-05-11 05:05:07 - ERROR - stderr - 53%|█████▎ | 1981/3741 [11:39:13<10:05:52, 20.66s/it] +2025-05-11 05:05:07 - ERROR - stderr - +2025-05-11 05:05:07 - ERROR - stderr - +2025-05-11 05:05:07 - INFO - stdout - {'loss': 0.8128, 'grad_norm': 0.7354634404182434, 'learning_rate': 9.532568390953886e-06, 'epoch': 1.59} +2025-05-11 05:05:07 - ERROR - stderr - 53%|█████▎ | 1981/3741 [11:39:13<10:05:52, 20.66s/it] +2025-05-11 05:05:27 - ERROR - stderr - 53%|█████▎ | 1982/3741 [11:39:33<9:56:48, 20.36s/it] +2025-05-11 05:05:27 - ERROR - stderr - +2025-05-11 05:05:27 - ERROR - stderr - +2025-05-11 05:05:27 - INFO - stdout - {'loss': 0.7422, 'grad_norm': 0.6740539073944092, 'learning_rate': 9.52391873622542e-06, 'epoch': 1.59} +2025-05-11 05:05:27 - ERROR - stderr - 53%|█████▎ | 1982/3741 [11:39:33<9:56:48, 20.36s/it] +2025-05-11 05:05:47 - ERROR - stderr - 53%|█████▎ | 1983/3741 [11:39:53<9:55:06, 20.31s/it] +2025-05-11 05:05:47 - ERROR - stderr - +2025-05-11 05:05:47 - ERROR - stderr - +2025-05-11 05:05:47 - INFO - stdout - {'loss': 0.7493, 'grad_norm': 0.6699314117431641, 'learning_rate': 9.515269438478898e-06, 'epoch': 1.59} +2025-05-11 05:05:47 - ERROR - stderr - 53%|█████▎ | 1983/3741 [11:39:53<9:55:06, 20.31s/it] +2025-05-11 05:06:06 - ERROR - stderr - 53%|█████▎ | 1984/3741 [11:40:13<9:47:28, 20.06s/it] +2025-05-11 05:06:06 - ERROR - stderr - +2025-05-11 05:06:06 - ERROR - stderr - +2025-05-11 05:06:06 - INFO - stdout - {'loss': 0.749, 'grad_norm': 0.6977644562721252, 'learning_rate': 9.506620504199854e-06, 'epoch': 1.59} +2025-05-11 05:06:06 - ERROR - stderr - 53%|█████▎ | 1984/3741 [11:40:13<9:47:28, 20.06s/it] +2025-05-11 05:06:26 - ERROR - stderr - 53%|█████▎ | 1985/3741 [11:40:32<9:44:21, 19.97s/it] +2025-05-11 05:06:26 - ERROR - stderr - +2025-05-11 05:06:26 - ERROR - stderr - +2025-05-11 05:06:26 - INFO - stdout - {'loss': 0.7679, 'grad_norm': 0.6674862504005432, 'learning_rate': 9.497971939873567e-06, 'epoch': 1.59} +2025-05-11 05:06:26 - ERROR - stderr - 53%|█████▎ | 1985/3741 [11:40:32<9:44:21, 19.97s/it] +2025-05-11 05:06:47 - ERROR - stderr - 53%|█████▎ | 1986/3741 [11:40:53<9:51:10, 20.21s/it] +2025-05-11 05:06:47 - ERROR - stderr - +2025-05-11 05:06:47 - ERROR - stderr - +2025-05-11 05:06:47 - INFO - stdout - {'loss': 0.7485, 'grad_norm': 0.6880958676338196, 'learning_rate': 9.489323751985009e-06, 'epoch': 1.59} +2025-05-11 05:06:47 - ERROR - stderr - 53%|█████▎ | 1986/3741 [11:40:53<9:51:10, 20.21s/it] +2025-05-11 05:07:06 - ERROR - stderr - 53%|█████▎ | 1987/3741 [11:41:13<9:45:30, 20.03s/it] +2025-05-11 05:07:06 - ERROR - stderr - +2025-05-11 05:07:06 - ERROR - stderr - +2025-05-11 05:07:06 - INFO - stdout - {'loss': 0.7573, 'grad_norm': 0.6663671731948853, 'learning_rate': 9.480675947018899e-06, 'epoch': 1.59} +2025-05-11 05:07:06 - ERROR - stderr - 53%|█████▎ | 1987/3741 [11:41:13<9:45:30, 20.03s/it] +2025-05-11 05:07:28 - ERROR - stderr - 53%|█████▎ | 1988/3741 [11:41:35<10:02:39, 20.63s/it] +2025-05-11 05:07:28 - ERROR - stderr - +2025-05-11 05:07:28 - ERROR - stderr - +2025-05-11 05:07:29 - INFO - stdout - {'loss': 0.7605, 'grad_norm': 0.706123948097229, 'learning_rate': 9.472028531459649e-06, 'epoch': 1.59} +2025-05-11 05:07:29 - ERROR - stderr - 53%|█████▎ | 1988/3741 [11:41:35<10:02:39, 20.63s/it] +2025-05-11 05:07:48 - ERROR - stderr - 53%|█████▎ | 1989/3741 [11:41:55<9:54:59, 20.38s/it] +2025-05-11 05:07:48 - ERROR - stderr - +2025-05-11 05:07:48 - ERROR - stderr - +2025-05-11 05:07:48 - INFO - stdout - {'loss': 0.7809, 'grad_norm': 0.7029390931129456, 'learning_rate': 9.463381511791386e-06, 'epoch': 1.6} +2025-05-11 05:07:48 - ERROR - stderr - 53%|█████▎ | 1989/3741 [11:41:55<9:54:59, 20.38s/it] +2025-05-11 05:08:11 - ERROR - stderr - 53%|█████▎ | 1990/3741 [11:42:17<10:14:28, 21.06s/it] +2025-05-11 05:08:11 - ERROR - stderr - +2025-05-11 05:08:11 - ERROR - stderr - +2025-05-11 05:08:11 - INFO - stdout - {'loss': 0.7103, 'grad_norm': 0.633929431438446, 'learning_rate': 9.454734894497942e-06, 'epoch': 1.6} +2025-05-11 05:08:11 - ERROR - stderr - 53%|█████▎ | 1990/3741 [11:42:17<10:14:28, 21.06s/it] +2025-05-11 05:08:31 - ERROR - stderr - 53%|█████▎ | 1991/3741 [11:42:37<10:04:43, 20.73s/it] +2025-05-11 05:08:31 - ERROR - stderr - +2025-05-11 05:08:31 - ERROR - stderr - +2025-05-11 05:08:31 - INFO - stdout - {'loss': 0.7599, 'grad_norm': 0.6639130115509033, 'learning_rate': 9.446088686062838e-06, 'epoch': 1.6} +2025-05-11 05:08:31 - ERROR - stderr - 53%|█████▎ | 1991/3741 [11:42:37<10:04:43, 20.73s/it] +2025-05-11 05:08:54 - ERROR - stderr - 53%|█████▎ | 1992/3741 [11:43:00<10:21:05, 21.31s/it] +2025-05-11 05:08:54 - ERROR - stderr - +2025-05-11 05:08:54 - ERROR - stderr - +2025-05-11 05:08:54 - INFO - stdout - {'loss': 0.7605, 'grad_norm': 0.6766201853752136, 'learning_rate': 9.437442892969308e-06, 'epoch': 1.6} +2025-05-11 05:08:54 - ERROR - stderr - 53%|█████▎ | 1992/3741 [11:43:00<10:21:05, 21.31s/it] +2025-05-11 05:09:13 - ERROR - stderr - 53%|█████▎ | 1993/3741 [11:43:20<10:07:35, 20.86s/it] +2025-05-11 05:09:13 - ERROR - stderr - +2025-05-11 05:09:13 - ERROR - stderr - +2025-05-11 05:09:13 - INFO - stdout - {'loss': 0.7316, 'grad_norm': 0.6676896810531616, 'learning_rate': 9.428797521700254e-06, 'epoch': 1.6} +2025-05-11 05:09:13 - ERROR - stderr - 53%|█████▎ | 1993/3741 [11:43:20<10:07:35, 20.86s/it] +2025-05-11 05:09:37 - ERROR - stderr - 53%|█████▎ | 1994/3741 [11:43:43<10:29:16, 21.61s/it] +2025-05-11 05:09:37 - ERROR - stderr - +2025-05-11 05:09:37 - ERROR - stderr - +2025-05-11 05:09:37 - INFO - stdout - {'loss': 0.7832, 'grad_norm': 0.7439046502113342, 'learning_rate': 9.420152578738269e-06, 'epoch': 1.6} +2025-05-11 05:09:37 - ERROR - stderr - 53%|█████▎ | 1994/3741 [11:43:43<10:29:16, 21.61s/it] +2025-05-11 05:09:56 - ERROR - stderr - 53%|█████▎ | 1995/3741 [11:44:03<10:10:12, 20.97s/it] +2025-05-11 05:09:56 - ERROR - stderr - +2025-05-11 05:09:56 - ERROR - stderr - +2025-05-11 05:09:56 - INFO - stdout - {'loss': 0.7817, 'grad_norm': 0.8324495553970337, 'learning_rate': 9.41150807056563e-06, 'epoch': 1.6} +2025-05-11 05:09:56 - ERROR - stderr - 53%|█████▎ | 1995/3741 [11:44:03<10:10:12, 20.97s/it] +2025-05-11 05:10:20 - ERROR - stderr - 53%|█████▎ | 1996/3741 [11:44:27<10:37:34, 21.92s/it] +2025-05-11 05:10:20 - ERROR - stderr - +2025-05-11 05:10:20 - ERROR - stderr - +2025-05-11 05:10:20 - INFO - stdout - {'loss': 0.7429, 'grad_norm': 0.6500052213668823, 'learning_rate': 9.402864003664279e-06, 'epoch': 1.6} +2025-05-11 05:10:20 - ERROR - stderr - 53%|█████▎ | 1996/3741 [11:44:27<10:37:34, 21.92s/it] +2025-05-11 05:10:40 - ERROR - stderr - 53%|█████▎ | 1997/3741 [11:44:46<10:18:24, 21.28s/it] +2025-05-11 05:10:40 - ERROR - stderr - +2025-05-11 05:10:40 - ERROR - stderr - +2025-05-11 05:10:40 - INFO - stdout - {'loss': 0.7663, 'grad_norm': 0.6784413456916809, 'learning_rate': 9.394220384515836e-06, 'epoch': 1.6} +2025-05-11 05:10:40 - ERROR - stderr - 53%|█████▎ | 1997/3741 [11:44:46<10:18:24, 21.28s/it] +2025-05-11 05:11:00 - ERROR - stderr - 53%|█████▎ | 1998/3741 [11:45:06<10:02:09, 20.73s/it] +2025-05-11 05:11:00 - ERROR - stderr - +2025-05-11 05:11:00 - ERROR - stderr - +2025-05-11 05:11:00 - INFO - stdout - {'loss': 0.7391, 'grad_norm': 0.6711524724960327, 'learning_rate': 9.38557721960158e-06, 'epoch': 1.6} +2025-05-11 05:11:00 - ERROR - stderr - 53%|█████▎ | 1998/3741 [11:45:06<10:02:09, 20.73s/it] +2025-05-11 05:11:19 - ERROR - stderr - 53%|█████▎ | 1999/3741 [11:45:25<9:51:26, 20.37s/it] +2025-05-11 05:11:19 - ERROR - stderr - +2025-05-11 05:11:19 - ERROR - stderr - +2025-05-11 05:11:19 - INFO - stdout - {'loss': 0.7391, 'grad_norm': 0.6689935326576233, 'learning_rate': 9.37693451540245e-06, 'epoch': 1.6} +2025-05-11 05:11:19 - ERROR - stderr - 53%|█████▎ | 1999/3741 [11:45:25<9:51:26, 20.37s/it] +2025-05-11 05:11:39 - ERROR - stderr - 53%|█████▎ | 2000/3741 [11:45:45<9:48:37, 20.29s/it] +2025-05-11 05:11:39 - ERROR - stderr - +2025-05-11 05:11:39 - ERROR - stderr - +2025-05-11 05:11:39 - INFO - stdout - {'loss': 0.755, 'grad_norm': 0.7078515887260437, 'learning_rate': 9.368292278399038e-06, 'epoch': 1.6} +2025-05-11 05:11:39 - ERROR - stderr - 53%|█████▎ | 2000/3741 [11:45:46<9:48:37, 20.29s/it] +2025-05-11 05:12:01 - ERROR - stderr - 53%|█████▎ | 2001/3741 [11:46:07<10:03:04, 20.80s/it] +2025-05-11 05:12:01 - ERROR - stderr - +2025-05-11 05:12:01 - ERROR - stderr - +2025-05-11 05:12:01 - INFO - stdout - {'loss': 0.7511, 'grad_norm': 0.6978031396865845, 'learning_rate': 9.35965051507159e-06, 'epoch': 1.6} +2025-05-11 05:12:01 - ERROR - stderr - 53%|█████▎ | 2001/3741 [11:46:07<10:03:04, 20.80s/it] +2025-05-11 05:12:21 - ERROR - stderr - 54%|█████▎ | 2002/3741 [11:46:27<9:53:42, 20.48s/it] +2025-05-11 05:12:21 - ERROR - stderr - +2025-05-11 05:12:21 - ERROR - stderr - +2025-05-11 05:12:21 - INFO - stdout - {'loss': 0.7673, 'grad_norm': 0.6822302937507629, 'learning_rate': 9.351009231899995e-06, 'epoch': 1.61} +2025-05-11 05:12:21 - ERROR - stderr - 54%|█████▎ | 2002/3741 [11:46:27<9:53:42, 20.48s/it] +2025-05-11 05:12:43 - ERROR - stderr - 54%|█████▎ | 2003/3741 [11:46:49<10:07:49, 20.98s/it] +2025-05-11 05:12:43 - ERROR - stderr - +2025-05-11 05:12:43 - ERROR - stderr - +2025-05-11 05:12:43 - INFO - stdout - {'loss': 0.7611, 'grad_norm': 0.7071417570114136, 'learning_rate': 9.342368435363774e-06, 'epoch': 1.61} +2025-05-11 05:12:43 - ERROR - stderr - 54%|█████▎ | 2003/3741 [11:46:49<10:07:49, 20.98s/it] +2025-05-11 05:13:03 - ERROR - stderr - 54%|█████▎ | 2004/3741 [11:47:09<9:54:29, 20.54s/it] +2025-05-11 05:13:03 - ERROR - stderr - +2025-05-11 05:13:03 - ERROR - stderr - +2025-05-11 05:13:03 - INFO - stdout - {'loss': 0.7593, 'grad_norm': 0.7437505722045898, 'learning_rate': 9.333728131942104e-06, 'epoch': 1.61} +2025-05-11 05:13:03 - ERROR - stderr - 54%|█████▎ | 2004/3741 [11:47:09<9:54:29, 20.54s/it] +2025-05-11 05:13:25 - ERROR - stderr - 54%|█████▎ | 2005/3741 [11:47:32<10:12:46, 21.18s/it] +2025-05-11 05:13:25 - ERROR - stderr - +2025-05-11 05:13:25 - ERROR - stderr - +2025-05-11 05:13:25 - INFO - stdout - {'loss': 0.743, 'grad_norm': 0.6476147770881653, 'learning_rate': 9.325088328113769e-06, 'epoch': 1.61} +2025-05-11 05:13:25 - ERROR - stderr - 54%|█████▎ | 2005/3741 [11:47:32<10:12:46, 21.18s/it] +2025-05-11 05:13:45 - ERROR - stderr - 54%|█████▎ | 2006/3741 [11:47:51<9:59:09, 20.72s/it] +2025-05-11 05:13:45 - ERROR - stderr - +2025-05-11 05:13:45 - ERROR - stderr - +2025-05-11 05:13:45 - INFO - stdout - {'loss': 0.7613, 'grad_norm': 0.6686375737190247, 'learning_rate': 9.316449030357188e-06, 'epoch': 1.61} +2025-05-11 05:13:45 - ERROR - stderr - 54%|█████▎ | 2006/3741 [11:47:51<9:59:09, 20.72s/it] +2025-05-11 05:14:07 - ERROR - stderr - 54%|█████▎ | 2007/3741 [11:48:14<10:12:31, 21.19s/it] +2025-05-11 05:14:07 - ERROR - stderr - +2025-05-11 05:14:07 - ERROR - stderr - +2025-05-11 05:14:07 - INFO - stdout - {'loss': 0.7171, 'grad_norm': 0.6458247303962708, 'learning_rate': 9.307810245150408e-06, 'epoch': 1.61} +2025-05-11 05:14:07 - ERROR - stderr - 54%|█████▎ | 2007/3741 [11:48:14<10:12:31, 21.19s/it] +2025-05-11 05:14:27 - ERROR - stderr - 54%|█████▎ | 2008/3741 [11:48:33<9:59:18, 20.75s/it] +2025-05-11 05:14:27 - ERROR - stderr - +2025-05-11 05:14:27 - ERROR - stderr - +2025-05-11 05:14:27 - INFO - stdout - {'loss': 0.7636, 'grad_norm': 1.242529273033142, 'learning_rate': 9.299171978971073e-06, 'epoch': 1.61} +2025-05-11 05:14:27 - ERROR - stderr - 54%|█████▎ | 2008/3741 [11:48:33<9:59:18, 20.75s/it] +2025-05-11 05:14:50 - ERROR - stderr - 54%|█████▎ | 2009/3741 [11:48:57<10:21:43, 21.54s/it] +2025-05-11 05:14:50 - ERROR - stderr - +2025-05-11 05:14:50 - ERROR - stderr - +2025-05-11 05:14:50 - INFO - stdout - {'loss': 0.7621, 'grad_norm': 0.7031643390655518, 'learning_rate': 9.290534238296462e-06, 'epoch': 1.61} +2025-05-11 05:14:50 - ERROR - stderr - 54%|█████▎ | 2009/3741 [11:48:57<10:21:43, 21.54s/it] +2025-05-11 05:15:10 - ERROR - stderr - 54%|█████▎ | 2010/3741 [11:49:17<10:07:34, 21.06s/it] +2025-05-11 05:15:10 - ERROR - stderr - +2025-05-11 05:15:10 - ERROR - stderr - +2025-05-11 05:15:10 - INFO - stdout - {'loss': 0.6897, 'grad_norm': 0.6745364665985107, 'learning_rate': 9.281897029603439e-06, 'epoch': 1.61} +2025-05-11 05:15:10 - ERROR - stderr - 54%|█████▎ | 2010/3741 [11:49:17<10:07:34, 21.06s/it] +2025-05-11 05:15:34 - ERROR - stderr - 54%|█████▍ | 2011/3741 [11:49:40<10:26:32, 21.73s/it] +2025-05-11 05:15:34 - ERROR - stderr - +2025-05-11 05:15:34 - ERROR - stderr - +2025-05-11 05:15:34 - INFO - stdout - {'loss': 0.7431, 'grad_norm': 0.8014926314353943, 'learning_rate': 9.273260359368478e-06, 'epoch': 1.61} +2025-05-11 05:15:34 - ERROR - stderr - 54%|█████▍ | 2011/3741 [11:49:40<10:26:32, 21.73s/it] +2025-05-11 05:15:53 - ERROR - stderr - 54%|█████▍ | 2012/3741 [11:49:59<10:07:33, 21.08s/it] +2025-05-11 05:15:53 - ERROR - stderr - +2025-05-11 05:15:53 - ERROR - stderr - +2025-05-11 05:15:53 - INFO - stdout - {'loss': 0.7679, 'grad_norm': 0.6873915791511536, 'learning_rate': 9.264624234067651e-06, 'epoch': 1.61} +2025-05-11 05:15:53 - ERROR - stderr - 54%|█████▍ | 2012/3741 [11:49:59<10:07:33, 21.08s/it] +2025-05-11 05:16:13 - ERROR - stderr - 54%|█████▍ | 2013/3741 [11:50:19<9:54:28, 20.64s/it] +2025-05-11 05:16:13 - ERROR - stderr - +2025-05-11 05:16:13 - ERROR - stderr - +2025-05-11 05:16:13 - INFO - stdout - {'loss': 0.7457, 'grad_norm': 0.6493097543716431, 'learning_rate': 9.255988660176613e-06, 'epoch': 1.61} +2025-05-11 05:16:13 - ERROR - stderr - 54%|█████▍ | 2013/3741 [11:50:19<9:54:28, 20.64s/it] +2025-05-11 05:16:32 - ERROR - stderr - 54%|█████▍ | 2014/3741 [11:50:38<9:43:49, 20.28s/it] +2025-05-11 05:16:32 - ERROR - stderr - +2025-05-11 05:16:32 - ERROR - stderr - +2025-05-11 05:16:32 - INFO - stdout - {'loss': 0.7683, 'grad_norm': 0.7045702934265137, 'learning_rate': 9.247353644170622e-06, 'epoch': 1.62} +2025-05-11 05:16:32 - ERROR - stderr - 54%|█████▍ | 2014/3741 [11:50:38<9:43:49, 20.28s/it] +2025-05-11 05:16:52 - ERROR - stderr - 54%|█████▍ | 2015/3741 [11:50:58<9:39:28, 20.14s/it] +2025-05-11 05:16:52 - ERROR - stderr - +2025-05-11 05:16:52 - ERROR - stderr - +2025-05-11 05:16:52 - INFO - stdout - {'loss': 0.7129, 'grad_norm': 0.6594098210334778, 'learning_rate': 9.238719192524501e-06, 'epoch': 1.62} +2025-05-11 05:16:52 - ERROR - stderr - 54%|█████▍ | 2015/3741 [11:50:58<9:39:28, 20.14s/it] +2025-05-11 05:17:15 - ERROR - stderr - 54%|█████▍ | 2016/3741 [11:51:21<10:00:10, 20.88s/it] +2025-05-11 05:17:15 - ERROR - stderr - +2025-05-11 05:17:15 - ERROR - stderr - +2025-05-11 05:17:15 - INFO - stdout - {'loss': 0.7376, 'grad_norm': 0.7022315859794617, 'learning_rate': 9.23008531171265e-06, 'epoch': 1.62} +2025-05-11 05:17:15 - ERROR - stderr - 54%|█████▍ | 2016/3741 [11:51:21<10:00:10, 20.88s/it] +2025-05-11 05:17:34 - ERROR - stderr - 54%|█████▍ | 2017/3741 [11:51:40<9:48:29, 20.48s/it] +2025-05-11 05:17:34 - ERROR - stderr - +2025-05-11 05:17:34 - ERROR - stderr - +2025-05-11 05:17:34 - INFO - stdout - {'loss': 0.7507, 'grad_norm': 0.6779872179031372, 'learning_rate': 9.221452008209057e-06, 'epoch': 1.62} +2025-05-11 05:17:34 - ERROR - stderr - 54%|█████▍ | 2017/3741 [11:51:40<9:48:29, 20.48s/it] +2025-05-11 05:17:56 - ERROR - stderr - 54%|█████▍ | 2018/3741 [11:52:03<10:03:24, 21.01s/it] +2025-05-11 05:17:56 - ERROR - stderr - +2025-05-11 05:17:56 - ERROR - stderr - +2025-05-11 05:17:56 - INFO - stdout - {'loss': 0.7652, 'grad_norm': 0.7060291767120361, 'learning_rate': 9.21281928848726e-06, 'epoch': 1.62} +2025-05-11 05:17:56 - ERROR - stderr - 54%|█████▍ | 2018/3741 [11:52:03<10:03:24, 21.01s/it] +2025-05-11 05:18:16 - ERROR - stderr - 54%|█████▍ | 2019/3741 [11:52:22<9:50:27, 20.57s/it] +2025-05-11 05:18:16 - ERROR - stderr - +2025-05-11 05:18:16 - ERROR - stderr - +2025-05-11 05:18:16 - INFO - stdout - {'loss': 0.7142, 'grad_norm': 0.6302214860916138, 'learning_rate': 9.204187159020372e-06, 'epoch': 1.62} +2025-05-11 05:18:16 - ERROR - stderr - 54%|█████▍ | 2019/3741 [11:52:22<9:50:27, 20.57s/it] +2025-05-11 05:18:39 - ERROR - stderr - 54%|█████▍ | 2020/3741 [11:52:46<10:13:36, 21.39s/it] +2025-05-11 05:18:39 - ERROR - stderr - +2025-05-11 05:18:39 - ERROR - stderr - +2025-05-11 05:18:39 - INFO - stdout - {'loss': 0.7594, 'grad_norm': 0.7084274291992188, 'learning_rate': 9.195555626281053e-06, 'epoch': 1.62} +2025-05-11 05:18:39 - ERROR - stderr - 54%|█████▍ | 2020/3741 [11:52:46<10:13:36, 21.39s/it] +2025-05-11 05:18:59 - ERROR - stderr - 54%|█████▍ | 2021/3741 [11:53:05<9:56:21, 20.80s/it] +2025-05-11 05:18:59 - ERROR - stderr - +2025-05-11 05:18:59 - ERROR - stderr - +2025-05-11 05:18:59 - INFO - stdout - {'loss': 0.7467, 'grad_norm': 0.678536593914032, 'learning_rate': 9.186924696741519e-06, 'epoch': 1.62} +2025-05-11 05:18:59 - ERROR - stderr - 54%|█████▍ | 2021/3741 [11:53:05<9:56:21, 20.80s/it] +2025-05-11 05:19:22 - ERROR - stderr - 54%|█████▍ | 2022/3741 [11:53:28<10:17:56, 21.57s/it] +2025-05-11 05:19:22 - ERROR - stderr - +2025-05-11 05:19:22 - ERROR - stderr - +2025-05-11 05:19:22 - INFO - stdout - {'loss': 0.7463, 'grad_norm': 0.670260488986969, 'learning_rate': 9.17829437687354e-06, 'epoch': 1.62} +2025-05-11 05:19:22 - ERROR - stderr - 54%|█████▍ | 2022/3741 [11:53:28<10:17:56, 21.57s/it] +2025-05-11 05:19:42 - ERROR - stderr - 54%|█████▍ | 2023/3741 [11:53:48<10:01:34, 21.01s/it] +2025-05-11 05:19:42 - ERROR - stderr - +2025-05-11 05:19:42 - ERROR - stderr - +2025-05-11 05:19:42 - INFO - stdout - {'loss': 0.7582, 'grad_norm': 0.702285885810852, 'learning_rate': 9.169664673148421e-06, 'epoch': 1.62} +2025-05-11 05:19:42 - ERROR - stderr - 54%|█████▍ | 2023/3741 [11:53:48<10:01:34, 21.01s/it] +2025-05-11 05:20:05 - ERROR - stderr - 54%|█████▍ | 2024/3741 [11:54:11<10:22:10, 21.74s/it] +2025-05-11 05:20:05 - ERROR - stderr - +2025-05-11 05:20:05 - ERROR - stderr - +2025-05-11 05:20:05 - INFO - stdout - {'loss': 0.706, 'grad_norm': 0.7096335291862488, 'learning_rate': 9.16103559203701e-06, 'epoch': 1.62} +2025-05-11 05:20:05 - ERROR - stderr - 54%|█████▍ | 2024/3741 [11:54:11<10:22:10, 21.74s/it] +2025-05-11 05:20:25 - ERROR - stderr - 54%|█████▍ | 2025/3741 [11:54:31<10:03:19, 21.10s/it] +2025-05-11 05:20:25 - ERROR - stderr - +2025-05-11 05:20:25 - ERROR - stderr - +2025-05-11 05:20:25 - INFO - stdout - {'loss': 0.721, 'grad_norm': 0.6567702293395996, 'learning_rate': 9.152407140009684e-06, 'epoch': 1.62} +2025-05-11 05:20:25 - ERROR - stderr - 54%|█████▍ | 2025/3741 [11:54:31<10:03:19, 21.10s/it] +2025-05-11 05:20:45 - ERROR - stderr - 54%|█████▍ | 2026/3741 [11:54:51<9:53:57, 20.78s/it] +2025-05-11 05:20:45 - ERROR - stderr - +2025-05-11 05:20:45 - ERROR - stderr - +2025-05-11 05:20:45 - INFO - stdout - {'loss': 0.7575, 'grad_norm': 0.6644206047058105, 'learning_rate': 9.143779323536346e-06, 'epoch': 1.62} +2025-05-11 05:20:45 - ERROR - stderr - 54%|█████▍ | 2026/3741 [11:54:51<9:53:57, 20.78s/it] +2025-05-11 05:21:04 - ERROR - stderr - 54%|█████▍ | 2027/3741 [11:55:11<9:42:21, 20.39s/it] +2025-05-11 05:21:04 - ERROR - stderr - +2025-05-11 05:21:04 - ERROR - stderr - +2025-05-11 05:21:04 - INFO - stdout - {'loss': 0.7174, 'grad_norm': 0.6624019145965576, 'learning_rate': 9.135152149086436e-06, 'epoch': 1.63} +2025-05-11 05:21:04 - ERROR - stderr - 54%|█████▍ | 2027/3741 [11:55:11<9:42:21, 20.39s/it] +2025-05-11 05:21:24 - ERROR - stderr - 54%|█████▍ | 2028/3741 [11:55:30<9:34:56, 20.14s/it] +2025-05-11 05:21:24 - ERROR - stderr - +2025-05-11 05:21:24 - ERROR - stderr - +2025-05-11 05:21:24 - INFO - stdout - {'loss': 0.7682, 'grad_norm': 0.723945677280426, 'learning_rate': 9.126525623128896e-06, 'epoch': 1.63} +2025-05-11 05:21:24 - ERROR - stderr - 54%|█████▍ | 2028/3741 [11:55:30<9:34:56, 20.14s/it] +2025-05-11 05:21:43 - ERROR - stderr - 54%|█████▍ | 2029/3741 [11:55:50<9:30:26, 19.99s/it] +2025-05-11 05:21:43 - ERROR - stderr - +2025-05-11 05:21:43 - ERROR - stderr - +2025-05-11 05:21:43 - INFO - stdout - {'loss': 0.7677, 'grad_norm': 0.684164822101593, 'learning_rate': 9.117899752132193e-06, 'epoch': 1.63} +2025-05-11 05:21:43 - ERROR - stderr - 54%|█████▍ | 2029/3741 [11:55:50<9:30:26, 19.99s/it] +2025-05-11 05:21:44 - INFO - stdout - WARNING: tokenization mismatch: 1 vs. 3133. (ignored) +2025-05-11 05:22:03 - ERROR - stderr - 54%|█████▍ | 2030/3741 [11:56:09<9:27:04, 19.89s/it] +2025-05-11 05:22:03 - ERROR - stderr - +2025-05-11 05:22:03 - ERROR - stderr - +2025-05-11 05:22:03 - INFO - stdout - {'loss': 0.7646, 'grad_norm': 0.6822468042373657, 'learning_rate': 9.109274542564295e-06, 'epoch': 1.63} +2025-05-11 05:22:03 - ERROR - stderr - 54%|█████▍ | 2030/3741 [11:56:09<9:27:04, 19.89s/it] +2025-05-11 05:22:24 - ERROR - stderr - 54%|█████▍ | 2031/3741 [11:56:30<9:35:02, 20.18s/it] +2025-05-11 05:22:24 - ERROR - stderr - +2025-05-11 05:22:24 - ERROR - stderr - +2025-05-11 05:22:24 - INFO - stdout - {'loss': 0.7443, 'grad_norm': 0.707531750202179, 'learning_rate': 9.100650000892679e-06, 'epoch': 1.63} +2025-05-11 05:22:24 - ERROR - stderr - 54%|█████▍ | 2031/3741 [11:56:30<9:35:02, 20.18s/it] +2025-05-11 05:22:43 - ERROR - stderr - 54%|█████▍ | 2032/3741 [11:56:50<9:28:37, 19.96s/it] +2025-05-11 05:22:43 - ERROR - stderr - +2025-05-11 05:22:43 - ERROR - stderr - +2025-05-11 05:22:43 - INFO - stdout - {'loss': 0.7717, 'grad_norm': 0.6925025582313538, 'learning_rate': 9.092026133584322e-06, 'epoch': 1.63} +2025-05-11 05:22:43 - ERROR - stderr - 54%|█████▍ | 2032/3741 [11:56:50<9:28:37, 19.96s/it] +2025-05-11 05:23:05 - ERROR - stderr - 54%|█████▍ | 2033/3741 [11:57:11<9:38:44, 20.33s/it] +2025-05-11 05:23:05 - ERROR - stderr - +2025-05-11 05:23:05 - ERROR - stderr - +2025-05-11 05:23:05 - INFO - stdout - {'loss': 0.7586, 'grad_norm': 0.6611237525939941, 'learning_rate': 9.083402947105688e-06, 'epoch': 1.63} +2025-05-11 05:23:05 - ERROR - stderr - 54%|█████▍ | 2033/3741 [11:57:11<9:38:44, 20.33s/it] +2025-05-11 05:23:24 - ERROR - stderr - 54%|█████▍ | 2034/3741 [11:57:30<9:31:39, 20.09s/it] +2025-05-11 05:23:24 - ERROR - stderr - +2025-05-11 05:23:24 - ERROR - stderr - +2025-05-11 05:23:24 - INFO - stdout - {'loss': 0.7076, 'grad_norm': 0.6879470348358154, 'learning_rate': 9.074780447922746e-06, 'epoch': 1.63} +2025-05-11 05:23:24 - ERROR - stderr - 54%|█████▍ | 2034/3741 [11:57:30<9:31:39, 20.09s/it] +2025-05-11 05:23:46 - ERROR - stderr - 54%|█████▍ | 2035/3741 [11:57:52<9:44:06, 20.54s/it] +2025-05-11 05:23:46 - ERROR - stderr - +2025-05-11 05:23:46 - ERROR - stderr - +2025-05-11 05:23:46 - INFO - stdout - {'loss': 0.7587, 'grad_norm': 0.6737411618232727, 'learning_rate': 9.066158642500933e-06, 'epoch': 1.63} +2025-05-11 05:23:46 - ERROR - stderr - 54%|█████▍ | 2035/3741 [11:57:52<9:44:06, 20.54s/it] +2025-05-11 05:24:05 - ERROR - stderr - 54%|█████▍ | 2036/3741 [11:58:12<9:36:10, 20.28s/it] +2025-05-11 05:24:05 - ERROR - stderr - +2025-05-11 05:24:05 - ERROR - stderr - +2025-05-11 05:24:05 - INFO - stdout - {'loss': 0.7363, 'grad_norm': 0.6747552752494812, 'learning_rate': 9.05753753730517e-06, 'epoch': 1.63} +2025-05-11 05:24:05 - ERROR - stderr - 54%|█████▍ | 2036/3741 [11:58:12<9:36:10, 20.28s/it] +2025-05-11 05:24:27 - ERROR - stderr - 54%|█████▍ | 2037/3741 [11:58:33<9:48:09, 20.71s/it] +2025-05-11 05:24:27 - ERROR - stderr - +2025-05-11 05:24:27 - ERROR - stderr - +2025-05-11 05:24:27 - INFO - stdout - {'loss': 0.7896, 'grad_norm': 0.7112221717834473, 'learning_rate': 9.04891713879986e-06, 'epoch': 1.63} +2025-05-11 05:24:27 - ERROR - stderr - 54%|█████▍ | 2037/3741 [11:58:33<9:48:09, 20.71s/it] +2025-05-11 05:24:47 - ERROR - stderr - 54%|█████▍ | 2038/3741 [11:58:53<9:39:55, 20.43s/it] +2025-05-11 05:24:47 - ERROR - stderr - +2025-05-11 05:24:47 - ERROR - stderr - +2025-05-11 05:24:47 - INFO - stdout - {'loss': 0.7195, 'grad_norm': 0.6935960650444031, 'learning_rate': 9.040297453448867e-06, 'epoch': 1.63} +2025-05-11 05:24:47 - ERROR - stderr - 54%|█████▍ | 2038/3741 [11:58:53<9:39:55, 20.43s/it] +2025-05-11 05:25:09 - ERROR - stderr - 55%|█████▍ | 2039/3741 [11:59:16<9:57:39, 21.07s/it] +2025-05-11 05:25:09 - ERROR - stderr - +2025-05-11 05:25:09 - ERROR - stderr - +2025-05-11 05:25:09 - INFO - stdout - {'loss': 0.7473, 'grad_norm': 0.6707137227058411, 'learning_rate': 9.03167848771553e-06, 'epoch': 1.64} +2025-05-11 05:25:09 - ERROR - stderr - 55%|█████▍ | 2039/3741 [11:59:16<9:57:39, 21.07s/it] +2025-05-11 05:25:29 - ERROR - stderr - 55%|█████▍ | 2040/3741 [11:59:36<9:45:56, 20.67s/it] +2025-05-11 05:25:29 - ERROR - stderr - +2025-05-11 05:25:29 - ERROR - stderr - +2025-05-11 05:25:29 - INFO - stdout - {'loss': 0.7654, 'grad_norm': 0.6829401254653931, 'learning_rate': 9.023060248062642e-06, 'epoch': 1.64} +2025-05-11 05:25:29 - ERROR - stderr - 55%|█████▍ | 2040/3741 [11:59:36<9:45:56, 20.67s/it] +2025-05-11 05:25:53 - ERROR - stderr - 55%|█████▍ | 2041/3741 [11:59:59<10:08:49, 21.49s/it] +2025-05-11 05:25:53 - ERROR - stderr - +2025-05-11 05:25:53 - ERROR - stderr - +2025-05-11 05:25:53 - INFO - stdout - {'loss': 0.7386, 'grad_norm': 0.6423894166946411, 'learning_rate': 9.014442740952446e-06, 'epoch': 1.64} +2025-05-11 05:25:53 - ERROR - stderr - 55%|█████▍ | 2041/3741 [11:59:59<10:08:49, 21.49s/it] +2025-05-11 05:26:13 - ERROR - stderr - 55%|█████▍ | 2042/3741 [12:00:19<9:55:49, 21.04s/it] +2025-05-11 05:26:13 - ERROR - stderr - +2025-05-11 05:26:13 - ERROR - stderr - +2025-05-11 05:26:13 - INFO - stdout - {'loss': 0.7215, 'grad_norm': 0.678142786026001, 'learning_rate': 9.005825972846652e-06, 'epoch': 1.64} +2025-05-11 05:26:13 - ERROR - stderr - 55%|█████▍ | 2042/3741 [12:00:19<9:55:49, 21.04s/it] +2025-05-11 05:26:33 - ERROR - stderr - 55%|█████▍ | 2043/3741 [12:00:39<9:47:32, 20.76s/it] +2025-05-11 05:26:33 - ERROR - stderr - +2025-05-11 05:26:33 - ERROR - stderr - +2025-05-11 05:26:33 - INFO - stdout - {'loss': 0.7578, 'grad_norm': 0.6874783039093018, 'learning_rate': 8.997209950206396e-06, 'epoch': 1.64} +2025-05-11 05:26:33 - ERROR - stderr - 55%|█████▍ | 2043/3741 [12:00:39<9:47:32, 20.76s/it] +2025-05-11 05:26:53 - ERROR - stderr - 55%|█████▍ | 2044/3741 [12:01:00<9:45:45, 20.71s/it] +2025-05-11 05:26:53 - ERROR - stderr - +2025-05-11 05:26:53 - ERROR - stderr - +2025-05-11 05:26:53 - INFO - stdout - {'loss': 0.7656, 'grad_norm': 0.6972293257713318, 'learning_rate': 8.988594679492276e-06, 'epoch': 1.64} +2025-05-11 05:26:53 - ERROR - stderr - 55%|█████▍ | 2044/3741 [12:01:00<9:45:45, 20.71s/it] +2025-05-11 05:26:54 - WARNING - transformers.tokenization_utils_base - Token indices sequence length is longer than the specified maximum sequence length for this model (5949 > 4096). Running this sequence through the model will result in indexing errors +2025-05-11 05:26:54 - WARNING - transformers.tokenization_utils_base - Token indices sequence length is longer than the specified maximum sequence length for this model (5949 > 4096). Running this sequence through the model will result in indexing errors +2025-05-11 05:27:14 - ERROR - stderr - 55%|█████▍ | 2045/3741 [12:01:20<9:42:33, 20.61s/it] +2025-05-11 05:27:14 - ERROR - stderr - +2025-05-11 05:27:14 - ERROR - stderr - +2025-05-11 05:27:14 - INFO - stdout - {'loss': 0.7223, 'grad_norm': 0.6379215121269226, 'learning_rate': 8.979980167164311e-06, 'epoch': 1.64} +2025-05-11 05:27:14 - ERROR - stderr - 55%|█████▍ | 2045/3741 [12:01:20<9:42:33, 20.61s/it] +2025-05-11 05:27:38 - ERROR - stderr - 55%|█████▍ | 2046/3741 [12:01:45<10:15:33, 21.79s/it] +2025-05-11 05:27:38 - ERROR - stderr - +2025-05-11 05:27:38 - ERROR - stderr - +2025-05-11 05:27:38 - INFO - stdout - {'loss': 0.7007, 'grad_norm': 0.6454169750213623, 'learning_rate': 8.971366419681948e-06, 'epoch': 1.64} +2025-05-11 05:27:38 - ERROR - stderr - 55%|█████▍ | 2046/3741 [12:01:45<10:15:33, 21.79s/it] +2025-05-11 05:27:58 - ERROR - stderr - 55%|█████▍ | 2047/3741 [12:02:05<10:00:11, 21.26s/it] +2025-05-11 05:27:58 - ERROR - stderr - +2025-05-11 05:27:58 - ERROR - stderr - +2025-05-11 05:27:58 - INFO - stdout - {'loss': 0.7664, 'grad_norm': 0.6857996582984924, 'learning_rate': 8.96275344350408e-06, 'epoch': 1.64} +2025-05-11 05:27:58 - ERROR - stderr - 55%|█████▍ | 2047/3741 [12:02:05<10:00:11, 21.26s/it] +2025-05-11 05:28:18 - ERROR - stderr - 55%|█████▍ | 2048/3741 [12:02:25<9:49:22, 20.89s/it] +2025-05-11 05:28:18 - ERROR - stderr - +2025-05-11 05:28:18 - ERROR - stderr - +2025-05-11 05:28:18 - INFO - stdout - {'loss': 0.7698, 'grad_norm': 0.7171909809112549, 'learning_rate': 8.954141245089002e-06, 'epoch': 1.64} +2025-05-11 05:28:18 - ERROR - stderr - 55%|█████▍ | 2048/3741 [12:02:25<9:49:22, 20.89s/it] +2025-05-11 05:28:38 - ERROR - stderr - 55%|█████▍ | 2049/3741 [12:02:44<9:38:33, 20.52s/it] +2025-05-11 05:28:38 - ERROR - stderr - +2025-05-11 05:28:38 - ERROR - stderr - +2025-05-11 05:28:38 - INFO - stdout - {'loss': 0.7481, 'grad_norm': 0.6804106831550598, 'learning_rate': 8.945529830894439e-06, 'epoch': 1.64} +2025-05-11 05:28:38 - ERROR - stderr - 55%|█████▍ | 2049/3741 [12:02:44<9:38:33, 20.52s/it] +2025-05-11 05:29:00 - ERROR - stderr - 55%|█████▍ | 2050/3741 [12:03:06<9:49:03, 20.90s/it] +2025-05-11 05:29:00 - ERROR - stderr - +2025-05-11 05:29:00 - ERROR - stderr - +2025-05-11 05:29:00 - INFO - stdout - {'loss': 0.7417, 'grad_norm': 0.6667369604110718, 'learning_rate': 8.93691920737752e-06, 'epoch': 1.64} +2025-05-11 05:29:00 - ERROR - stderr - 55%|█████▍ | 2050/3741 [12:03:06<9:49:03, 20.90s/it] +2025-05-11 05:29:19 - ERROR - stderr - 55%|█████▍ | 2051/3741 [12:03:26<9:39:01, 20.56s/it] +2025-05-11 05:29:19 - ERROR - stderr - +2025-05-11 05:29:19 - ERROR - stderr - +2025-05-11 05:29:19 - INFO - stdout - {'loss': 0.6958, 'grad_norm': 0.6525418758392334, 'learning_rate': 8.92830938099478e-06, 'epoch': 1.64} +2025-05-11 05:29:19 - ERROR - stderr - 55%|█████▍ | 2051/3741 [12:03:26<9:39:01, 20.56s/it] +2025-05-11 05:29:41 - ERROR - stderr - 55%|█████▍ | 2052/3741 [12:03:48<9:50:18, 20.97s/it] +2025-05-11 05:29:41 - ERROR - stderr - +2025-05-11 05:29:41 - ERROR - stderr - +2025-05-11 05:29:41 - INFO - stdout - {'loss': 0.7469, 'grad_norm': 0.6953479647636414, 'learning_rate': 8.919700358202167e-06, 'epoch': 1.65} +2025-05-11 05:29:41 - ERROR - stderr - 55%|█████▍ | 2052/3741 [12:03:48<9:50:18, 20.97s/it] +2025-05-11 05:30:01 - ERROR - stderr - 55%|█████▍ | 2053/3741 [12:04:07<9:39:09, 20.59s/it] +2025-05-11 05:30:01 - ERROR - stderr - +2025-05-11 05:30:01 - ERROR - stderr - +2025-05-11 05:30:01 - INFO - stdout - {'loss': 0.7828, 'grad_norm': 0.715549886226654, 'learning_rate': 8.911092145455015e-06, 'epoch': 1.65} +2025-05-11 05:30:01 - ERROR - stderr - 55%|█████▍ | 2053/3741 [12:04:07<9:39:09, 20.59s/it] +2025-05-11 05:30:23 - ERROR - stderr - 55%|█████▍ | 2054/3741 [12:04:29<9:49:38, 20.97s/it] +2025-05-11 05:30:23 - ERROR - stderr - +2025-05-11 05:30:23 - ERROR - stderr - +2025-05-11 05:30:23 - INFO - stdout - {'loss': 0.737, 'grad_norm': 0.9194969534873962, 'learning_rate': 8.902484749208058e-06, 'epoch': 1.65} +2025-05-11 05:30:23 - ERROR - stderr - 55%|█████▍ | 2054/3741 [12:04:29<9:49:38, 20.97s/it] +2025-05-11 05:30:43 - ERROR - stderr - 55%|█████▍ | 2055/3741 [12:04:49<9:38:35, 20.59s/it] +2025-05-11 05:30:43 - ERROR - stderr - +2025-05-11 05:30:43 - ERROR - stderr - +2025-05-11 05:30:43 - INFO - stdout - {'loss': 0.718, 'grad_norm': 0.6881150603294373, 'learning_rate': 8.893878175915414e-06, 'epoch': 1.65} +2025-05-11 05:30:43 - ERROR - stderr - 55%|█████▍ | 2055/3741 [12:04:49<9:38:35, 20.59s/it] +2025-05-11 05:31:06 - ERROR - stderr - 55%|█████▍ | 2056/3741 [12:05:12<10:00:35, 21.39s/it] +2025-05-11 05:31:06 - ERROR - stderr - +2025-05-11 05:31:06 - ERROR - stderr - +2025-05-11 05:31:06 - INFO - stdout - {'loss': 0.7638, 'grad_norm': 0.716201663017273, 'learning_rate': 8.885272432030579e-06, 'epoch': 1.65} +2025-05-11 05:31:06 - ERROR - stderr - 55%|█████▍ | 2056/3741 [12:05:12<10:00:35, 21.39s/it] +2025-05-11 05:31:26 - ERROR - stderr - 55%|█████▍ | 2057/3741 [12:05:32<9:48:22, 20.96s/it] +2025-05-11 05:31:26 - ERROR - stderr - +2025-05-11 05:31:26 - ERROR - stderr - +2025-05-11 05:31:26 - INFO - stdout - {'loss': 0.7288, 'grad_norm': 0.6798361539840698, 'learning_rate': 8.876667524006442e-06, 'epoch': 1.65} +2025-05-11 05:31:26 - ERROR - stderr - 55%|█████▍ | 2057/3741 [12:05:32<9:48:22, 20.96s/it] +2025-05-11 05:31:48 - ERROR - stderr - 55%|█████▌ | 2058/3741 [12:05:54<9:59:10, 21.36s/it] +2025-05-11 05:31:48 - ERROR - stderr - +2025-05-11 05:31:48 - ERROR - stderr - +2025-05-11 05:31:48 - INFO - stdout - {'loss': 0.7251, 'grad_norm': 0.6651695370674133, 'learning_rate': 8.868063458295251e-06, 'epoch': 1.65} +2025-05-11 05:31:48 - ERROR - stderr - 55%|█████▌ | 2058/3741 [12:05:54<9:59:10, 21.36s/it] +2025-05-11 05:32:08 - ERROR - stderr - 55%|█████▌ | 2059/3741 [12:06:14<9:43:47, 20.83s/it] +2025-05-11 05:32:08 - ERROR - stderr - +2025-05-11 05:32:08 - ERROR - stderr - +2025-05-11 05:32:08 - INFO - stdout - {'loss': 0.7271, 'grad_norm': 0.6712408065795898, 'learning_rate': 8.85946024134863e-06, 'epoch': 1.65} +2025-05-11 05:32:08 - ERROR - stderr - 55%|█████▌ | 2059/3741 [12:06:14<9:43:47, 20.83s/it] +2025-05-11 05:32:27 - ERROR - stderr - 55%|█████▌ | 2060/3741 [12:06:34<9:33:42, 20.48s/it] +2025-05-11 05:32:27 - ERROR - stderr - +2025-05-11 05:32:27 - ERROR - stderr - +2025-05-11 05:32:27 - INFO - stdout - {'loss': 0.7224, 'grad_norm': 0.6494314670562744, 'learning_rate': 8.850857879617562e-06, 'epoch': 1.65} +2025-05-11 05:32:27 - ERROR - stderr - 55%|█████▌ | 2060/3741 [12:06:34<9:33:42, 20.48s/it] +2025-05-11 05:32:47 - ERROR - stderr - 55%|█████▌ | 2061/3741 [12:06:54<9:28:51, 20.32s/it] +2025-05-11 05:32:47 - ERROR - stderr - +2025-05-11 05:32:47 - ERROR - stderr - +2025-05-11 05:32:47 - INFO - stdout - {'loss': 0.7429, 'grad_norm': 0.6849057674407959, 'learning_rate': 8.84225637955239e-06, 'epoch': 1.65} +2025-05-11 05:32:47 - ERROR - stderr - 55%|█████▌ | 2061/3741 [12:06:54<9:28:51, 20.32s/it] +2025-05-11 05:33:07 - ERROR - stderr - 55%|█████▌ | 2062/3741 [12:07:13<9:23:55, 20.15s/it] +2025-05-11 05:33:07 - ERROR - stderr - +2025-05-11 05:33:07 - ERROR - stderr - +2025-05-11 05:33:07 - INFO - stdout - {'loss': 0.6723, 'grad_norm': 0.6645267009735107, 'learning_rate': 8.833655747602816e-06, 'epoch': 1.65} +2025-05-11 05:33:07 - ERROR - stderr - 55%|█████▌ | 2062/3741 [12:07:13<9:23:55, 20.15s/it] +2025-05-11 05:33:27 - ERROR - stderr - 55%|█████▌ | 2063/3741 [12:07:33<9:18:59, 19.99s/it] +2025-05-11 05:33:27 - ERROR - stderr - +2025-05-11 05:33:27 - ERROR - stderr - +2025-05-11 05:33:27 - INFO - stdout - {'loss': 0.7193, 'grad_norm': 0.6581180691719055, 'learning_rate': 8.825055990217877e-06, 'epoch': 1.65} +2025-05-11 05:33:27 - ERROR - stderr - 55%|█████▌ | 2063/3741 [12:07:33<9:18:59, 19.99s/it] +2025-05-11 05:33:46 - ERROR - stderr - 55%|█████▌ | 2064/3741 [12:07:52<9:14:05, 19.82s/it] +2025-05-11 05:33:46 - ERROR - stderr - +2025-05-11 05:33:46 - ERROR - stderr - +2025-05-11 05:33:46 - INFO - stdout - {'loss': 0.7498, 'grad_norm': 0.6753679513931274, 'learning_rate': 8.816457113845977e-06, 'epoch': 1.66} +2025-05-11 05:33:46 - ERROR - stderr - 55%|█████▌ | 2064/3741 [12:07:52<9:14:05, 19.82s/it] +2025-05-11 05:34:07 - ERROR - stderr - 55%|█████▌ | 2065/3741 [12:08:13<9:20:08, 20.05s/it] +2025-05-11 05:34:07 - ERROR - stderr - +2025-05-11 05:34:07 - ERROR - stderr - +2025-05-11 05:34:07 - INFO - stdout - {'loss': 0.755, 'grad_norm': 0.6934730410575867, 'learning_rate': 8.80785912493484e-06, 'epoch': 1.66} +2025-05-11 05:34:07 - ERROR - stderr - 55%|█████▌ | 2065/3741 [12:08:13<9:20:08, 20.05s/it] +2025-05-11 05:34:26 - ERROR - stderr - 55%|█████▌ | 2066/3741 [12:08:33<9:16:49, 19.95s/it] +2025-05-11 05:34:26 - ERROR - stderr - +2025-05-11 05:34:26 - ERROR - stderr - +2025-05-11 05:34:26 - INFO - stdout - {'loss': 0.7371, 'grad_norm': 0.6492776870727539, 'learning_rate': 8.799262029931527e-06, 'epoch': 1.66} +2025-05-11 05:34:26 - ERROR - stderr - 55%|█████▌ | 2066/3741 [12:08:33<9:16:49, 19.95s/it] +2025-05-11 05:34:48 - ERROR - stderr - 55%|█████▌ | 2067/3741 [12:08:54<9:30:47, 20.46s/it] +2025-05-11 05:34:48 - ERROR - stderr - +2025-05-11 05:34:48 - ERROR - stderr - +2025-05-11 05:34:48 - INFO - stdout - {'loss': 0.7319, 'grad_norm': 0.6823244690895081, 'learning_rate': 8.79066583528244e-06, 'epoch': 1.66} +2025-05-11 05:34:48 - ERROR - stderr - 55%|█████▌ | 2067/3741 [12:08:54<9:30:47, 20.46s/it] +2025-05-11 05:35:08 - ERROR - stderr - 55%|█████▌ | 2068/3741 [12:09:14<9:26:07, 20.30s/it] +2025-05-11 05:35:08 - ERROR - stderr - +2025-05-11 05:35:08 - ERROR - stderr - +2025-05-11 05:35:08 - INFO - stdout - {'loss': 0.6997, 'grad_norm': 0.6459192037582397, 'learning_rate': 8.78207054743329e-06, 'epoch': 1.66} +2025-05-11 05:35:08 - ERROR - stderr - 55%|█████▌ | 2068/3741 [12:09:14<9:26:07, 20.30s/it] +2025-05-11 05:35:31 - ERROR - stderr - 55%|█████▌ | 2069/3741 [12:09:37<9:45:23, 21.01s/it] +2025-05-11 05:35:31 - ERROR - stderr - +2025-05-11 05:35:31 - ERROR - stderr - +2025-05-11 05:35:31 - INFO - stdout - {'loss': 0.7578, 'grad_norm': 0.6940407156944275, 'learning_rate': 8.773476172829127e-06, 'epoch': 1.66} +2025-05-11 05:35:31 - ERROR - stderr - 55%|█████▌ | 2069/3741 [12:09:37<9:45:23, 21.01s/it] +2025-05-11 05:35:51 - ERROR - stderr - 55%|█████▌ | 2070/3741 [12:09:57<9:39:47, 20.82s/it] +2025-05-11 05:35:51 - ERROR - stderr - +2025-05-11 05:35:51 - ERROR - stderr - +2025-05-11 05:35:51 - INFO - stdout - {'loss': 0.7466, 'grad_norm': 0.6635679602622986, 'learning_rate': 8.7648827179143e-06, 'epoch': 1.66} +2025-05-11 05:35:51 - ERROR - stderr - 55%|█████▌ | 2070/3741 [12:09:57<9:39:47, 20.82s/it] +2025-05-11 05:36:14 - ERROR - stderr - 55%|█████▌ | 2071/3741 [12:10:20<9:57:04, 21.45s/it] +2025-05-11 05:36:14 - ERROR - stderr - +2025-05-11 05:36:14 - ERROR - stderr - +2025-05-11 05:36:14 - INFO - stdout - {'loss': 0.7564, 'grad_norm': 0.7041206359863281, 'learning_rate': 8.756290189132473e-06, 'epoch': 1.66} +2025-05-11 05:36:14 - ERROR - stderr - 55%|█████▌ | 2071/3741 [12:10:20<9:57:04, 21.45s/it] +2025-05-11 05:36:34 - ERROR - stderr - 55%|█████▌ | 2072/3741 [12:10:40<9:42:54, 20.96s/it] +2025-05-11 05:36:34 - ERROR - stderr - +2025-05-11 05:36:34 - ERROR - stderr - +2025-05-11 05:36:34 - INFO - stdout - {'loss': 0.723, 'grad_norm': 0.6900807619094849, 'learning_rate': 8.74769859292662e-06, 'epoch': 1.66} +2025-05-11 05:36:34 - ERROR - stderr - 55%|█████▌ | 2072/3741 [12:10:40<9:42:54, 20.96s/it] +2025-05-11 05:36:56 - ERROR - stderr - 55%|█████▌ | 2073/3741 [12:11:02<9:54:12, 21.37s/it] +2025-05-11 05:36:56 - ERROR - stderr - +2025-05-11 05:36:56 - ERROR - stderr - +2025-05-11 05:36:56 - INFO - stdout - {'loss': 0.7512, 'grad_norm': 0.7090346217155457, 'learning_rate': 8.739107935739004e-06, 'epoch': 1.66} +2025-05-11 05:36:56 - ERROR - stderr - 55%|█████▌ | 2073/3741 [12:11:02<9:54:12, 21.37s/it] +2025-05-11 05:37:16 - ERROR - stderr - 55%|█████▌ | 2074/3741 [12:11:22<9:38:11, 20.81s/it] +2025-05-11 05:37:16 - ERROR - stderr - +2025-05-11 05:37:16 - ERROR - stderr - +2025-05-11 05:37:16 - INFO - stdout - {'loss': 0.7404, 'grad_norm': 0.6753419041633606, 'learning_rate': 8.730518224011209e-06, 'epoch': 1.66} +2025-05-11 05:37:16 - ERROR - stderr - 55%|█████▌ | 2074/3741 [12:11:22<9:38:11, 20.81s/it] +2025-05-11 05:37:36 - ERROR - stderr - 55%|█████▌ | 2075/3741 [12:11:43<9:37:17, 20.79s/it] +2025-05-11 05:37:36 - ERROR - stderr - +2025-05-11 05:37:36 - ERROR - stderr - +2025-05-11 05:37:36 - INFO - stdout - {'loss': 0.7529, 'grad_norm': 0.6859150528907776, 'learning_rate': 8.721929464184079e-06, 'epoch': 1.66} +2025-05-11 05:37:36 - ERROR - stderr - 55%|█████▌ | 2075/3741 [12:11:43<9:37:17, 20.79s/it] +2025-05-11 05:37:56 - ERROR - stderr - 55%|█████▌ | 2076/3741 [12:12:02<9:28:04, 20.47s/it] +2025-05-11 05:37:56 - ERROR - stderr - +2025-05-11 05:37:56 - ERROR - stderr - +2025-05-11 05:37:56 - INFO - stdout - {'loss': 0.7569, 'grad_norm': 0.713150143623352, 'learning_rate': 8.71334166269776e-06, 'epoch': 1.66} +2025-05-11 05:37:56 - ERROR - stderr - 55%|█████▌ | 2076/3741 [12:12:02<9:28:04, 20.47s/it] +2025-05-11 05:38:16 - ERROR - stderr - 56%|█████▌ | 2077/3741 [12:12:22<9:20:57, 20.23s/it] +2025-05-11 05:38:16 - ERROR - stderr - +2025-05-11 05:38:16 - ERROR - stderr - +2025-05-11 05:38:16 - INFO - stdout - {'loss': 0.7733, 'grad_norm': 0.6997129321098328, 'learning_rate': 8.704754825991684e-06, 'epoch': 1.67} +2025-05-11 05:38:16 - ERROR - stderr - 56%|█████▌ | 2077/3741 [12:12:22<9:20:57, 20.23s/it] +2025-05-11 05:38:35 - ERROR - stderr - 56%|█████▌ | 2078/3741 [12:12:42<9:16:33, 20.08s/it] +2025-05-11 05:38:36 - ERROR - stderr - +2025-05-11 05:38:36 - ERROR - stderr - +2025-05-11 05:38:36 - INFO - stdout - {'loss': 0.7407, 'grad_norm': 0.6805722117424011, 'learning_rate': 8.69616896050455e-06, 'epoch': 1.67} +2025-05-11 05:38:36 - ERROR - stderr - 56%|█████▌ | 2078/3741 [12:12:42<9:16:33, 20.08s/it] +2025-05-11 05:38:55 - ERROR - stderr - 56%|█████▌ | 2079/3741 [12:13:01<9:10:54, 19.89s/it] +2025-05-11 05:38:55 - ERROR - stderr - +2025-05-11 05:38:55 - ERROR - stderr - +2025-05-11 05:38:55 - INFO - stdout - {'loss': 0.7519, 'grad_norm': 0.6935459971427917, 'learning_rate': 8.687584072674335e-06, 'epoch': 1.67} +2025-05-11 05:38:55 - ERROR - stderr - 56%|█████▌ | 2079/3741 [12:13:01<9:10:54, 19.89s/it] +2025-05-11 05:39:16 - ERROR - stderr - 56%|█████▌ | 2080/3741 [12:13:22<9:19:04, 20.20s/it] +2025-05-11 05:39:16 - ERROR - stderr - +2025-05-11 05:39:16 - ERROR - stderr - +2025-05-11 05:39:16 - INFO - stdout - {'loss': 0.7196, 'grad_norm': 0.6706266403198242, 'learning_rate': 8.679000168938278e-06, 'epoch': 1.67} +2025-05-11 05:39:16 - ERROR - stderr - 56%|█████▌ | 2080/3741 [12:13:22<9:19:04, 20.20s/it] +2025-05-11 05:39:36 - ERROR - stderr - 56%|█████▌ | 2081/3741 [12:13:42<9:15:42, 20.09s/it] +2025-05-11 05:39:36 - ERROR - stderr - +2025-05-11 05:39:36 - ERROR - stderr - +2025-05-11 05:39:36 - INFO - stdout - {'loss': 0.7471, 'grad_norm': 0.6920490264892578, 'learning_rate': 8.670417255732876e-06, 'epoch': 1.67} +2025-05-11 05:39:36 - ERROR - stderr - 56%|█████▌ | 2081/3741 [12:13:42<9:15:42, 20.09s/it] +2025-05-11 05:39:57 - ERROR - stderr - 56%|█████▌ | 2082/3741 [12:14:03<9:22:31, 20.34s/it] +2025-05-11 05:39:57 - ERROR - stderr - +2025-05-11 05:39:57 - ERROR - stderr - +2025-05-11 05:39:57 - INFO - stdout - {'loss': 0.734, 'grad_norm': 0.7178738117218018, 'learning_rate': 8.661835339493903e-06, 'epoch': 1.67} +2025-05-11 05:39:57 - ERROR - stderr - 56%|█████▌ | 2082/3741 [12:14:03<9:22:31, 20.34s/it] +2025-05-11 05:40:16 - ERROR - stderr - 56%|█████▌ | 2083/3741 [12:14:23<9:16:26, 20.14s/it] +2025-05-11 05:40:16 - ERROR - stderr - +2025-05-11 05:40:16 - ERROR - stderr - +2025-05-11 05:40:16 - INFO - stdout - {'loss': 0.7109, 'grad_norm': 0.6797714233398438, 'learning_rate': 8.653254426656364e-06, 'epoch': 1.67} +2025-05-11 05:40:16 - ERROR - stderr - 56%|█████▌ | 2083/3741 [12:14:23<9:16:26, 20.14s/it] +2025-05-11 05:40:39 - ERROR - stderr - 56%|█████▌ | 2084/3741 [12:14:45<9:35:08, 20.83s/it] +2025-05-11 05:40:39 - ERROR - stderr - +2025-05-11 05:40:39 - ERROR - stderr - +2025-05-11 05:40:39 - INFO - stdout - {'loss': 0.7198, 'grad_norm': 0.7086063027381897, 'learning_rate': 8.644674523654522e-06, 'epoch': 1.67} +2025-05-11 05:40:39 - ERROR - stderr - 56%|█████▌ | 2084/3741 [12:14:45<9:35:08, 20.83s/it] +2025-05-11 05:40:58 - ERROR - stderr - 56%|█████▌ | 2085/3741 [12:15:05<9:24:39, 20.46s/it] +2025-05-11 05:40:58 - ERROR - stderr - +2025-05-11 05:40:58 - ERROR - stderr - +2025-05-11 05:40:58 - INFO - stdout - {'loss': 0.7373, 'grad_norm': 0.660064160823822, 'learning_rate': 8.636095636921878e-06, 'epoch': 1.67} +2025-05-11 05:40:58 - ERROR - stderr - 56%|█████▌ | 2085/3741 [12:15:05<9:24:39, 20.46s/it] +2025-05-11 05:41:21 - ERROR - stderr - 56%|█████▌ | 2086/3741 [12:15:27<9:43:40, 21.16s/it] +2025-05-11 05:41:21 - ERROR - stderr - +2025-05-11 05:41:21 - ERROR - stderr - +2025-05-11 05:41:21 - INFO - stdout - {'loss': 0.7312, 'grad_norm': 0.6909743547439575, 'learning_rate': 8.627517772891172e-06, 'epoch': 1.67} +2025-05-11 05:41:21 - ERROR - stderr - 56%|█████▌ | 2086/3741 [12:15:27<9:43:40, 21.16s/it] +2025-05-11 05:41:41 - ERROR - stderr - 56%|█████▌ | 2087/3741 [12:15:47<9:28:52, 20.64s/it] +2025-05-11 05:41:41 - ERROR - stderr - +2025-05-11 05:41:41 - ERROR - stderr - +2025-05-11 05:41:41 - INFO - stdout - {'loss': 0.7629, 'grad_norm': 0.6800159811973572, 'learning_rate': 8.618940937994387e-06, 'epoch': 1.67} +2025-05-11 05:41:41 - ERROR - stderr - 56%|█████▌ | 2087/3741 [12:15:47<9:28:52, 20.64s/it] +2025-05-11 05:42:00 - ERROR - stderr - 56%|█████▌ | 2088/3741 [12:16:06<9:20:05, 20.33s/it] +2025-05-11 05:42:00 - ERROR - stderr - +2025-05-11 05:42:00 - ERROR - stderr - +2025-05-11 05:42:00 - INFO - stdout - {'loss': 0.7388, 'grad_norm': 0.7067490816116333, 'learning_rate': 8.610365138662716e-06, 'epoch': 1.67} +2025-05-11 05:42:00 - ERROR - stderr - 56%|█████▌ | 2088/3741 [12:16:06<9:20:05, 20.33s/it] +2025-05-11 05:42:20 - ERROR - stderr - 56%|█████▌ | 2089/3741 [12:16:26<9:14:09, 20.13s/it] +2025-05-11 05:42:20 - ERROR - stderr - +2025-05-11 05:42:20 - ERROR - stderr - +2025-05-11 05:42:20 - INFO - stdout - {'loss': 0.7714, 'grad_norm': 0.7322930097579956, 'learning_rate': 8.601790381326593e-06, 'epoch': 1.68} +2025-05-11 05:42:20 - ERROR - stderr - 56%|█████▌ | 2089/3741 [12:16:26<9:14:09, 20.13s/it] +2025-05-11 05:42:40 - ERROR - stderr - 56%|█████▌ | 2090/3741 [12:16:46<9:10:27, 20.00s/it] +2025-05-11 05:42:40 - ERROR - stderr - +2025-05-11 05:42:40 - ERROR - stderr - +2025-05-11 05:42:40 - INFO - stdout - {'loss': 0.7291, 'grad_norm': 0.6572024822235107, 'learning_rate': 8.59321667241566e-06, 'epoch': 1.68} +2025-05-11 05:42:40 - ERROR - stderr - 56%|█████▌ | 2090/3741 [12:16:46<9:10:27, 20.00s/it] +2025-05-11 05:42:59 - ERROR - stderr - 56%|█████▌ | 2091/3741 [12:17:05<9:07:07, 19.90s/it] +2025-05-11 05:42:59 - ERROR - stderr - +2025-05-11 05:42:59 - ERROR - stderr - +2025-05-11 05:42:59 - INFO - stdout - {'loss': 0.7107, 'grad_norm': 0.6840153932571411, 'learning_rate': 8.584644018358773e-06, 'epoch': 1.68} +2025-05-11 05:42:59 - ERROR - stderr - 56%|█████▌ | 2091/3741 [12:17:05<9:07:07, 19.90s/it] +2025-05-11 05:43:19 - ERROR - stderr - 56%|█████▌ | 2092/3741 [12:17:25<9:03:16, 19.77s/it] +2025-05-11 05:43:19 - ERROR - stderr - +2025-05-11 05:43:19 - ERROR - stderr - +2025-05-11 05:43:19 - INFO - stdout - {'loss': 0.7084, 'grad_norm': 0.653813898563385, 'learning_rate': 8.576072425584004e-06, 'epoch': 1.68} +2025-05-11 05:43:19 - ERROR - stderr - 56%|█████▌ | 2092/3741 [12:17:25<9:03:16, 19.77s/it] +2025-05-11 05:43:39 - ERROR - stderr - 56%|█████▌ | 2093/3741 [12:17:46<9:10:06, 20.03s/it] +2025-05-11 05:43:39 - ERROR - stderr - +2025-05-11 05:43:39 - ERROR - stderr - +2025-05-11 05:43:39 - INFO - stdout - {'loss': 0.728, 'grad_norm': 0.6750298142433167, 'learning_rate': 8.56750190051862e-06, 'epoch': 1.68} +2025-05-11 05:43:39 - ERROR - stderr - 56%|█████▌ | 2093/3741 [12:17:46<9:10:06, 20.03s/it] +2025-05-11 05:43:59 - ERROR - stderr - 56%|█████▌ | 2094/3741 [12:18:05<9:07:02, 19.93s/it] +2025-05-11 05:43:59 - ERROR - stderr - +2025-05-11 05:43:59 - ERROR - stderr - +2025-05-11 05:43:59 - INFO - stdout - {'loss': 0.7279, 'grad_norm': 0.6693862080574036, 'learning_rate': 8.558932449589103e-06, 'epoch': 1.68} +2025-05-11 05:43:59 - ERROR - stderr - 56%|█████▌ | 2094/3741 [12:18:05<9:07:02, 19.93s/it] +2025-05-11 05:44:21 - ERROR - stderr - 56%|█████▌ | 2095/3741 [12:18:28<9:25:50, 20.63s/it] +2025-05-11 05:44:21 - ERROR - stderr - +2025-05-11 05:44:21 - ERROR - stderr - +2025-05-11 05:44:21 - INFO - stdout - {'loss': 0.7475, 'grad_norm': 0.6705959439277649, 'learning_rate': 8.550364079221111e-06, 'epoch': 1.68} +2025-05-11 05:44:21 - ERROR - stderr - 56%|█████▌ | 2095/3741 [12:18:28<9:25:50, 20.63s/it] +2025-05-11 05:44:41 - ERROR - stderr - 56%|█████▌ | 2096/3741 [12:18:47<9:17:02, 20.32s/it] +2025-05-11 05:44:41 - ERROR - stderr - +2025-05-11 05:44:41 - ERROR - stderr - +2025-05-11 05:44:41 - INFO - stdout - {'loss': 0.7598, 'grad_norm': 0.7326763868331909, 'learning_rate': 8.541796795839498e-06, 'epoch': 1.68} +2025-05-11 05:44:41 - ERROR - stderr - 56%|█████▌ | 2096/3741 [12:18:47<9:17:02, 20.32s/it] +2025-05-11 05:45:03 - ERROR - stderr - 56%|█████▌ | 2097/3741 [12:19:09<9:31:22, 20.85s/it] +2025-05-11 05:45:03 - ERROR - stderr - +2025-05-11 05:45:03 - ERROR - stderr - +2025-05-11 05:45:03 - INFO - stdout - {'loss': 0.714, 'grad_norm': 0.6632883548736572, 'learning_rate': 8.533230605868314e-06, 'epoch': 1.68} +2025-05-11 05:45:03 - ERROR - stderr - 56%|█████▌ | 2097/3741 [12:19:09<9:31:22, 20.85s/it] +2025-05-11 05:45:23 - ERROR - stderr - 56%|█████▌ | 2098/3741 [12:19:29<9:21:57, 20.52s/it] +2025-05-11 05:45:23 - ERROR - stderr - +2025-05-11 05:45:23 - ERROR - stderr - +2025-05-11 05:45:23 - INFO - stdout - {'loss': 0.7276, 'grad_norm': 0.676483154296875, 'learning_rate': 8.524665515730766e-06, 'epoch': 1.68} +2025-05-11 05:45:23 - ERROR - stderr - 56%|█████▌ | 2098/3741 [12:19:29<9:21:57, 20.52s/it] +2025-05-11 05:45:45 - ERROR - stderr - 56%|█████▌ | 2099/3741 [12:19:52<9:40:25, 21.21s/it] +2025-05-11 05:45:45 - ERROR - stderr - +2025-05-11 05:45:45 - ERROR - stderr - +2025-05-11 05:45:45 - INFO - stdout - {'loss': 0.7608, 'grad_norm': 0.6715077757835388, 'learning_rate': 8.516101531849266e-06, 'epoch': 1.68} +2025-05-11 05:45:45 - ERROR - stderr - 56%|█████▌ | 2099/3741 [12:19:52<9:40:25, 21.21s/it] +2025-05-11 05:46:05 - ERROR - stderr - 56%|█████▌ | 2100/3741 [12:20:11<9:27:42, 20.76s/it] +2025-05-11 05:46:05 - ERROR - stderr - +2025-05-11 05:46:05 - ERROR - stderr - +2025-05-11 05:46:05 - INFO - stdout - {'loss': 0.725, 'grad_norm': 0.6537802219390869, 'learning_rate': 8.507538660645372e-06, 'epoch': 1.68} +2025-05-11 05:46:05 - ERROR - stderr - 56%|█████▌ | 2100/3741 [12:20:12<9:27:42, 20.76s/it] +2025-05-11 05:46:28 - ERROR - stderr - 56%|█████▌ | 2101/3741 [12:20:34<9:43:56, 21.36s/it] +2025-05-11 05:46:28 - ERROR - stderr - +2025-05-11 05:46:28 - ERROR - stderr - +2025-05-11 05:46:28 - INFO - stdout - {'loss': 0.7545, 'grad_norm': 0.6930273771286011, 'learning_rate': 8.498976908539817e-06, 'epoch': 1.68} +2025-05-11 05:46:28 - ERROR - stderr - 56%|█████▌ | 2101/3741 [12:20:34<9:43:56, 21.36s/it] +2025-05-11 05:46:48 - ERROR - stderr - 56%|█████▌ | 2102/3741 [12:20:54<9:30:03, 20.87s/it] +2025-05-11 05:46:48 - ERROR - stderr - +2025-05-11 05:46:48 - ERROR - stderr - +2025-05-11 05:46:48 - INFO - stdout - {'loss': 0.7618, 'grad_norm': 0.6789145469665527, 'learning_rate': 8.490416281952495e-06, 'epoch': 1.69} +2025-05-11 05:46:48 - ERROR - stderr - 56%|█████▌ | 2102/3741 [12:20:54<9:30:03, 20.87s/it] +2025-05-11 05:47:10 - ERROR - stderr - 56%|█████▌ | 2103/3741 [12:21:16<9:38:43, 21.20s/it] +2025-05-11 05:47:10 - ERROR - stderr - +2025-05-11 05:47:10 - ERROR - stderr - +2025-05-11 05:47:10 - INFO - stdout - {'loss': 0.7155, 'grad_norm': 0.6359390020370483, 'learning_rate': 8.481856787302454e-06, 'epoch': 1.69} +2025-05-11 05:47:10 - ERROR - stderr - 56%|█████▌ | 2103/3741 [12:21:16<9:38:43, 21.20s/it] +2025-05-11 05:47:30 - ERROR - stderr - 56%|█████▌ | 2104/3741 [12:21:36<9:31:06, 20.93s/it] +2025-05-11 05:47:30 - ERROR - stderr - +2025-05-11 05:47:30 - ERROR - stderr - +2025-05-11 05:47:30 - INFO - stdout - {'loss': 0.7385, 'grad_norm': 0.658112108707428, 'learning_rate': 8.473298431007901e-06, 'epoch': 1.69} +2025-05-11 05:47:30 - ERROR - stderr - 56%|█████▌ | 2104/3741 [12:21:36<9:31:06, 20.93s/it] +2025-05-11 05:47:53 - ERROR - stderr - 56%|█████▋ | 2105/3741 [12:21:59<9:46:41, 21.52s/it] +2025-05-11 05:47:53 - ERROR - stderr - +2025-05-11 05:47:53 - ERROR - stderr - +2025-05-11 05:47:53 - INFO - stdout - {'loss': 0.7365, 'grad_norm': 0.6744823455810547, 'learning_rate': 8.464741219486175e-06, 'epoch': 1.69} +2025-05-11 05:47:53 - ERROR - stderr - 56%|█████�� | 2105/3741 [12:21:59<9:46:41, 21.52s/it] +2025-05-11 05:48:13 - ERROR - stderr - 56%|█████▋ | 2106/3741 [12:22:19<9:32:45, 21.02s/it] +2025-05-11 05:48:13 - ERROR - stderr - +2025-05-11 05:48:13 - ERROR - stderr - +2025-05-11 05:48:13 - INFO - stdout - {'loss': 0.7542, 'grad_norm': 0.6585288643836975, 'learning_rate': 8.456185159153765e-06, 'epoch': 1.69} +2025-05-11 05:48:13 - ERROR - stderr - 56%|█████▋ | 2106/3741 [12:22:19<9:32:45, 21.02s/it] +2025-05-11 05:48:35 - ERROR - stderr - 56%|█████▋ | 2107/3741 [12:22:42<9:44:54, 21.48s/it] +2025-05-11 05:48:35 - ERROR - stderr - +2025-05-11 05:48:35 - ERROR - stderr - +2025-05-11 05:48:35 - INFO - stdout - {'loss': 0.7255, 'grad_norm': 0.6513398885726929, 'learning_rate': 8.447630256426303e-06, 'epoch': 1.69} +2025-05-11 05:48:35 - ERROR - stderr - 56%|█████▋ | 2107/3741 [12:22:42<9:44:54, 21.48s/it] +2025-05-11 05:48:55 - ERROR - stderr - 56%|█████▋ | 2108/3741 [12:23:01<9:31:01, 20.98s/it] +2025-05-11 05:48:55 - ERROR - stderr - +2025-05-11 05:48:55 - ERROR - stderr - +2025-05-11 05:48:55 - INFO - stdout - {'loss': 0.7541, 'grad_norm': 0.6796519756317139, 'learning_rate': 8.439076517718541e-06, 'epoch': 1.69} +2025-05-11 05:48:55 - ERROR - stderr - 56%|█████▋ | 2108/3741 [12:23:01<9:31:01, 20.98s/it] +2025-05-11 05:49:18 - ERROR - stderr - 56%|█████▋ | 2109/3741 [12:23:24<9:48:06, 21.62s/it] +2025-05-11 05:49:18 - ERROR - stderr - +2025-05-11 05:49:18 - ERROR - stderr - +2025-05-11 05:49:18 - INFO - stdout - {'loss': 0.7137, 'grad_norm': 0.6824721097946167, 'learning_rate': 8.430523949444367e-06, 'epoch': 1.69} +2025-05-11 05:49:18 - ERROR - stderr - 56%|█████▋ | 2109/3741 [12:23:25<9:48:06, 21.62s/it] +2025-05-11 05:49:38 - ERROR - stderr - 56%|█████▋ | 2110/3741 [12:23:44<9:32:03, 21.04s/it] +2025-05-11 05:49:38 - ERROR - stderr - +2025-05-11 05:49:38 - ERROR - stderr - +2025-05-11 05:49:38 - INFO - stdout - {'loss': 0.7435, 'grad_norm': 0.6917021870613098, 'learning_rate': 8.421972558016786e-06, 'epoch': 1.69} +2025-05-11 05:49:38 - ERROR - stderr - 56%|█████▋ | 2110/3741 [12:23:44<9:32:03, 21.04s/it] +2025-05-11 05:49:58 - ERROR - stderr - 56%|█████▋ | 2111/3741 [12:24:04<9:24:55, 20.79s/it] +2025-05-11 05:49:58 - ERROR - stderr - +2025-05-11 05:49:58 - ERROR - stderr - +2025-05-11 05:49:58 - INFO - stdout - {'loss': 0.7403, 'grad_norm': 0.6658660173416138, 'learning_rate': 8.413422349847918e-06, 'epoch': 1.69} +2025-05-11 05:49:58 - ERROR - stderr - 56%|█████▋ | 2111/3741 [12:24:04<9:24:55, 20.79s/it] +2025-05-11 05:50:18 - ERROR - stderr - 56%|█████▋ | 2112/3741 [12:24:24<9:14:34, 20.43s/it] +2025-05-11 05:50:18 - ERROR - stderr - +2025-05-11 05:50:18 - ERROR - stderr - +2025-05-11 05:50:18 - INFO - stdout - {'loss': 0.7765, 'grad_norm': 0.7041023373603821, 'learning_rate': 8.404873331349009e-06, 'epoch': 1.69} +2025-05-11 05:50:18 - ERROR - stderr - 56%|█████▋ | 2112/3741 [12:24:24<9:14:34, 20.43s/it] +2025-05-11 05:50:37 - ERROR - stderr - 56%|█████▋ | 2113/3741 [12:24:44<9:07:23, 20.17s/it] +2025-05-11 05:50:37 - ERROR - stderr - +2025-05-11 05:50:37 - ERROR - stderr - +2025-05-11 05:50:37 - INFO - stdout - {'loss': 0.766, 'grad_norm': 0.7012107372283936, 'learning_rate': 8.396325508930398e-06, 'epoch': 1.69} +2025-05-11 05:50:37 - ERROR - stderr - 56%|█████▋ | 2113/3741 [12:24:44<9:07:23, 20.17s/it] +2025-05-11 05:50:57 - ERROR - stderr - 57%|█████▋ | 2114/3741 [12:25:03<9:04:50, 20.09s/it] +2025-05-11 05:50:57 - ERROR - stderr - +2025-05-11 05:50:57 - ERROR - stderr - +2025-05-11 05:50:57 - INFO - stdout - {'loss': 0.7359, 'grad_norm': 0.662287712097168, 'learning_rate': 8.387778889001539e-06, 'epoch': 1.7} +2025-05-11 05:50:57 - ERROR - stderr - 57%|█████▋ | 2114/3741 [12:25:03<9:04:50, 20.09s/it] +2025-05-11 05:51:17 - ERROR - stderr - 57%|█████▋ | 2115/3741 [12:25:23<9:00:36, 19.95s/it] +2025-05-11 05:51:17 - ERROR - stderr - +2025-05-11 05:51:17 - ERROR - stderr - +2025-05-11 05:51:17 - INFO - stdout - {'loss': 0.7821, 'grad_norm': 0.692939043045044, 'learning_rate': 8.379233477970975e-06, 'epoch': 1.7} +2025-05-11 05:51:17 - ERROR - stderr - 57%|█████▋ | 2115/3741 [12:25:23<9:00:36, 19.95s/it] +2025-05-11 05:51:38 - ERROR - stderr - 57%|█████▋ | 2116/3741 [12:25:44<9:12:15, 20.39s/it] +2025-05-11 05:51:38 - ERROR - stderr - +2025-05-11 05:51:38 - ERROR - stderr - +2025-05-11 05:51:38 - INFO - stdout - {'loss': 0.7677, 'grad_norm': 0.7044574022293091, 'learning_rate': 8.370689282246341e-06, 'epoch': 1.7} +2025-05-11 05:51:38 - ERROR - stderr - 57%|█████▋ | 2116/3741 [12:25:45<9:12:15, 20.39s/it] +2025-05-11 05:51:59 - ERROR - stderr - 57%|█████▋ | 2117/3741 [12:26:05<9:13:12, 20.44s/it] +2025-05-11 05:51:59 - ERROR - stderr - +2025-05-11 05:51:59 - ERROR - stderr - +2025-05-11 05:51:59 - INFO - stdout - {'loss': 0.7545, 'grad_norm': 0.6683332920074463, 'learning_rate': 8.36214630823438e-06, 'epoch': 1.7} +2025-05-11 05:51:59 - ERROR - stderr - 57%|█████▋ | 2117/3741 [12:26:05<9:13:12, 20.44s/it] +2025-05-11 05:52:19 - ERROR - stderr - 57%|█████▋ | 2118/3741 [12:26:26<9:14:34, 20.50s/it] +2025-05-11 05:52:19 - ERROR - stderr - +2025-05-11 05:52:19 - ERROR - stderr - +2025-05-11 05:52:19 - INFO - stdout - {'loss': 0.777, 'grad_norm': 0.7165277600288391, 'learning_rate': 8.353604562340886e-06, 'epoch': 1.7} +2025-05-11 05:52:19 - ERROR - stderr - 57%|█████▋ | 2118/3741 [12:26:26<9:14:34, 20.50s/it] +2025-05-11 05:52:39 - ERROR - stderr - 57%|█████▋ | 2119/3741 [12:26:46<9:09:10, 20.31s/it] +2025-05-11 05:52:39 - ERROR - stderr - +2025-05-11 05:52:39 - ERROR - stderr - +2025-05-11 05:52:39 - INFO - stdout - {'loss': 0.7495, 'grad_norm': 0.7054672241210938, 'learning_rate': 8.345064050970767e-06, 'epoch': 1.7} +2025-05-11 05:52:39 - ERROR - stderr - 57%|█████▋ | 2119/3741 [12:26:46<9:09:10, 20.31s/it] +2025-05-11 05:53:00 - ERROR - stderr - 57%|█████▋ | 2120/3741 [12:27:07<9:14:13, 20.51s/it] +2025-05-11 05:53:00 - ERROR - stderr - +2025-05-11 05:53:00 - ERROR - stderr - +2025-05-11 05:53:00 - INFO - stdout - {'loss': 0.7648, 'grad_norm': 0.664445161819458, 'learning_rate': 8.336524780527986e-06, 'epoch': 1.7} +2025-05-11 05:53:00 - ERROR - stderr - 57%|█████▋ | 2120/3741 [12:27:07<9:14:13, 20.51s/it] +2025-05-11 05:53:20 - ERROR - stderr - 57%|█████▋ | 2121/3741 [12:27:26<9:09:10, 20.34s/it] +2025-05-11 05:53:20 - ERROR - stderr - +2025-05-11 05:53:20 - ERROR - stderr - +2025-05-11 05:53:20 - INFO - stdout - {'loss': 0.7248, 'grad_norm': 0.6737078428268433, 'learning_rate': 8.327986757415571e-06, 'epoch': 1.7} +2025-05-11 05:53:20 - ERROR - stderr - 57%|█████▋ | 2121/3741 [12:27:26<9:09:10, 20.34s/it] +2025-05-11 05:53:43 - ERROR - stderr - 57%|█████▋ | 2122/3741 [12:27:49<9:26:35, 21.00s/it] +2025-05-11 05:53:43 - ERROR - stderr - +2025-05-11 05:53:43 - ERROR - stderr - +2025-05-11 05:53:43 - INFO - stdout - {'loss': 0.7499, 'grad_norm': 0.6758593320846558, 'learning_rate': 8.319449988035631e-06, 'epoch': 1.7} +2025-05-11 05:53:43 - ERROR - stderr - 57%|█████▋ | 2122/3741 [12:27:49<9:26:35, 21.00s/it] +2025-05-11 05:54:02 - ERROR - stderr - 57%|█████▋ | 2123/3741 [12:28:09<9:15:43, 20.61s/it] +2025-05-11 05:54:02 - ERROR - stderr - +2025-05-11 05:54:02 - ERROR - stderr - +2025-05-11 05:54:02 - INFO - stdout - {'loss': 0.7671, 'grad_norm': 0.668846845626831, 'learning_rate': 8.310914478789321e-06, 'epoch': 1.7} +2025-05-11 05:54:02 - ERROR - stderr - 57%|█████▋ | 2123/3741 [12:28:09<9:15:43, 20.61s/it] +2025-05-11 05:54:25 - ERROR - stderr - 57%|█████▋ | 2124/3741 [12:28:31<9:30:48, 21.18s/it] +2025-05-11 05:54:25 - ERROR - stderr - +2025-05-11 05:54:25 - ERROR - stderr - +2025-05-11 05:54:25 - INFO - stdout - {'loss': 0.7507, 'grad_norm': 0.6598069071769714, 'learning_rate': 8.30238023607686e-06, 'epoch': 1.7} +2025-05-11 05:54:25 - ERROR - stderr - 57%|█████▋ | 2124/3741 [12:28:31<9:30:48, 21.18s/it] +2025-05-11 05:54:45 - ERROR - stderr - 57%|█████▋ | 2125/3741 [12:28:51<9:20:02, 20.79s/it] +2025-05-11 05:54:45 - ERROR - stderr - +2025-05-11 05:54:45 - ERROR - stderr - +2025-05-11 05:54:45 - INFO - stdout - {'loss': 0.7198, 'grad_norm': 0.6659058928489685, 'learning_rate': 8.293847266297513e-06, 'epoch': 1.7} +2025-05-11 05:54:45 - ERROR - stderr - 57%|█████▋ | 2125/3741 [12:28:51<9:20:02, 20.79s/it] +2025-05-11 05:55:07 - ERROR - stderr - 57%|█████▋ | 2126/3741 [12:29:14<9:34:20, 21.34s/it] +2025-05-11 05:55:07 - ERROR - stderr - +2025-05-11 05:55:07 - ERROR - stderr - +2025-05-11 05:55:07 - INFO - stdout - {'loss': 0.7103, 'grad_norm': 0.6601713299751282, 'learning_rate': 8.285315575849589e-06, 'epoch': 1.7} +2025-05-11 05:55:07 - ERROR - stderr - 57%|█████▋ | 2126/3741 [12:29:14<9:34:20, 21.34s/it] +2025-05-11 05:55:27 - ERROR - stderr - 57%|█████▋ | 2127/3741 [12:29:33<9:19:09, 20.79s/it] +2025-05-11 05:55:27 - ERROR - stderr - +2025-05-11 05:55:27 - ERROR - stderr - +2025-05-11 05:55:27 - INFO - stdout - {'loss': 0.7441, 'grad_norm': 0.6853638887405396, 'learning_rate': 8.276785171130445e-06, 'epoch': 1.71} +2025-05-11 05:55:27 - ERROR - stderr - 57%|█████▋ | 2127/3741 [12:29:33<9:19:09, 20.79s/it] +2025-05-11 05:55:50 - ERROR - stderr - 57%|█████▋ | 2128/3741 [12:29:56<9:36:24, 21.44s/it] +2025-05-11 05:55:50 - ERROR - stderr - +2025-05-11 05:55:50 - ERROR - stderr - +2025-05-11 05:55:50 - INFO - stdout - {'loss': 0.744, 'grad_norm': 0.71426922082901, 'learning_rate': 8.26825605853646e-06, 'epoch': 1.71} +2025-05-11 05:55:50 - ERROR - stderr - 57%|█████▋ | 2128/3741 [12:29:56<9:36:24, 21.44s/it] +2025-05-11 05:56:10 - ERROR - stderr - 57%|█████▋ | 2129/3741 [12:30:16<9:23:36, 20.98s/it] +2025-05-11 05:56:10 - ERROR - stderr - +2025-05-11 05:56:10 - ERROR - stderr - +2025-05-11 05:56:10 - INFO - stdout - {'loss': 0.694, 'grad_norm': 0.6715183854103088, 'learning_rate': 8.259728244463065e-06, 'epoch': 1.71} +2025-05-11 05:56:10 - ERROR - stderr - 57%|█████▋ | 2129/3741 [12:30:16<9:23:36, 20.98s/it] +2025-05-11 05:56:33 - ERROR - stderr - 57%|█████▋ | 2130/3741 [12:30:39<9:42:12, 21.68s/it] +2025-05-11 05:56:33 - ERROR - stderr - +2025-05-11 05:56:33 - ERROR - stderr - +2025-05-11 05:56:33 - INFO - stdout - {'loss': 0.7388, 'grad_norm': 0.6622087359428406, 'learning_rate': 8.251201735304698e-06, 'epoch': 1.71} +2025-05-11 05:56:33 - ERROR - stderr - 57%|█████▋ | 2130/3741 [12:30:39<9:42:12, 21.68s/it] +2025-05-11 05:56:53 - ERROR - stderr - 57%|█████▋ | 2131/3741 [12:30:59<9:23:54, 21.02s/it] +2025-05-11 05:56:53 - ERROR - stderr - +2025-05-11 05:56:53 - ERROR - stderr - +2025-05-11 05:56:53 - INFO - stdout - {'loss': 0.775, 'grad_norm': 0.7121644616127014, 'learning_rate': 8.242676537454825e-06, 'epoch': 1.71} +2025-05-11 05:56:53 - ERROR - stderr - 57%|█████▋ | 2131/3741 [12:30:59<9:23:54, 21.02s/it] +2025-05-11 05:57:17 - ERROR - stderr - 57%|█████▋ | 2132/3741 [12:31:23<9:49:52, 22.00s/it] +2025-05-11 05:57:17 - ERROR - stderr - +2025-05-11 05:57:17 - ERROR - stderr - +2025-05-11 05:57:17 - INFO - stdout - {'loss': 0.7363, 'grad_norm': 0.6632829904556274, 'learning_rate': 8.234152657305936e-06, 'epoch': 1.71} +2025-05-11 05:57:17 - ERROR - stderr - 57%|█████▋ | 2132/3741 [12:31:23<9:49:52, 22.00s/it] +2025-05-11 05:57:37 - ERROR - stderr - 57%|█████▋ | 2133/3741 [12:31:43<9:34:35, 21.44s/it] +2025-05-11 05:57:37 - ERROR - stderr - +2025-05-11 05:57:37 - ERROR - stderr - +2025-05-11 05:57:37 - INFO - stdout - {'loss': 0.7378, 'grad_norm': 0.6959726214408875, 'learning_rate': 8.22563010124952e-06, 'epoch': 1.71} +2025-05-11 05:57:37 - ERROR - stderr - 57%|█████▋ | 2133/3741 [12:31:43<9:34:35, 21.44s/it] +2025-05-11 05:58:00 - ERROR - stderr - 57%|█████▋ | 2134/3741 [12:32:06<9:44:53, 21.84s/it] +2025-05-11 05:58:00 - ERROR - stderr - +2025-05-11 05:58:00 - ERROR - stderr - +2025-05-11 05:58:00 - INFO - stdout - {'loss': 0.7204, 'grad_norm': 0.7143699526786804, 'learning_rate': 8.217108875676083e-06, 'epoch': 1.71} +2025-05-11 05:58:00 - ERROR - stderr - 57%|█████▋ | 2134/3741 [12:32:06<9:44:53, 21.84s/it] +2025-05-11 05:58:19 - ERROR - stderr - 57%|█████▋ | 2135/3741 [12:32:26<9:26:11, 21.15s/it] +2025-05-11 05:58:19 - ERROR - stderr - +2025-05-11 05:58:19 - ERROR - stderr - +2025-05-11 05:58:19 - INFO - stdout - {'loss': 0.7385, 'grad_norm': 0.6711505651473999, 'learning_rate': 8.20858898697513e-06, 'epoch': 1.71} +2025-05-11 05:58:19 - ERROR - stderr - 57%|█████▋ | 2135/3741 [12:32:26<9:26:11, 21.15s/it] +2025-05-11 05:58:42 - ERROR - stderr - 57%|█████▋ | 2136/3741 [12:32:48<9:37:20, 21.58s/it] +2025-05-11 05:58:42 - ERROR - stderr - +2025-05-11 05:58:42 - ERROR - stderr - +2025-05-11 05:58:42 - INFO - stdout - {'loss': 0.7139, 'grad_norm': 0.6337816119194031, 'learning_rate': 8.200070441535159e-06, 'epoch': 1.71} +2025-05-11 05:58:42 - ERROR - stderr - 57%|█████▋ | 2136/3741 [12:32:48<9:37:20, 21.58s/it] +2025-05-11 05:59:02 - ERROR - stderr - 57%|█████▋ | 2137/3741 [12:33:08<9:21:17, 21.00s/it] +2025-05-11 05:59:02 - ERROR - stderr - +2025-05-11 05:59:02 - ERROR - stderr - +2025-05-11 05:59:02 - INFO - stdout - {'loss': 0.7812, 'grad_norm': 0.6956325173377991, 'learning_rate': 8.191553245743675e-06, 'epoch': 1.71} +2025-05-11 05:59:02 - ERROR - stderr - 57%|█████▋ | 2137/3741 [12:33:08<9:21:17, 21.00s/it] +2025-05-11 05:59:25 - ERROR - stderr - 57%|█████▋ | 2138/3741 [12:33:31<9:41:50, 21.78s/it] +2025-05-11 05:59:25 - ERROR - stderr - +2025-05-11 05:59:25 - ERROR - stderr - +2025-05-11 05:59:25 - INFO - stdout - {'loss': 0.7351, 'grad_norm': 0.6819128394126892, 'learning_rate': 8.183037405987155e-06, 'epoch': 1.71} +2025-05-11 05:59:25 - ERROR - stderr - 57%|█████▋ | 2138/3741 [12:33:31<9:41:50, 21.78s/it] +2025-05-11 05:59:45 - ERROR - stderr - 57%|█████▋ | 2139/3741 [12:33:51<9:26:48, 21.23s/it] +2025-05-11 05:59:45 - ERROR - stderr - +2025-05-11 05:59:45 - ERROR - stderr - +2025-05-11 05:59:45 - INFO - stdout - {'loss': 0.7443, 'grad_norm': 0.6942816972732544, 'learning_rate': 8.174522928651068e-06, 'epoch': 1.72} +2025-05-11 05:59:45 - ERROR - stderr - 57%|█████▋ | 2139/3741 [12:33:51<9:26:48, 21.23s/it] +2025-05-11 06:00:08 - ERROR - stderr - 57%|█████▋ | 2140/3741 [12:34:14<9:40:54, 21.77s/it] +2025-05-11 06:00:08 - ERROR - stderr - +2025-05-11 06:00:08 - ERROR - stderr - +2025-05-11 06:00:08 - INFO - stdout - {'loss': 0.7521, 'grad_norm': 0.653355598449707, 'learning_rate': 8.166009820119857e-06, 'epoch': 1.72} +2025-05-11 06:00:08 - ERROR - stderr - 57%|█████▋ | 2140/3741 [12:34:14<9:40:54, 21.77s/it] +2025-05-11 06:00:28 - ERROR - stderr - 57%|█████▋ | 2141/3741 [12:34:34<9:23:34, 21.13s/it] +2025-05-11 06:00:28 - ERROR - stderr - +2025-05-11 06:00:28 - ERROR - stderr - +2025-05-11 06:00:28 - INFO - stdout - {'loss': 0.7139, 'grad_norm': 0.6483197808265686, 'learning_rate': 8.157498086776937e-06, 'epoch': 1.72} +2025-05-11 06:00:28 - ERROR - stderr - 57%|█████▋ | 2141/3741 [12:34:34<9:23:34, 21.13s/it] +2025-05-11 06:00:50 - ERROR - stderr - 57%|█████▋ | 2142/3741 [12:34:56<9:28:40, 21.34s/it] +2025-05-11 06:00:50 - ERROR - stderr - +2025-05-11 06:00:50 - ERROR - stderr - +2025-05-11 06:00:50 - INFO - stdout - {'loss': 0.7773, 'grad_norm': 0.6634138822555542, 'learning_rate': 8.148987735004706e-06, 'epoch': 1.72} +2025-05-11 06:00:50 - ERROR - stderr - 57%|█████▋ | 2142/3741 [12:34:56<9:28:40, 21.34s/it] +2025-05-11 06:01:09 - ERROR - stderr - 57%|█████▋ | 2143/3741 [12:35:15<9:13:22, 20.78s/it] +2025-05-11 06:01:09 - ERROR - stderr - +2025-05-11 06:01:09 - ERROR - stderr - +2025-05-11 06:01:09 - INFO - stdout - {'loss': 0.7243, 'grad_norm': 0.6787661910057068, 'learning_rate': 8.140478771184507e-06, 'epoch': 1.72} +2025-05-11 06:01:09 - ERROR - stderr - 57%|█████▋ | 2143/3741 [12:35:15<9:13:22, 20.78s/it] +2025-05-11 06:01:29 - ERROR - stderr - 57%|█████▋ | 2144/3741 [12:35:35<9:02:48, 20.39s/it] +2025-05-11 06:01:29 - ERROR - stderr - +2025-05-11 06:01:29 - ERROR - stderr - +2025-05-11 06:01:29 - INFO - stdout - {'loss': 0.7558, 'grad_norm': 0.6924588680267334, 'learning_rate': 8.131971201696656e-06, 'epoch': 1.72} +2025-05-11 06:01:29 - ERROR - stderr - 57%|█████▋ | 2144/3741 [12:35:35<9:02:48, 20.39s/it] +2025-05-11 06:01:48 - ERROR - stderr - 57%|█████▋ | 2145/3741 [12:35:54<8:54:21, 20.09s/it] +2025-05-11 06:01:48 - ERROR - stderr - +2025-05-11 06:01:48 - ERROR - stderr - +2025-05-11 06:01:48 - INFO - stdout - {'loss': 0.7175, 'grad_norm': 0.6862332820892334, 'learning_rate': 8.123465032920415e-06, 'epoch': 1.72} +2025-05-11 06:01:48 - ERROR - stderr - 57%|█████▋ | 2145/3741 [12:35:54<8:54:21, 20.09s/it] +2025-05-11 06:02:07 - ERROR - stderr - 57%|█████▋ | 2146/3741 [12:36:14<8:48:09, 19.87s/it] +2025-05-11 06:02:07 - ERROR - stderr - +2025-05-11 06:02:07 - ERROR - stderr - +2025-05-11 06:02:07 - INFO - stdout - {'loss': 0.7774, 'grad_norm': 0.6828835010528564, 'learning_rate': 8.114960271233999e-06, 'epoch': 1.72} +2025-05-11 06:02:07 - ERROR - stderr - 57%|█████▋ | 2146/3741 [12:36:14<8:48:09, 19.87s/it] +2025-05-11 06:02:27 - ERROR - stderr - 57%|█████▋ | 2147/3741 [12:36:33<8:46:48, 19.83s/it] +2025-05-11 06:02:27 - ERROR - stderr - +2025-05-11 06:02:27 - ERROR - stderr - +2025-05-11 06:02:27 - INFO - stdout - {'loss': 0.7002, 'grad_norm': 0.6388763189315796, 'learning_rate': 8.106456923014571e-06, 'epoch': 1.72} +2025-05-11 06:02:27 - ERROR - stderr - 57%|█████▋ | 2147/3741 [12:36:33<8:46:48, 19.83s/it] +2025-05-11 06:02:47 - ERROR - stderr - 57%|█████▋ | 2148/3741 [12:36:53<8:45:13, 19.78s/it] +2025-05-11 06:02:47 - ERROR - stderr - +2025-05-11 06:02:47 - ERROR - stderr - +2025-05-11 06:02:47 - INFO - stdout - {'loss': 0.7579, 'grad_norm': 0.6799570918083191, 'learning_rate': 8.097954994638225e-06, 'epoch': 1.72} +2025-05-11 06:02:47 - ERROR - stderr - 57%|█████▋ | 2148/3741 [12:36:53<8:45:13, 19.78s/it] +2025-05-11 06:03:08 - ERROR - stderr - 57%|█████▋ | 2149/3741 [12:37:14<8:55:28, 20.18s/it] +2025-05-11 06:03:08 - ERROR - stderr - +2025-05-11 06:03:08 - ERROR - stderr - +2025-05-11 06:03:08 - INFO - stdout - {'loss': 0.7354, 'grad_norm': 0.6811538338661194, 'learning_rate': 8.089454492480004e-06, 'epoch': 1.72} +2025-05-11 06:03:08 - ERROR - stderr - 57%|█████▋ | 2149/3741 [12:37:14<8:55:28, 20.18s/it] +2025-05-11 06:03:27 - ERROR - stderr - 57%|█████▋ | 2150/3741 [12:37:33<8:48:17, 19.92s/it] +2025-05-11 06:03:27 - ERROR - stderr - +2025-05-11 06:03:27 - ERROR - stderr - +2025-05-11 06:03:27 - INFO - stdout - {'loss': 0.7416, 'grad_norm': 0.7103093266487122, 'learning_rate': 8.080955422913872e-06, 'epoch': 1.72} +2025-05-11 06:03:27 - ERROR - stderr - 57%|█████▋ | 2150/3741 [12:37:33<8:48:17, 19.92s/it] +2025-05-11 06:03:48 - ERROR - stderr - 57%|█████▋ | 2151/3741 [12:37:54<8:52:40, 20.10s/it] +2025-05-11 06:03:48 - ERROR - stderr - +2025-05-11 06:03:48 - ERROR - stderr - +2025-05-11 06:03:48 - INFO - stdout - {'loss': 0.7321, 'grad_norm': 0.6595764756202698, 'learning_rate': 8.072457792312715e-06, 'epoch': 1.72} +2025-05-11 06:03:48 - ERROR - stderr - 57%|███��█▋ | 2151/3741 [12:37:54<8:52:40, 20.10s/it] +2025-05-11 06:04:07 - ERROR - stderr - 58%|█████▊ | 2152/3741 [12:38:14<8:48:36, 19.96s/it] +2025-05-11 06:04:07 - ERROR - stderr - +2025-05-11 06:04:07 - ERROR - stderr - +2025-05-11 06:04:07 - INFO - stdout - {'loss': 0.7535, 'grad_norm': 0.6965066194534302, 'learning_rate': 8.063961607048353e-06, 'epoch': 1.73} +2025-05-11 06:04:07 - ERROR - stderr - 58%|█████▊ | 2152/3741 [12:38:14<8:48:36, 19.96s/it] +2025-05-11 06:04:30 - ERROR - stderr - 58%|█████▊ | 2153/3741 [12:38:37<9:11:54, 20.85s/it] +2025-05-11 06:04:30 - ERROR - stderr - +2025-05-11 06:04:30 - ERROR - stderr - +2025-05-11 06:04:30 - INFO - stdout - {'loss': 0.7559, 'grad_norm': 0.6480819582939148, 'learning_rate': 8.05546687349151e-06, 'epoch': 1.73} +2025-05-11 06:04:30 - ERROR - stderr - 58%|█████▊ | 2153/3741 [12:38:37<9:11:54, 20.85s/it] +2025-05-11 06:04:50 - ERROR - stderr - 58%|█████▊ | 2154/3741 [12:38:56<9:01:34, 20.48s/it] +2025-05-11 06:04:50 - ERROR - stderr - +2025-05-11 06:04:50 - ERROR - stderr - +2025-05-11 06:04:50 - INFO - stdout - {'loss': 0.7416, 'grad_norm': 0.6977070569992065, 'learning_rate': 8.046973598011831e-06, 'epoch': 1.73} +2025-05-11 06:04:50 - ERROR - stderr - 58%|█████▊ | 2154/3741 [12:38:56<9:01:34, 20.48s/it] +2025-05-11 06:05:10 - ERROR - stderr - 58%|█████▊ | 2155/3741 [12:39:16<8:55:44, 20.27s/it] +2025-05-11 06:05:10 - ERROR - stderr - +2025-05-11 06:05:10 - ERROR - stderr - +2025-05-11 06:05:10 - INFO - stdout - {'loss': 0.7546, 'grad_norm': 0.689796507358551, 'learning_rate': 8.038481786977858e-06, 'epoch': 1.73} +2025-05-11 06:05:10 - ERROR - stderr - 58%|█████▊ | 2155/3741 [12:39:16<8:55:44, 20.27s/it] +2025-05-11 06:05:29 - ERROR - stderr - 58%|█████▊ | 2156/3741 [12:39:35<8:48:52, 20.02s/it] +2025-05-11 06:05:29 - ERROR - stderr - +2025-05-11 06:05:29 - ERROR - stderr - +2025-05-11 06:05:29 - INFO - stdout - {'loss': 0.733, 'grad_norm': 0.6607633233070374, 'learning_rate': 8.029991446757047e-06, 'epoch': 1.73} +2025-05-11 06:05:29 - ERROR - stderr - 58%|█████▊ | 2156/3741 [12:39:35<8:48:52, 20.02s/it] +2025-05-11 06:05:48 - ERROR - stderr - 58%|█████▊ | 2157/3741 [12:39:55<8:43:39, 19.84s/it] +2025-05-11 06:05:48 - ERROR - stderr - +2025-05-11 06:05:48 - ERROR - stderr - +2025-05-11 06:05:48 - INFO - stdout - {'loss': 0.7734, 'grad_norm': 0.7048103213310242, 'learning_rate': 8.02150258371574e-06, 'epoch': 1.73} +2025-05-11 06:05:48 - ERROR - stderr - 58%|█████▊ | 2157/3741 [12:39:55<8:43:39, 19.84s/it] +2025-05-11 06:06:08 - ERROR - stderr - 58%|█████▊ | 2158/3741 [12:40:15<8:44:35, 19.88s/it] +2025-05-11 06:06:08 - ERROR - stderr - +2025-05-11 06:06:08 - ERROR - stderr - +2025-05-11 06:06:08 - INFO - stdout - {'loss': 0.7469, 'grad_norm': 0.6884298324584961, 'learning_rate': 8.013015204219171e-06, 'epoch': 1.73} +2025-05-11 06:06:08 - ERROR - stderr - 58%|█████▊ | 2158/3741 [12:40:15<8:44:35, 19.88s/it] +2025-05-11 06:06:28 - ERROR - stderr - 58%|█████▊ | 2159/3741 [12:40:34<8:39:16, 19.69s/it] +2025-05-11 06:06:28 - ERROR - stderr - +2025-05-11 06:06:28 - ERROR - stderr - +2025-05-11 06:06:28 - INFO - stdout - {'loss': 0.7367, 'grad_norm': 0.7125540375709534, 'learning_rate': 8.004529314631476e-06, 'epoch': 1.73} +2025-05-11 06:06:28 - ERROR - stderr - 58%|█████▊ | 2159/3741 [12:40:34<8:39:16, 19.69s/it] +2025-05-11 06:06:50 - ERROR - stderr - 58%|█████▊ | 2160/3741 [12:40:56<8:56:52, 20.37s/it] +2025-05-11 06:06:50 - ERROR - stderr - +2025-05-11 06:06:50 - ERROR - stderr - +2025-05-11 06:06:50 - INFO - stdout - {'loss': 0.7246, 'grad_norm': 0.6849104762077332, 'learning_rate': 7.996044921315656e-06, 'epoch': 1.73} +2025-05-11 06:06:50 - ERROR - stderr - 58%|█████▊ | 2160/3741 [12:40:56<8:56:52, 20.37s/it] +2025-05-11 06:07:10 - ERROR - stderr - 58%|█████▊ | 2161/3741 [12:41:16<8:55:12, 20.32s/it] +2025-05-11 06:07:10 - ERROR - stderr - +2025-05-11 06:07:10 - ERROR - stderr - +2025-05-11 06:07:10 - INFO - stdout - {'loss': 0.726, 'grad_norm': 0.6800344586372375, 'learning_rate': 7.987562030633604e-06, 'epoch': 1.73} +2025-05-11 06:07:10 - ERROR - stderr - 58%|█████▊ | 2161/3741 [12:41:16<8:55:12, 20.32s/it] +2025-05-11 06:07:32 - ERROR - stderr - 58%|█████▊ | 2162/3741 [12:41:39<9:11:04, 20.94s/it] +2025-05-11 06:07:32 - ERROR - stderr - +2025-05-11 06:07:32 - ERROR - stderr - +2025-05-11 06:07:32 - INFO - stdout - {'loss': 0.7169, 'grad_norm': 0.7025103569030762, 'learning_rate': 7.979080648946078e-06, 'epoch': 1.73} +2025-05-11 06:07:32 - ERROR - stderr - 58%|█████▊ | 2162/3741 [12:41:39<9:11:04, 20.94s/it] +2025-05-11 06:07:52 - ERROR - stderr - 58%|█████▊ | 2163/3741 [12:41:58<9:02:01, 20.61s/it] +2025-05-11 06:07:52 - ERROR - stderr - +2025-05-11 06:07:52 - ERROR - stderr - +2025-05-11 06:07:52 - INFO - stdout - {'loss': 0.7497, 'grad_norm': 0.6762646436691284, 'learning_rate': 7.970600782612703e-06, 'epoch': 1.73} +2025-05-11 06:07:52 - ERROR - stderr - 58%|█████▊ | 2163/3741 [12:41:58<9:02:01, 20.61s/it] +2025-05-11 06:08:15 - ERROR - stderr - 58%|█████▊ | 2164/3741 [12:42:21<9:20:22, 21.32s/it] +2025-05-11 06:08:15 - ERROR - stderr - +2025-05-11 06:08:15 - ERROR - stderr - +2025-05-11 06:08:15 - INFO - stdout - {'loss': 0.7864, 'grad_norm': 0.7249945998191833, 'learning_rate': 7.962122437991978e-06, 'epoch': 1.74} +2025-05-11 06:08:15 - ERROR - stderr - 58%|█████▊ | 2164/3741 [12:42:21<9:20:22, 21.32s/it] +2025-05-11 06:08:35 - ERROR - stderr - 58%|█████▊ | 2165/3741 [12:42:41<9:06:17, 20.80s/it] +2025-05-11 06:08:35 - ERROR - stderr - +2025-05-11 06:08:35 - ERROR - stderr - +2025-05-11 06:08:35 - INFO - stdout - {'loss': 0.7619, 'grad_norm': 0.6832449436187744, 'learning_rate': 7.953645621441245e-06, 'epoch': 1.74} +2025-05-11 06:08:35 - ERROR - stderr - 58%|█████▊ | 2165/3741 [12:42:41<9:06:17, 20.80s/it] +2025-05-11 06:08:58 - ERROR - stderr - 58%|█████▊ | 2166/3741 [12:43:04<9:25:46, 21.55s/it] +2025-05-11 06:08:58 - ERROR - stderr - +2025-05-11 06:08:58 - ERROR - stderr - +2025-05-11 06:08:58 - INFO - stdout - {'loss': 0.721, 'grad_norm': 0.6707166433334351, 'learning_rate': 7.945170339316724e-06, 'epoch': 1.74} +2025-05-11 06:08:58 - ERROR - stderr - 58%|█████▊ | 2166/3741 [12:43:04<9:25:46, 21.55s/it] +2025-05-11 06:09:18 - ERROR - stderr - 58%|█████▊ | 2167/3741 [12:43:24<9:10:38, 20.99s/it] +2025-05-11 06:09:18 - ERROR - stderr - +2025-05-11 06:09:18 - ERROR - stderr - +2025-05-11 06:09:18 - INFO - stdout - {'loss': 0.7313, 'grad_norm': 0.6761574149131775, 'learning_rate': 7.93669659797346e-06, 'epoch': 1.74} +2025-05-11 06:09:18 - ERROR - stderr - 58%|█████▊ | 2167/3741 [12:43:24<9:10:38, 20.99s/it] +2025-05-11 06:09:41 - ERROR - stderr - 58%|█████▊ | 2168/3741 [12:43:48<9:31:14, 21.79s/it] +2025-05-11 06:09:41 - ERROR - stderr - +2025-05-11 06:09:41 - ERROR - stderr - +2025-05-11 06:09:41 - INFO - stdout - {'loss': 0.736, 'grad_norm': 0.6892061829566956, 'learning_rate': 7.928224403765353e-06, 'epoch': 1.74} +2025-05-11 06:09:41 - ERROR - stderr - 58%|█████▊ | 2168/3741 [12:43:48<9:31:14, 21.79s/it] +2025-05-11 06:10:01 - ERROR - stderr - 58%|█████▊ | 2169/3741 [12:44:07<9:13:01, 21.11s/it] +2025-05-11 06:10:01 - ERROR - stderr - +2025-05-11 06:10:01 - ERROR - stderr - +2025-05-11 06:10:01 - INFO - stdout - {'loss': 0.7297, 'grad_norm': 0.6842492818832397, 'learning_rate': 7.919753763045148e-06, 'epoch': 1.74} +2025-05-11 06:10:01 - ERROR - stderr - 58%|█████▊ | 2169/3741 [12:44:07<9:13:01, 21.11s/it] +2025-05-11 06:10:21 - ERROR - stderr - 58%|█████▊ | 2170/3741 [12:44:28<9:07:30, 20.91s/it] +2025-05-11 06:10:21 - ERROR - stderr - +2025-05-11 06:10:21 - ERROR - stderr - +2025-05-11 06:10:21 - INFO - stdout - {'loss': 0.7464, 'grad_norm': 0.6731321811676025, 'learning_rate': 7.911284682164413e-06, 'epoch': 1.74} +2025-05-11 06:10:21 - ERROR - stderr - 58%|█████▊ | 2170/3741 [12:44:28<9:07:30, 20.91s/it] +2025-05-11 06:10:41 - ERROR - stderr - 58%|█████▊ | 2171/3741 [12:44:47<8:55:49, 20.48s/it] +2025-05-11 06:10:41 - ERROR - stderr - +2025-05-11 06:10:41 - ERROR - stderr - +2025-05-11 06:10:41 - INFO - stdout - {'loss': 0.737, 'grad_norm': 0.6783955693244934, 'learning_rate': 7.90281716747356e-06, 'epoch': 1.74} +2025-05-11 06:10:41 - ERROR - stderr - 58%|█████▊ | 2171/3741 [12:44:47<8:55:49, 20.48s/it] +2025-05-11 06:11:00 - ERROR - stderr - 58%|█████▊ | 2172/3741 [12:45:07<8:48:34, 20.21s/it] +2025-05-11 06:11:00 - ERROR - stderr - +2025-05-11 06:11:00 - ERROR - stderr - +2025-05-11 06:11:00 - INFO - stdout - {'loss': 0.7445, 'grad_norm': 0.6662715673446655, 'learning_rate': 7.894351225321817e-06, 'epoch': 1.74} +2025-05-11 06:11:00 - ERROR - stderr - 58%|█████▊ | 2172/3741 [12:45:07<8:48:34, 20.21s/it] +2025-05-11 06:11:20 - ERROR - stderr - 58%|█████▊ | 2173/3741 [12:45:26<8:43:13, 20.02s/it] +2025-05-11 06:11:20 - ERROR - stderr - +2025-05-11 06:11:20 - ERROR - stderr - +2025-05-11 06:11:20 - INFO - stdout - {'loss': 0.7155, 'grad_norm': 0.6635359525680542, 'learning_rate': 7.885886862057233e-06, 'epoch': 1.74} +2025-05-11 06:11:20 - ERROR - stderr - 58%|█████▊ | 2173/3741 [12:45:26<8:43:13, 20.02s/it] +2025-05-11 06:11:39 - ERROR - stderr - 58%|█████▊ | 2174/3741 [12:45:46<8:39:06, 19.88s/it] +2025-05-11 06:11:39 - ERROR - stderr - +2025-05-11 06:11:39 - ERROR - stderr - +2025-05-11 06:11:39 - INFO - stdout - {'loss': 0.729, 'grad_norm': 0.691408634185791, 'learning_rate': 7.877424084026682e-06, 'epoch': 1.74} +2025-05-11 06:11:39 - ERROR - stderr - 58%|█��███▊ | 2174/3741 [12:45:46<8:39:06, 19.88s/it] +2025-05-11 06:11:59 - ERROR - stderr - 58%|█████▊ | 2175/3741 [12:46:05<8:35:37, 19.76s/it] +2025-05-11 06:11:59 - ERROR - stderr - +2025-05-11 06:11:59 - ERROR - stderr - +2025-05-11 06:11:59 - INFO - stdout - {'loss': 0.7426, 'grad_norm': 0.6881137490272522, 'learning_rate': 7.868962897575837e-06, 'epoch': 1.74} +2025-05-11 06:11:59 - ERROR - stderr - 58%|█████▊ | 2175/3741 [12:46:05<8:35:37, 19.76s/it] +2025-05-11 06:12:18 - ERROR - stderr - 58%|█████▊ | 2176/3741 [12:46:24<8:31:21, 19.60s/it] +2025-05-11 06:12:18 - ERROR - stderr - +2025-05-11 06:12:18 - ERROR - stderr - +2025-05-11 06:12:18 - INFO - stdout - {'loss': 0.7381, 'grad_norm': 0.6919487714767456, 'learning_rate': 7.86050330904919e-06, 'epoch': 1.74} +2025-05-11 06:12:18 - ERROR - stderr - 58%|█████▊ | 2176/3741 [12:46:24<8:31:21, 19.60s/it] +2025-05-11 06:12:39 - ERROR - stderr - 58%|█████▊ | 2177/3741 [12:46:45<8:39:46, 19.94s/it] +2025-05-11 06:12:39 - ERROR - stderr - +2025-05-11 06:12:39 - ERROR - stderr - +2025-05-11 06:12:39 - INFO - stdout - {'loss': 0.7485, 'grad_norm': 0.6565024852752686, 'learning_rate': 7.852045324790023e-06, 'epoch': 1.75} +2025-05-11 06:12:39 - ERROR - stderr - 58%|█████▊ | 2177/3741 [12:46:45<8:39:46, 19.94s/it] +2025-05-11 06:12:59 - ERROR - stderr - 58%|█████▊ | 2178/3741 [12:47:05<8:37:40, 19.87s/it] +2025-05-11 06:12:59 - ERROR - stderr - +2025-05-11 06:12:59 - ERROR - stderr - +2025-05-11 06:12:59 - INFO - stdout - {'loss': 0.7429, 'grad_norm': 0.6828457117080688, 'learning_rate': 7.843588951140421e-06, 'epoch': 1.75} +2025-05-11 06:12:59 - ERROR - stderr - 58%|█████▊ | 2178/3741 [12:47:05<8:37:40, 19.87s/it] +2025-05-11 06:13:20 - ERROR - stderr - 58%|█████▊ | 2179/3741 [12:47:26<8:48:49, 20.31s/it] +2025-05-11 06:13:20 - ERROR - stderr - +2025-05-11 06:13:20 - ERROR - stderr - +2025-05-11 06:13:20 - INFO - stdout - {'loss': 0.7139, 'grad_norm': 0.6890912652015686, 'learning_rate': 7.835134194441265e-06, 'epoch': 1.75} +2025-05-11 06:13:20 - ERROR - stderr - 58%|█████▊ | 2179/3741 [12:47:26<8:48:49, 20.31s/it] +2025-05-11 06:13:39 - ERROR - stderr - 58%|█████▊ | 2180/3741 [12:47:46<8:41:45, 20.05s/it] +2025-05-11 06:13:39 - ERROR - stderr - +2025-05-11 06:13:39 - ERROR - stderr - +2025-05-11 06:13:39 - INFO - stdout - {'loss': 0.7731, 'grad_norm': 0.6851256489753723, 'learning_rate': 7.826681061032216e-06, 'epoch': 1.75} +2025-05-11 06:13:39 - ERROR - stderr - 58%|█████▊ | 2180/3741 [12:47:46<8:41:45, 20.05s/it] +2025-05-11 06:14:02 - ERROR - stderr - 58%|█████▊ | 2181/3741 [12:48:09<9:05:08, 20.97s/it] +2025-05-11 06:14:02 - ERROR - stderr - +2025-05-11 06:14:02 - ERROR - stderr - +2025-05-11 06:14:02 - INFO - stdout - {'loss': 0.7865, 'grad_norm': 0.6674900054931641, 'learning_rate': 7.818229557251722e-06, 'epoch': 1.75} +2025-05-11 06:14:02 - ERROR - stderr - 58%|█████▊ | 2181/3741 [12:48:09<9:05:08, 20.97s/it] +2025-05-11 06:14:22 - ERROR - stderr - 58%|█████▊ | 2182/3741 [12:48:29<8:56:14, 20.64s/it] +2025-05-11 06:14:22 - ERROR - stderr - +2025-05-11 06:14:22 - ERROR - stderr - +2025-05-11 06:14:22 - INFO - stdout - {'loss': 0.7245, 'grad_norm': 0.693809986114502, 'learning_rate': 7.809779689437011e-06, 'epoch': 1.75} +2025-05-11 06:14:22 - ERROR - stderr - 58%|█████▊ | 2182/3741 [12:48:29<8:56:14, 20.64s/it] +2025-05-11 06:14:46 - ERROR - stderr - 58%|█████▊ | 2183/3741 [12:48:52<9:15:58, 21.41s/it] +2025-05-11 06:14:46 - ERROR - stderr - +2025-05-11 06:14:46 - ERROR - stderr - +2025-05-11 06:14:46 - INFO - stdout - {'loss': 0.7556, 'grad_norm': 0.7122488617897034, 'learning_rate': 7.801331463924076e-06, 'epoch': 1.75} +2025-05-11 06:14:46 - ERROR - stderr - 58%|█████▊ | 2183/3741 [12:48:52<9:15:58, 21.41s/it] +2025-05-11 06:15:05 - ERROR - stderr - 58%|█████▊ | 2184/3741 [12:49:11<9:01:39, 20.87s/it] +2025-05-11 06:15:05 - ERROR - stderr - +2025-05-11 06:15:05 - ERROR - stderr - +2025-05-11 06:15:05 - INFO - stdout - {'loss': 0.7313, 'grad_norm': 0.6818574666976929, 'learning_rate': 7.79288488704769e-06, 'epoch': 1.75} +2025-05-11 06:15:05 - ERROR - stderr - 58%|█████▊ | 2184/3741 [12:49:11<9:01:39, 20.87s/it] +2025-05-11 06:15:29 - ERROR - stderr - 58%|█████▊ | 2185/3741 [12:49:36<9:26:19, 21.84s/it] +2025-05-11 06:15:29 - ERROR - stderr - +2025-05-11 06:15:29 - ERROR - stderr - +2025-05-11 06:15:29 - INFO - stdout - {'loss': 0.7223, 'grad_norm': 0.6880291104316711, 'learning_rate': 7.784439965141381e-06, 'epoch': 1.75} +2025-05-11 06:15:29 - ERROR - stderr - 58%|█████▊ | 2185/3741 [12:49:36<9:26:19, 21.84s/it] +2025-05-11 06:15:49 - ERROR - stderr - 58%|█████▊ | 2186/3741 [12:49:55<9:09:15, 21.19s/it] +2025-05-11 06:15:49 - ERROR - stderr - +2025-05-11 06:15:49 - ERROR - stderr - +2025-05-11 06:15:49 - INFO - stdout - {'loss': 0.7173, 'grad_norm': 0.6625512838363647, 'learning_rate': 7.775996704537442e-06, 'epoch': 1.75} +2025-05-11 06:15:49 - ERROR - stderr - 58%|█████▊ | 2186/3741 [12:49:55<9:09:15, 21.19s/it] +2025-05-11 06:16:13 - ERROR - stderr - 58%|█████▊ | 2187/3741 [12:50:20<9:34:27, 22.18s/it] +2025-05-11 06:16:13 - ERROR - stderr - +2025-05-11 06:16:13 - ERROR - stderr - +2025-05-11 06:16:13 - INFO - stdout - {'loss': 0.7263, 'grad_norm': 0.669406533241272, 'learning_rate': 7.767555111566914e-06, 'epoch': 1.75} +2025-05-11 06:16:13 - ERROR - stderr - 58%|█████▊ | 2187/3741 [12:50:20<9:34:27, 22.18s/it] +2025-05-11 06:16:33 - ERROR - stderr - 58%|█████▊ | 2188/3741 [12:50:39<9:15:21, 21.46s/it] +2025-05-11 06:16:33 - ERROR - stderr - +2025-05-11 06:16:33 - ERROR - stderr - +2025-05-11 06:16:33 - INFO - stdout - {'loss': 0.7081, 'grad_norm': 0.6676865816116333, 'learning_rate': 7.759115192559589e-06, 'epoch': 1.75} +2025-05-11 06:16:33 - ERROR - stderr - 58%|█████▊ | 2188/3741 [12:50:40<9:15:21, 21.46s/it] +2025-05-11 06:16:55 - ERROR - stderr - 59%|█████▊ | 2189/3741 [12:51:01<9:19:15, 21.62s/it] +2025-05-11 06:16:55 - ERROR - stderr - +2025-05-11 06:16:55 - ERROR - stderr - +2025-05-11 06:16:55 - INFO - stdout - {'loss': 0.741, 'grad_norm': 0.6633387804031372, 'learning_rate': 7.750676953844011e-06, 'epoch': 1.76} +2025-05-11 06:16:55 - ERROR - stderr - 59%|█████▊ | 2189/3741 [12:51:02<9:19:15, 21.62s/it] +2025-05-11 06:17:15 - ERROR - stderr - 59%|█████▊ | 2190/3741 [12:51:21<9:01:37, 20.95s/it] +2025-05-11 06:17:15 - ERROR - stderr - +2025-05-11 06:17:15 - ERROR - stderr - +2025-05-11 06:17:15 - INFO - stdout - {'loss': 0.7791, 'grad_norm': 0.7206395864486694, 'learning_rate': 7.742240401747457e-06, 'epoch': 1.76} +2025-05-11 06:17:15 - ERROR - stderr - 59%|█████▊ | 2190/3741 [12:51:21<9:01:37, 20.95s/it] +2025-05-11 06:17:35 - ERROR - stderr - 59%|█████▊ | 2191/3741 [12:51:41<8:56:17, 20.76s/it] +2025-05-11 06:17:35 - ERROR - stderr - +2025-05-11 06:17:35 - ERROR - stderr - +2025-05-11 06:17:35 - INFO - stdout - {'loss': 0.7411, 'grad_norm': 0.7158024311065674, 'learning_rate': 7.73380554259594e-06, 'epoch': 1.76} +2025-05-11 06:17:35 - ERROR - stderr - 59%|█████▊ | 2191/3741 [12:51:41<8:56:17, 20.76s/it] +2025-05-11 06:17:54 - ERROR - stderr - 59%|█████▊ | 2192/3741 [12:52:01<8:46:41, 20.40s/it] +2025-05-11 06:17:54 - ERROR - stderr - +2025-05-11 06:17:54 - ERROR - stderr - +2025-05-11 06:17:54 - INFO - stdout - {'loss': 0.7193, 'grad_norm': 0.684083104133606, 'learning_rate': 7.725372382714208e-06, 'epoch': 1.76} +2025-05-11 06:17:54 - ERROR - stderr - 59%|█████▊ | 2192/3741 [12:52:01<8:46:41, 20.40s/it] +2025-05-11 06:18:14 - ERROR - stderr - 59%|█████▊ | 2193/3741 [12:52:20<8:39:39, 20.14s/it] +2025-05-11 06:18:14 - ERROR - stderr - +2025-05-11 06:18:14 - ERROR - stderr - +2025-05-11 06:18:14 - INFO - stdout - {'loss': 0.7775, 'grad_norm': 0.6862793564796448, 'learning_rate': 7.716940928425724e-06, 'epoch': 1.76} +2025-05-11 06:18:14 - ERROR - stderr - 59%|█████▊ | 2193/3741 [12:52:20<8:39:39, 20.14s/it] +2025-05-11 06:18:35 - ERROR - stderr - 59%|█████▊ | 2194/3741 [12:52:41<8:47:01, 20.44s/it] +2025-05-11 06:18:35 - ERROR - stderr - +2025-05-11 06:18:35 - ERROR - stderr - +2025-05-11 06:18:35 - INFO - stdout - {'loss': 0.7628, 'grad_norm': 0.718550980091095, 'learning_rate': 7.708511186052689e-06, 'epoch': 1.76} +2025-05-11 06:18:35 - ERROR - stderr - 59%|█████▊ | 2194/3741 [12:52:41<8:47:01, 20.44s/it] +2025-05-11 06:18:56 - ERROR - stderr - 59%|█████▊ | 2195/3741 [12:53:02<8:51:08, 20.61s/it] +2025-05-11 06:18:56 - ERROR - stderr - +2025-05-11 06:18:56 - ERROR - stderr - +2025-05-11 06:18:56 - INFO - stdout - {'loss': 0.7639, 'grad_norm': 0.7173058986663818, 'learning_rate': 7.700083161916e-06, 'epoch': 1.76} +2025-05-11 06:18:56 - ERROR - stderr - 59%|█████▊ | 2195/3741 [12:53:02<8:51:08, 20.61s/it] +2025-05-11 06:19:17 - ERROR - stderr - 59%|█████▊ | 2196/3741 [12:53:23<8:52:11, 20.67s/it] +2025-05-11 06:19:17 - ERROR - stderr - +2025-05-11 06:19:17 - ERROR - stderr - +2025-05-11 06:19:17 - INFO - stdout - {'loss': 0.705, 'grad_norm': 0.6749827861785889, 'learning_rate': 7.691656862335288e-06, 'epoch': 1.76} +2025-05-11 06:19:17 - ERROR - stderr - 59%|█████▊ | 2196/3741 [12:53:23<8:52:11, 20.67s/it] +2025-05-11 06:19:37 - ERROR - stderr - 59%|█████▊ | 2197/3741 [12:53:43<8:46:17, 20.45s/it] +2025-05-11 06:19:37 - ERROR - stderr - +2025-05-11 06:19:37 - ERROR - stderr - +2025-05-11 06:19:37 - INFO - stdout - {'loss': 0.7156, 'grad_norm': 0.6680623292922974, 'learning_rate': 7.683232293628873e-06, 'epoch': 1.76} +2025-05-11 06:19:37 - ERROR - stderr - 59%|█████▊ | 2197/3741 [12:53:43<8:46:17, 20.45s/it] +2025-05-11 06:20:01 - ERROR - stderr - 59%|█████▉ | 2198/3741 [12:54:08<9:17:05, 21.66s/it] +2025-05-11 06:20:01 - ERROR - stderr - +2025-05-11 06:20:01 - ERROR - stderr - +2025-05-11 06:20:01 - INFO - stdout - {'loss': 0.7494, 'grad_norm': 0.6810491681098938, 'learning_rate': 7.674809462113782e-06, 'epoch': 1.76} +2025-05-11 06:20:01 - ERROR - stderr - 59%|█████▉ | 2198/3741 [12:54:08<9:17:05, 21.66s/it] +2025-05-11 06:20:23 - ERROR - stderr - 59%|█████▉ | 2199/3741 [12:54:29<9:17:17, 21.68s/it] +2025-05-11 06:20:23 - ERROR - stderr - +2025-05-11 06:20:23 - ERROR - stderr - +2025-05-11 06:20:23 - INFO - stdout - {'loss': 0.7559, 'grad_norm': 0.6893939971923828, 'learning_rate': 7.666388374105747e-06, 'epoch': 1.76} +2025-05-11 06:20:23 - ERROR - stderr - 59%|█████▉ | 2199/3741 [12:54:29<9:17:17, 21.68s/it] +2025-05-11 06:20:48 - ERROR - stderr - 59%|█████▉ | 2200/3741 [12:54:54<9:38:06, 22.51s/it] +2025-05-11 06:20:48 - ERROR - stderr - +2025-05-11 06:20:48 - ERROR - stderr - +2025-05-11 06:20:48 - INFO - stdout - {'loss': 0.7398, 'grad_norm': 0.6892242431640625, 'learning_rate': 7.65796903591918e-06, 'epoch': 1.76} +2025-05-11 06:20:48 - ERROR - stderr - 59%|█████▉ | 2200/3741 [12:54:54<9:38:06, 22.51s/it] +2025-05-11 06:21:09 - ERROR - stderr - 59%|█████▉ | 2201/3741 [12:55:16<9:31:25, 22.26s/it] +2025-05-11 06:21:09 - ERROR - stderr - +2025-05-11 06:21:09 - ERROR - stderr - +2025-05-11 06:21:09 - INFO - stdout - {'loss': 0.7275, 'grad_norm': 0.6665722131729126, 'learning_rate': 7.649551453867192e-06, 'epoch': 1.77} +2025-05-11 06:21:09 - ERROR - stderr - 59%|█████▉ | 2201/3741 [12:55:16<9:31:25, 22.26s/it] +2025-05-11 06:21:33 - ERROR - stderr - 59%|█████▉ | 2202/3741 [12:55:40<9:45:00, 22.81s/it] +2025-05-11 06:21:33 - ERROR - stderr - +2025-05-11 06:21:33 - ERROR - stderr - +2025-05-11 06:21:33 - INFO - stdout - {'loss': 0.7379, 'grad_norm': 0.7008151412010193, 'learning_rate': 7.641135634261572e-06, 'epoch': 1.77} +2025-05-11 06:21:33 - ERROR - stderr - 59%|█████▉ | 2202/3741 [12:55:40<9:45:00, 22.81s/it] +2025-05-11 06:21:55 - ERROR - stderr - 59%|█████▉ | 2203/3741 [12:56:02<9:38:27, 22.57s/it] +2025-05-11 06:21:55 - ERROR - stderr - +2025-05-11 06:21:55 - ERROR - stderr - +2025-05-11 06:21:55 - INFO - stdout - {'loss': 0.77, 'grad_norm': 0.6838683485984802, 'learning_rate': 7.632721583412787e-06, 'epoch': 1.77} +2025-05-11 06:21:55 - ERROR - stderr - 59%|█████▉ | 2203/3741 [12:56:02<9:38:27, 22.57s/it] +2025-05-11 06:22:20 - ERROR - stderr - 59%|█████▉ | 2204/3741 [12:56:26<9:54:18, 23.20s/it] +2025-05-11 06:22:20 - ERROR - stderr - +2025-05-11 06:22:20 - ERROR - stderr - +2025-05-11 06:22:20 - INFO - stdout - {'loss': 0.7355, 'grad_norm': 0.691834032535553, 'learning_rate': 7.62430930762998e-06, 'epoch': 1.77} +2025-05-11 06:22:20 - ERROR - stderr - 59%|█████▉ | 2204/3741 [12:56:26<9:54:18, 23.20s/it] +2025-05-11 06:22:42 - ERROR - stderr - 59%|█████▉ | 2205/3741 [12:56:48<9:44:25, 22.83s/it] +2025-05-11 06:22:42 - ERROR - stderr - +2025-05-11 06:22:42 - ERROR - stderr - +2025-05-11 06:22:42 - INFO - stdout - {'loss': 0.7353, 'grad_norm': 0.6844693422317505, 'learning_rate': 7.615898813220958e-06, 'epoch': 1.77} +2025-05-11 06:22:42 - ERROR - stderr - 59%|█████▉ | 2205/3741 [12:56:48<9:44:25, 22.83s/it] +2025-05-11 06:23:04 - ERROR - stderr - 59%|█████▉ | 2206/3741 [12:57:10<9:38:54, 22.63s/it] +2025-05-11 06:23:04 - ERROR - stderr - +2025-05-11 06:23:04 - ERROR - stderr - +2025-05-11 06:23:04 - INFO - stdout - {'loss': 0.747, 'grad_norm': 0.7072806358337402, 'learning_rate': 7.607490106492205e-06, 'epoch': 1.77} +2025-05-11 06:23:04 - ERROR - stderr - 59%|█████▉ | 2206/3741 [12:57:10<9:38:54, 22.63s/it] +2025-05-11 06:23:26 - ERROR - stderr - 59%|█████▉ | 2207/3741 [12:57:33<9:35:24, 22.51s/it] +2025-05-11 06:23:26 - ERROR - stderr - +2025-05-11 06:23:26 - ERROR - stderr - +2025-05-11 06:23:26 - INFO - stdout - {'loss': 0.697, 'grad_norm': 0.6707396507263184, 'learning_rate': 7.5990831937488476e-06, 'epoch': 1.77} +2025-05-11 06:23:26 - ERROR - stderr - 59%|█████▉ | 2207/3741 [12:57:33<9:35:24, 22.51s/it] +2025-05-11 06:23:48 - ERROR - stderr - 59%|█████▉ | 2208/3741 [12:57:54<9:29:36, 22.29s/it] +2025-05-11 06:23:48 - ERROR - stderr - +2025-05-11 06:23:48 - ERROR - stderr - +2025-05-11 06:23:48 - INFO - stdout - {'loss': 0.7167, 'grad_norm': 0.7029158473014832, 'learning_rate': 7.590678081294673e-06, 'epoch': 1.77} +2025-05-11 06:23:48 - ERROR - stderr - 59%|█████▉ | 2208/3741 [12:57:54<9:29:36, 22.29s/it] +2025-05-11 06:24:10 - ERROR - stderr - 59%|█████▉ | 2209/3741 [12:58:16<9:25:52, 22.16s/it] +2025-05-11 06:24:10 - ERROR - stderr - +2025-05-11 06:24:10 - ERROR - stderr - +2025-05-11 06:24:10 - INFO - stdout - {'loss': 0.7507, 'grad_norm': 0.7110798954963684, 'learning_rate': 7.5822747754321315e-06, 'epoch': 1.77} +2025-05-11 06:24:10 - ERROR - stderr - 59%|█████▉ | 2209/3741 [12:58:16<9:25:52, 22.16s/it] +2025-05-11 06:24:32 - ERROR - stderr - 59%|█████▉ | 2210/3741 [12:58:38<9:24:00, 22.10s/it] +2025-05-11 06:24:32 - ERROR - stderr - +2025-05-11 06:24:32 - ERROR - stderr - +2025-05-11 06:24:32 - INFO - stdout - {'loss': 0.7402, 'grad_norm': 0.6975316405296326, 'learning_rate': 7.573873282462299e-06, 'epoch': 1.77} +2025-05-11 06:24:32 - ERROR - stderr - 59%|█████▉ | 2210/3741 [12:58:38<9:24:00, 22.10s/it] +2025-05-11 06:24:54 - ERROR - stderr - 59%|█████▉ | 2211/3741 [12:59:00<9:23:27, 22.10s/it] +2025-05-11 06:24:54 - ERROR - stderr - +2025-05-11 06:24:54 - ERROR - stderr - +2025-05-11 06:24:54 - INFO - stdout - {'loss': 0.714, 'grad_norm': 0.6738576889038086, 'learning_rate': 7.5654736086849056e-06, 'epoch': 1.77} +2025-05-11 06:24:54 - ERROR - stderr - 59%|█████▉ | 2211/3741 [12:59:00<9:23:27, 22.10s/it] +2025-05-11 06:25:16 - ERROR - stderr - 59%|█████▉ | 2212/3741 [12:59:22<9:19:53, 21.97s/it] +2025-05-11 06:25:16 - ERROR - stderr - +2025-05-11 06:25:16 - ERROR - stderr - +2025-05-11 06:25:16 - INFO - stdout - {'loss': 0.7079, 'grad_norm': 0.6818029284477234, 'learning_rate': 7.5570757603983115e-06, 'epoch': 1.77} +2025-05-11 06:25:16 - ERROR - stderr - 59%|█████▉ | 2212/3741 [12:59:22<9:19:53, 21.97s/it] +2025-05-11 06:25:38 - ERROR - stderr - 59%|█████▉ | 2213/3741 [12:59:45<9:24:21, 22.16s/it] +2025-05-11 06:25:38 - ERROR - stderr - +2025-05-11 06:25:38 - ERROR - stderr - +2025-05-11 06:25:38 - INFO - stdout - {'loss': 0.7548, 'grad_norm': 0.6604394316673279, 'learning_rate': 7.548679743899505e-06, 'epoch': 1.77} +2025-05-11 06:25:38 - ERROR - stderr - 59%|█████▉ | 2213/3741 [12:59:45<9:24:21, 22.16s/it] +2025-05-11 06:26:00 - ERROR - stderr - 59%|█████▉ | 2214/3741 [13:00:06<9:19:34, 21.99s/it] +2025-05-11 06:26:00 - ERROR - stderr - +2025-05-11 06:26:00 - ERROR - stderr - +2025-05-11 06:26:00 - INFO - stdout - {'loss': 0.7403, 'grad_norm': 0.6959803104400635, 'learning_rate': 7.540285565484114e-06, 'epoch': 1.78} +2025-05-11 06:26:00 - ERROR - stderr - 59%|█████▉ | 2214/3741 [13:00:06<9:19:34, 21.99s/it] +2025-05-11 06:26:22 - ERROR - stderr - 59%|█████▉ | 2215/3741 [13:00:28<9:17:58, 21.94s/it] +2025-05-11 06:26:22 - ERROR - stderr - +2025-05-11 06:26:22 - ERROR - stderr - +2025-05-11 06:26:22 - INFO - stdout - {'loss': 0.7615, 'grad_norm': 0.7064805626869202, 'learning_rate': 7.531893231446372e-06, 'epoch': 1.78} +2025-05-11 06:26:22 - ERROR - stderr - 59%|█████▉ | 2215/3741 [13:00:28<9:17:58, 21.94s/it] +2025-05-11 06:26:43 - ERROR - stderr - 59%|█████▉ | 2216/3741 [13:00:50<9:14:48, 21.83s/it] +2025-05-11 06:26:43 - ERROR - stderr - +2025-05-11 06:26:43 - ERROR - stderr - +2025-05-11 06:26:43 - INFO - stdout - {'loss': 0.7474, 'grad_norm': 0.6517053842544556, 'learning_rate': 7.523502748079141e-06, 'epoch': 1.78} +2025-05-11 06:26:43 - ERROR - stderr - 59%|█████▉ | 2216/3741 [13:00:50<9:14:48, 21.83s/it] +2025-05-11 06:27:05 - ERROR - stderr - 59%|█████▉ | 2217/3741 [13:01:12<9:15:40, 21.88s/it] +2025-05-11 06:27:05 - ERROR - stderr - +2025-05-11 06:27:05 - ERROR - stderr - +2025-05-11 06:27:05 - INFO - stdout - {'loss': 0.702, 'grad_norm': 0.674662172794342, 'learning_rate': 7.51511412167389e-06, 'epoch': 1.78} +2025-05-11 06:27:05 - ERROR - stderr - 59%|█████▉ | 2217/3741 [13:01:12<9:15:40, 21.88s/it] +2025-05-11 06:27:06 - WARNING - transformers.tokenization_utils_base - Token indices sequence length is longer than the specified maximum sequence length for this model (5986 > 4096). Running this sequence through the model will result in indexing errors +2025-05-11 06:27:06 - WARNING - transformers.tokenization_utils_base - Token indices sequence length is longer than the specified maximum sequence length for this model (5986 > 4096). Running this sequence through the model will result in indexing errors +2025-05-11 06:27:28 - ERROR - stderr - 59%|█████▉ | 2218/3741 [13:01:34<9:22:22, 22.16s/it] +2025-05-11 06:27:28 - ERROR - stderr - +2025-05-11 06:27:28 - ERROR - stderr - +2025-05-11 06:27:28 - INFO - stdout - {'loss': 0.7111, 'grad_norm': 0.7000203132629395, 'learning_rate': 7.506727358520693e-06, 'epoch': 1.78} +2025-05-11 06:27:28 - ERROR - stderr - 59%|█████▉ | 2218/3741 [13:01:34<9:22:22, 22.16s/it] +2025-05-11 06:27:53 - ERROR - stderr - 59%|█████▉ | 2219/3741 [13:01:59<9:41:48, 22.94s/it] +2025-05-11 06:27:53 - ERROR - stderr - +2025-05-11 06:27:53 - ERROR - stderr - +2025-05-11 06:27:53 - INFO - stdout - {'loss': 0.7337, 'grad_norm': 0.6659766435623169, 'learning_rate': 7.498342464908237e-06, 'epoch': 1.78} +2025-05-11 06:27:53 - ERROR - stderr - 59%|█████▉ | 2219/3741 [13:01:59<9:41:48, 22.94s/it] +2025-05-11 06:28:15 - ERROR - stderr - 59%|█████▉ | 2220/3741 [13:02:21<9:35:14, 22.69s/it] +2025-05-11 06:28:15 - ERROR - stderr - +2025-05-11 06:28:15 - ERROR - stderr - +2025-05-11 06:28:15 - INFO - stdout - {'loss': 0.7378, 'grad_norm': 0.6771808862686157, 'learning_rate': 7.489959447123797e-06, 'epoch': 1.78} +2025-05-11 06:28:15 - ERROR - stderr - 59%|█████▉ | 2220/3741 [13:02:21<9:35:14, 22.69s/it] +2025-05-11 06:28:38 - ERROR - stderr - 59%|█████▉ | 2221/3741 [13:02:44<9:35:33, 22.72s/it] +2025-05-11 06:28:38 - ERROR - stderr - +2025-05-11 06:28:38 - ERROR - stderr - +2025-05-11 06:28:38 - INFO - stdout - {'loss': 0.7649, 'grad_norm': 0.7038045525550842, 'learning_rate': 7.4815783114532485e-06, 'epoch': 1.78} +2025-05-11 06:28:38 - ERROR - stderr - 59%|█████▉ | 2221/3741 [13:02:44<9:35:33, 22.72s/it] +2025-05-11 06:28:59 - ERROR - stderr - 59%|█████▉ | 2222/3741 [13:03:06<9:26:13, 22.37s/it] +2025-05-11 06:28:59 - ERROR - stderr - +2025-05-11 06:28:59 - ERROR - stderr - +2025-05-11 06:28:59 - INFO - stdout - {'loss': 0.7119, 'grad_norm': 0.6613171100616455, 'learning_rate': 7.473199064181048e-06, 'epoch': 1.78} +2025-05-11 06:28:59 - ERROR - stderr - 59%|█████▉ | 2222/3741 [13:03:06<9:26:13, 22.37s/it] +2025-05-11 06:29:24 - ERROR - stderr - 59%|█████▉ | 2223/3741 [13:03:30<9:43:09, 23.05s/it] +2025-05-11 06:29:24 - ERROR - stderr - +2025-05-11 06:29:24 - ERROR - stderr - +2025-05-11 06:29:24 - INFO - stdout - {'loss': 0.7293, 'grad_norm': 0.6629149913787842, 'learning_rate': 7.464821711590242e-06, 'epoch': 1.78} +2025-05-11 06:29:24 - ERROR - stderr - 59%|█████▉ | 2223/3741 [13:03:30<9:43:09, 23.05s/it] +2025-05-11 06:29:46 - ERROR - stderr - 59%|█████▉ | 2224/3741 [13:03:52<9:31:43, 22.61s/it] +2025-05-11 06:29:46 - ERROR - stderr - +2025-05-11 06:29:46 - ERROR - stderr - +2025-05-11 06:29:46 - INFO - stdout - {'loss': 0.7311, 'grad_norm': 0.6879216432571411, 'learning_rate': 7.456446259962455e-06, 'epoch': 1.78} +2025-05-11 06:29:46 - ERROR - stderr - 59%|█████▉ | 2224/3741 [13:03:52<9:31:43, 22.61s/it] +2025-05-11 06:30:12 - ERROR - stderr - 59%|█████▉ | 2225/3741 [13:04:18<10:00:11, 23.75s/it] +2025-05-11 06:30:12 - ERROR - stderr - +2025-05-11 06:30:12 - ERROR - stderr - +2025-05-11 06:30:12 - INFO - stdout - {'loss': 0.7511, 'grad_norm': 0.6925482153892517, 'learning_rate': 7.448072715577885e-06, 'epoch': 1.78} +2025-05-11 06:30:12 - ERROR - stderr - 59%|█████▉ | 2225/3741 [13:04:18<10:00:11, 23.75s/it] +2025-05-11 06:30:43 - ERROR - stderr - 60%|█████▉ | 2226/3741 [13:04:50<10:58:16, 26.07s/it] +2025-05-11 06:30:43 - ERROR - stderr - +2025-05-11 06:30:43 - ERROR - stderr - +2025-05-11 06:30:43 - INFO - stdout - {'loss': 0.7436, 'grad_norm': 0.6826873421669006, 'learning_rate': 7.439701084715305e-06, 'epoch': 1.79} +2025-05-11 06:30:43 - ERROR - stderr - 60%|█████▉ | 2226/3741 [13:04:50<10:58:16, 26.07s/it] +2025-05-11 06:31:05 - ERROR - stderr - 60%|█████▉ | 2227/3741 [13:05:12<10:25:51, 24.80s/it] +2025-05-11 06:31:05 - ERROR - stderr - +2025-05-11 06:31:05 - ERROR - stderr - +2025-05-11 06:31:05 - INFO - stdout - {'loss': 0.7159, 'grad_norm': 0.7013863325119019, 'learning_rate': 7.431331373652046e-06, 'epoch': 1.79} +2025-05-11 06:31:05 - ERROR - stderr - 60%|█████▉ | 2227/3741 [13:05:12<10:25:51, 24.80s/it] +2025-05-11 06:31:27 - ERROR - stderr - 60%|█████▉ | 2228/3741 [13:05:33<10:01:51, 23.87s/it] +2025-05-11 06:31:27 - ERROR - stderr - +2025-05-11 06:31:27 - ERROR - stderr - +2025-05-11 06:31:27 - INFO - stdout - {'loss': 0.7404, 'grad_norm': 0.6822634935379028, 'learning_rate': 7.422963588663998e-06, 'epoch': 1.79} +2025-05-11 06:31:27 - ERROR - stderr - 60%|█████▉ | 2228/3741 [13:05:33<10:01:51, 23.87s/it] +2025-05-11 06:31:49 - ERROR - stderr - 60%|█████▉ | 2229/3741 [13:05:55<9:46:30, 23.27s/it] +2025-05-11 06:31:49 - ERROR - stderr - +2025-05-11 06:31:49 - ERROR - stderr - +2025-05-11 06:31:49 - INFO - stdout - {'loss': 0.755, 'grad_norm': 0.6994298100471497, 'learning_rate': 7.414597736025621e-06, 'epoch': 1.79} +2025-05-11 06:31:49 - ERROR - stderr - 60%|█████▉ | 2229/3741 [13:05:55<9:46:30, 23.27s/it] +2025-05-11 06:32:11 - ERROR - stderr - 60%|█████▉ | 2230/3741 [13:06:17<9:37:42, 22.94s/it] +2025-05-11 06:32:11 - ERROR - stderr - +2025-05-11 06:32:11 - ERROR - stderr - +2025-05-11 06:32:11 - INFO - stdout - {'loss': 0.7806, 'grad_norm': 0.7196714282035828, 'learning_rate': 7.406233822009904e-06, 'epoch': 1.79} +2025-05-11 06:32:11 - ERROR - stderr - 60%|█████▉ | 2230/3741 [13:06:17<9:37:42, 22.94s/it] +2025-05-11 06:32:33 - ERROR - stderr - 60%|█████▉ | 2231/3741 [13:06:39<9:29:41, 22.64s/it] +2025-05-11 06:32:33 - ERROR - stderr - +2025-05-11 06:32:33 - ERROR - stderr - +2025-05-11 06:32:33 - INFO - stdout - {'loss': 0.7119, 'grad_norm': 0.6684456467628479, 'learning_rate': 7.397871852888405e-06, 'epoch': 1.79} +2025-05-11 06:32:33 - ERROR - stderr - 60%|█████▉ | 2231/3741 [13:06:39<9:29:41, 22.64s/it] +2025-05-11 06:32:55 - ERROR - stderr - 60%|█████▉ | 2232/3741 [13:07:01<9:24:39, 22.45s/it] +2025-05-11 06:32:55 - ERROR - stderr - +2025-05-11 06:32:55 - ERROR - stderr - +2025-05-11 06:32:55 - INFO - stdout - {'loss': 0.7417, 'grad_norm': 0.6782661080360413, 'learning_rate': 7.389511834931211e-06, 'epoch': 1.79} +2025-05-11 06:32:55 - ERROR - stderr - 60%|█████▉ | 2232/3741 [13:07:01<9:24:39, 22.45s/it] +2025-05-11 06:33:17 - ERROR - stderr - 60%|█████▉ | 2233/3741 [13:07:23<9:20:58, 22.32s/it] +2025-05-11 06:33:17 - ERROR - stderr - +2025-05-11 06:33:17 - ERROR - stderr - +2025-05-11 06:33:17 - INFO - stdout - {'loss': 0.7621, 'grad_norm': 0.7280923128128052, 'learning_rate': 7.381153774406944e-06, 'epoch': 1.79} +2025-05-11 06:33:17 - ERROR - stderr - 60%|█████▉ | 2233/3741 [13:07:23<9:20:58, 22.32s/it] +2025-05-11 06:33:39 - ERROR - stderr - 60%|█████▉ | 2234/3741 [13:07:45<9:18:33, 22.24s/it] +2025-05-11 06:33:39 - ERROR - stderr - +2025-05-11 06:33:39 - ERROR - stderr - +2025-05-11 06:33:39 - INFO - stdout - {'loss': 0.7315, 'grad_norm': 0.6602609157562256, 'learning_rate': 7.372797677582767e-06, 'epoch': 1.79} +2025-05-11 06:33:39 - ERROR - stderr - 60%|█████▉ | 2234/3741 [13:07:45<9:18:33, 22.24s/it] +2025-05-11 06:34:01 - ERROR - stderr - 60%|█████▉ | 2235/3741 [13:08:07<9:17:33, 22.21s/it] +2025-05-11 06:34:01 - ERROR - stderr - +2025-05-11 06:34:01 - ERROR - stderr - +2025-05-11 06:34:01 - INFO - stdout - {'loss': 0.7265, 'grad_norm': 0.6975564956665039, 'learning_rate': 7.36444355072436e-06, 'epoch': 1.79} +2025-05-11 06:34:01 - ERROR - stderr - 60%|█████▉ | 2235/3741 [13:08:08<9:17:33, 22.21s/it] +2025-05-11 06:34:24 - ERROR - stderr - 60%|█████▉ | 2236/3741 [13:08:30<9:21:29, 22.39s/it] +2025-05-11 06:34:24 - ERROR - stderr - +2025-05-11 06:34:24 - ERROR - stderr - +2025-05-11 06:34:24 - INFO - stdout - {'loss': 0.7065, 'grad_norm': 0.6379725933074951, 'learning_rate': 7.356091400095942e-06, 'epoch': 1.79} +2025-05-11 06:34:24 - ERROR - stderr - 60%|█████▉ | 2236/3741 [13:08:30<9:21:29, 22.39s/it] +2025-05-11 06:34:46 - ERROR - stderr - 60%|█████▉ | 2237/3741 [13:08:52<9:17:03, 22.22s/it] +2025-05-11 06:34:46 - ERROR - stderr - +2025-05-11 06:34:46 - ERROR - stderr - +2025-05-11 06:34:46 - INFO - stdout - {'loss': 0.7275, 'grad_norm': 0.7008256316184998, 'learning_rate': 7.3477412319602306e-06, 'epoch': 1.79} +2025-05-11 06:34:46 - ERROR - stderr - 60%|█████▉ | 2237/3741 [13:08:52<9:17:03, 22.22s/it] +2025-05-11 06:35:08 - ERROR - stderr - 60%|█████▉ | 2238/3741 [13:09:14<9:17:10, 22.24s/it] +2025-05-11 06:35:08 - ERROR - stderr - +2025-05-11 06:35:08 - ERROR - stderr - +2025-05-11 06:35:08 - INFO - stdout - {'loss': 0.732, 'grad_norm': 0.6815301775932312, 'learning_rate': 7.339393052578465e-06, 'epoch': 1.79} +2025-05-11 06:35:08 - ERROR - stderr - 60%|█████▉ | 2238/3741 [13:09:14<9:17:10, 22.24s/it] +2025-05-11 06:35:30 - ERROR - stderr - 60%|█████▉ | 2239/3741 [13:09:36<9:15:13, 22.18s/it] +2025-05-11 06:35:30 - ERROR - stderr - +2025-05-11 06:35:30 - ERROR - stderr - +2025-05-11 06:35:30 - INFO - stdout - {'loss': 0.7292, 'grad_norm': 0.6972305178642273, 'learning_rate': 7.3310468682104055e-06, 'epoch': 1.8} +2025-05-11 06:35:30 - ERROR - stderr - 60%|█████▉ | 2239/3741 [13:09:36<9:15:13, 22.18s/it] +2025-05-11 06:35:52 - ERROR - stderr - 60%|█████▉ | 2240/3741 [13:09:58<9:13:55, 22.14s/it] +2025-05-11 06:35:52 - ERROR - stderr - +2025-05-11 06:35:52 - ERROR - stderr - +2025-05-11 06:35:52 - INFO - stdout - {'loss': 0.6968, 'grad_norm': 0.6848737597465515, 'learning_rate': 7.322702685114295e-06, 'epoch': 1.8} +2025-05-11 06:35:52 - ERROR - stderr - 60%|█████▉ | 2240/3741 [13:09:59<9:13:55, 22.14s/it] +2025-05-11 06:36:14 - ERROR - stderr - 60%|█████▉ | 2241/3741 [13:10:20<9:11:06, 22.04s/it] +2025-05-11 06:36:14 - ERROR - stderr - +2025-05-11 06:36:14 - ERROR - stderr - +2025-05-11 06:36:14 - INFO - stdout - {'loss': 0.7678, 'grad_norm': 0.7986035943031311, 'learning_rate': 7.3143605095468915e-06, 'epoch': 1.8} +2025-05-11 06:36:14 - ERROR - stderr - 60%|█████▉ | 2241/3741 [13:10:20<9:11:06, 22.04s/it] +2025-05-11 06:36:36 - ERROR - stderr - 60%|█████▉ | 2242/3741 [13:10:42<9:07:44, 21.92s/it] +2025-05-11 06:36:36 - ERROR - stderr - +2025-05-11 06:36:36 - ERROR - stderr - +2025-05-11 06:36:36 - INFO - stdout - {'loss': 0.6936, 'grad_norm': 0.6967061758041382, 'learning_rate': 7.30602034776344e-06, 'epoch': 1.8} +2025-05-11 06:36:36 - ERROR - stderr - 60%|█████▉ | 2242/3741 [13:10:42<9:07:44, 21.92s/it] +2025-05-11 06:36:58 - ERROR - stderr - 60%|█████▉ | 2243/3741 [13:11:04<9:07:07, 21.91s/it] +2025-05-11 06:36:58 - ERROR - stderr - +2025-05-11 06:36:58 - ERROR - stderr - +2025-05-11 06:36:58 - INFO - stdout - {'loss': 0.7218, 'grad_norm': 0.6833521723747253, 'learning_rate': 7.297682206017676e-06, 'epoch': 1.8} +2025-05-11 06:36:58 - ERROR - stderr - 60%|█████▉ | 2243/3741 [13:11:04<9:07:07, 21.91s/it] +2025-05-11 06:37:20 - ERROR - stderr - 60%|█████▉ | 2244/3741 [13:11:26<9:10:28, 22.06s/it] +2025-05-11 06:37:20 - ERROR - stderr - +2025-05-11 06:37:20 - ERROR - stderr - +2025-05-11 06:37:20 - INFO - stdout - {'loss': 0.7502, 'grad_norm': 0.6830206513404846, 'learning_rate': 7.289346090561828e-06, 'epoch': 1.8} +2025-05-11 06:37:20 - ERROR - stderr - 60%|█████▉ | 2244/3741 [13:11:26<9:10:28, 22.06s/it] +2025-05-11 06:37:43 - ERROR - stderr - 60%|██████ | 2245/3741 [13:11:49<9:14:54, 22.26s/it] +2025-05-11 06:37:43 - ERROR - stderr - +2025-05-11 06:37:43 - ERROR - stderr - +2025-05-11 06:37:43 - INFO - stdout - {'loss': 0.7189, 'grad_norm': 0.68632972240448, 'learning_rate': 7.281012007646595e-06, 'epoch': 1.8} +2025-05-11 06:37:43 - ERROR - stderr - 60%|██████ | 2245/3741 [13:11:49<9:14:54, 22.26s/it] +2025-05-11 06:38:05 - ERROR - stderr - 60%|██████ | 2246/3741 [13:12:11<9:14:11, 22.24s/it] +2025-05-11 06:38:05 - ERROR - stderr - +2025-05-11 06:38:05 - ERROR - stderr - +2025-05-11 06:38:05 - INFO - stdout - {'loss': 0.7313, 'grad_norm': 0.7078197002410889, 'learning_rate': 7.272679963521158e-06, 'epoch': 1.8} +2025-05-11 06:38:05 - ERROR - stderr - 60%|██████ | 2246/3741 [13:12:11<9:14:11, 22.24s/it] +2025-05-11 06:38:27 - ERROR - stderr - 60%|██████ | 2247/3741 [13:12:33<9:12:53, 22.20s/it] +2025-05-11 06:38:27 - ERROR - stderr - +2025-05-11 06:38:27 - ERROR - stderr - +2025-05-11 06:38:27 - INFO - stdout - {'loss': 0.7321, 'grad_norm': 0.6793120503425598, 'learning_rate': 7.264349964433168e-06, 'epoch': 1.8} +2025-05-11 06:38:27 - ERROR - stderr - 60%|██████ | 2247/3741 [13:12:33<9:12:53, 22.20s/it] +2025-05-11 06:38:49 - ERROR - stderr - 60%|██████ | 2248/3741 [13:12:55<9:08:30, 22.04s/it] +2025-05-11 06:38:49 - ERROR - stderr - +2025-05-11 06:38:49 - ERROR - stderr - +2025-05-11 06:38:49 - INFO - stdout - {'loss': 0.7568, 'grad_norm': 0.7071113586425781, 'learning_rate': 7.2560220166287355e-06, 'epoch': 1.8} +2025-05-11 06:38:49 - ERROR - stderr - 60%|██████ | 2248/3741 [13:12:55<9:08:30, 22.04s/it] +2025-05-11 06:39:11 - ERROR - stderr - 60%|██████ | 2249/3741 [13:13:17<9:10:53, 22.15s/it] +2025-05-11 06:39:11 - ERROR - stderr - +2025-05-11 06:39:11 - ERROR - stderr - +2025-05-11 06:39:11 - INFO - stdout - {'loss': 0.7318, 'grad_norm': 0.6845733523368835, 'learning_rate': 7.24769612635245e-06, 'epoch': 1.8} +2025-05-11 06:39:11 - ERROR - stderr - 60%|██████ | 2249/3741 [13:13:17<9:10:53, 22.15s/it] +2025-05-11 06:39:33 - ERROR - stderr - 60%|██████ | 2250/3741 [13:13:40<9:11:21, 22.19s/it] +2025-05-11 06:39:33 - ERROR - stderr - +2025-05-11 06:39:33 - ERROR - stderr - +2025-05-11 06:39:33 - INFO - stdout - {'loss': 0.7303, 'grad_norm': 0.6932980418205261, 'learning_rate': 7.239372299847338e-06, 'epoch': 1.8} +2025-05-11 06:39:33 - ERROR - stderr - 60%|██████ | 2250/3741 [13:13:40<9:11:21, 22.19s/it] +2025-05-11 06:39:55 - ERROR - stderr - 60%|██████ | 2251/3741 [13:14:02<9:10:27, 22.17s/it] +2025-05-11 06:39:55 - ERROR - stderr - +2025-05-11 06:39:55 - ERROR - stderr - +2025-05-11 06:39:55 - INFO - stdout - {'loss': 0.7239, 'grad_norm': 0.6790763139724731, 'learning_rate': 7.231050543354894e-06, 'epoch': 1.81} +2025-05-11 06:39:55 - ERROR - stderr - 60%|██████ | 2251/3741 [13:14:02<9:10:27, 22.17s/it] +2025-05-11 06:40:18 - ERROR - stderr - 60%|██████ | 2252/3741 [13:14:24<9:13:31, 22.30s/it] +2025-05-11 06:40:18 - ERROR - stderr - +2025-05-11 06:40:18 - ERROR - stderr - +2025-05-11 06:40:18 - INFO - stdout - {'loss': 0.7255, 'grad_norm': 0.7017188668251038, 'learning_rate': 7.2227308631150535e-06, 'epoch': 1.81} +2025-05-11 06:40:18 - ERROR - stderr - 60%|██████ | 2252/3741 [13:14:24<9:13:31, 22.30s/it] +2025-05-11 06:40:40 - ERROR - stderr - 60%|██████ | 2253/3741 [13:14:46<9:09:05, 22.14s/it] +2025-05-11 06:40:40 - ERROR - stderr - +2025-05-11 06:40:40 - ERROR - stderr - +2025-05-11 06:40:40 - INFO - stdout - {'loss': 0.7225, 'grad_norm': 0.6675518155097961, 'learning_rate': 7.214413265366194e-06, 'epoch': 1.81} +2025-05-11 06:40:40 - ERROR - stderr - 60%|██████ | 2253/3741 [13:14:46<9:09:05, 22.14s/it] +2025-05-11 06:41:02 - ERROR - stderr - 60%|████���█ | 2254/3741 [13:15:08<9:07:31, 22.09s/it] +2025-05-11 06:41:02 - ERROR - stderr - +2025-05-11 06:41:02 - ERROR - stderr - +2025-05-11 06:41:02 - INFO - stdout - {'loss': 0.7182, 'grad_norm': 0.6851517558097839, 'learning_rate': 7.206097756345135e-06, 'epoch': 1.81} +2025-05-11 06:41:02 - ERROR - stderr - 60%|██████ | 2254/3741 [13:15:08<9:07:31, 22.09s/it] +2025-05-11 06:41:24 - ERROR - stderr - 60%|██████ | 2255/3741 [13:15:30<9:05:02, 22.01s/it] +2025-05-11 06:41:24 - ERROR - stderr - +2025-05-11 06:41:24 - ERROR - stderr - +2025-05-11 06:41:24 - INFO - stdout - {'loss': 0.7717, 'grad_norm': 0.6712617874145508, 'learning_rate': 7.197784342287125e-06, 'epoch': 1.81} +2025-05-11 06:41:24 - ERROR - stderr - 60%|██████ | 2255/3741 [13:15:30<9:05:02, 22.01s/it] +2025-05-11 06:41:46 - ERROR - stderr - 60%|██████ | 2256/3741 [13:15:52<9:04:38, 22.01s/it] +2025-05-11 06:41:46 - ERROR - stderr - +2025-05-11 06:41:46 - ERROR - stderr - +2025-05-11 06:41:46 - INFO - stdout - {'loss': 0.7246, 'grad_norm': 0.6820451617240906, 'learning_rate': 7.189473029425852e-06, 'epoch': 1.81} +2025-05-11 06:41:46 - ERROR - stderr - 60%|██████ | 2256/3741 [13:15:52<9:04:38, 22.01s/it] +2025-05-11 06:42:08 - ERROR - stderr - 60%|██████ | 2257/3741 [13:16:14<9:06:48, 22.11s/it] +2025-05-11 06:42:08 - ERROR - stderr - +2025-05-11 06:42:08 - ERROR - stderr - +2025-05-11 06:42:08 - INFO - stdout - {'loss': 0.7586, 'grad_norm': 0.6897710561752319, 'learning_rate': 7.181163823993418e-06, 'epoch': 1.81} +2025-05-11 06:42:08 - ERROR - stderr - 60%|██████ | 2257/3741 [13:16:14<9:06:48, 22.11s/it] +2025-05-11 06:42:30 - ERROR - stderr - 60%|██████ | 2258/3741 [13:16:36<9:04:09, 22.02s/it] +2025-05-11 06:42:30 - ERROR - stderr - +2025-05-11 06:42:30 - ERROR - stderr - +2025-05-11 06:42:30 - INFO - stdout - {'loss': 0.71, 'grad_norm': 0.6737632751464844, 'learning_rate': 7.172856732220344e-06, 'epoch': 1.81} +2025-05-11 06:42:30 - ERROR - stderr - 60%|██████ | 2258/3741 [13:16:36<9:04:09, 22.02s/it] +2025-05-11 06:42:52 - ERROR - stderr - 60%|██████ | 2259/3741 [13:16:58<9:03:34, 22.01s/it] +2025-05-11 06:42:52 - ERROR - stderr - +2025-05-11 06:42:52 - ERROR - stderr - +2025-05-11 06:42:52 - INFO - stdout - {'loss': 0.7375, 'grad_norm': 0.6887868046760559, 'learning_rate': 7.164551760335579e-06, 'epoch': 1.81} +2025-05-11 06:42:52 - ERROR - stderr - 60%|██████ | 2259/3741 [13:16:58<9:03:34, 22.01s/it] +2025-05-11 06:43:13 - ERROR - stderr - 60%|██████ | 2260/3741 [13:17:20<9:01:16, 21.93s/it] +2025-05-11 06:43:14 - ERROR - stderr - +2025-05-11 06:43:14 - ERROR - stderr - +2025-05-11 06:43:14 - INFO - stdout - {'loss': 0.7322, 'grad_norm': 0.6990635395050049, 'learning_rate': 7.156248914566461e-06, 'epoch': 1.81} +2025-05-11 06:43:14 - ERROR - stderr - 60%|██████ | 2260/3741 [13:17:20<9:01:16, 21.93s/it] +2025-05-11 06:43:36 - ERROR - stderr - 60%|██████ | 2261/3741 [13:17:42<9:02:21, 21.99s/it] +2025-05-11 06:43:36 - ERROR - stderr - +2025-05-11 06:43:36 - ERROR - stderr - +2025-05-11 06:43:36 - INFO - stdout - {'loss': 0.7243, 'grad_norm': 0.6845853924751282, 'learning_rate': 7.147948201138761e-06, 'epoch': 1.81} +2025-05-11 06:43:36 - ERROR - stderr - 60%|██████ | 2261/3741 [13:17:42<9:02:21, 21.99s/it] +2025-05-11 06:43:58 - ERROR - stderr - 60%|██████ | 2262/3741 [13:18:04<9:02:57, 22.03s/it] +2025-05-11 06:43:58 - ERROR - stderr - +2025-05-11 06:43:58 - ERROR - stderr - +2025-05-11 06:43:58 - INFO - stdout - {'loss': 0.7328, 'grad_norm': 0.7157221436500549, 'learning_rate': 7.139649626276629e-06, 'epoch': 1.81} +2025-05-11 06:43:58 - ERROR - stderr - 60%|██████ | 2262/3741 [13:18:04<9:02:57, 22.03s/it] +2025-05-11 06:44:18 - ERROR - stderr - 60%|██████ | 2263/3741 [13:18:25<8:52:29, 21.62s/it] +2025-05-11 06:44:18 - ERROR - stderr - +2025-05-11 06:44:18 - ERROR - stderr - +2025-05-11 06:44:18 - INFO - stdout - {'loss': 0.737, 'grad_norm': 0.668306827545166, 'learning_rate': 7.131353196202617e-06, 'epoch': 1.81} +2025-05-11 06:44:18 - ERROR - stderr - 60%|██████ | 2263/3741 [13:18:25<8:52:29, 21.62s/it] +2025-05-11 06:44:39 - ERROR - stderr - 61%|██████ | 2264/3741 [13:18:45<8:41:18, 21.18s/it] +2025-05-11 06:44:39 - ERROR - stderr - +2025-05-11 06:44:39 - ERROR - stderr - +2025-05-11 06:44:39 - INFO - stdout - {'loss': 0.7033, 'grad_norm': 0.6798452138900757, 'learning_rate': 7.123058917137677e-06, 'epoch': 1.82} +2025-05-11 06:44:39 - ERROR - stderr - 61%|██████ | 2264/3741 [13:18:45<8:41:18, 21.18s/it] +2025-05-11 06:44:58 - ERROR - stderr - 61%|██████ | 2265/3741 [13:19:05<8:31:55, 20.81s/it] +2025-05-11 06:44:59 - ERROR - stderr - +2025-05-11 06:44:59 - ERROR - stderr - +2025-05-11 06:44:59 - INFO - stdout - {'loss': 0.6999, 'grad_norm': 0.7059512138366699, 'learning_rate': 7.114766795301138e-06, 'epoch': 1.82} +2025-05-11 06:44:59 - ERROR - stderr - 61%|██████ | 2265/3741 [13:19:05<8:31:55, 20.81s/it] +2025-05-11 06:45:18 - ERROR - stderr - 61%|██████ | 2266/3741 [13:19:25<8:23:21, 20.48s/it] +2025-05-11 06:45:18 - ERROR - stderr - +2025-05-11 06:45:18 - ERROR - stderr - +2025-05-11 06:45:18 - INFO - stdout - {'loss': 0.7199, 'grad_norm': 0.6953184604644775, 'learning_rate': 7.106476836910716e-06, 'epoch': 1.82} +2025-05-11 06:45:18 - ERROR - stderr - 61%|██████ | 2266/3741 [13:19:25<8:23:21, 20.48s/it] +2025-05-11 06:45:39 - ERROR - stderr - 61%|██████ | 2267/3741 [13:19:46<8:27:49, 20.67s/it] +2025-05-11 06:45:39 - ERROR - stderr - +2025-05-11 06:45:39 - ERROR - stderr - +2025-05-11 06:45:39 - INFO - stdout - {'loss': 0.7685, 'grad_norm': 0.7047235369682312, 'learning_rate': 7.098189048182504e-06, 'epoch': 1.82} +2025-05-11 06:45:39 - ERROR - stderr - 61%|██████ | 2267/3741 [13:19:46<8:27:49, 20.67s/it] +2025-05-11 06:45:59 - ERROR - stderr - 61%|██████ | 2268/3741 [13:20:05<8:19:53, 20.36s/it] +2025-05-11 06:45:59 - ERROR - stderr - +2025-05-11 06:45:59 - ERROR - stderr - +2025-05-11 06:45:59 - INFO - stdout - {'loss': 0.7466, 'grad_norm': 0.7124036550521851, 'learning_rate': 7.089903435330966e-06, 'epoch': 1.82} +2025-05-11 06:45:59 - ERROR - stderr - 61%|██████ | 2268/3741 [13:20:05<8:19:53, 20.36s/it] +2025-05-11 06:46:21 - ERROR - stderr - 61%|██████ | 2269/3741 [13:20:27<8:28:38, 20.73s/it] +2025-05-11 06:46:21 - ERROR - stderr - +2025-05-11 06:46:21 - ERROR - stderr - +2025-05-11 06:46:21 - INFO - stdout - {'loss': 0.7218, 'grad_norm': 0.6875273585319519, 'learning_rate': 7.081620004568943e-06, 'epoch': 1.82} +2025-05-11 06:46:21 - ERROR - stderr - 61%|██████ | 2269/3741 [13:20:27<8:28:38, 20.73s/it] +2025-05-11 06:46:40 - ERROR - stderr - 61%|██████ | 2270/3741 [13:20:47<8:21:56, 20.47s/it] +2025-05-11 06:46:40 - ERROR - stderr - +2025-05-11 06:46:40 - ERROR - stderr - +2025-05-11 06:46:40 - INFO - stdout - {'loss': 0.7362, 'grad_norm': 0.6810701489448547, 'learning_rate': 7.073338762107627e-06, 'epoch': 1.82} +2025-05-11 06:46:40 - ERROR - stderr - 61%|██████ | 2270/3741 [13:20:47<8:21:56, 20.47s/it] +2025-05-11 06:47:03 - ERROR - stderr - 61%|██████ | 2271/3741 [13:21:09<8:34:58, 21.02s/it] +2025-05-11 06:47:03 - ERROR - stderr - +2025-05-11 06:47:03 - ERROR - stderr - +2025-05-11 06:47:03 - INFO - stdout - {'loss': 0.7142, 'grad_norm': 0.6458592414855957, 'learning_rate': 7.065059714156579e-06, 'epoch': 1.82} +2025-05-11 06:47:03 - ERROR - stderr - 61%|██████ | 2271/3741 [13:21:09<8:34:58, 21.02s/it] +2025-05-11 06:47:23 - ERROR - stderr - 61%|██████ | 2272/3741 [13:21:29<8:27:47, 20.74s/it] +2025-05-11 06:47:23 - ERROR - stderr - +2025-05-11 06:47:23 - ERROR - stderr - +2025-05-11 06:47:23 - INFO - stdout - {'loss': 0.7441, 'grad_norm': 0.6925168037414551, 'learning_rate': 7.0567828669237125e-06, 'epoch': 1.82} +2025-05-11 06:47:23 - ERROR - stderr - 61%|██████ | 2272/3741 [13:21:29<8:27:47, 20.74s/it] +2025-05-11 06:47:46 - ERROR - stderr - 61%|██████ | 2273/3741 [13:21:53<8:47:09, 21.55s/it] +2025-05-11 06:47:46 - ERROR - stderr - +2025-05-11 06:47:46 - ERROR - stderr - +2025-05-11 06:47:46 - INFO - stdout - {'loss': 0.72, 'grad_norm': 0.7175741195678711, 'learning_rate': 7.048508226615282e-06, 'epoch': 1.82} +2025-05-11 06:47:46 - ERROR - stderr - 61%|██████ | 2273/3741 [13:21:53<8:47:09, 21.55s/it] +2025-05-11 06:48:06 - ERROR - stderr - 61%|██████ | 2274/3741 [13:22:13<8:37:03, 21.15s/it] +2025-05-11 06:48:06 - ERROR - stderr - +2025-05-11 06:48:06 - ERROR - stderr - +2025-05-11 06:48:06 - INFO - stdout - {'loss': 0.727, 'grad_norm': 0.6916970610618591, 'learning_rate': 7.040235799435904e-06, 'epoch': 1.82} +2025-05-11 06:48:06 - ERROR - stderr - 61%|██████ | 2274/3741 [13:22:13<8:37:03, 21.15s/it] +2025-05-11 06:48:30 - ERROR - stderr - 61%|██████ | 2275/3741 [13:22:36<8:53:33, 21.84s/it] +2025-05-11 06:48:30 - ERROR - stderr - +2025-05-11 06:48:30 - ERROR - stderr - +2025-05-11 06:48:30 - INFO - stdout - {'loss': 0.7692, 'grad_norm': 0.6771306395530701, 'learning_rate': 7.0319655915885185e-06, 'epoch': 1.82} +2025-05-11 06:48:30 - ERROR - stderr - 61%|██████ | 2275/3741 [13:22:36<8:53:33, 21.84s/it] +2025-05-11 06:48:50 - ERROR - stderr - 61%|██████ | 2276/3741 [13:22:56<8:39:21, 21.27s/it] +2025-05-11 06:48:50 - ERROR - stderr - +2025-05-11 06:48:50 - ERROR - stderr - +2025-05-11 06:48:50 - INFO - stdout - {'loss': 0.7244, 'grad_norm': 0.7066898941993713, 'learning_rate': 7.023697609274418e-06, 'epoch': 1.83} +2025-05-11 06:48:50 - ERROR - stderr - 61%|██████ | 2276/3741 [13:22:56<8:39:21, 21.27s/it] +2025-05-11 06:49:14 - ERROR - stderr - 61%|██████ | 2277/3741 [13:23:20<8:57:31, 22.03s/it] +2025-05-11 06:49:14 - ERROR - stderr - +2025-05-11 06:49:14 - ERROR - stderr - +2025-05-11 06:49:14 - INFO - stdout - {'loss': 0.7411, 'grad_norm': 0.6827045679092407, 'learning_rate': 7.015431858693209e-06, 'epoch': 1.83} +2025-05-11 06:49:14 - ERROR - stderr - 61%|██████ | 2277/3741 [13:23:20<8:57:31, 22.03s/it] +2025-05-11 06:49:34 - ERROR - stderr - 61%|██████ | 2278/3741 [13:23:40<8:45:03, 21.53s/it] +2025-05-11 06:49:34 - ERROR - stderr - +2025-05-11 06:49:34 - ERROR - stderr - +2025-05-11 06:49:34 - INFO - stdout - {'loss': 0.7481, 'grad_norm': 0.7030799388885498, 'learning_rate': 7.007168346042832e-06, 'epoch': 1.83} +2025-05-11 06:49:34 - ERROR - stderr - 61%|██████ | 2278/3741 [13:23:40<8:45:03, 21.53s/it] +2025-05-11 06:49:57 - ERROR - stderr - 61%|██████ | 2279/3741 [13:24:03<8:54:14, 21.93s/it] +2025-05-11 06:49:57 - ERROR - stderr - +2025-05-11 06:49:57 - ERROR - stderr - +2025-05-11 06:49:57 - INFO - stdout - {'loss': 0.7296, 'grad_norm': 0.6946167945861816, 'learning_rate': 6.998907077519561e-06, 'epoch': 1.83} +2025-05-11 06:49:57 - ERROR - stderr - 61%|██████ | 2279/3741 [13:24:03<8:54:14, 21.93s/it] +2025-05-11 06:50:17 - ERROR - stderr - 61%|██████ | 2280/3741 [13:24:23<8:41:27, 21.41s/it] +2025-05-11 06:50:17 - ERROR - stderr - +2025-05-11 06:50:17 - ERROR - stderr - +2025-05-11 06:50:17 - INFO - stdout - {'loss': 0.7295, 'grad_norm': 0.6715916991233826, 'learning_rate': 6.990648059317961e-06, 'epoch': 1.83} +2025-05-11 06:50:17 - ERROR - stderr - 61%|██████ | 2280/3741 [13:24:23<8:41:27, 21.41s/it] +2025-05-11 06:50:37 - ERROR - stderr - 61%|██████ | 2281/3741 [13:24:44<8:32:19, 21.05s/it] +2025-05-11 06:50:37 - ERROR - stderr - +2025-05-11 06:50:37 - ERROR - stderr - +2025-05-11 06:50:37 - INFO - stdout - {'loss': 0.7294, 'grad_norm': 0.6531383395195007, 'learning_rate': 6.982391297630939e-06, 'epoch': 1.83} +2025-05-11 06:50:37 - ERROR - stderr - 61%|██████ | 2281/3741 [13:24:44<8:32:19, 21.05s/it] +2025-05-11 06:50:57 - ERROR - stderr - 61%|██████ | 2282/3741 [13:25:03<8:22:24, 20.66s/it] +2025-05-11 06:50:57 - ERROR - stderr - +2025-05-11 06:50:57 - ERROR - stderr - +2025-05-11 06:50:57 - INFO - stdout - {'loss': 0.7247, 'grad_norm': 0.6955968141555786, 'learning_rate': 6.97413679864969e-06, 'epoch': 1.83} +2025-05-11 06:50:57 - ERROR - stderr - 61%|██████ | 2282/3741 [13:25:03<8:22:24, 20.66s/it] +2025-05-11 06:51:17 - ERROR - stderr - 61%|██████ | 2283/3741 [13:25:23<8:16:23, 20.43s/it] +2025-05-11 06:51:17 - ERROR - stderr - +2025-05-11 06:51:17 - ERROR - stderr - +2025-05-11 06:51:17 - INFO - stdout - {'loss': 0.6878, 'grad_norm': 0.7067756652832031, 'learning_rate': 6.965884568563717e-06, 'epoch': 1.83} +2025-05-11 06:51:17 - ERROR - stderr - 61%|██████ | 2283/3741 [13:25:23<8:16:23, 20.43s/it] +2025-05-11 06:51:37 - ERROR - stderr - 61%|██████ | 2284/3741 [13:25:43<8:12:27, 20.28s/it] +2025-05-11 06:51:37 - ERROR - stderr - +2025-05-11 06:51:37 - ERROR - stderr - +2025-05-11 06:51:37 - INFO - stdout - {'loss': 0.7231, 'grad_norm': 0.6867192983627319, 'learning_rate': 6.957634613560827e-06, 'epoch': 1.83} +2025-05-11 06:51:37 - ERROR - stderr - 61%|██████ | 2284/3741 [13:25:43<8:12:27, 20.28s/it] +2025-05-11 06:51:57 - ERROR - stderr - 61%|██████ | 2285/3741 [13:26:03<8:09:31, 20.17s/it] +2025-05-11 06:51:57 - ERROR - stderr - +2025-05-11 06:51:57 - ERROR - stderr - +2025-05-11 06:51:57 - INFO - stdout - {'loss': 0.7236, 'grad_norm': 0.6870403289794922, 'learning_rate': 6.94938693982711e-06, 'epoch': 1.83} +2025-05-11 06:51:57 - ERROR - stderr - 61%|██████ | 2285/3741 [13:26:03<8:09:31, 20.17s/it] +2025-05-11 06:52:17 - ERROR - stderr - 61%|██████ | 2286/3741 [13:26:24<8:11:05, 20.25s/it] +2025-05-11 06:52:17 - ERROR - stderr - +2025-05-11 06:52:17 - ERROR - stderr - +2025-05-11 06:52:17 - INFO - stdout - {'loss': 0.7548, 'grad_norm': 0.687545120716095, 'learning_rate': 6.941141553546963e-06, 'epoch': 1.83} +2025-05-11 06:52:17 - ERROR - stderr - 61%|██████ | 2286/3741 [13:26:24<8:11:05, 20.25s/it] +2025-05-11 06:52:38 - ERROR - stderr - 61%|██████ | 2287/3741 [13:26:44<8:12:25, 20.32s/it] +2025-05-11 06:52:38 - ERROR - stderr - +2025-05-11 06:52:38 - ERROR - stderr - +2025-05-11 06:52:38 - INFO - stdout - {'loss': 0.7243, 'grad_norm': 0.6433346271514893, 'learning_rate': 6.932898460903052e-06, 'epoch': 1.83} +2025-05-11 06:52:38 - ERROR - stderr - 61%|██████ | 2287/3741 [13:26:44<8:12:25, 20.32s/it] +2025-05-11 06:52:59 - ERROR - stderr - 61%|██████ | 2288/3741 [13:27:06<8:21:00, 20.69s/it] +2025-05-11 06:52:59 - ERROR - stderr - +2025-05-11 06:52:59 - ERROR - stderr - +2025-05-11 06:52:59 - INFO - stdout - {'loss': 0.7515, 'grad_norm': 0.7730824947357178, 'learning_rate': 6.924657668076326e-06, 'epoch': 1.83} +2025-05-11 06:52:59 - ERROR - stderr - 61%|██████ | 2288/3741 [13:27:06<8:21:00, 20.69s/it] +2025-05-11 06:53:19 - ERROR - stderr - 61%|██████ | 2289/3741 [13:27:26<8:16:31, 20.52s/it] +2025-05-11 06:53:19 - ERROR - stderr - +2025-05-11 06:53:19 - ERROR - stderr - +2025-05-11 06:53:19 - INFO - stdout - {'loss': 0.714, 'grad_norm': 0.6878124475479126, 'learning_rate': 6.9164191812460194e-06, 'epoch': 1.84} +2025-05-11 06:53:19 - ERROR - stderr - 61%|██████ | 2289/3741 [13:27:26<8:16:31, 20.52s/it] +2025-05-11 06:53:41 - ERROR - stderr - 61%|██████ | 2290/3741 [13:27:47<8:25:00, 20.88s/it] +2025-05-11 06:53:41 - ERROR - stderr - +2025-05-11 06:53:41 - ERROR - stderr - +2025-05-11 06:53:41 - INFO - stdout - {'loss': 0.701, 'grad_norm': 0.6526556015014648, 'learning_rate': 6.90818300658962e-06, 'epoch': 1.84} +2025-05-11 06:53:41 - ERROR - stderr - 61%|██████ | 2290/3741 [13:27:47<8:25:00, 20.88s/it] +2025-05-11 06:54:01 - ERROR - stderr - 61%|██████ | 2291/3741 [13:28:07<8:18:23, 20.62s/it] +2025-05-11 06:54:01 - ERROR - stderr - +2025-05-11 06:54:01 - ERROR - stderr - +2025-05-11 06:54:01 - INFO - stdout - {'loss': 0.714, 'grad_norm': 0.6894628405570984, 'learning_rate': 6.899949150282903e-06, 'epoch': 1.84} +2025-05-11 06:54:01 - ERROR - stderr - 61%|██████ | 2291/3741 [13:28:07<8:18:23, 20.62s/it] +2025-05-11 06:54:23 - ERROR - stderr - 61%|██████▏ | 2292/3741 [13:28:30<8:30:02, 21.12s/it] +2025-05-11 06:54:23 - ERROR - stderr - +2025-05-11 06:54:23 - ERROR - stderr - +2025-05-11 06:54:23 - INFO - stdout - {'loss': 0.7532, 'grad_norm': 0.7038993239402771, 'learning_rate': 6.8917176184998915e-06, 'epoch': 1.84} +2025-05-11 06:54:23 - ERROR - stderr - 61%|██████▏ | 2292/3741 [13:28:30<8:30:02, 21.12s/it] +2025-05-11 06:54:43 - ERROR - stderr - 61%|██████▏ | 2293/3741 [13:28:50<8:20:14, 20.73s/it] +2025-05-11 06:54:43 - ERROR - stderr - +2025-05-11 06:54:43 - ERROR - stderr - +2025-05-11 06:54:43 - INFO - stdout - {'loss': 0.7281, 'grad_norm': 0.6736302375793457, 'learning_rate': 6.883488417412858e-06, 'epoch': 1.84} +2025-05-11 06:54:43 - ERROR - stderr - 61%|██████▏ | 2293/3741 [13:28:50<8:20:14, 20.73s/it] +2025-05-11 06:55:06 - ERROR - stderr - 61%|██████▏ | 2294/3741 [13:29:12<8:31:43, 21.22s/it] +2025-05-11 06:55:06 - ERROR - stderr - +2025-05-11 06:55:06 - ERROR - stderr - +2025-05-11 06:55:06 - INFO - stdout - {'loss': 0.7431, 'grad_norm': 0.7072314023971558, 'learning_rate': 6.875261553192352e-06, 'epoch': 1.84} +2025-05-11 06:55:06 - ERROR - stderr - 61%|██████▏ | 2294/3741 [13:29:12<8:31:43, 21.22s/it] +2025-05-11 06:55:25 - ERROR - stderr - 61%|██████▏ | 2295/3741 [13:29:32<8:21:58, 20.83s/it] +2025-05-11 06:55:26 - ERROR - stderr - +2025-05-11 06:55:26 - ERROR - stderr - +2025-05-11 06:55:26 - INFO - stdout - {'loss': 0.6828, 'grad_norm': 0.6677948832511902, 'learning_rate': 6.8670370320071466e-06, 'epoch': 1.84} +2025-05-11 06:55:26 - ERROR - stderr - 61%|██████▏ | 2295/3741 [13:29:32<8:21:58, 20.83s/it] +2025-05-11 06:55:48 - ERROR - stderr - 61%|██████▏ | 2296/3741 [13:29:54<8:31:12, 21.23s/it] +2025-05-11 06:55:48 - ERROR - stderr - +2025-05-11 06:55:48 - ERROR - stderr - +2025-05-11 06:55:48 - INFO - stdout - {'loss': 0.7471, 'grad_norm': 0.6706652641296387, 'learning_rate': 6.858814860024275e-06, 'epoch': 1.84} +2025-05-11 06:55:48 - ERROR - stderr - 61%|██████▏ | 2296/3741 [13:29:54<8:31:12, 21.23s/it] +2025-05-11 06:56:07 - ERROR - stderr - 61%|██████▏ | 2297/3741 [13:30:14<8:20:43, 20.81s/it] +2025-05-11 06:56:07 - ERROR - stderr - +2025-05-11 06:56:07 - ERROR - stderr - +2025-05-11 06:56:07 - INFO - stdout - {'loss': 0.691, 'grad_norm': 0.6636921763420105, 'learning_rate': 6.850595043408997e-06, 'epoch': 1.84} +2025-05-11 06:56:07 - ERROR - stderr - 61%|██████▏ | 2297/3741 [13:30:14<8:20:43, 20.81s/it] +2025-05-11 06:56:31 - ERROR - stderr - 61%|██████▏ | 2298/3741 [13:30:37<8:40:34, 21.65s/it] +2025-05-11 06:56:31 - ERROR - stderr - +2025-05-11 06:56:31 - ERROR - stderr - +2025-05-11 06:56:31 - INFO - stdout - {'loss': 0.7275, 'grad_norm': 0.6821714639663696, 'learning_rate': 6.842377588324809e-06, 'epoch': 1.84} +2025-05-11 06:56:31 - ERROR - stderr - 61%|██████▏ | 2298/3741 [13:30:37<8:40:34, 21.65s/it] +2025-05-11 06:56:51 - ERROR - stderr - 61%|██████▏ | 2299/3741 [13:30:57<8:26:39, 21.08s/it] +2025-05-11 06:56:51 - ERROR - stderr - +2025-05-11 06:56:51 - ERROR - stderr - +2025-05-11 06:56:51 - INFO - stdout - {'loss': 0.7008, 'grad_norm': 0.6688547730445862, 'learning_rate': 6.834162500933445e-06, 'epoch': 1.84} +2025-05-11 06:56:51 - ERROR - stderr - 61%|██████▏ | 2299/3741 [13:30:57<8:26:39, 21.08s/it] +2025-05-11 06:57:14 - ERROR - stderr - 61%|██████▏ | 2300/3741 [13:31:20<8:41:00, 21.69s/it] +2025-05-11 06:57:14 - ERROR - stderr - +2025-05-11 06:57:14 - ERROR - stderr - +2025-05-11 06:57:14 - INFO - stdout - {'loss': 0.7175, 'grad_norm': 0.675174355506897, 'learning_rate': 6.825949787394853e-06, 'epoch': 1.84} +2025-05-11 06:57:14 - ERROR - stderr - 61%|██████▏ | 2300/3741 [13:31:20<8:41:00, 21.69s/it] +2025-05-11 06:57:34 - ERROR - stderr - 62%|██████▏ | 2301/3741 [13:31:40<8:27:18, 21.14s/it] +2025-05-11 06:57:34 - ERROR - stderr - +2025-05-11 06:57:34 - ERROR - stderr - +2025-05-11 06:57:34 - INFO - stdout - {'loss': 0.7273, 'grad_norm': 0.6541465520858765, 'learning_rate': 6.817739453867209e-06, 'epoch': 1.85} +2025-05-11 06:57:34 - ERROR - stderr - 62%|██████▏ | 2301/3741 [13:31:40<8:27:18, 21.14s/it] +2025-05-11 06:57:56 - ERROR - stderr - 62%|██████▏ | 2302/3741 [13:32:02<8:34:31, 21.45s/it] +2025-05-11 06:57:56 - ERROR - stderr - +2025-05-11 06:57:56 - ERROR - stderr - +2025-05-11 06:57:56 - INFO - stdout - {'loss': 0.7551, 'grad_norm': 0.6905247569084167, 'learning_rate': 6.809531506506898e-06, 'epoch': 1.85} +2025-05-11 06:57:56 - ERROR - stderr - 62%|██████▏ | 2302/3741 [13:32:02<8:34:31, 21.45s/it] +2025-05-11 06:58:16 - ERROR - stderr - 62%|██████▏ | 2303/3741 [13:32:22<8:23:47, 21.02s/it] +2025-05-11 06:58:16 - ERROR - stderr - +2025-05-11 06:58:16 - ERROR - stderr - +2025-05-11 06:58:16 - INFO - stdout - {'loss': 0.7546, 'grad_norm': 0.6996776461601257, 'learning_rate': 6.801325951468514e-06, 'epoch': 1.85} +2025-05-11 06:58:16 - ERROR - stderr - 62%|██████▏ | 2303/3741 [13:32:22<8:23:47, 21.02s/it] +2025-05-11 06:58:38 - ERROR - stderr - 62%|██████▏ | 2304/3741 [13:32:44<8:27:32, 21.19s/it] +2025-05-11 06:58:38 - ERROR - stderr - +2025-05-11 06:58:38 - ERROR - stderr - +2025-05-11 06:58:38 - INFO - stdout - {'loss': 0.7418, 'grad_norm': 0.6827793717384338, 'learning_rate': 6.7931227949048714e-06, 'epoch': 1.85} +2025-05-11 06:58:38 - ERROR - stderr - 62%|██████▏ | 2304/3741 [13:32:44<8:27:32, 21.19s/it] +2025-05-11 06:58:57 - ERROR - stderr - 62%|██████▏ | 2305/3741 [13:33:04<8:16:13, 20.73s/it] +2025-05-11 06:58:57 - ERROR - stderr - +2025-05-11 06:58:57 - ERROR - stderr - +2025-05-11 06:58:57 - INFO - stdout - {'loss': 0.7051, 'grad_norm': 0.6554014682769775, 'learning_rate': 6.784922042966968e-06, 'epoch': 1.85} +2025-05-11 06:58:57 - ERROR - stderr - 62%|██████▏ | 2305/3741 [13:33:04<8:16:13, 20.73s/it] +2025-05-11 06:59:18 - ERROR - stderr - 62%|██████▏ | 2306/3741 [13:33:24<8:13:10, 20.62s/it] +2025-05-11 06:59:18 - ERROR - stderr - +2025-05-11 06:59:18 - ERROR - stderr - +2025-05-11 06:59:18 - INFO - stdout - {'loss': 0.7335, 'grad_norm': 0.6803217530250549, 'learning_rate': 6.776723701804013e-06, 'epoch': 1.85} +2025-05-11 06:59:18 - ERROR - stderr - 62%|██████▏ | 2306/3741 [13:33:24<8:13:10, 20.62s/it] +2025-05-11 06:59:37 - ERROR - stderr - 62%|██████▏ | 2307/3741 [13:33:44<8:05:54, 20.33s/it] +2025-05-11 06:59:37 - ERROR - stderr - +2025-05-11 06:59:37 - ERROR - stderr - +2025-05-11 06:59:37 - INFO - stdout - {'loss': 0.7053, 'grad_norm': 0.6773452758789062, 'learning_rate': 6.768527777563396e-06, 'epoch': 1.85} +2025-05-11 06:59:37 - ERROR - stderr - 62%|██████▏ | 2307/3741 [13:33:44<8:05:54, 20.33s/it] +2025-05-11 06:59:58 - ERROR - stderr - 62%|██████▏ | 2308/3741 [13:34:04<8:06:27, 20.37s/it] +2025-05-11 06:59:58 - ERROR - stderr - +2025-05-11 06:59:58 - ERROR - stderr - +2025-05-11 06:59:58 - INFO - stdout - {'loss': 0.7471, 'grad_norm': 0.6819374561309814, 'learning_rate': 6.760334276390707e-06, 'epoch': 1.85} +2025-05-11 06:59:58 - ERROR - stderr - 62%|██████▏ | 2308/3741 [13:34:04<8:06:27, 20.37s/it] +2025-05-11 07:00:18 - ERROR - stderr - 62%|██████▏ | 2309/3741 [13:34:24<8:02:18, 20.21s/it] +2025-05-11 07:00:18 - ERROR - stderr - +2025-05-11 07:00:18 - ERROR - stderr - +2025-05-11 07:00:18 - INFO - stdout - {'loss': 0.7263, 'grad_norm': 0.6662135720252991, 'learning_rate': 6.752143204429709e-06, 'epoch': 1.85} +2025-05-11 07:00:18 - ERROR - stderr - 62%|██████▏ | 2309/3741 [13:34:24<8:02:18, 20.21s/it] +2025-05-11 07:00:38 - ERROR - stderr - 62%|██████▏ | 2310/3741 [13:34:44<8:01:42, 20.20s/it] +2025-05-11 07:00:38 - ERROR - stderr - +2025-05-11 07:00:38 - ERROR - stderr - +2025-05-11 07:00:38 - INFO - stdout - {'loss': 0.7027, 'grad_norm': 0.7113257646560669, 'learning_rate': 6.7439545678223404e-06, 'epoch': 1.85} +2025-05-11 07:00:38 - ERROR - stderr - 62%|██████▏ | 2310/3741 [13:34:44<8:01:42, 20.20s/it] +2025-05-11 07:00:58 - ERROR - stderr - 62%|██████▏ | 2311/3741 [13:35:04<8:00:34, 20.16s/it] +2025-05-11 07:00:58 - ERROR - stderr - +2025-05-11 07:00:58 - ERROR - stderr - +2025-05-11 07:00:58 - INFO - stdout - {'loss': 0.7514, 'grad_norm': 0.697996973991394, 'learning_rate': 6.735768372708731e-06, 'epoch': 1.85} +2025-05-11 07:00:58 - ERROR - stderr - 62%|██████▏ | 2311/3741 [13:35:04<8:00:34, 20.16s/it] +2025-05-11 07:01:18 - ERROR - stderr - 62%|██████▏ | 2312/3741 [13:35:24<7:58:20, 20.08s/it] +2025-05-11 07:01:18 - ERROR - stderr - +2025-05-11 07:01:18 - ERROR - stderr - +2025-05-11 07:01:18 - INFO - stdout - {'loss': 0.7454, 'grad_norm': 0.6723498702049255, 'learning_rate': 6.727584625227159e-06, 'epoch': 1.85} +2025-05-11 07:01:18 - ERROR - stderr - 62%|██████▏ | 2312/3741 [13:35:24<7:58:20, 20.08s/it] +2025-05-11 07:01:37 - ERROR - stderr - 62%|██████▏ | 2313/3741 [13:35:44<7:54:43, 19.95s/it] +2025-05-11 07:01:37 - ERROR - stderr - +2025-05-11 07:01:37 - ERROR - stderr - +2025-05-11 07:01:37 - INFO - stdout - {'loss': 0.7167, 'grad_norm': 0.6865621209144592, 'learning_rate': 6.719403331514085e-06, 'epoch': 1.85} +2025-05-11 07:01:37 - ERROR - stderr - 62%|██████▏ | 2313/3741 [13:35:44<7:54:43, 19.95s/it] +2025-05-11 07:01:57 - ERROR - stderr - 62%|██████▏ | 2314/3741 [13:36:03<7:51:37, 19.83s/it] +2025-05-11 07:01:57 - ERROR - stderr - +2025-05-11 07:01:57 - ERROR - stderr - +2025-05-11 07:01:57 - INFO - stdout - {'loss': 0.7182, 'grad_norm': 0.6741671562194824, 'learning_rate': 6.711224497704116e-06, 'epoch': 1.86} +2025-05-11 07:01:57 - ERROR - stderr - 62%|██████▏ | 2314/3741 [13:36:03<7:51:37, 19.83s/it] +2025-05-11 07:02:19 - ERROR - stderr - 62%|██████▏ | 2315/3741 [13:36:25<8:07:39, 20.52s/it] +2025-05-11 07:02:19 - ERROR - stderr - +2025-05-11 07:02:19 - ERROR - stderr - +2025-05-11 07:02:19 - INFO - stdout - {'loss': 0.7246, 'grad_norm': 0.6949282884597778, 'learning_rate': 6.703048129930019e-06, 'epoch': 1.86} +2025-05-11 07:02:19 - ERROR - stderr - 62%|██████▏ | 2315/3741 [13:36:25<8:07:39, 20.52s/it] +2025-05-11 07:02:39 - ERROR - stderr - 62%|██████▏ | 2316/3741 [13:36:45<8:00:27, 20.23s/it] +2025-05-11 07:02:39 - ERROR - stderr - +2025-05-11 07:02:39 - ERROR - stderr - +2025-05-11 07:02:39 - INFO - stdout - {'loss': 0.7259, 'grad_norm': 0.6716841459274292, 'learning_rate': 6.694874234322719e-06, 'epoch': 1.86} +2025-05-11 07:02:39 - ERROR - stderr - 62%|██████▏ | 2316/3741 [13:36:45<8:00:27, 20.23s/it] +2025-05-11 07:03:08 - ERROR - stderr - 62%|██████▏ | 2317/3741 [13:37:14<9:05:21, 22.98s/it] +2025-05-11 07:03:08 - ERROR - stderr - +2025-05-11 07:03:08 - ERROR - stderr - +2025-05-11 07:03:08 - INFO - stdout - {'loss': 0.7328, 'grad_norm': 0.7084620594978333, 'learning_rate': 6.686702817011277e-06, 'epoch': 1.86} +2025-05-11 07:03:08 - ERROR - stderr - 62%|██████▏ | 2317/3741 [13:37:14<9:05:21, 22.98s/it] +2025-05-11 07:03:29 - ERROR - stderr - 62%|██████▏ | 2318/3741 [13:37:35<8:50:14, 22.36s/it] +2025-05-11 07:03:29 - ERROR - stderr - +2025-05-11 07:03:29 - ERROR - stderr - +2025-05-11 07:03:29 - INFO - stdout - {'loss': 0.73, 'grad_norm': 0.6928534507751465, 'learning_rate': 6.678533884122904e-06, 'epoch': 1.86} +2025-05-11 07:03:29 - ERROR - stderr - 62%|██████▏ | 2318/3741 [13:37:35<8:50:14, 22.36s/it] +2025-05-11 07:03:51 - ERROR - stderr - 62%|██████▏ | 2319/3741 [13:37:57<8:45:13, 22.16s/it] +2025-05-11 07:03:51 - ERROR - stderr - +2025-05-11 07:03:51 - ERROR - stderr - +2025-05-11 07:03:51 - INFO - stdout - {'loss': 0.6775, 'grad_norm': 0.6859990358352661, 'learning_rate': 6.670367441782941e-06, 'epoch': 1.86} +2025-05-11 07:03:51 - ERROR - stderr - 62%|██████▏ | 2319/3741 [13:37:57<8:45:13, 22.16s/it] +2025-05-11 07:04:12 - ERROR - stderr - 62%|██████▏ | 2320/3741 [13:38:18<8:38:40, 21.90s/it] +2025-05-11 07:04:12 - ERROR - stderr - +2025-05-11 07:04:12 - ERROR - stderr - +2025-05-11 07:04:12 - INFO - stdout - {'loss': 0.7571, 'grad_norm': 0.7146435379981995, 'learning_rate': 6.66220349611486e-06, 'epoch': 1.86} +2025-05-11 07:04:12 - ERROR - stderr - 62%|██████▏ | 2320/3741 [13:38:18<8:38:40, 21.90s/it] +2025-05-11 07:04:33 - ERROR - stderr - 62%|██████▏ | 2321/3741 [13:38:39<8:32:56, 21.67s/it] +2025-05-11 07:04:33 - ERROR - stderr - +2025-05-11 07:04:33 - ERROR - stderr - +2025-05-11 07:04:33 - INFO - stdout - {'loss': 0.7387, 'grad_norm': 0.702168345451355, 'learning_rate': 6.654042053240275e-06, 'epoch': 1.86} +2025-05-11 07:04:33 - ERROR - stderr - 62%|██████▏ | 2321/3741 [13:38:39<8:32:56, 21.67s/it] +2025-05-11 07:04:53 - ERROR - stderr - 62%|██████▏ | 2322/3741 [13:39:00<8:22:36, 21.25s/it] +2025-05-11 07:04:53 - ERROR - stderr - +2025-05-11 07:04:53 - ERROR - stderr - +2025-05-11 07:04:53 - INFO - stdout - {'loss': 0.7394, 'grad_norm': 0.688408613204956, 'learning_rate': 6.645883119278906e-06, 'epoch': 1.86} +2025-05-11 07:04:53 - ERROR - stderr - 62%|██████▏ | 2322/3741 [13:39:00<8:22:36, 21.25s/it] +2025-05-11 07:05:14 - ERROR - stderr - 62%|██████▏ | 2323/3741 [13:39:20<8:18:47, 21.11s/it] +2025-05-11 07:05:14 - ERROR - stderr - +2025-05-11 07:05:14 - ERROR - stderr - +2025-05-11 07:05:14 - INFO - stdout - {'loss': 0.7156, 'grad_norm': 0.6995466351509094, 'learning_rate': 6.637726700348606e-06, 'epoch': 1.86} +2025-05-11 07:05:14 - ERROR - stderr - 62%|██████▏ | 2323/3741 [13:39:20<8:18:47, 21.11s/it] +2025-05-11 07:05:36 - ERROR - stderr - 62%|██████▏ | 2324/3741 [13:39:42<8:21:31, 21.24s/it] +2025-05-11 07:05:36 - ERROR - stderr - +2025-05-11 07:05:36 - ERROR - stderr - +2025-05-11 07:05:36 - INFO - stdout - {'loss': 0.715, 'grad_norm': 0.6815131306648254, 'learning_rate': 6.629572802565332e-06, 'epoch': 1.86} +2025-05-11 07:05:36 - ERROR - stderr - 62%|██████▏ | 2324/3741 [13:39:42<8:21:31, 21.24s/it] +2025-05-11 07:05:57 - ERROR - stderr - 62%|██████▏ | 2325/3741 [13:40:03<8:20:33, 21.21s/it] +2025-05-11 07:05:57 - ERROR - stderr - +2025-05-11 07:05:57 - ERROR - stderr - +2025-05-11 07:05:57 - INFO - stdout - {'loss': 0.7109, 'grad_norm': 0.6656938195228577, 'learning_rate': 6.6214214320431534e-06, 'epoch': 1.86} +2025-05-11 07:05:57 - ERROR - stderr - 62%|██████▏ | 2325/3741 [13:40:03<8:20:33, 21.21s/it] +2025-05-11 07:06:17 - ERROR - stderr - 62%|██████▏ | 2326/3741 [13:40:24<8:15:34, 21.01s/it] +2025-05-11 07:06:17 - ERROR - stderr - +2025-05-11 07:06:17 - ERROR - stderr - +2025-05-11 07:06:17 - INFO - stdout - {'loss': 0.7439, 'grad_norm': 0.6970621943473816, 'learning_rate': 6.613272594894248e-06, 'epoch': 1.87} +2025-05-11 07:06:17 - ERROR - stderr - 62%|██████▏ | 2326/3741 [13:40:24<8:15:34, 21.01s/it] +2025-05-11 07:06:40 - ERROR - stderr - 62%|██████▏ | 2327/3741 [13:40:46<8:23:55, 21.38s/it] +2025-05-11 07:06:40 - ERROR - stderr - +2025-05-11 07:06:40 - ERROR - stderr - +2025-05-11 07:06:40 - INFO - stdout - {'loss': 0.7338, 'grad_norm': 0.6916574835777283, 'learning_rate': 6.605126297228886e-06, 'epoch': 1.87} +2025-05-11 07:06:40 - ERROR - stderr - 62%|██████▏ | 2327/3741 [13:40:46<8:23:55, 21.38s/it] +2025-05-11 07:07:00 - ERROR - stderr - 62%|██████▏ | 2328/3741 [13:41:06<8:17:19, 21.12s/it] +2025-05-11 07:07:00 - ERROR - stderr - +2025-05-11 07:07:00 - ERROR - stderr - +2025-05-11 07:07:00 - INFO - stdout - {'loss': 0.7179, 'grad_norm': 0.66231769323349, 'learning_rate': 6.596982545155447e-06, 'epoch': 1.87} +2025-05-11 07:07:00 - ERROR - stderr - 62%|██████▏ | 2328/3741 [13:41:06<8:17:19, 21.12s/it] +2025-05-11 07:07:22 - ERROR - stderr - 62%|██████▏ | 2329/3741 [13:41:29<8:26:08, 21.51s/it] +2025-05-11 07:07:22 - ERROR - stderr - +2025-05-11 07:07:22 - ERROR - stderr - +2025-05-11 07:07:22 - INFO - stdout - {'loss': 0.7485, 'grad_norm': 0.7257800102233887, 'learning_rate': 6.5888413447803905e-06, 'epoch': 1.87} +2025-05-11 07:07:22 - ERROR - stderr - 62%|██████▏ | 2329/3741 [13:41:29<8:26:08, 21.51s/it] +2025-05-11 07:07:43 - ERROR - stderr - 62%|██████▏ | 2330/3741 [13:41:50<8:20:47, 21.30s/it] +2025-05-11 07:07:43 - ERROR - stderr - +2025-05-11 07:07:43 - ERROR - stderr - +2025-05-11 07:07:43 - INFO - stdout - {'loss': 0.7652, 'grad_norm': 0.6953336000442505, 'learning_rate': 6.580702702208261e-06, 'epoch': 1.87} +2025-05-11 07:07:43 - ERROR - stderr - 62%|██████▏ | 2330/3741 [13:41:50<8:20:47, 21.30s/it] +2025-05-11 07:08:06 - ERROR - stderr - 62%|██████▏ | 2331/3741 [13:42:12<8:31:35, 21.77s/it] +2025-05-11 07:08:06 - ERROR - stderr - +2025-05-11 07:08:06 - ERROR - stderr - +2025-05-11 07:08:06 - INFO - stdout - {'loss': 0.7024, 'grad_norm': 0.6823389530181885, 'learning_rate': 6.572566623541697e-06, 'epoch': 1.87} +2025-05-11 07:08:06 - ERROR - stderr - 62%|██████▏ | 2331/3741 [13:42:12<8:31:35, 21.77s/it] +2025-05-11 07:08:27 - ERROR - stderr - 62%|██████▏ | 2332/3741 [13:42:33<8:22:03, 21.38s/it] +2025-05-11 07:08:27 - ERROR - stderr - +2025-05-11 07:08:27 - ERROR - stderr - +2025-05-11 07:08:27 - INFO - stdout - {'loss': 0.7128, 'grad_norm': 0.6590924859046936, 'learning_rate': 6.5644331148814e-06, 'epoch': 1.87} +2025-05-11 07:08:27 - ERROR - stderr - 62%|██████▏ | 2332/3741 [13:42:33<8:22:03, 21.38s/it] +2025-05-11 07:08:50 - ERROR - stderr - 62%|██████▏ | 2333/3741 [13:42:56<8:34:33, 21.93s/it] +2025-05-11 07:08:50 - ERROR - stderr - +2025-05-11 07:08:50 - ERROR - stderr - +2025-05-11 07:08:50 - INFO - stdout - {'loss': 0.7213, 'grad_norm': 0.6826373934745789, 'learning_rate': 6.55630218232616e-06, 'epoch': 1.87} +2025-05-11 07:08:50 - ERROR - stderr - 62%|██████▏ | 2333/3741 [13:42:56<8:34:33, 21.93s/it] +2025-05-11 07:09:10 - ERROR - stderr - 62%|██████▏ | 2334/3741 [13:43:16<8:22:58, 21.45s/it] +2025-05-11 07:09:10 - ERROR - stderr - +2025-05-11 07:09:10 - ERROR - stderr - +2025-05-11 07:09:10 - INFO - stdout - {'loss': 0.6984, 'grad_norm': 0.6541956663131714, 'learning_rate': 6.548173831972824e-06, 'epoch': 1.87} +2025-05-11 07:09:10 - ERROR - stderr - 62%|██████▏ | 2334/3741 [13:43:16<8:22:58, 21.45s/it] +2025-05-11 07:09:33 - ERROR - stderr - 62%|██████▏ | 2335/3741 [13:43:40<8:34:20, 21.95s/it] +2025-05-11 07:09:33 - ERROR - stderr - +2025-05-11 07:09:33 - ERROR - stderr - +2025-05-11 07:09:33 - INFO - stdout - {'loss': 0.7364, 'grad_norm': 0.6662783622741699, 'learning_rate': 6.540048069916301e-06, 'epoch': 1.87} +2025-05-11 07:09:33 - ERROR - stderr - 62%|██████▏ | 2335/3741 [13:43:40<8:34:20, 21.95s/it] +2025-05-11 07:09:55 - ERROR - stderr - 62%|██████▏ | 2336/3741 [13:44:01<8:32:22, 21.88s/it] +2025-05-11 07:09:55 - ERROR - stderr - +2025-05-11 07:09:55 - ERROR - stderr - +2025-05-11 07:09:55 - INFO - stdout - {'loss': 0.7371, 'grad_norm': 0.6726429462432861, 'learning_rate': 6.5319249022495715e-06, 'epoch': 1.87} +2025-05-11 07:09:55 - ERROR - stderr - 62%|██████▏ | 2336/3741 [13:44:01<8:32:22, 21.88s/it] +2025-05-11 07:10:18 - ERROR - stderr - 62%|██████▏ | 2337/3741 [13:44:24<8:39:28, 22.20s/it] +2025-05-11 07:10:18 - ERROR - stderr - +2025-05-11 07:10:18 - ERROR - stderr - +2025-05-11 07:10:18 - INFO - stdout - {'loss': 0.7046, 'grad_norm': 0.6762657165527344, 'learning_rate': 6.523804335063655e-06, 'epoch': 1.87} +2025-05-11 07:10:18 - ERROR - stderr - 62%|██████▏ | 2337/3741 [13:44:24<8:39:28, 22.20s/it] +2025-05-11 07:10:43 - ERROR - stderr - 62%|██████▏ | 2338/3741 [13:44:49<8:56:48, 22.96s/it] +2025-05-11 07:10:43 - ERROR - stderr - +2025-05-11 07:10:43 - ERROR - stderr - +2025-05-11 07:10:43 - INFO - stdout - {'loss': 0.7201, 'grad_norm': 0.7136995196342468, 'learning_rate': 6.515686374447641e-06, 'epoch': 1.87} +2025-05-11 07:10:43 - ERROR - stderr - 62%|██████▏ | 2338/3741 [13:44:49<8:56:48, 22.96s/it] +2025-05-11 07:11:06 - ERROR - stderr - 63%|██████▎ | 2339/3741 [13:45:12<8:57:05, 22.99s/it] +2025-05-11 07:11:06 - ERROR - stderr - +2025-05-11 07:11:06 - ERROR - stderr - +2025-05-11 07:11:06 - INFO - stdout - {'loss': 0.7674, 'grad_norm': 0.7213540077209473, 'learning_rate': 6.507571026488644e-06, 'epoch': 1.88} +2025-05-11 07:11:06 - ERROR - stderr - 63%|██████▎ | 2339/3741 [13:45:12<8:57:05, 22.99s/it] +2025-05-11 07:11:29 - ERROR - stderr - 63%|██████▎ | 2340/3741 [13:45:35<8:55:50, 22.95s/it] +2025-05-11 07:11:29 - ERROR - stderr - +2025-05-11 07:11:29 - ERROR - stderr - +2025-05-11 07:11:29 - INFO - stdout - {'loss': 0.7653, 'grad_norm': 0.7228121161460876, 'learning_rate': 6.499458297271826e-06, 'epoch': 1.88} +2025-05-11 07:11:29 - ERROR - stderr - 63%|██████▎ | 2340/3741 [13:45:35<8:55:50, 22.95s/it] +2025-05-11 07:11:49 - ERROR - stderr - 63%|██████▎ | 2341/3741 [13:45:55<8:36:55, 22.15s/it] +2025-05-11 07:11:49 - ERROR - stderr - +2025-05-11 07:11:49 - ERROR - stderr - +2025-05-11 07:11:49 - INFO - stdout - {'loss': 0.7133, 'grad_norm': 0.6905105113983154, 'learning_rate': 6.491348192880395e-06, 'epoch': 1.88} +2025-05-11 07:11:49 - ERROR - stderr - 63%|██████▎ | 2341/3741 [13:45:55<8:36:55, 22.15s/it] +2025-05-11 07:12:11 - ERROR - stderr - 63%|██████▎ | 2342/3741 [13:46:17<8:36:54, 22.17s/it] +2025-05-11 07:12:11 - ERROR - stderr - +2025-05-11 07:12:11 - ERROR - stderr - +2025-05-11 07:12:11 - INFO - stdout - {'loss': 0.7293, 'grad_norm': 0.6937234997749329, 'learning_rate': 6.48324071939558e-06, 'epoch': 1.88} +2025-05-11 07:12:11 - ERROR - stderr - 63%|██████▎ | 2342/3741 [13:46:17<8:36:54, 22.17s/it] +2025-05-11 07:12:35 - ERROR - stderr - 63%|██████▎ | 2343/3741 [13:46:42<8:52:10, 22.84s/it] +2025-05-11 07:12:35 - ERROR - stderr - +2025-05-11 07:12:35 - ERROR - stderr - +2025-05-11 07:12:35 - INFO - stdout - {'loss': 0.7285, 'grad_norm': 0.7513543963432312, 'learning_rate': 6.4751358828966415e-06, 'epoch': 1.88} +2025-05-11 07:12:35 - ERROR - stderr - 63%|██████▎ | 2343/3741 [13:46:42<8:52:10, 22.84s/it] +2025-05-11 07:12:56 - ERROR - stderr - 63%|██████▎ | 2344/3741 [13:47:02<8:32:42, 22.02s/it] +2025-05-11 07:12:56 - ERROR - stderr - +2025-05-11 07:12:56 - ERROR - stderr - +2025-05-11 07:12:56 - INFO - stdout - {'loss': 0.7007, 'grad_norm': 0.6793731451034546, 'learning_rate': 6.467033689460863e-06, 'epoch': 1.88} +2025-05-11 07:12:56 - ERROR - stderr - 63%|██████▎ | 2344/3741 [13:47:02<8:32:42, 22.02s/it] +2025-05-11 07:13:19 - ERROR - stderr - 63%|██████▎ | 2345/3741 [13:47:25<8:42:20, 22.45s/it] +2025-05-11 07:13:19 - ERROR - stderr - +2025-05-11 07:13:19 - ERROR - stderr - +2025-05-11 07:13:19 - INFO - stdout - {'loss': 0.7152, 'grad_norm': 0.6910126805305481, 'learning_rate': 6.458934145163539e-06, 'epoch': 1.88} +2025-05-11 07:13:19 - ERROR - stderr - 63%|██████▎ | 2345/3741 [13:47:25<8:42:20, 22.45s/it] +2025-05-11 07:13:40 - ERROR - stderr - 63%|██████▎ | 2346/3741 [13:47:46<8:32:28, 22.04s/it] +2025-05-11 07:13:40 - ERROR - stderr - +2025-05-11 07:13:40 - ERROR - stderr - +2025-05-11 07:13:40 - INFO - stdout - {'loss': 0.7716, 'grad_norm': 0.7317004203796387, 'learning_rate': 6.450837256077993e-06, 'epoch': 1.88} +2025-05-11 07:13:40 - ERROR - stderr - 63%|██████▎ | 2346/3741 [13:47:46<8:32:28, 22.04s/it] +2025-05-11 07:14:02 - ERROR - stderr - 63%|██████▎ | 2347/3741 [13:48:09<8:32:36, 22.06s/it] +2025-05-11 07:14:02 - ERROR - stderr - +2025-05-11 07:14:02 - ERROR - stderr - +2025-05-11 07:14:02 - INFO - stdout - {'loss': 0.7189, 'grad_norm': 0.6594632863998413, 'learning_rate': 6.44274302827554e-06, 'epoch': 1.88} +2025-05-11 07:14:02 - ERROR - stderr - 63%|██████▎ | 2347/3741 [13:48:09<8:32:36, 22.06s/it] +2025-05-11 07:14:22 - ERROR - stderr - 63%|██████▎ | 2348/3741 [13:48:29<8:17:43, 21.44s/it] +2025-05-11 07:14:22 - ERROR - stderr - +2025-05-11 07:14:22 - ERROR - stderr - +2025-05-11 07:14:22 - INFO - stdout - {'loss': 0.714, 'grad_norm': 0.654815673828125, 'learning_rate': 6.434651467825515e-06, 'epoch': 1.88} +2025-05-11 07:14:22 - ERROR - stderr - 63%|██████▎ | 2348/3741 [13:48:29<8:17:43, 21.44s/it] +2025-05-11 07:14:45 - ERROR - stderr - 63%|██████▎ | 2349/3741 [13:48:51<8:27:58, 21.90s/it] +2025-05-11 07:14:45 - ERROR - stderr - +2025-05-11 07:14:45 - ERROR - stderr - +2025-05-11 07:14:45 - INFO - stdout - {'loss': 0.7311, 'grad_norm': 0.7162003517150879, 'learning_rate': 6.426562580795242e-06, 'epoch': 1.88} +2025-05-11 07:14:45 - ERROR - stderr - 63%|██████▎ | 2349/3741 [13:48:52<8:27:58, 21.90s/it] +2025-05-11 07:15:05 - ERROR - stderr - 63%|██████▎ | 2350/3741 [13:49:11<8:12:49, 21.26s/it] +2025-05-11 07:15:05 - ERROR - stderr - +2025-05-11 07:15:05 - ERROR - stderr - +2025-05-11 07:15:05 - INFO - stdout - {'loss': 0.7173, 'grad_norm': 0.6873356103897095, 'learning_rate': 6.4184763732500376e-06, 'epoch': 1.88} +2025-05-11 07:15:05 - ERROR - stderr - 63%|██████▎ | 2350/3741 [13:49:11<8:12:49, 21.26s/it] +2025-05-11 07:15:28 - ERROR - stderr - 63%|██████▎ | 2351/3741 [13:49:34<8:23:44, 21.74s/it] +2025-05-11 07:15:28 - ERROR - stderr - +2025-05-11 07:15:28 - ERROR - stderr - +2025-05-11 07:15:28 - INFO - stdout - {'loss': 0.7156, 'grad_norm': 0.6748940944671631, 'learning_rate': 6.410392851253229e-06, 'epoch': 1.89} +2025-05-11 07:15:28 - ERROR - stderr - 63%|██████▎ | 2351/3741 [13:49:34<8:23:44, 21.74s/it] +2025-05-11 07:15:48 - ERROR - stderr - 63%|██████▎ | 2352/3741 [13:49:55<8:13:56, 21.34s/it] +2025-05-11 07:15:48 - ERROR - stderr - +2025-05-11 07:15:48 - ERROR - stderr - +2025-05-11 07:15:48 - INFO - stdout - {'loss': 0.7354, 'grad_norm': 0.6667020916938782, 'learning_rate': 6.402312020866102e-06, 'epoch': 1.89} +2025-05-11 07:15:48 - ERROR - stderr - 63%|██████▎ | 2352/3741 [13:49:55<8:13:56, 21.34s/it] +2025-05-11 07:16:11 - ERROR - stderr - 63%|██████▎ | 2353/3741 [13:50:18<8:25:13, 21.84s/it] +2025-05-11 07:16:11 - ERROR - stderr - +2025-05-11 07:16:11 - ERROR - stderr - +2025-05-11 07:16:11 - INFO - stdout - {'loss': 0.7357, 'grad_norm': 0.666428804397583, 'learning_rate': 6.39423388814795e-06, 'epoch': 1.89} +2025-05-11 07:16:11 - ERROR - stderr - 63%|██████▎ | 2353/3741 [13:50:18<8:25:13, 21.84s/it] +2025-05-11 07:16:31 - ERROR - stderr - 63%|██████▎ | 2354/3741 [13:50:37<8:10:36, 21.22s/it] +2025-05-11 07:16:31 - ERROR - stderr - +2025-05-11 07:16:31 - ERROR - stderr - +2025-05-11 07:16:31 - INFO - stdout - {'loss': 0.718, 'grad_norm': 0.6650567054748535, 'learning_rate': 6.386158459156029e-06, 'epoch': 1.89} +2025-05-11 07:16:31 - ERROR - stderr - 63%|██████▎ | 2354/3741 [13:50:37<8:10:36, 21.22s/it] +2025-05-11 07:16:51 - ERROR - stderr - 63%|██████▎ | 2355/3741 [13:50:57<8:02:20, 20.88s/it] +2025-05-11 07:16:51 - ERROR - stderr - +2025-05-11 07:16:51 - ERROR - stderr - +2025-05-11 07:16:51 - INFO - stdout - {'loss': 0.7532, 'grad_norm': 0.7268814444541931, 'learning_rate': 6.378085739945566e-06, 'epoch': 1.89} +2025-05-11 07:16:51 - ERROR - stderr - 63%|██████▎ | 2355/3741 [13:50:57<8:02:20, 20.88s/it] +2025-05-11 07:17:11 - ERROR - stderr - 63%|██████▎ | 2356/3741 [13:51:17<7:53:12, 20.50s/it] +2025-05-11 07:17:11 - ERROR - stderr - +2025-05-11 07:17:11 - ERROR - stderr - +2025-05-11 07:17:11 - INFO - stdout - {'loss': 0.7387, 'grad_norm': 0.696033239364624, 'learning_rate': 6.3700157365697655e-06, 'epoch': 1.89} +2025-05-11 07:17:11 - ERROR - stderr - 63%|██████▎ | 2356/3741 [13:51:17<7:53:12, 20.50s/it] +2025-05-11 07:17:30 - ERROR - stderr - 63%|██████▎ | 2357/3741 [13:51:37<7:47:57, 20.29s/it] +2025-05-11 07:17:31 - ERROR - stderr - +2025-05-11 07:17:31 - ERROR - stderr - +2025-05-11 07:17:31 - INFO - stdout - {'loss': 0.7662, 'grad_norm': 0.7350199818611145, 'learning_rate': 6.361948455079785e-06, 'epoch': 1.89} +2025-05-11 07:17:31 - ERROR - stderr - 63%|██████▎ | 2357/3741 [13:51:37<7:47:57, 20.29s/it] +2025-05-11 07:17:51 - ERROR - stderr - 63%|██████▎ | 2358/3741 [13:51:57<7:45:47, 20.21s/it] +2025-05-11 07:17:51 - ERROR - stderr - +2025-05-11 07:17:51 - ERROR - stderr - +2025-05-11 07:17:51 - INFO - stdout - {'loss': 0.7182, 'grad_norm': 0.6738780736923218, 'learning_rate': 6.353883901524756e-06, 'epoch': 1.89} +2025-05-11 07:17:51 - ERROR - stderr - 63%|██████▎ | 2358/3741 [13:51:57<7:45:47, 20.21s/it] +2025-05-11 07:18:11 - ERROR - stderr - 63%|██████▎ | 2359/3741 [13:52:17<7:45:49, 20.22s/it] +2025-05-11 07:18:11 - ERROR - stderr - +2025-05-11 07:18:11 - ERROR - stderr - +2025-05-11 07:18:11 - INFO - stdout - {'loss': 0.7417, 'grad_norm': 0.7525630593299866, 'learning_rate': 6.34582208195175e-06, 'epoch': 1.89} +2025-05-11 07:18:11 - ERROR - stderr - 63%|██████▎ | 2359/3741 [13:52:17<7:45:49, 20.22s/it] +2025-05-11 07:18:32 - ERROR - stderr - 63%|██████▎ | 2360/3741 [13:52:39<7:54:54, 20.63s/it] +2025-05-11 07:18:32 - ERROR - stderr - +2025-05-11 07:18:32 - ERROR - stderr - +2025-05-11 07:18:32 - INFO - stdout - {'loss': 0.7616, 'grad_norm': 0.6829856038093567, 'learning_rate': 6.337763002405792e-06, 'epoch': 1.89} +2025-05-11 07:18:32 - ERROR - stderr - 63%|██████▎ | 2360/3741 [13:52:39<7:54:54, 20.63s/it] +2025-05-11 07:18:53 - ERROR - stderr - 63%|██████▎ | 2361/3741 [13:52:59<7:55:37, 20.68s/it] +2025-05-11 07:18:53 - ERROR - stderr - +2025-05-11 07:18:53 - ERROR - stderr - +2025-05-11 07:18:53 - INFO - stdout - {'loss': 0.7149, 'grad_norm': 0.6920203566551208, 'learning_rate': 6.329706668929861e-06, 'epoch': 1.89} +2025-05-11 07:18:53 - ERROR - stderr - 63%|██████▎ | 2361/3741 [13:52:59<7:55:37, 20.68s/it] +2025-05-11 07:19:15 - ERROR - stderr - 63%|██████▎ | 2362/3741 [13:53:21<8:00:03, 20.89s/it] +2025-05-11 07:19:15 - ERROR - stderr - +2025-05-11 07:19:15 - ERROR - stderr - +2025-05-11 07:19:15 - INFO - stdout - {'loss': 0.754, 'grad_norm': 0.7000799775123596, 'learning_rate': 6.321653087564861e-06, 'epoch': 1.89} +2025-05-11 07:19:15 - ERROR - stderr - 63%|██████▎ | 2362/3741 [13:53:21<8:00:03, 20.89s/it] +2025-05-11 07:19:37 - ERROR - stderr - 63%|██████▎ | 2363/3741 [13:53:43<8:09:47, 21.33s/it] +2025-05-11 07:19:37 - ERROR - stderr - +2025-05-11 07:19:37 - ERROR - stderr - +2025-05-11 07:19:37 - INFO - stdout - {'loss': 0.7099, 'grad_norm': 0.6926515102386475, 'learning_rate': 6.31360226434965e-06, 'epoch': 1.89} +2025-05-11 07:19:37 - ERROR - stderr - 63%|██████▎ | 2363/3741 [13:53:43<8:09:47, 21.33s/it] +2025-05-11 07:19:59 - ERROR - stderr - 63%|██████▎ | 2364/3741 [13:54:05<8:15:57, 21.61s/it] +2025-05-11 07:19:59 - ERROR - stderr - +2025-05-11 07:19:59 - ERROR - stderr - +2025-05-11 07:19:59 - INFO - stdout - {'loss': 0.7287, 'grad_norm': 0.6759518384933472, 'learning_rate': 6.305554205321005e-06, 'epoch': 1.9} +2025-05-11 07:19:59 - ERROR - stderr - 63%|██████▎ | 2364/3741 [13:54:05<8:15:57, 21.61s/it] +2025-05-11 07:20:19 - ERROR - stderr - 63%|██████▎ | 2365/3741 [13:54:26<8:04:50, 21.14s/it] +2025-05-11 07:20:19 - ERROR - stderr - +2025-05-11 07:20:19 - ERROR - stderr - +2025-05-11 07:20:19 - INFO - stdout - {'loss': 0.7242, 'grad_norm': 0.6605546474456787, 'learning_rate': 6.297508916513636e-06, 'epoch': 1.9} +2025-05-11 07:20:19 - ERROR - stderr - 63%|██████▎ | 2365/3741 [13:54:26<8:04:50, 21.14s/it] +2025-05-11 07:20:42 - ERROR - stderr - 63%|██████▎ | 2366/3741 [13:54:48<8:13:22, 21.53s/it] +2025-05-11 07:20:42 - ERROR - stderr - +2025-05-11 07:20:42 - ERROR - stderr - +2025-05-11 07:20:42 - INFO - stdout - {'loss': 0.7318, 'grad_norm': 0.6701246500015259, 'learning_rate': 6.289466403960175e-06, 'epoch': 1.9} +2025-05-11 07:20:42 - ERROR - stderr - 63%|██████▎ | 2366/3741 [13:54:48<8:13:22, 21.53s/it] +2025-05-11 07:21:05 - ERROR - stderr - 63%|██████▎ | 2367/3741 [13:55:12<8:27:29, 22.16s/it] +2025-05-11 07:21:05 - ERROR - stderr - +2025-05-11 07:21:05 - ERROR - stderr - +2025-05-11 07:21:05 - INFO - stdout - {'loss': 0.7054, 'grad_norm': 0.6828826069831848, 'learning_rate': 6.281426673691169e-06, 'epoch': 1.9} +2025-05-11 07:21:05 - ERROR - stderr - 63%|██████▎ | 2367/3741 [13:55:12<8:27:29, 22.16s/it] +2025-05-11 07:21:27 - ERROR - stderr - 63%|██████▎ | 2368/3741 [13:55:33<8:25:14, 22.08s/it] +2025-05-11 07:21:27 - ERROR - stderr - +2025-05-11 07:21:27 - ERROR - stderr - +2025-05-11 07:21:27 - INFO - stdout - {'loss': 0.7241, 'grad_norm': 0.6410530209541321, 'learning_rate': 6.273389731735087e-06, 'epoch': 1.9} +2025-05-11 07:21:27 - ERROR - stderr - 63%|██████▎ | 2368/3741 [13:55:33<8:25:14, 22.08s/it] +2025-05-11 07:21:48 - ERROR - stderr - 63%|██████▎ | 2369/3741 [13:55:54<8:14:14, 21.61s/it] +2025-05-11 07:21:48 - ERROR - stderr - +2025-05-11 07:21:48 - ERROR - stderr - +2025-05-11 07:21:48 - INFO - stdout - {'loss': 0.7071, 'grad_norm': 0.7067154049873352, 'learning_rate': 6.265355584118297e-06, 'epoch': 1.9} +2025-05-11 07:21:48 - ERROR - stderr - 63%|██████▎ | 2369/3741 [13:55:54<8:14:14, 21.61s/it] +2025-05-11 07:22:10 - ERROR - stderr - 63%|██████▎ | 2370/3741 [13:56:16<8:15:43, 21.69s/it] +2025-05-11 07:22:10 - ERROR - stderr - +2025-05-11 07:22:10 - ERROR - stderr - +2025-05-11 07:22:10 - INFO - stdout - {'loss': 0.723, 'grad_norm': 0.698462724685669, 'learning_rate': 6.257324236865074e-06, 'epoch': 1.9} +2025-05-11 07:22:10 - ERROR - stderr - 63%|██████▎ | 2370/3741 [13:56:16<8:15:43, 21.69s/it] +2025-05-11 07:22:31 - ERROR - stderr - 63%|██████▎ | 2371/3741 [13:56:37<8:13:27, 21.61s/it] +2025-05-11 07:22:31 - ERROR - stderr - +2025-05-11 07:22:31 - ERROR - stderr - +2025-05-11 07:22:31 - INFO - stdout - {'loss': 0.755, 'grad_norm': 0.747157096862793, 'learning_rate': 6.249295695997604e-06, 'epoch': 1.9} +2025-05-11 07:22:31 - ERROR - stderr - 63%|██████▎ | 2371/3741 [13:56:37<8:13:27, 21.61s/it] +2025-05-11 07:22:53 - ERROR - stderr - 63%|██████▎ | 2372/3741 [13:57:00<8:17:35, 21.81s/it] +2025-05-11 07:22:53 - ERROR - stderr - +2025-05-11 07:22:53 - ERROR - stderr - +2025-05-11 07:22:53 - INFO - stdout - {'loss': 0.71, 'grad_norm': 0.7117529511451721, 'learning_rate': 6.241269967535955e-06, 'epoch': 1.9} +2025-05-11 07:22:53 - ERROR - stderr - 63%|██████▎ | 2372/3741 [13:57:00<8:17:35, 21.81s/it] +2025-05-11 07:23:14 - ERROR - stderr - 63%|██████▎ | 2373/3741 [13:57:20<8:08:30, 21.43s/it] +2025-05-11 07:23:14 - ERROR - stderr - +2025-05-11 07:23:14 - ERROR - stderr - +2025-05-11 07:23:14 - INFO - stdout - {'loss': 0.7238, 'grad_norm': 0.7347584962844849, 'learning_rate': 6.233247057498093e-06, 'epoch': 1.9} +2025-05-11 07:23:14 - ERROR - stderr - 63%|██████▎ | 2373/3741 [13:57:20<8:08:30, 21.43s/it] +2025-05-11 07:23:37 - ERROR - stderr - 63%|██████▎ | 2374/3741 [13:57:43<8:18:54, 21.90s/it] +2025-05-11 07:23:37 - ERROR - stderr - +2025-05-11 07:23:37 - ERROR - stderr - +2025-05-11 07:23:37 - INFO - stdout - {'loss': 0.7107, 'grad_norm': 0.6999946236610413, 'learning_rate': 6.225226971899869e-06, 'epoch': 1.9} +2025-05-11 07:23:37 - ERROR - stderr - 63%|██████▎ | 2374/3741 [13:57:43<8:18:54, 21.90s/it] +2025-05-11 07:23:57 - ERROR - stderr - 63%|██████▎ | 2375/3741 [13:58:03<8:07:23, 21.41s/it] +2025-05-11 07:23:57 - ERROR - stderr - +2025-05-11 07:23:57 - ERROR - stderr - +2025-05-11 07:23:57 - INFO - stdout - {'loss': 0.7432, 'grad_norm': 0.7275912761688232, 'learning_rate': 6.217209716755013e-06, 'epoch': 1.9} +2025-05-11 07:23:57 - ERROR - stderr - 63%|██████▎ | 2375/3741 [13:58:03<8:07:23, 21.41s/it] +2025-05-11 07:24:21 - ERROR - stderr - 64%|██████▎ | 2376/3741 [13:58:27<8:21:37, 22.05s/it] +2025-05-11 07:24:21 - ERROR - stderr - +2025-05-11 07:24:21 - ERROR - stderr - +2025-05-11 07:24:21 - INFO - stdout - {'loss': 0.6802, 'grad_norm': 0.6637576222419739, 'learning_rate': 6.2091952980751414e-06, 'epoch': 1.91} +2025-05-11 07:24:21 - ERROR - stderr - 64%|██████▎ | 2376/3741 [13:58:27<8:21:37, 22.05s/it] +2025-05-11 07:24:42 - ERROR - stderr - 64%|██████▎ | 2377/3741 [13:58:49<8:19:42, 21.98s/it] +2025-05-11 07:24:42 - ERROR - stderr - +2025-05-11 07:24:42 - ERROR - stderr - +2025-05-11 07:24:42 - INFO - stdout - {'loss': 0.7396, 'grad_norm': 0.7044709920883179, 'learning_rate': 6.201183721869735e-06, 'epoch': 1.91} +2025-05-11 07:24:42 - ERROR - stderr - 64%|██████▎ | 2377/3741 [13:58:49<8:19:42, 21.98s/it] +2025-05-11 07:25:05 - ERROR - stderr - 64%|██████▎ | 2378/3741 [13:59:12<8:26:48, 22.31s/it] +2025-05-11 07:25:06 - ERROR - stderr - +2025-05-11 07:25:06 - ERROR - stderr - +2025-05-11 07:25:06 - INFO - stdout - {'loss': 0.7369, 'grad_norm': 0.6789054870605469, 'learning_rate': 6.193174994146148e-06, 'epoch': 1.91} +2025-05-11 07:25:06 - ERROR - stderr - 64%|██████▎ | 2378/3741 [13:59:12<8:26:48, 22.31s/it] +2025-05-11 07:25:25 - ERROR - stderr - 64%|██████▎ | 2379/3741 [13:59:32<8:09:51, 21.58s/it] +2025-05-11 07:25:25 - ERROR - stderr - +2025-05-11 07:25:25 - ERROR - stderr - +2025-05-11 07:25:25 - INFO - stdout - {'loss': 0.7293, 'grad_norm': 0.6822087168693542, 'learning_rate': 6.185169120909598e-06, 'epoch': 1.91} +2025-05-11 07:25:25 - ERROR - stderr - 64%|██████▎ | 2379/3741 [13:59:32<8:09:51, 21.58s/it] +2025-05-11 07:25:48 - ERROR - stderr - 64%|██████▎ | 2380/3741 [13:59:54<8:15:09, 21.83s/it] +2025-05-11 07:25:48 - ERROR - stderr - +2025-05-11 07:25:48 - ERROR - stderr - +2025-05-11 07:25:48 - INFO - stdout - {'loss': 0.7156, 'grad_norm': 0.7218993306159973, 'learning_rate': 6.177166108163155e-06, 'epoch': 1.91} +2025-05-11 07:25:48 - ERROR - stderr - 64%|██████▎ | 2380/3741 [13:59:54<8:15:09, 21.83s/it] +2025-05-11 07:26:09 - ERROR - stderr - 64%|██████▎ | 2381/3741 [14:00:16<8:12:28, 21.73s/it] +2025-05-11 07:26:09 - ERROR - stderr - +2025-05-11 07:26:09 - ERROR - stderr - +2025-05-11 07:26:09 - INFO - stdout - {'loss': 0.7279, 'grad_norm': 0.6758652329444885, 'learning_rate': 6.169165961907762e-06, 'epoch': 1.91} +2025-05-11 07:26:09 - ERROR - stderr - 64%|██████▎ | 2381/3741 [14:00:16<8:12:28, 21.73s/it] +2025-05-11 07:26:31 - ERROR - stderr - 64%|██████▎ | 2382/3741 [14:00:38<8:14:36, 21.84s/it] +2025-05-11 07:26:31 - ERROR - stderr - +2025-05-11 07:26:31 - ERROR - stderr - +2025-05-11 07:26:31 - INFO - stdout - {'loss': 0.7328, 'grad_norm': 0.7355567216873169, 'learning_rate': 6.1611686881421875e-06, 'epoch': 1.91} +2025-05-11 07:26:31 - ERROR - stderr - 64%|██████▎ | 2382/3741 [14:00:38<8:14:36, 21.84s/it] +2025-05-11 07:26:53 - ERROR - stderr - 64%|██████▎ | 2383/3741 [14:00:59<8:13:35, 21.81s/it] +2025-05-11 07:26:53 - ERROR - stderr - +2025-05-11 07:26:53 - ERROR - stderr - +2025-05-11 07:26:53 - INFO - stdout - {'loss': 0.7094, 'grad_norm': 0.7142036557197571, 'learning_rate': 6.153174292863071e-06, 'epoch': 1.91} +2025-05-11 07:26:53 - ERROR - stderr - 64%|██████▎ | 2383/3741 [14:00:59<8:13:35, 21.81s/it] +2025-05-11 07:27:17 - ERROR - stderr - 64%|██████▎ | 2384/3741 [14:01:23<8:24:32, 22.31s/it] +2025-05-11 07:27:17 - ERROR - stderr - +2025-05-11 07:27:17 - ERROR - stderr - +2025-05-11 07:27:17 - INFO - stdout - {'loss': 0.7163, 'grad_norm': 0.7164692282676697, 'learning_rate': 6.145182782064879e-06, 'epoch': 1.91} +2025-05-11 07:27:17 - ERROR - stderr - 64%|██████▎ | 2384/3741 [14:01:23<8:24:32, 22.31s/it] +2025-05-11 07:27:36 - ERROR - stderr - 64%|██████▍ | 2385/3741 [14:01:43<8:06:21, 21.52s/it] +2025-05-11 07:27:36 - ERROR - stderr - +2025-05-11 07:27:36 - ERROR - stderr - +2025-05-11 07:27:36 - INFO - stdout - {'loss': 0.6753, 'grad_norm': 0.6600916981697083, 'learning_rate': 6.137194161739915e-06, 'epoch': 1.91} +2025-05-11 07:27:36 - ERROR - stderr - 64%|██████▍ | 2385/3741 [14:01:43<8:06:21, 21.52s/it] +2025-05-11 07:28:00 - ERROR - stderr - 64%|██████▍ | 2386/3741 [14:02:06<8:19:41, 22.13s/it] +2025-05-11 07:28:00 - ERROR - stderr - +2025-05-11 07:28:00 - ERROR - stderr - +2025-05-11 07:28:00 - INFO - stdout - {'loss': 0.7447, 'grad_norm': 0.7095491290092468, 'learning_rate': 6.129208437878324e-06, 'epoch': 1.91} +2025-05-11 07:28:00 - ERROR - stderr - 64%|██████▍ | 2386/3741 [14:02:06<8:19:41, 22.13s/it] +2025-05-11 07:28:20 - ERROR - stderr - 64%|██████▍ | 2387/3741 [14:02:26<8:06:54, 21.58s/it] +2025-05-11 07:28:20 - ERROR - stderr - +2025-05-11 07:28:20 - ERROR - stderr - +2025-05-11 07:28:20 - INFO - stdout - {'loss': 0.6973, 'grad_norm': 0.6852803230285645, 'learning_rate': 6.121225616468065e-06, 'epoch': 1.91} +2025-05-11 07:28:20 - ERROR - stderr - 64%|██████▍ | 2387/3741 [14:02:26<8:06:54, 21.58s/it] +2025-05-11 07:28:43 - ERROR - stderr - 64%|██████▍ | 2388/3741 [14:02:49<8:13:42, 21.89s/it] +2025-05-11 07:28:43 - ERROR - stderr - +2025-05-11 07:28:43 - ERROR - stderr - +2025-05-11 07:28:43 - INFO - stdout - {'loss': 0.7793, 'grad_norm': 0.6977118253707886, 'learning_rate': 6.113245703494941e-06, 'epoch': 1.91} +2025-05-11 07:28:43 - ERROR - stderr - 64%|██████▍ | 2388/3741 [14:02:49<8:13:42, 21.89s/it] +2025-05-11 07:29:02 - ERROR - stderr - 64%|██████▍ | 2389/3741 [14:03:09<7:57:18, 21.18s/it] +2025-05-11 07:29:02 - ERROR - stderr - +2025-05-11 07:29:02 - ERROR - stderr - +2025-05-11 07:29:02 - INFO - stdout - {'loss': 0.7237, 'grad_norm': 0.6602482199668884, 'learning_rate': 6.105268704942555e-06, 'epoch': 1.92} +2025-05-11 07:29:02 - ERROR - stderr - 64%|██████▍ | 2389/3741 [14:03:09<7:57:18, 21.18s/it] +2025-05-11 07:29:22 - ERROR - stderr - 64%|██████▍ | 2390/3741 [14:03:29<7:48:33, 20.81s/it] +2025-05-11 07:29:22 - ERROR - stderr - +2025-05-11 07:29:22 - ERROR - stderr - +2025-05-11 07:29:22 - INFO - stdout - {'loss': 0.7226, 'grad_norm': 0.7034119963645935, 'learning_rate': 6.097294626792334e-06, 'epoch': 1.92} +2025-05-11 07:29:22 - ERROR - stderr - 64%|██████▍ | 2390/3741 [14:03:29<7:48:33, 20.81s/it] +2025-05-11 07:29:42 - ERROR - stderr - 64%|██████▍ | 2391/3741 [14:03:48<7:41:59, 20.53s/it] +2025-05-11 07:29:42 - ERROR - stderr - +2025-05-11 07:29:42 - ERROR - stderr - +2025-05-11 07:29:42 - INFO - stdout - {'loss': 0.73, 'grad_norm': 0.6831420063972473, 'learning_rate': 6.0893234750235145e-06, 'epoch': 1.92} +2025-05-11 07:29:42 - ERROR - stderr - 64%|██████▍ | 2391/3741 [14:03:48<7:41:59, 20.53s/it] +2025-05-11 07:30:02 - ERROR - stderr - 64%|██████▍ | 2392/3741 [14:04:08<7:35:05, 20.24s/it] +2025-05-11 07:30:02 - ERROR - stderr - +2025-05-11 07:30:02 - ERROR - stderr - +2025-05-11 07:30:02 - INFO - stdout - {'loss': 0.7314, 'grad_norm': 0.7341967225074768, 'learning_rate': 6.0813552556131315e-06, 'epoch': 1.92} +2025-05-11 07:30:02 - ERROR - stderr - 64%|██████▍ | 2392/3741 [14:04:08<7:35:05, 20.24s/it] +2025-05-11 07:30:21 - ERROR - stderr - 64%|██████▍ | 2393/3741 [14:04:28<7:32:06, 20.12s/it] +2025-05-11 07:30:22 - ERROR - stderr - +2025-05-11 07:30:22 - ERROR - stderr - +2025-05-11 07:30:22 - INFO - stdout - {'loss': 0.7362, 'grad_norm': 0.7385361194610596, 'learning_rate': 6.073389974536037e-06, 'epoch': 1.92} +2025-05-11 07:30:22 - ERROR - stderr - 64%|██████▍ | 2393/3741 [14:04:28<7:32:06, 20.12s/it] +2025-05-11 07:30:42 - ERROR - stderr - 64%|██████▍ | 2394/3741 [14:04:48<7:33:52, 20.22s/it] +2025-05-11 07:30:42 - ERROR - stderr - +2025-05-11 07:30:42 - ERROR - stderr - +2025-05-11 07:30:42 - INFO - stdout - {'loss': 0.742, 'grad_norm': 0.6924091577529907, 'learning_rate': 6.065427637764865e-06, 'epoch': 1.92} +2025-05-11 07:30:42 - ERROR - stderr - 64%|██████▍ | 2394/3741 [14:04:48<7:33:52, 20.22s/it] +2025-05-11 07:31:02 - ERROR - stderr - 64%|██████▍ | 2395/3741 [14:05:08<7:31:41, 20.13s/it] +2025-05-11 07:31:02 - ERROR - stderr - +2025-05-11 07:31:02 - ERROR - stderr - +2025-05-11 07:31:02 - INFO - stdout - {'loss': 0.6972, 'grad_norm': 0.6444892883300781, 'learning_rate': 6.0574682512700444e-06, 'epoch': 1.92} +2025-05-11 07:31:02 - ERROR - stderr - 64%|██████▍ | 2395/3741 [14:05:08<7:31:41, 20.13s/it] +2025-05-11 07:31:23 - ERROR - stderr - 64%|██████▍ | 2396/3741 [14:05:29<7:37:04, 20.39s/it] +2025-05-11 07:31:23 - ERROR - stderr - +2025-05-11 07:31:23 - ERROR - stderr - +2025-05-11 07:31:23 - INFO - stdout - {'loss': 0.7426, 'grad_norm': 0.7088480591773987, 'learning_rate': 6.0495118210197975e-06, 'epoch': 1.92} +2025-05-11 07:31:23 - ERROR - stderr - 64%|██████▍ | 2396/3741 [14:05:29<7:37:04, 20.39s/it] +2025-05-11 07:31:45 - ERROR - stderr - 64%|██████▍ | 2397/3741 [14:05:51<7:45:55, 20.80s/it] +2025-05-11 07:31:45 - ERROR - stderr - +2025-05-11 07:31:45 - ERROR - stderr - +2025-05-11 07:31:45 - INFO - stdout - {'loss': 0.7335, 'grad_norm': 0.7047684788703918, 'learning_rate': 6.041558352980126e-06, 'epoch': 1.92} +2025-05-11 07:31:45 - ERROR - stderr - 64%|██████▍ | 2397/3741 [14:05:51<7:45:55, 20.80s/it] +2025-05-11 07:32:05 - ERROR - stderr - 64%|██████▍ | 2398/3741 [14:06:11<7:43:29, 20.71s/it] +2025-05-11 07:32:05 - ERROR - stderr - +2025-05-11 07:32:05 - ERROR - stderr - +2025-05-11 07:32:05 - INFO - stdout - {'loss': 0.802, 'grad_norm': 0.7246830463409424, 'learning_rate': 6.033607853114813e-06, 'epoch': 1.92} +2025-05-11 07:32:05 - ERROR - stderr - 64%|██████▍ | 2398/3741 [14:06:11<7:43:29, 20.71s/it] +2025-05-11 07:32:26 - ERROR - stderr - 64%|██████▍ | 2399/3741 [14:06:33<7:47:34, 20.90s/it] +2025-05-11 07:32:26 - ERROR - stderr - +2025-05-11 07:32:26 - ERROR - stderr - +2025-05-11 07:32:26 - INFO - stdout - {'loss': 0.7143, 'grad_norm': 0.6952186822891235, 'learning_rate': 6.025660327385412e-06, 'epoch': 1.92} +2025-05-11 07:32:26 - ERROR - stderr - 64%|██████▍ | 2399/3741 [14:06:33<7:47:34, 20.90s/it] +2025-05-11 07:32:48 - ERROR - stderr - 64%|██████▍ | 2400/3741 [14:06:54<7:48:11, 20.95s/it] +2025-05-11 07:32:48 - ERROR - stderr - +2025-05-11 07:32:48 - ERROR - stderr - +2025-05-11 07:32:48 - INFO - stdout - {'loss': 0.7112, 'grad_norm': 0.6386004090309143, 'learning_rate': 6.017715781751243e-06, 'epoch': 1.92} +2025-05-11 07:32:48 - ERROR - stderr - 64%|██████▍ | 2400/3741 [14:06:54<7:48:11, 20.95s/it] +2025-05-11 07:33:10 - ERROR - stderr - 64%|██████▍ | 2401/3741 [14:07:16<7:54:56, 21.27s/it] +2025-05-11 07:33:10 - ERROR - stderr - +2025-05-11 07:33:10 - ERROR - stderr - +2025-05-11 07:33:10 - INFO - stdout - {'loss': 0.7377, 'grad_norm': 0.6913342475891113, 'learning_rate': 6.009774222169409e-06, 'epoch': 1.93} +2025-05-11 07:33:10 - ERROR - stderr - 64%|██████▍ | 2401/3741 [14:07:16<7:54:56, 21.27s/it] +2025-05-11 07:33:31 - ERROR - stderr - 64%|██████▍ | 2402/3741 [14:07:37<7:53:39, 21.22s/it] +2025-05-11 07:33:31 - ERROR - stderr - +2025-05-11 07:33:31 - ERROR - stderr - +2025-05-11 07:33:31 - INFO - stdout - {'loss': 0.7444, 'grad_norm': 0.6988136768341064, 'learning_rate': 6.001835654594751e-06, 'epoch': 1.93} +2025-05-11 07:33:31 - ERROR - stderr - 64%|██████▍ | 2402/3741 [14:07:37<7:53:39, 21.22s/it] +2025-05-11 07:33:52 - ERROR - stderr - 64%|██████▍ | 2403/3741 [14:07:59<7:56:32, 21.37s/it] +2025-05-11 07:33:52 - ERROR - stderr - +2025-05-11 07:33:52 - ERROR - stderr - +2025-05-11 07:33:52 - INFO - stdout - {'loss': 0.7516, 'grad_norm': 0.7216395735740662, 'learning_rate': 5.993900084979884e-06, 'epoch': 1.93} +2025-05-11 07:33:52 - ERROR - stderr - 64%|██████▍ | 2403/3741 [14:07:59<7:56:32, 21.37s/it] +2025-05-11 07:34:13 - ERROR - stderr - 64%|██████▍ | 2404/3741 [14:08:19<7:49:22, 21.06s/it] +2025-05-11 07:34:13 - ERROR - stderr - +2025-05-11 07:34:13 - ERROR - stderr - +2025-05-11 07:34:13 - INFO - stdout - {'loss': 0.6978, 'grad_norm': 0.6848301887512207, 'learning_rate': 5.985967519275167e-06, 'epoch': 1.93} +2025-05-11 07:34:13 - ERROR - stderr - 64%|██████▍ | 2404/3741 [14:08:19<7:49:22, 21.06s/it] +2025-05-11 07:34:34 - ERROR - stderr - 64%|██████▍ | 2405/3741 [14:08:41<7:51:53, 21.19s/it] +2025-05-11 07:34:34 - ERROR - stderr - +2025-05-11 07:34:34 - ERROR - stderr - +2025-05-11 07:34:34 - INFO - stdout - {'loss': 0.7278, 'grad_norm': 0.6854767203330994, 'learning_rate': 5.978037963428702e-06, 'epoch': 1.93} +2025-05-11 07:34:34 - ERROR - stderr - 64%|██████▍ | 2405/3741 [14:08:41<7:51:53, 21.19s/it] +2025-05-11 07:34:58 - ERROR - stderr - 64%|██████▍ | 2406/3741 [14:09:04<8:09:02, 21.98s/it] +2025-05-11 07:34:58 - ERROR - stderr - +2025-05-11 07:34:58 - ERROR - stderr - +2025-05-11 07:34:58 - INFO - stdout - {'loss': 0.7319, 'grad_norm': 0.6917245984077454, 'learning_rate': 5.970111423386349e-06, 'epoch': 1.93} +2025-05-11 07:34:58 - ERROR - stderr - 64%|██████▍ | 2406/3741 [14:09:04<8:09:02, 21.98s/it] +2025-05-11 07:35:21 - ERROR - stderr - 64%|██████▍ | 2407/3741 [14:09:27<8:16:06, 22.31s/it] +2025-05-11 07:35:21 - ERROR - stderr - +2025-05-11 07:35:21 - ERROR - stderr - +2025-05-11 07:35:21 - INFO - stdout - {'loss': 0.7181, 'grad_norm': 0.7103894948959351, 'learning_rate': 5.962187905091692e-06, 'epoch': 1.93} +2025-05-11 07:35:21 - ERROR - stderr - 64%|██████▍ | 2407/3741 [14:09:27<8:16:06, 22.31s/it] +2025-05-11 07:35:42 - ERROR - stderr - 64%|██████▍ | 2408/3741 [14:09:49<8:09:10, 22.02s/it] +2025-05-11 07:35:42 - ERROR - stderr - +2025-05-11 07:35:42 - ERROR - stderr - +2025-05-11 07:35:42 - INFO - stdout - {'loss': 0.7213, 'grad_norm': 0.6659692525863647, 'learning_rate': 5.954267414486057e-06, 'epoch': 1.93} +2025-05-11 07:35:42 - ERROR - stderr - 64%|██████▍ | 2408/3741 [14:09:49<8:09:10, 22.02s/it] +2025-05-11 07:36:04 - ERROR - stderr - 64%|██████▍ | 2409/3741 [14:10:10<8:02:44, 21.75s/it] +2025-05-11 07:36:04 - ERROR - stderr - +2025-05-11 07:36:04 - ERROR - stderr - +2025-05-11 07:36:04 - INFO - stdout - {'loss': 0.7317, 'grad_norm': 0.7113282084465027, 'learning_rate': 5.946349957508499e-06, 'epoch': 1.93} +2025-05-11 07:36:04 - ERROR - stderr - 64%|██████▍ | 2409/3741 [14:10:10<8:02:44, 21.75s/it] +2025-05-11 07:36:25 - ERROR - stderr - 64%|██████▍ | 2410/3741 [14:10:31<8:01:15, 21.69s/it] +2025-05-11 07:36:25 - ERROR - stderr - +2025-05-11 07:36:25 - ERROR - stderr - +2025-05-11 07:36:25 - INFO - stdout - {'loss': 0.6955, 'grad_norm': 0.6863912343978882, 'learning_rate': 5.93843554009579e-06, 'epoch': 1.93} +2025-05-11 07:36:25 - ERROR - stderr - 64%|██████▍ | 2410/3741 [14:10:31<8:01:15, 21.69s/it] +2025-05-11 07:36:46 - ERROR - stderr - 64%|██████▍ | 2411/3741 [14:10:52<7:55:03, 21.43s/it] +2025-05-11 07:36:46 - ERROR - stderr - +2025-05-11 07:36:46 - ERROR - stderr - +2025-05-11 07:36:46 - INFO - stdout - {'loss': 0.7199, 'grad_norm': 0.6813226938247681, 'learning_rate': 5.930524168182441e-06, 'epoch': 1.93} +2025-05-11 07:36:46 - ERROR - stderr - 64%|██████▍ | 2411/3741 [14:10:52<7:55:03, 21.43s/it] +2025-05-11 07:37:08 - ERROR - stderr - 64%|██████▍ | 2412/3741 [14:11:14<7:57:17, 21.55s/it] +2025-05-11 07:37:08 - ERROR - stderr - +2025-05-11 07:37:08 - ERROR - stderr - +2025-05-11 07:37:08 - INFO - stdout - {'loss': 0.7189, 'grad_norm': 0.6929488778114319, 'learning_rate': 5.922615847700655e-06, 'epoch': 1.93} +2025-05-11 07:37:08 - ERROR - stderr - 64%|██████▍ | 2412/3741 [14:11:14<7:57:17, 21.55s/it] +2025-05-11 07:37:29 - ERROR - stderr - 65%|██████▍ | 2413/3741 [14:11:35<7:52:57, 21.37s/it] +2025-05-11 07:37:29 - ERROR - stderr - +2025-05-11 07:37:29 - ERROR - stderr - +2025-05-11 07:37:29 - INFO - stdout - {'loss': 0.7502, 'grad_norm': 0.6780677437782288, 'learning_rate': 5.91471058458037e-06, 'epoch': 1.94} +2025-05-11 07:37:29 - ERROR - stderr - 65%|���█████▍ | 2413/3741 [14:11:35<7:52:57, 21.37s/it] +2025-05-11 07:37:49 - ERROR - stderr - 65%|██████▍ | 2414/3741 [14:11:55<7:45:36, 21.05s/it] +2025-05-11 07:37:49 - ERROR - stderr - +2025-05-11 07:37:49 - ERROR - stderr - +2025-05-11 07:37:49 - INFO - stdout - {'loss': 0.6846, 'grad_norm': 0.6948460340499878, 'learning_rate': 5.90680838474922e-06, 'epoch': 1.94} +2025-05-11 07:37:49 - ERROR - stderr - 65%|██████▍ | 2414/3741 [14:11:55<7:45:36, 21.05s/it] +2025-05-11 07:38:11 - ERROR - stderr - 65%|██████▍ | 2415/3741 [14:12:17<7:48:44, 21.21s/it] +2025-05-11 07:38:11 - ERROR - stderr - +2025-05-11 07:38:11 - ERROR - stderr - +2025-05-11 07:38:11 - INFO - stdout - {'loss': 0.6991, 'grad_norm': 0.6975181698799133, 'learning_rate': 5.898909254132539e-06, 'epoch': 1.94} +2025-05-11 07:38:11 - ERROR - stderr - 65%|██████▍ | 2415/3741 [14:12:17<7:48:44, 21.21s/it] +2025-05-11 07:38:33 - ERROR - stderr - 65%|██████▍ | 2416/3741 [14:12:39<7:54:36, 21.49s/it] +2025-05-11 07:38:33 - ERROR - stderr - +2025-05-11 07:38:33 - ERROR - stderr - +2025-05-11 07:38:33 - INFO - stdout - {'loss': 0.7178, 'grad_norm': 0.7138963937759399, 'learning_rate': 5.891013198653368e-06, 'epoch': 1.94} +2025-05-11 07:38:33 - ERROR - stderr - 65%|██████▍ | 2416/3741 [14:12:39<7:54:36, 21.49s/it] +2025-05-11 07:38:57 - ERROR - stderr - 65%|██████▍ | 2417/3741 [14:13:04<8:14:11, 22.40s/it] +2025-05-11 07:38:57 - ERROR - stderr - +2025-05-11 07:38:57 - ERROR - stderr - +2025-05-11 07:38:57 - INFO - stdout - {'loss': 0.7485, 'grad_norm': 0.7381751537322998, 'learning_rate': 5.8831202242324345e-06, 'epoch': 1.94} +2025-05-11 07:38:57 - ERROR - stderr - 65%|██████▍ | 2417/3741 [14:13:04<8:14:11, 22.40s/it] +2025-05-11 07:39:24 - ERROR - stderr - 65%|██████▍ | 2418/3741 [14:13:30<8:39:30, 23.56s/it] +2025-05-11 07:39:24 - ERROR - stderr - +2025-05-11 07:39:24 - ERROR - stderr - +2025-05-11 07:39:24 - INFO - stdout - {'loss': 0.7372, 'grad_norm': 0.6692777872085571, 'learning_rate': 5.875230336788167e-06, 'epoch': 1.94} +2025-05-11 07:39:24 - ERROR - stderr - 65%|██████▍ | 2418/3741 [14:13:30<8:39:30, 23.56s/it] +2025-05-11 07:39:47 - ERROR - stderr - 65%|██████▍ | 2419/3741 [14:13:54<8:41:15, 23.66s/it] +2025-05-11 07:39:47 - ERROR - stderr - +2025-05-11 07:39:47 - ERROR - stderr - +2025-05-11 07:39:47 - INFO - stdout - {'loss': 0.7446, 'grad_norm': 0.699887216091156, 'learning_rate': 5.8673435422366656e-06, 'epoch': 1.94} +2025-05-11 07:39:47 - ERROR - stderr - 65%|██████▍ | 2419/3741 [14:13:54<8:41:15, 23.66s/it] +2025-05-11 07:40:09 - ERROR - stderr - 65%|██████▍ | 2420/3741 [14:14:16<8:30:22, 23.18s/it] +2025-05-11 07:40:10 - ERROR - stderr - +2025-05-11 07:40:10 - ERROR - stderr - +2025-05-11 07:40:10 - INFO - stdout - {'loss': 0.7047, 'grad_norm': 0.6614972352981567, 'learning_rate': 5.859459846491718e-06, 'epoch': 1.94} +2025-05-11 07:40:10 - ERROR - stderr - 65%|██████▍ | 2420/3741 [14:14:16<8:30:22, 23.18s/it] +2025-05-11 07:40:30 - ERROR - stderr - 65%|██████▍ | 2421/3741 [14:14:37<8:15:23, 22.52s/it] +2025-05-11 07:40:30 - ERROR - stderr - +2025-05-11 07:40:30 - ERROR - stderr - +2025-05-11 07:40:30 - INFO - stdout - {'loss': 0.7272, 'grad_norm': 0.6647498607635498, 'learning_rate': 5.85157925546479e-06, 'epoch': 1.94} +2025-05-11 07:40:30 - ERROR - stderr - 65%|██████▍ | 2421/3741 [14:14:37<8:15:23, 22.52s/it] +2025-05-11 07:40:51 - ERROR - stderr - 65%|██████▍ | 2422/3741 [14:14:58<8:05:05, 22.07s/it] +2025-05-11 07:40:51 - ERROR - stderr - +2025-05-11 07:40:51 - ERROR - stderr - +2025-05-11 07:40:51 - INFO - stdout - {'loss': 0.7061, 'grad_norm': 0.6981691122055054, 'learning_rate': 5.843701775065011e-06, 'epoch': 1.94} +2025-05-11 07:40:51 - ERROR - stderr - 65%|██████▍ | 2422/3741 [14:14:58<8:05:05, 22.07s/it] +2025-05-11 07:41:11 - ERROR - stderr - 65%|██████▍ | 2423/3741 [14:15:17<7:48:22, 21.32s/it] +2025-05-11 07:41:11 - ERROR - stderr - +2025-05-11 07:41:11 - ERROR - stderr - +2025-05-11 07:41:11 - INFO - stdout - {'loss': 0.7676, 'grad_norm': 0.7187725305557251, 'learning_rate': 5.835827411199194e-06, 'epoch': 1.94} +2025-05-11 07:41:11 - ERROR - stderr - 65%|██████▍ | 2423/3741 [14:15:17<7:48:22, 21.32s/it] +2025-05-11 07:41:32 - ERROR - stderr - 65%|██████▍ | 2424/3741 [14:15:39<7:48:02, 21.32s/it] +2025-05-11 07:41:32 - ERROR - stderr - +2025-05-11 07:41:32 - ERROR - stderr - +2025-05-11 07:41:32 - INFO - stdout - {'loss': 0.7354, 'grad_norm': 0.6973015069961548, 'learning_rate': 5.8279561697718025e-06, 'epoch': 1.94} +2025-05-11 07:41:32 - ERROR - stderr - 65%|██████▍ | 2424/3741 [14:15:39<7:48:02, 21.32s/it] +2025-05-11 07:41:52 - ERROR - stderr - 65%|██████▍ | 2425/3741 [14:15:59<7:38:30, 20.90s/it] +2025-05-11 07:41:52 - ERROR - stderr - +2025-05-11 07:41:52 - ERROR - stderr - +2025-05-11 07:41:52 - INFO - stdout - {'loss': 0.7591, 'grad_norm': 0.69856858253479, 'learning_rate': 5.8200880566849535e-06, 'epoch': 1.94} +2025-05-11 07:41:52 - ERROR - stderr - 65%|██████▍ | 2425/3741 [14:15:59<7:38:30, 20.90s/it] +2025-05-11 07:42:14 - ERROR - stderr - 65%|██████▍ | 2426/3741 [14:16:21<7:44:37, 21.20s/it] +2025-05-11 07:42:14 - ERROR - stderr - +2025-05-11 07:42:14 - ERROR - stderr - +2025-05-11 07:42:14 - INFO - stdout - {'loss': 0.7067, 'grad_norm': 0.674909770488739, 'learning_rate': 5.812223077838433e-06, 'epoch': 1.95} +2025-05-11 07:42:14 - ERROR - stderr - 65%|██████▍ | 2426/3741 [14:16:21<7:44:37, 21.20s/it] +2025-05-11 07:42:34 - ERROR - stderr - 65%|██████▍ | 2427/3741 [14:16:40<7:35:19, 20.79s/it] +2025-05-11 07:42:34 - ERROR - stderr - +2025-05-11 07:42:34 - ERROR - stderr - +2025-05-11 07:42:34 - INFO - stdout - {'loss': 0.726, 'grad_norm': 0.6738868951797485, 'learning_rate': 5.804361239129668e-06, 'epoch': 1.95} +2025-05-11 07:42:34 - ERROR - stderr - 65%|██████▍ | 2427/3741 [14:16:40<7:35:19, 20.79s/it] +2025-05-11 07:42:57 - ERROR - stderr - 65%|██████▍ | 2428/3741 [14:17:03<7:50:18, 21.49s/it] +2025-05-11 07:42:57 - ERROR - stderr - +2025-05-11 07:42:57 - ERROR - stderr - +2025-05-11 07:42:57 - INFO - stdout - {'loss': 0.7368, 'grad_norm': 0.6930453777313232, 'learning_rate': 5.7965025464537336e-06, 'epoch': 1.95} +2025-05-11 07:42:57 - ERROR - stderr - 65%|██████▍ | 2428/3741 [14:17:03<7:50:18, 21.49s/it] +2025-05-11 07:43:17 - ERROR - stderr - 65%|██████▍ | 2429/3741 [14:17:23<7:37:55, 20.94s/it] +2025-05-11 07:43:17 - ERROR - stderr - +2025-05-11 07:43:17 - ERROR - stderr - +2025-05-11 07:43:17 - INFO - stdout - {'loss': 0.7118, 'grad_norm': 0.6769317388534546, 'learning_rate': 5.788647005703349e-06, 'epoch': 1.95} +2025-05-11 07:43:17 - ERROR - stderr - 65%|██████▍ | 2429/3741 [14:17:23<7:37:55, 20.94s/it] +2025-05-11 07:43:39 - ERROR - stderr - 65%|██████▍ | 2430/3741 [14:17:45<7:46:26, 21.35s/it] +2025-05-11 07:43:39 - ERROR - stderr - +2025-05-11 07:43:39 - ERROR - stderr - +2025-05-11 07:43:39 - INFO - stdout - {'loss': 0.7179, 'grad_norm': 0.6883741021156311, 'learning_rate': 5.780794622768859e-06, 'epoch': 1.95} +2025-05-11 07:43:39 - ERROR - stderr - 65%|██████▍ | 2430/3741 [14:17:45<7:46:26, 21.35s/it] +2025-05-11 07:43:59 - ERROR - stderr - 65%|██████▍ | 2431/3741 [14:18:05<7:35:06, 20.84s/it] +2025-05-11 07:43:59 - ERROR - stderr - +2025-05-11 07:43:59 - ERROR - stderr - +2025-05-11 07:43:59 - INFO - stdout - {'loss': 0.7072, 'grad_norm': 0.6916329860687256, 'learning_rate': 5.77294540353825e-06, 'epoch': 1.95} +2025-05-11 07:43:59 - ERROR - stderr - 65%|██████▍ | 2431/3741 [14:18:05<7:35:06, 20.84s/it] +2025-05-11 07:44:21 - ERROR - stderr - 65%|██████▌ | 2432/3741 [14:18:27<7:41:58, 21.17s/it] +2025-05-11 07:44:21 - ERROR - stderr - +2025-05-11 07:44:21 - ERROR - stderr - +2025-05-11 07:44:21 - INFO - stdout - {'loss': 0.7044, 'grad_norm': 0.6634646654129028, 'learning_rate': 5.765099353897136e-06, 'epoch': 1.95} +2025-05-11 07:44:21 - ERROR - stderr - 65%|██████▌ | 2432/3741 [14:18:27<7:41:58, 21.17s/it] +2025-05-11 07:44:41 - ERROR - stderr - 65%|██████▌ | 2433/3741 [14:18:47<7:35:24, 20.89s/it] +2025-05-11 07:44:41 - ERROR - stderr - +2025-05-11 07:44:41 - ERROR - stderr - +2025-05-11 07:44:41 - INFO - stdout - {'loss': 0.7317, 'grad_norm': 0.6738753318786621, 'learning_rate': 5.7572564797287525e-06, 'epoch': 1.95} +2025-05-11 07:44:41 - ERROR - stderr - 65%|██████▌ | 2433/3741 [14:18:47<7:35:24, 20.89s/it] +2025-05-11 07:45:03 - ERROR - stderr - 65%|██████▌ | 2434/3741 [14:19:09<7:39:30, 21.09s/it] +2025-05-11 07:45:03 - ERROR - stderr - +2025-05-11 07:45:03 - ERROR - stderr - +2025-05-11 07:45:03 - INFO - stdout - {'loss': 0.7059, 'grad_norm': 0.6909292340278625, 'learning_rate': 5.749416786913954e-06, 'epoch': 1.95} +2025-05-11 07:45:03 - ERROR - stderr - 65%|██████▌ | 2434/3741 [14:19:09<7:39:30, 21.09s/it] +2025-05-11 07:45:22 - ERROR - stderr - 65%|██████▌ | 2435/3741 [14:19:28<7:29:44, 20.66s/it] +2025-05-11 07:45:22 - ERROR - stderr - +2025-05-11 07:45:22 - ERROR - stderr - +2025-05-11 07:45:22 - INFO - stdout - {'loss': 0.7367, 'grad_norm': 0.6980206966400146, 'learning_rate': 5.741580281331204e-06, 'epoch': 1.95} +2025-05-11 07:45:22 - ERROR - stderr - 65%|██████▌ | 2435/3741 [14:19:29<7:29:44, 20.66s/it] +2025-05-11 07:45:42 - ERROR - stderr - 65%|██████▌ | 2436/3741 [14:19:48<7:24:16, 20.43s/it] +2025-05-11 07:45:42 - ERROR - stderr - +2025-05-11 07:45:42 - ERROR - stderr - +2025-05-11 07:45:42 - INFO - stdout - {'loss': 0.6925, 'grad_norm': 0.7224229574203491, 'learning_rate': 5.733746968856585e-06, 'epoch': 1.95} +2025-05-11 07:45:42 - ERROR - stderr - 65%|██████▌ | 2436/3741 [14:19:48<7:24:16, 20.43s/it] +2025-05-11 07:46:02 - ERROR - stderr - 65%|██████▌ | 2437/3741 [14:20:08<7:20:32, 20.27s/it] +2025-05-11 07:46:02 - ERROR - stderr - +2025-05-11 07:46:02 - ERROR - stderr - +2025-05-11 07:46:02 - INFO - stdout - {'loss': 0.7288, 'grad_norm': 0.680873692035675, 'learning_rate': 5.7259168553637815e-06, 'epoch': 1.95} +2025-05-11 07:46:02 - ERROR - stderr - 65%|██████▌ | 2437/3741 [14:20:08<7:20:32, 20.27s/it] +2025-05-11 07:46:22 - ERROR - stderr - 65%|██████▌ | 2438/3741 [14:20:29<7:21:08, 20.31s/it] +2025-05-11 07:46:22 - ERROR - stderr - +2025-05-11 07:46:22 - ERROR - stderr - +2025-05-11 07:46:22 - INFO - stdout - {'loss': 0.6918, 'grad_norm': 0.6676924824714661, 'learning_rate': 5.718089946724078e-06, 'epoch': 1.96} +2025-05-11 07:46:22 - ERROR - stderr - 65%|██████▌ | 2438/3741 [14:20:29<7:21:08, 20.31s/it] +2025-05-11 07:46:43 - ERROR - stderr - 65%|██████▌ | 2439/3741 [14:20:49<7:22:36, 20.40s/it] +2025-05-11 07:46:43 - ERROR - stderr - +2025-05-11 07:46:43 - ERROR - stderr - +2025-05-11 07:46:43 - INFO - stdout - {'loss': 0.7167, 'grad_norm': 0.7096678614616394, 'learning_rate': 5.710266248806363e-06, 'epoch': 1.96} +2025-05-11 07:46:43 - ERROR - stderr - 65%|██████▌ | 2439/3741 [14:20:49<7:22:36, 20.40s/it] +2025-05-11 07:47:03 - ERROR - stderr - 65%|██████▌ | 2440/3741 [14:21:09<7:19:13, 20.26s/it] +2025-05-11 07:47:03 - ERROR - stderr - +2025-05-11 07:47:03 - ERROR - stderr - +2025-05-11 07:47:03 - INFO - stdout - {'loss': 0.6996, 'grad_norm': 0.6810303926467896, 'learning_rate': 5.702445767477103e-06, 'epoch': 1.96} +2025-05-11 07:47:03 - ERROR - stderr - 65%|██████▌ | 2440/3741 [14:21:09<7:19:13, 20.26s/it] +2025-05-11 07:47:25 - ERROR - stderr - 65%|██████▌ | 2441/3741 [14:21:31<7:32:00, 20.86s/it] +2025-05-11 07:47:25 - ERROR - stderr - +2025-05-11 07:47:25 - ERROR - stderr - +2025-05-11 07:47:25 - INFO - stdout - {'loss': 0.7609, 'grad_norm': 0.6997901201248169, 'learning_rate': 5.6946285086003636e-06, 'epoch': 1.96} +2025-05-11 07:47:25 - ERROR - stderr - 65%|██████▌ | 2441/3741 [14:21:32<7:32:00, 20.86s/it] +2025-05-11 07:47:47 - ERROR - stderr - 65%|██████▌ | 2442/3741 [14:21:53<7:37:46, 21.14s/it] +2025-05-11 07:47:47 - ERROR - stderr - +2025-05-11 07:47:47 - ERROR - stderr - +2025-05-11 07:47:47 - INFO - stdout - {'loss': 0.7472, 'grad_norm': 0.7114957571029663, 'learning_rate': 5.686814478037795e-06, 'epoch': 1.96} +2025-05-11 07:47:47 - ERROR - stderr - 65%|██████▌ | 2442/3741 [14:21:53<7:37:46, 21.14s/it] +2025-05-11 07:48:24 - ERROR - stderr - 65%|██████▌ | 2443/3741 [14:22:31<9:22:02, 25.98s/it] +2025-05-11 07:48:24 - ERROR - stderr - +2025-05-11 07:48:24 - ERROR - stderr - +2025-05-11 07:48:24 - INFO - stdout - {'loss': 0.6977, 'grad_norm': 0.7095164656639099, 'learning_rate': 5.679003681648625e-06, 'epoch': 1.96} +2025-05-11 07:48:24 - ERROR - stderr - 65%|██████▌ | 2443/3741 [14:22:31<9:22:02, 25.98s/it] +2025-05-11 07:49:06 - ERROR - stderr - 65%|██████▌ | 2444/3741 [14:23:13<11:05:51, 30.80s/it] +2025-05-11 07:49:06 - ERROR - stderr - +2025-05-11 07:49:06 - ERROR - stderr - +2025-05-11 07:49:06 - INFO - stdout - {'loss': 0.7513, 'grad_norm': 0.7158694267272949, 'learning_rate': 5.671196125289647e-06, 'epoch': 1.96} +2025-05-11 07:49:06 - ERROR - stderr - 65%|██████▌ | 2444/3741 [14:23:13<11:05:51, 30.80s/it] +2025-05-11 07:49:34 - ERROR - stderr - 65%|██████▌ | 2445/3741 [14:23:40<10:44:12, 29.82s/it] +2025-05-11 07:49:34 - ERROR - stderr - +2025-05-11 07:49:34 - ERROR - stderr - +2025-05-11 07:49:34 - INFO - stdout - {'loss': 0.7194, 'grad_norm': 0.6895703673362732, 'learning_rate': 5.663391814815238e-06, 'epoch': 1.96} +2025-05-11 07:49:34 - ERROR - stderr - 65%|██████▌ | 2445/3741 [14:23:40<10:44:12, 29.82s/it] +2025-05-11 07:50:09 - ERROR - stderr - 65%|██████▌ | 2446/3741 [14:24:16<11:19:50, 31.50s/it] +2025-05-11 07:50:09 - ERROR - stderr - +2025-05-11 07:50:09 - ERROR - stderr - +2025-05-11 07:50:09 - INFO - stdout - {'loss': 0.7336, 'grad_norm': 0.6775211095809937, 'learning_rate': 5.655590756077334e-06, 'epoch': 1.96} +2025-05-11 07:50:09 - ERROR - stderr - 65%|██████▌ | 2446/3741 [14:24:16<11:19:50, 31.50s/it] +2025-05-11 07:50:36 - ERROR - stderr - 65%|██████▌ | 2447/3741 [14:24:42<10:47:00, 30.00s/it] +2025-05-11 07:50:36 - ERROR - stderr - +2025-05-11 07:50:36 - ERROR - stderr - +2025-05-11 07:50:36 - INFO - stdout - {'loss': 0.7276, 'grad_norm': 0.7095913290977478, 'learning_rate': 5.647792954925435e-06, 'epoch': 1.96} +2025-05-11 07:50:36 - ERROR - stderr - 65%|██████▌ | 2447/3741 [14:24:42<10:47:00, 30.00s/it] +2025-05-11 07:51:12 - ERROR - stderr - 65%|██████▌ | 2448/3741 [14:25:19<11:28:47, 31.96s/it] +2025-05-11 07:51:12 - ERROR - stderr - +2025-05-11 07:51:12 - ERROR - stderr - +2025-05-11 07:51:12 - INFO - stdout - {'loss': 0.7091, 'grad_norm': 0.6791907548904419, 'learning_rate': 5.639998417206602e-06, 'epoch': 1.96} +2025-05-11 07:51:12 - ERROR - stderr - 65%|██████▌ | 2448/3741 [14:25:19<11:28:47, 31.96s/it] +2025-05-11 07:51:52 - ERROR - stderr - 65%|██████▌ | 2449/3741 [14:25:59<12:20:57, 34.41s/it] +2025-05-11 07:51:52 - ERROR - stderr - +2025-05-11 07:51:52 - ERROR - stderr - +2025-05-11 07:51:52 - INFO - stdout - {'loss': 0.7246, 'grad_norm': 0.7308151125907898, 'learning_rate': 5.632207148765438e-06, 'epoch': 1.96} +2025-05-11 07:51:52 - ERROR - stderr - 65%|██████▌ | 2449/3741 [14:25:59<12:20:57, 34.41s/it] +2025-05-11 07:52:25 - ERROR - stderr - 65%|██████▌ | 2450/3741 [14:26:31<12:06:35, 33.77s/it] +2025-05-11 07:52:25 - ERROR - stderr - +2025-05-11 07:52:25 - ERROR - stderr - +2025-05-11 07:52:25 - INFO - stdout - {'loss': 0.7325, 'grad_norm': 0.7150318026542664, 'learning_rate': 5.6244191554441045e-06, 'epoch': 1.96} +2025-05-11 07:52:25 - ERROR - stderr - 65%|██████▌ | 2450/3741 [14:26:31<12:06:35, 33.77s/it] +2025-05-11 07:53:00 - ERROR - stderr - 66%|██████▌ | 2451/3741 [14:27:07<12:18:48, 34.36s/it] +2025-05-11 07:53:00 - ERROR - stderr - +2025-05-11 07:53:00 - ERROR - stderr - +2025-05-11 07:53:00 - INFO - stdout - {'loss': 0.7073, 'grad_norm': 0.6908929347991943, 'learning_rate': 5.616634443082303e-06, 'epoch': 1.97} +2025-05-11 07:53:00 - ERROR - stderr - 66%|██████▌ | 2451/3741 [14:27:07<12:18:48, 34.36s/it] +2025-05-11 07:53:46 - ERROR - stderr - 66%|██████▌ | 2452/3741 [14:27:53<13:32:19, 37.81s/it] +2025-05-11 07:53:46 - ERROR - stderr - +2025-05-11 07:53:46 - ERROR - stderr - +2025-05-11 07:53:46 - INFO - stdout - {'loss': 0.6903, 'grad_norm': 0.7129378318786621, 'learning_rate': 5.608853017517277e-06, 'epoch': 1.97} +2025-05-11 07:53:46 - ERROR - stderr - 66%|██████▌ | 2452/3741 [14:27:53<13:32:19, 37.81s/it] +2025-05-11 07:54:08 - ERROR - stderr - 66%|██████▌ | 2453/3741 [14:28:14<11:47:03, 32.94s/it] +2025-05-11 07:54:08 - ERROR - stderr - +2025-05-11 07:54:08 - ERROR - stderr - +2025-05-11 07:54:08 - INFO - stdout - {'loss': 0.7296, 'grad_norm': 0.6859320998191833, 'learning_rate': 5.601074884583809e-06, 'epoch': 1.97} +2025-05-11 07:54:08 - ERROR - stderr - 66%|██████▌ | 2453/3741 [14:28:14<11:47:03, 32.94s/it] +2025-05-11 07:54:29 - ERROR - stderr - 66%|██████▌ | 2454/3741 [14:28:35<10:29:18, 29.34s/it] +2025-05-11 07:54:29 - ERROR - stderr - +2025-05-11 07:54:29 - ERROR - stderr - +2025-05-11 07:54:29 - INFO - stdout - {'loss': 0.7437, 'grad_norm': 0.6935213208198547, 'learning_rate': 5.593300050114199e-06, 'epoch': 1.97} +2025-05-11 07:54:29 - ERROR - stderr - 66%|██████▌ | 2454/3741 [14:28:35<10:29:18, 29.34s/it] +2025-05-11 07:54:51 - ERROR - stderr - 66%|██████▌ | 2455/3741 [14:28:57<9:42:48, 27.19s/it] +2025-05-11 07:54:51 - ERROR - stderr - +2025-05-11 07:54:51 - ERROR - stderr - +2025-05-11 07:54:51 - INFO - stdout - {'loss': 0.7118, 'grad_norm': 0.6933846473693848, 'learning_rate': 5.585528519938288e-06, 'epoch': 1.97} +2025-05-11 07:54:51 - ERROR - stderr - 66%|██████▌ | 2455/3741 [14:28:57<9:42:48, 27.19s/it] +2025-05-11 07:55:11 - ERROR - stderr - 66%|██████▌ | 2456/3741 [14:29:18<8:58:32, 25.15s/it] +2025-05-11 07:55:11 - ERROR - stderr - +2025-05-11 07:55:11 - ERROR - stderr - +2025-05-11 07:55:11 - INFO - stdout - {'loss': 0.7091, 'grad_norm': 0.6689132452011108, 'learning_rate': 5.5777602998834345e-06, 'epoch': 1.97} +2025-05-11 07:55:11 - ERROR - stderr - 66%|██████▌ | 2456/3741 [14:29:18<8:58:32, 25.15s/it] +2025-05-11 07:55:35 - ERROR - stderr - 66%|██████▌ | 2457/3741 [14:29:41<8:45:57, 24.58s/it] +2025-05-11 07:55:35 - ERROR - stderr - +2025-05-11 07:55:35 - ERROR - stderr - +2025-05-11 07:55:35 - INFO - stdout - {'loss': 0.6749, 'grad_norm': 0.6558547616004944, 'learning_rate': 5.569995395774508e-06, 'epoch': 1.97} +2025-05-11 07:55:35 - ERROR - stderr - 66%|██████▌ | 2457/3741 [14:29:41<8:45:57, 24.58s/it] +2025-05-11 07:55:55 - ERROR - stderr - 66%|██████▌ | 2458/3741 [14:30:02<8:20:23, 23.40s/it] +2025-05-11 07:55:55 - ERROR - stderr - +2025-05-11 07:55:55 - ERROR - stderr - +2025-05-11 07:55:55 - INFO - stdout - {'loss': 0.7316, 'grad_norm': 0.680107057094574, 'learning_rate': 5.562233813433909e-06, 'epoch': 1.97} +2025-05-11 07:55:55 - ERROR - stderr - 66%|██████▌ | 2458/3741 [14:30:02<8:20:23, 23.40s/it] +2025-05-11 07:56:17 - ERROR - stderr - 66%|██████▌ | 2459/3741 [14:30:24<8:10:59, 22.98s/it] +2025-05-11 07:56:17 - ERROR - stderr - +2025-05-11 07:56:17 - ERROR - stderr - +2025-05-11 07:56:17 - INFO - stdout - {'loss': 0.7284, 'grad_norm': 0.6814321875572205, 'learning_rate': 5.5544755586815265e-06, 'epoch': 1.97} +2025-05-11 07:56:17 - ERROR - stderr - 66%|██████▌ | 2459/3741 [14:30:24<8:10:59, 22.98s/it] +2025-05-11 07:56:37 - ERROR - stderr - 66%|██████▌ | 2460/3741 [14:30:44<7:51:41, 22.09s/it] +2025-05-11 07:56:37 - ERROR - stderr - +2025-05-11 07:56:37 - ERROR - stderr - +2025-05-11 07:56:37 - INFO - stdout - {'loss': 0.7091, 'grad_norm': 0.6948514580726624, 'learning_rate': 5.546720637334769e-06, 'epoch': 1.97} +2025-05-11 07:56:37 - ERROR - stderr - 66%|██████▌ | 2460/3741 [14:30:44<7:51:41, 22.09s/it] +2025-05-11 07:56:59 - ERROR - stderr - 66%|██████▌ | 2461/3741 [14:31:05<7:45:48, 21.83s/it] +2025-05-11 07:56:59 - ERROR - stderr - +2025-05-11 07:56:59 - ERROR - stderr - +2025-05-11 07:56:59 - INFO - stdout - {'loss': 0.7373, 'grad_norm': 0.6681773066520691, 'learning_rate': 5.538969055208543e-06, 'epoch': 1.97} +2025-05-11 07:56:59 - ERROR - stderr - 66%|██████▌ | 2461/3741 [14:31:05<7:45:48, 21.83s/it] +2025-05-11 07:57:19 - ERROR - stderr - 66%|██████▌ | 2462/3741 [14:31:25<7:37:32, 21.46s/it] +2025-05-11 07:57:19 - ERROR - stderr - +2025-05-11 07:57:19 - ERROR - stderr - +2025-05-11 07:57:19 - INFO - stdout - {'loss': 0.7387, 'grad_norm': 0.7033309936523438, 'learning_rate': 5.5312208181152376e-06, 'epoch': 1.97} +2025-05-11 07:57:19 - ERROR - stderr - 66%|██████▌ | 2462/3741 [14:31:25<7:37:32, 21.46s/it] +2025-05-11 07:57:41 - ERROR - stderr - 66%|██████▌ | 2463/3741 [14:31:47<7:38:35, 21.53s/it] +2025-05-11 07:57:41 - ERROR - stderr - +2025-05-11 07:57:41 - ERROR - stderr - +2025-05-11 07:57:41 - INFO - stdout - {'loss': 0.7446, 'grad_norm': 0.6964126229286194, 'learning_rate': 5.523475931864759e-06, 'epoch': 1.98} +2025-05-11 07:57:41 - ERROR - stderr - 66%|██████▌ | 2463/3741 [14:31:47<7:38:35, 21.53s/it] +2025-05-11 07:58:01 - ERROR - stderr - 66%|██████▌ | 2464/3741 [14:32:08<7:31:05, 21.19s/it] +2025-05-11 07:58:01 - ERROR - stderr - +2025-05-11 07:58:01 - ERROR - stderr - +2025-05-11 07:58:01 - INFO - stdout - {'loss': 0.6558, 'grad_norm': 0.6732887029647827, 'learning_rate': 5.515734402264478e-06, 'epoch': 1.98} +2025-05-11 07:58:01 - ERROR - stderr - 66%|██████▌ | 2464/3741 [14:32:08<7:31:05, 21.19s/it] +2025-05-11 07:58:23 - ERROR - stderr - 66%|██████▌ | 2465/3741 [14:32:30<7:37:43, 21.52s/it] +2025-05-11 07:58:24 - ERROR - stderr - +2025-05-11 07:58:24 - ERROR - stderr - +2025-05-11 07:58:24 - INFO - stdout - {'loss': 0.7085, 'grad_norm': 0.6745656728744507, 'learning_rate': 5.5079962351192585e-06, 'epoch': 1.98} +2025-05-11 07:58:24 - ERROR - stderr - 66%|██████▌ | 2465/3741 [14:32:30<7:37:43, 21.52s/it] +2025-05-11 07:58:45 - ERROR - stderr - 66%|██████▌ | 2466/3741 [14:32:51<7:36:41, 21.49s/it] +2025-05-11 07:58:45 - ERROR - stderr - +2025-05-11 07:58:45 - ERROR - stderr - +2025-05-11 07:58:45 - INFO - stdout - {'loss': 0.7126, 'grad_norm': 0.7190232276916504, 'learning_rate': 5.500261436231447e-06, 'epoch': 1.98} +2025-05-11 07:58:45 - ERROR - stderr - 66%|██████▌ | 2466/3741 [14:32:51<7:36:41, 21.49s/it] +2025-05-11 07:59:06 - ERROR - stderr - 66%|██████▌ | 2467/3741 [14:33:13<7:35:36, 21.46s/it] +2025-05-11 07:59:06 - ERROR - stderr - +2025-05-11 07:59:06 - ERROR - stderr - +2025-05-11 07:59:06 - INFO - stdout - {'loss': 0.7176, 'grad_norm': 0.6871313452720642, 'learning_rate': 5.4925300114008465e-06, 'epoch': 1.98} +2025-05-11 07:59:06 - ERROR - stderr - 66%|██████▌ | 2467/3741 [14:33:13<7:35:36, 21.46s/it] +2025-05-11 07:59:28 - ERROR - stderr - 66%|██████▌ | 2468/3741 [14:33:34<7:34:59, 21.44s/it] +2025-05-11 07:59:28 - ERROR - stderr - +2025-05-11 07:59:28 - ERROR - stderr - +2025-05-11 07:59:28 - INFO - stdout - {'loss': 0.7013, 'grad_norm': 0.6837944388389587, 'learning_rate': 5.4848019664247575e-06, 'epoch': 1.98} +2025-05-11 07:59:28 - ERROR - stderr - 66%|██████▌ | 2468/3741 [14:33:34<7:34:59, 21.44s/it] +2025-05-11 07:59:49 - ERROR - stderr - 66%|██████▌ | 2469/3741 [14:33:55<7:34:16, 21.43s/it] +2025-05-11 07:59:49 - ERROR - stderr - +2025-05-11 07:59:49 - ERROR - stderr - +2025-05-11 07:59:49 - INFO - stdout - {'loss': 0.733, 'grad_norm': 0.706548810005188, 'learning_rate': 5.4770773070979225e-06, 'epoch': 1.98} +2025-05-11 07:59:49 - ERROR - stderr - 66%|██████▌ | 2469/3741 [14:33:55<7:34:16, 21.43s/it] +2025-05-11 08:00:11 - ERROR - stderr - 66%|██████▌ | 2470/3741 [14:34:17<7:34:41, 21.46s/it] +2025-05-11 08:00:11 - ERROR - stderr - +2025-05-11 08:00:11 - ERROR - stderr - +2025-05-11 08:00:11 - INFO - stdout - {'loss': 0.7227, 'grad_norm': 0.682320773601532, 'learning_rate': 5.469356039212557e-06, 'epoch': 1.98} +2025-05-11 08:00:11 - ERROR - stderr - 66%|██████▌ | 2470/3741 [14:34:17<7:34:41, 21.46s/it] +2025-05-11 08:00:32 - ERROR - stderr - 66%|██████▌ | 2471/3741 [14:34:38<7:33:43, 21.44s/it] +2025-05-11 08:00:32 - ERROR - stderr - +2025-05-11 08:00:32 - ERROR - stderr - +2025-05-11 08:00:32 - INFO - stdout - {'loss': 0.7324, 'grad_norm': 0.7176364660263062, 'learning_rate': 5.461638168558332e-06, 'epoch': 1.98} +2025-05-11 08:00:32 - ERROR - stderr - 66%|██████▌ | 2471/3741 [14:34:38<7:33:43, 21.44s/it] +2025-05-11 08:00:52 - ERROR - stderr - 66%|██████▌ | 2472/3741 [14:34:58<7:24:05, 21.00s/it] +2025-05-11 08:00:52 - ERROR - stderr - +2025-05-11 08:00:52 - ERROR - stderr - +2025-05-11 08:00:52 - INFO - stdout - {'loss': 0.7153, 'grad_norm': 0.6608320474624634, 'learning_rate': 5.453923700922366e-06, 'epoch': 1.98} +2025-05-11 08:00:52 - ERROR - stderr - 66%|██████▌ | 2472/3741 [14:34:58<7:24:05, 21.00s/it] +2025-05-11 08:01:13 - ERROR - stderr - 66%|██████▌ | 2473/3741 [14:35:19<7:23:15, 20.97s/it] +2025-05-11 08:01:13 - ERROR - stderr - +2025-05-11 08:01:13 - ERROR - stderr - +2025-05-11 08:01:13 - INFO - stdout - {'loss': 0.7022, 'grad_norm': 0.6956177353858948, 'learning_rate': 5.446212642089228e-06, 'epoch': 1.98} +2025-05-11 08:01:13 - ERROR - stderr - 66%|██████▌ | 2473/3741 [14:35:19<7:23:15, 20.97s/it] +2025-05-11 08:01:33 - ERROR - stderr - 66%|██████▌ | 2474/3741 [14:35:39<7:14:36, 20.58s/it] +2025-05-11 08:01:33 - ERROR - stderr - +2025-05-11 08:01:33 - ERROR - stderr - +2025-05-11 08:01:33 - INFO - stdout - {'loss': 0.7185, 'grad_norm': 0.6896581053733826, 'learning_rate': 5.4385049978409385e-06, 'epoch': 1.98} +2025-05-11 08:01:33 - ERROR - stderr - 66%|██████▌ | 2474/3741 [14:35:39<7:14:36, 20.58s/it] +2025-05-11 08:01:54 - ERROR - stderr - 66%|██████▌ | 2475/3741 [14:36:00<7:20:26, 20.87s/it] +2025-05-11 08:01:54 - ERROR - stderr - +2025-05-11 08:01:54 - ERROR - stderr - +2025-05-11 08:01:54 - INFO - stdout - {'loss': 0.7393, 'grad_norm': 0.7071901559829712, 'learning_rate': 5.430800773956948e-06, 'epoch': 1.98} +2025-05-11 08:01:54 - ERROR - stderr - 66%|██████▌ | 2475/3741 [14:36:00<7:20:26, 20.87s/it] +2025-05-11 08:02:14 - ERROR - stderr - 66%|██████▌ | 2476/3741 [14:36:20<7:12:53, 20.53s/it] +2025-05-11 08:02:14 - ERROR - stderr - +2025-05-11 08:02:14 - ERROR - stderr - +2025-05-11 08:02:14 - INFO - stdout - {'loss': 0.7563, 'grad_norm': 0.7110061645507812, 'learning_rate': 5.42309997621415e-06, 'epoch': 1.99} +2025-05-11 08:02:14 - ERROR - stderr - 66%|██████▌ | 2476/3741 [14:36:20<7:12:53, 20.53s/it] +2025-05-11 08:02:35 - ERROR - stderr - 66%|██████▌ | 2477/3741 [14:36:41<7:17:29, 20.77s/it] +2025-05-11 08:02:35 - ERROR - stderr - +2025-05-11 08:02:35 - ERROR - stderr - +2025-05-11 08:02:35 - INFO - stdout - {'loss': 0.7583, 'grad_norm': 0.7318345904350281, 'learning_rate': 5.415402610386859e-06, 'epoch': 1.99} +2025-05-11 08:02:35 - ERROR - stderr - 66%|██████▌ | 2477/3741 [14:36:42<7:17:29, 20.77s/it] +2025-05-11 08:02:55 - ERROR - stderr - 66%|██████▌ | 2478/3741 [14:37:02<7:12:45, 20.56s/it] +2025-05-11 08:02:55 - ERROR - stderr - +2025-05-11 08:02:55 - ERROR - stderr - +2025-05-11 08:02:55 - INFO - stdout - {'loss': 0.6964, 'grad_norm': 0.6681869029998779, 'learning_rate': 5.407708682246825e-06, 'epoch': 1.99} +2025-05-11 08:02:55 - ERROR - stderr - 66%|██████▌ | 2478/3741 [14:37:02<7:12:45, 20.56s/it] +2025-05-11 08:03:18 - ERROR - stderr - 66%|██████▋ | 2479/3741 [14:37:24<7:24:41, 21.14s/it] +2025-05-11 08:03:18 - ERROR - stderr - +2025-05-11 08:03:18 - ERROR - stderr - +2025-05-11 08:03:18 - INFO - stdout - {'loss': 0.7301, 'grad_norm': 0.6897268891334534, 'learning_rate': 5.400018197563217e-06, 'epoch': 1.99} +2025-05-11 08:03:18 - ERROR - stderr - 66%|██████▋ | 2479/3741 [14:37:24<7:24:41, 21.14s/it] +2025-05-11 08:03:38 - ERROR - stderr - 66%|██████▋ | 2480/3741 [14:37:44<7:19:09, 20.90s/it] +2025-05-11 08:03:38 - ERROR - stderr - +2025-05-11 08:03:38 - ERROR - stderr - +2025-05-11 08:03:38 - INFO - stdout - {'loss': 0.6904, 'grad_norm': 0.6932487487792969, 'learning_rate': 5.392331162102622e-06, 'epoch': 1.99} +2025-05-11 08:03:38 - ERROR - stderr - 66%|██████▋ | 2480/3741 [14:37:44<7:19:09, 20.90s/it] +2025-05-11 08:04:01 - ERROR - stderr - 66%|██████▋ | 2481/3741 [14:38:07<7:29:20, 21.40s/it] +2025-05-11 08:04:01 - ERROR - stderr - +2025-05-11 08:04:01 - ERROR - stderr - +2025-05-11 08:04:01 - INFO - stdout - {'loss': 0.7364, 'grad_norm': 0.7004687786102295, 'learning_rate': 5.384647581629045e-06, 'epoch': 1.99} +2025-05-11 08:04:01 - ERROR - stderr - 66%|██████▋ | 2481/3741 [14:38:07<7:29:20, 21.40s/it] +2025-05-11 08:04:23 - ERROR - stderr - 66%|██████▋ | 2482/3741 [14:38:29<7:32:34, 21.57s/it] +2025-05-11 08:04:23 - ERROR - stderr - +2025-05-11 08:04:23 - ERROR - stderr - +2025-05-11 08:04:23 - INFO - stdout - {'loss': 0.7311, 'grad_norm': 0.6893764734268188, 'learning_rate': 5.37696746190389e-06, 'epoch': 1.99} +2025-05-11 08:04:23 - ERROR - stderr - 66%|██████▋ | 2482/3741 [14:38:29<7:32:34, 21.57s/it] +2025-05-11 08:04:44 - ERROR - stderr - 66%|██████▋ | 2483/3741 [14:38:51<7:33:07, 21.61s/it] +2025-05-11 08:04:44 - ERROR - stderr - +2025-05-11 08:04:44 - ERROR - stderr - +2025-05-11 08:04:44 - INFO - stdout - {'loss': 0.7163, 'grad_norm': 0.7490194439888, 'learning_rate': 5.369290808685975e-06, 'epoch': 1.99} +2025-05-11 08:04:44 - ERROR - stderr - 66%|██████▋ | 2483/3741 [14:38:51<7:33:07, 21.61s/it] +2025-05-11 08:05:06 - ERROR - stderr - 66%|██████▋ | 2484/3741 [14:39:13<7:35:56, 21.76s/it] +2025-05-11 08:05:06 - ERROR - stderr - +2025-05-11 08:05:06 - ERROR - stderr - +2025-05-11 08:05:06 - INFO - stdout - {'loss': 0.7446, 'grad_norm': 0.7016685009002686, 'learning_rate': 5.3616176277315164e-06, 'epoch': 1.99} +2025-05-11 08:05:06 - ERROR - stderr - 66%|██████▋ | 2484/3741 [14:39:13<7:35:56, 21.76s/it] +2025-05-11 08:05:29 - ERROR - stderr - 66%|██████▋ | 2485/3741 [14:39:35<7:39:29, 21.95s/it] +2025-05-11 08:05:29 - ERROR - stderr - +2025-05-11 08:05:29 - ERROR - stderr - +2025-05-11 08:05:29 - INFO - stdout - {'loss': 0.7777, 'grad_norm': 0.7102388739585876, 'learning_rate': 5.353947924794129e-06, 'epoch': 1.99} +2025-05-11 08:05:29 - ERROR - stderr - 66%|██████▋ | 2485/3741 [14:39:35<7:39:29, 21.95s/it] +2025-05-11 08:05:50 - ERROR - stderr - 66%|██████▋ | 2486/3741 [14:39:57<7:37:18, 21.86s/it] +2025-05-11 08:05:51 - ERROR - stderr - +2025-05-11 08:05:51 - ERROR - stderr - +2025-05-11 08:05:51 - INFO - stdout - {'loss': 0.7304, 'grad_norm': 0.707472026348114, 'learning_rate': 5.346281705624812e-06, 'epoch': 1.99} +2025-05-11 08:05:51 - ERROR - stderr - 66%|██████▋ | 2486/3741 [14:39:57<7:37:18, 21.86s/it] +2025-05-11 08:06:16 - ERROR - stderr - 66%|██████▋ | 2487/3741 [14:40:23<8:01:24, 23.03s/it] +2025-05-11 08:06:16 - ERROR - stderr - +2025-05-11 08:06:16 - ERROR - stderr - +2025-05-11 08:06:16 - INFO - stdout - {'loss': 0.7545, 'grad_norm': 0.6962066888809204, 'learning_rate': 5.33861897597196e-06, 'epoch': 1.99} +2025-05-11 08:06:16 - ERROR - stderr - 66%|██████▋ | 2487/3741 [14:40:23<8:01:24, 23.03s/it] +2025-05-11 08:06:37 - ERROR - stderr - 67%|██████▋ | 2488/3741 [14:40:43<7:46:47, 22.35s/it] +2025-05-11 08:06:37 - ERROR - stderr - +2025-05-11 08:06:37 - ERROR - stderr - +2025-05-11 08:06:37 - INFO - stdout - {'loss': 0.712, 'grad_norm': 0.684525191783905, 'learning_rate': 5.330959741581347e-06, 'epoch': 2.0} +2025-05-11 08:06:37 - ERROR - stderr - 67%|██████▋ | 2488/3741 [14:40:43<7:46:47, 22.35s/it] +2025-05-11 08:07:06 - ERROR - stderr - 67%|██████▋ | 2489/3741 [14:41:12<8:26:48, 24.29s/it] +2025-05-11 08:07:06 - ERROR - stderr - +2025-05-11 08:07:06 - ERROR - stderr - +2025-05-11 08:07:06 - INFO - stdout - {'loss': 0.7334, 'grad_norm': 0.6817164421081543, 'learning_rate': 5.323304008196133e-06, 'epoch': 2.0} +2025-05-11 08:07:06 - ERROR - stderr - 67%|██████▋ | 2489/3741 [14:41:12<8:26:48, 24.29s/it] +2025-05-11 08:07:29 - ERROR - stderr - 67%|██████▋ | 2490/3741 [14:41:36<8:20:42, 24.02s/it] +2025-05-11 08:07:29 - ERROR - stderr - +2025-05-11 08:07:29 - ERROR - stderr - +2025-05-11 08:07:29 - INFO - stdout - {'loss': 0.7334, 'grad_norm': 0.6918975114822388, 'learning_rate': 5.3156517815568455e-06, 'epoch': 2.0} +2025-05-11 08:07:29 - ERROR - stderr - 67%|██████▋ | 2490/3741 [14:41:36<8:20:42, 24.02s/it] +2025-05-11 08:07:49 - ERROR - stderr - 67%|██████▋ | 2491/3741 [14:41:55<7:52:50, 22.70s/it] +2025-05-11 08:07:49 - ERROR - stderr - +2025-05-11 08:07:49 - ERROR - stderr - +2025-05-11 08:07:49 - INFO - stdout - {'loss': 0.7255, 'grad_norm': 0.6976943612098694, 'learning_rate': 5.30800306740138e-06, 'epoch': 2.0} +2025-05-11 08:07:49 - ERROR - stderr - 67%|██████▋ | 2491/3741 [14:41:55<7:52:50, 22.70s/it] +2025-05-11 08:08:10 - ERROR - stderr - 67%|██████▋ | 2492/3741 [14:42:16<7:43:15, 22.25s/it] +2025-05-11 08:08:10 - ERROR - stderr - +2025-05-11 08:08:10 - ERROR - stderr - +2025-05-11 08:08:10 - INFO - stdout - {'loss': 0.7338, 'grad_norm': 0.7042475938796997, 'learning_rate': 5.300357871465007e-06, 'epoch': 2.0} +2025-05-11 08:08:10 - ERROR - stderr - 67%|██████▋ | 2492/3741 [14:42:16<7:43:15, 22.25s/it] +2025-05-11 08:08:31 - ERROR - stderr - 67%|██████▋ | 2493/3741 [14:42:37<7:32:01, 21.73s/it] +2025-05-11 08:08:31 - ERROR - stderr - +2025-05-11 08:08:31 - ERROR - stderr - +2025-05-11 08:08:31 - INFO - stdout - {'loss': 0.7195, 'grad_norm': 0.6709238290786743, 'learning_rate': 5.292716199480354e-06, 'epoch': 2.0} +2025-05-11 08:08:31 - ERROR - stderr - 67%|██████▋ | 2493/3741 [14:42:37<7:32:01, 21.73s/it] +2025-05-11 08:08:49 - ERROR - stderr - 67%|██████▋ | 2494/3741 [14:42:55<7:12:16, 20.80s/it] +2025-05-11 08:08:49 - ERROR - stderr - +2025-05-11 08:08:49 - ERROR - stderr - +2025-05-11 08:08:49 - INFO - stdout - {'loss': 0.6162, 'grad_norm': 0.648729145526886, 'learning_rate': 5.285078057177406e-06, 'epoch': 2.0} +2025-05-11 08:08:49 - ERROR - stderr - 67%|██████▋ | 2494/3741 [14:42:56<7:12:16, 20.80s/it] +2025-05-11 08:09:11 - ERROR - stderr - 67%|██████▋ | 2495/3741 [14:43:17<7:18:24, 21.11s/it] +2025-05-11 08:09:11 - ERROR - stderr - +2025-05-11 08:09:11 - ERROR - stderr - +2025-05-11 08:09:11 - INFO - stdout - {'loss': 0.5518, 'grad_norm': 0.6973950862884521, 'learning_rate': 5.277443450283508e-06, 'epoch': 2.0} +2025-05-11 08:09:11 - ERROR - stderr - 67%|██████▋ | 2495/3741 [14:43:17<7:18:24, 21.11s/it] +2025-05-11 08:09:31 - ERROR - stderr - 67%|██████▋ | 2496/3741 [14:43:37<7:09:45, 20.71s/it] +2025-05-11 08:09:31 - ERROR - stderr - +2025-05-11 08:09:31 - ERROR - stderr - +2025-05-11 08:09:31 - INFO - stdout - {'loss': 0.5418, 'grad_norm': 0.6707605123519897, 'learning_rate': 5.269812384523341e-06, 'epoch': 2.0} +2025-05-11 08:09:31 - ERROR - stderr - 67%|██████▋ | 2496/3741 [14:43:37<7:09:45, 20.71s/it] +2025-05-11 08:09:51 - ERROR - stderr - 67%|██████▋ | 2497/3741 [14:43:57<7:06:11, 20.56s/it] +2025-05-11 08:09:51 - ERROR - stderr - +2025-05-11 08:09:51 - ERROR - stderr - +2025-05-11 08:09:51 - INFO - stdout - {'loss': 0.5272, 'grad_norm': 0.6641839146614075, 'learning_rate': 5.262184865618938e-06, 'epoch': 2.0} +2025-05-11 08:09:51 - ERROR - stderr - 67%|██████▋ | 2497/3741 [14:43:57<7:06:11, 20.56s/it] +2025-05-11 08:10:12 - ERROR - stderr - 67%|██████▋ | 2498/3741 [14:44:18<7:07:19, 20.63s/it] +2025-05-11 08:10:12 - ERROR - stderr - +2025-05-11 08:10:12 - ERROR - stderr - +2025-05-11 08:10:12 - INFO - stdout - {'loss': 0.5537, 'grad_norm': 0.703292191028595, 'learning_rate': 5.254560899289679e-06, 'epoch': 2.0} +2025-05-11 08:10:12 - ERROR - stderr - 67%|██████▋ | 2498/3741 [14:44:18<7:07:19, 20.63s/it] +2025-05-11 08:10:34 - ERROR - stderr - 67%|██████▋ | 2499/3741 [14:44:40<7:15:46, 21.05s/it] +2025-05-11 08:10:34 - ERROR - stderr - +2025-05-11 08:10:34 - ERROR - stderr - +2025-05-11 08:10:34 - INFO - stdout - {'loss': 0.5197, 'grad_norm': 0.7050741314888, 'learning_rate': 5.246940491252263e-06, 'epoch': 2.0} +2025-05-11 08:10:34 - ERROR - stderr - 67%|██████▋ | 2499/3741 [14:44:40<7:15:46, 21.05s/it] +2025-05-11 08:10:54 - ERROR - stderr - 67%|██████▋ | 2500/3741 [14:45:01<7:11:25, 20.86s/it] +2025-05-11 08:10:54 - ERROR - stderr - +2025-05-11 08:10:54 - ERROR - stderr - +2025-05-11 08:10:54 - INFO - stdout - {'loss': 0.5039, 'grad_norm': 0.7010351419448853, 'learning_rate': 5.239323647220744e-06, 'epoch': 2.0} +2025-05-11 08:10:54 - ERROR - stderr - 67%|██████▋ | 2500/3741 [14:45:01<7:11:25, 20.86s/it] +2025-05-11 08:11:17 - ERROR - stderr - 67%|██████▋ | 2501/3741 [14:45:23<7:20:56, 21.34s/it] +2025-05-11 08:11:17 - ERROR - stderr - +2025-05-11 08:11:17 - ERROR - stderr - +2025-05-11 08:11:17 - INFO - stdout - {'loss': 0.5257, 'grad_norm': 0.7894969582557678, 'learning_rate': 5.231710372906482e-06, 'epoch': 2.01} +2025-05-11 08:11:17 - ERROR - stderr - 67%|██████▋ | 2501/3741 [14:45:23<7:20:56, 21.34s/it] +2025-05-11 08:11:40 - ERROR - stderr - 67%|██████▋ | 2502/3741 [14:45:46<7:33:03, 21.94s/it] +2025-05-11 08:11:40 - ERROR - stderr - +2025-05-11 08:11:40 - ERROR - stderr - +2025-05-11 08:11:40 - INFO - stdout - {'loss': 0.5038, 'grad_norm': 0.8365249633789062, 'learning_rate': 5.224100674018173e-06, 'epoch': 2.01} +2025-05-11 08:11:40 - ERROR - stderr - 67%|██████▋ | 2502/3741 [14:45:46<7:33:03, 21.94s/it] +2025-05-11 08:12:04 - ERROR - stderr - 67%|██████▋ | 2503/3741 [14:46:10<7:44:57, 22.53s/it] +2025-05-11 08:12:04 - ERROR - stderr - +2025-05-11 08:12:04 - ERROR - stderr - +2025-05-11 08:12:04 - INFO - stdout - {'loss': 0.5347, 'grad_norm': 0.9526363611221313, 'learning_rate': 5.216494556261831e-06, 'epoch': 2.01} +2025-05-11 08:12:04 - ERROR - stderr - 67%|██████▋ | 2503/3741 [14:46:10<7:44:57, 22.53s/it] +2025-05-11 08:12:26 - ERROR - stderr - 67%|██████▋ | 2504/3741 [14:46:32<7:42:23, 22.43s/it] +2025-05-11 08:12:26 - ERROR - stderr - +2025-05-11 08:12:26 - ERROR - stderr - +2025-05-11 08:12:26 - INFO - stdout - {'loss': 0.506, 'grad_norm': 0.9146489500999451, 'learning_rate': 5.208892025340772e-06, 'epoch': 2.01} +2025-05-11 08:12:26 - ERROR - stderr - 67%|██████▋ | 2504/3741 [14:46:32<7:42:23, 22.43s/it] +2025-05-11 08:12:49 - ERROR - stderr - 67%|██████▋ | 2505/3741 [14:46:55<7:43:43, 22.51s/it] +2025-05-11 08:12:49 - ERROR - stderr - +2025-05-11 08:12:49 - ERROR - stderr - +2025-05-11 08:12:49 - INFO - stdout - {'loss': 0.5, 'grad_norm': 0.8940325975418091, 'learning_rate': 5.201293086955646e-06, 'epoch': 2.01} +2025-05-11 08:12:49 - ERROR - stderr - 67%|██████▋ | 2505/3741 [14:46:55<7:43:43, 22.51s/it] +2025-05-11 08:13:10 - ERROR - stderr - 67%|██████▋ | 2506/3741 [14:47:16<7:35:08, 22.11s/it] +2025-05-11 08:13:10 - ERROR - stderr - +2025-05-11 08:13:10 - ERROR - stderr - +2025-05-11 08:13:10 - INFO - stdout - {'loss': 0.5178, 'grad_norm': 0.9119753837585449, 'learning_rate': 5.193697746804386e-06, 'epoch': 2.01} +2025-05-11 08:13:10 - ERROR - stderr - 67%|██████▋ | 2506/3741 [14:47:16<7:35:08, 22.11s/it] +2025-05-11 08:13:32 - ERROR - stderr - 67%|██████▋ | 2507/3741 [14:47:39<7:35:22, 22.14s/it] +2025-05-11 08:13:32 - ERROR - stderr - +2025-05-11 08:13:32 - ERROR - stderr - +2025-05-11 08:13:32 - INFO - stdout - {'loss': 0.5075, 'grad_norm': 0.8597251176834106, 'learning_rate': 5.186106010582239e-06, 'epoch': 2.01} +2025-05-11 08:13:32 - ERROR - stderr - 67%|██████▋ | 2507/3741 [14:47:39<7:35:22, 22.14s/it] +2025-05-11 08:13:54 - ERROR - stderr - 67%|██████▋ | 2508/3741 [14:48:00<7:33:15, 22.06s/it] +2025-05-11 08:13:54 - ERROR - stderr - +2025-05-11 08:13:54 - ERROR - stderr - +2025-05-11 08:13:54 - INFO - stdout - {'loss': 0.4928, 'grad_norm': 0.7838432192802429, 'learning_rate': 5.178517883981753e-06, 'epoch': 2.01} +2025-05-11 08:13:54 - ERROR - stderr - 67%|██████▋ | 2508/3741 [14:48:00<7:33:15, 22.06s/it] +2025-05-11 08:14:16 - ERROR - stderr - 67%|██████▋ | 2509/3741 [14:48:23<7:33:32, 22.09s/it] +2025-05-11 08:14:16 - ERROR - stderr - +2025-05-11 08:14:16 - ERROR - stderr - +2025-05-11 08:14:16 - INFO - stdout - {'loss': 0.5191, 'grad_norm': 0.7873410582542419, 'learning_rate': 5.170933372692752e-06, 'epoch': 2.01} +2025-05-11 08:14:16 - ERROR - stderr - 67%|██████▋ | 2509/3741 [14:48:23<7:33:32, 22.09s/it] +2025-05-11 08:14:36 - ERROR - stderr - 67%|██████▋ | 2510/3741 [14:48:43<7:21:32, 21.52s/it] +2025-05-11 08:14:36 - ERROR - stderr - +2025-05-11 08:14:36 - ERROR - stderr - +2025-05-11 08:14:36 - INFO - stdout - {'loss': 0.5039, 'grad_norm': 0.7258116006851196, 'learning_rate': 5.163352482402375e-06, 'epoch': 2.01} +2025-05-11 08:14:36 - ERROR - stderr - 67%|██████▋ | 2510/3741 [14:48:43<7:21:32, 21.52s/it] +2025-05-11 08:15:00 - ERROR - stderr - 67%|██████▋ | 2511/3741 [14:49:06<7:31:48, 22.04s/it] +2025-05-11 08:15:00 - ERROR - stderr - +2025-05-11 08:15:00 - ERROR - stderr - +2025-05-11 08:15:00 - INFO - stdout - {'loss': 0.5224, 'grad_norm': 0.8325080871582031, 'learning_rate': 5.15577521879502e-06, 'epoch': 2.01} +2025-05-11 08:15:00 - ERROR - stderr - 67%|██████▋ | 2511/3741 [14:49:06<7:31:48, 22.04s/it] +2025-05-11 08:15:19 - ERROR - stderr - 67%|██████▋ | 2512/3741 [14:49:26<7:17:29, 21.36s/it] +2025-05-11 08:15:19 - ERROR - stderr - +2025-05-11 08:15:19 - ERROR - stderr - +2025-05-11 08:15:19 - INFO - stdout - {'loss': 0.4951, 'grad_norm': 0.7581323385238647, 'learning_rate': 5.148201587552384e-06, 'epoch': 2.01} +2025-05-11 08:15:19 - ERROR - stderr - 67%|██████▋ | 2512/3741 [14:49:26<7:17:29, 21.36s/it] +2025-05-11 08:15:44 - ERROR - stderr - 67%|██████▋ | 2513/3741 [14:49:50<7:33:53, 22.18s/it] +2025-05-11 08:15:44 - ERROR - stderr - +2025-05-11 08:15:44 - ERROR - stderr - +2025-05-11 08:15:44 - INFO - stdout - {'loss': 0.5489, 'grad_norm': 0.7743967175483704, 'learning_rate': 5.140631594353434e-06, 'epoch': 2.02} +2025-05-11 08:15:44 - ERROR - stderr - 67%|██████▋ | 2513/3741 [14:49:50<7:33:53, 22.18s/it] +2025-05-11 08:16:03 - ERROR - stderr - 67%|██████▋ | 2514/3741 [14:50:09<7:17:52, 21.41s/it] +2025-05-11 08:16:03 - ERROR - stderr - +2025-05-11 08:16:03 - ERROR - stderr - +2025-05-11 08:16:03 - INFO - stdout - {'loss': 0.4962, 'grad_norm': 0.7546889185905457, 'learning_rate': 5.133065244874404e-06, 'epoch': 2.02} +2025-05-11 08:16:03 - ERROR - stderr - 67%|██████▋ | 2514/3741 [14:50:10<7:17:52, 21.41s/it] +2025-05-11 08:16:28 - ERROR - stderr - 67%|██████▋ | 2515/3741 [14:50:34<7:37:01, 22.37s/it] +2025-05-11 08:16:28 - ERROR - stderr - +2025-05-11 08:16:28 - ERROR - stderr - +2025-05-11 08:16:28 - INFO - stdout - {'loss': 0.4818, 'grad_norm': 0.725445568561554, 'learning_rate': 5.1255025447888005e-06, 'epoch': 2.02} +2025-05-11 08:16:28 - ERROR - stderr - 67%|██████▋ | 2515/3741 [14:50:34<7:37:01, 22.37s/it] +2025-05-11 08:16:49 - ERROR - stderr - 67%|██████▋ | 2516/3741 [14:50:55<7:27:03, 21.90s/it] +2025-05-11 08:16:49 - ERROR - stderr - +2025-05-11 08:16:49 - ERROR - stderr - +2025-05-11 08:16:49 - INFO - stdout - {'loss': 0.4848, 'grad_norm': 0.7801692485809326, 'learning_rate': 5.117943499767402e-06, 'epoch': 2.02} +2025-05-11 08:16:49 - ERROR - stderr - 67%|██████▋ | 2516/3741 [14:50:55<7:27:03, 21.90s/it] +2025-05-11 08:17:09 - ERROR - stderr - 67%|██████▋ | 2517/3741 [14:51:15<7:18:31, 21.50s/it] +2025-05-11 08:17:09 - ERROR - stderr - +2025-05-11 08:17:09 - ERROR - stderr - +2025-05-11 08:17:09 - INFO - stdout - {'loss': 0.4813, 'grad_norm': 0.7961569428443909, 'learning_rate': 5.110388115478222e-06, 'epoch': 2.02} +2025-05-11 08:17:09 - ERROR - stderr - 67%|██████▋ | 2517/3741 [14:51:15<7:18:31, 21.50s/it] +2025-05-11 08:17:30 - ERROR - stderr - 67%|██████▋ | 2518/3741 [14:51:36<7:12:12, 21.20s/it] +2025-05-11 08:17:30 - ERROR - stderr - +2025-05-11 08:17:30 - ERROR - stderr - +2025-05-11 08:17:30 - INFO - stdout - {'loss': 0.4887, 'grad_norm': 0.840562641620636, 'learning_rate': 5.102836397586564e-06, 'epoch': 2.02} +2025-05-11 08:17:30 - ERROR - stderr - 67%|██████▋ | 2518/3741 [14:51:36<7:12:12, 21.20s/it] +2025-05-11 08:17:50 - ERROR - stderr - 67%|██████▋ | 2519/3741 [14:51:57<7:07:50, 21.01s/it] +2025-05-11 08:17:50 - ERROR - stderr - +2025-05-11 08:17:50 - ERROR - stderr - +2025-05-11 08:17:50 - INFO - stdout - {'loss': 0.4874, 'grad_norm': 0.8071300983428955, 'learning_rate': 5.09528835175495e-06, 'epoch': 2.02} +2025-05-11 08:17:50 - ERROR - stderr - 67%|██████▋ | 2519/3741 [14:51:57<7:07:50, 21.01s/it] +2025-05-11 08:18:11 - ERROR - stderr - 67%|██████▋ | 2520/3741 [14:52:17<7:05:23, 20.90s/it] +2025-05-11 08:18:11 - ERROR - stderr - +2025-05-11 08:18:11 - ERROR - stderr - +2025-05-11 08:18:11 - INFO - stdout - {'loss': 0.4947, 'grad_norm': 0.8382665514945984, 'learning_rate': 5.087743983643165e-06, 'epoch': 2.02} +2025-05-11 08:18:11 - ERROR - stderr - 67%|██████▋ | 2520/3741 [14:52:17<7:05:23, 20.90s/it] +2025-05-11 08:18:32 - ERROR - stderr - 67%|██████▋ | 2521/3741 [14:52:38<7:05:07, 20.91s/it] +2025-05-11 08:18:32 - ERROR - stderr - +2025-05-11 08:18:32 - ERROR - stderr - +2025-05-11 08:18:32 - INFO - stdout - {'loss': 0.4644, 'grad_norm': 0.7823915481567383, 'learning_rate': 5.080203298908239e-06, 'epoch': 2.02} +2025-05-11 08:18:32 - ERROR - stderr - 67%|██████▋ | 2521/3741 [14:52:38<7:05:07, 20.91s/it] +2025-05-11 08:18:54 - ERROR - stderr - 67%|██████▋ | 2522/3741 [14:53:00<7:12:38, 21.29s/it] +2025-05-11 08:18:54 - ERROR - stderr - +2025-05-11 08:18:54 - ERROR - stderr - +2025-05-11 08:18:54 - INFO - stdout - {'loss': 0.4924, 'grad_norm': 0.8397455215454102, 'learning_rate': 5.072666303204421e-06, 'epoch': 2.02} +2025-05-11 08:18:54 - ERROR - stderr - 67%|██████▋ | 2522/3741 [14:53:00<7:12:38, 21.29s/it] +2025-05-11 08:19:14 - ERROR - stderr - 67%|██████▋ | 2523/3741 [14:53:20<7:05:04, 20.94s/it] +2025-05-11 08:19:14 - ERROR - stderr - +2025-05-11 08:19:14 - ERROR - stderr - +2025-05-11 08:19:14 - INFO - stdout - {'loss': 0.5191, 'grad_norm': 0.8278082609176636, 'learning_rate': 5.065133002183223e-06, 'epoch': 2.02} +2025-05-11 08:19:14 - ERROR - stderr - 67%|██████▋ | 2523/3741 [14:53:20<7:05:04, 20.94s/it] +2025-05-11 08:19:36 - ERROR - stderr - 67%|██████▋ | 2524/3741 [14:53:43<7:12:59, 21.35s/it] +2025-05-11 08:19:36 - ERROR - stderr - +2025-05-11 08:19:36 - ERROR - stderr - +2025-05-11 08:19:36 - INFO - stdout - {'loss': 0.4957, 'grad_norm': 0.8194684982299805, 'learning_rate': 5.057603401493358e-06, 'epoch': 2.02} +2025-05-11 08:19:36 - ERROR - stderr - 67%|██████▋ | 2524/3741 [14:53:43<7:12:59, 21.35s/it] +2025-05-11 08:19:57 - ERROR - stderr - 67%|██████▋ | 2525/3741 [14:54:03<7:08:40, 21.15s/it] +2025-05-11 08:19:57 - ERROR - stderr - +2025-05-11 08:19:57 - ERROR - stderr - +2025-05-11 08:19:57 - INFO - stdout - {'loss': 0.4892, 'grad_norm': 0.7626014947891235, 'learning_rate': 5.050077506780783e-06, 'epoch': 2.02} +2025-05-11 08:19:57 - ERROR - stderr - 67%|██████▋ | 2525/3741 [14:54:03<7:08:40, 21.15s/it] +2025-05-11 08:20:20 - ERROR - stderr - 68%|██████▊ | 2526/3741 [14:54:27<7:20:11, 21.74s/it] +2025-05-11 08:20:20 - ERROR - stderr - +2025-05-11 08:20:20 - ERROR - stderr - +2025-05-11 08:20:20 - INFO - stdout - {'loss': 0.5108, 'grad_norm': 0.7766503691673279, 'learning_rate': 5.042555323688673e-06, 'epoch': 2.03} +2025-05-11 08:20:20 - ERROR - stderr - 68%|██████▊ | 2526/3741 [14:54:27<7:20:11, 21.74s/it] +2025-05-11 08:20:41 - ERROR - stderr - 68%|██████▊ | 2527/3741 [14:54:48<7:15:46, 21.54s/it] +2025-05-11 08:20:41 - ERROR - stderr - +2025-05-11 08:20:41 - ERROR - stderr - +2025-05-11 08:20:41 - INFO - stdout - {'loss': 0.4472, 'grad_norm': 0.7892016768455505, 'learning_rate': 5.035036857857405e-06, 'epoch': 2.03} +2025-05-11 08:20:41 - ERROR - stderr - 68%|██████▊ | 2527/3741 [14:54:48<7:15:46, 21.54s/it] +2025-05-11 08:21:04 - ERROR - stderr - 68%|██████▊ | 2528/3741 [14:55:10<7:22:32, 21.89s/it] +2025-05-11 08:21:04 - ERROR - stderr - +2025-05-11 08:21:04 - ERROR - stderr - +2025-05-11 08:21:04 - INFO - stdout - {'loss': 0.4936, 'grad_norm': 0.7951651811599731, 'learning_rate': 5.027522114924597e-06, 'epoch': 2.03} +2025-05-11 08:21:04 - ERROR - stderr - 68%|██████▊ | 2528/3741 [14:55:10<7:22:32, 21.89s/it] +2025-05-11 08:21:24 - ERROR - stderr - 68%|██████▊ | 2529/3741 [14:55:30<7:09:45, 21.28s/it] +2025-05-11 08:21:24 - ERROR - stderr - +2025-05-11 08:21:24 - ERROR - stderr - +2025-05-11 08:21:24 - INFO - stdout - {'loss': 0.5195, 'grad_norm': 0.7646651864051819, 'learning_rate': 5.020011100525047e-06, 'epoch': 2.03} +2025-05-11 08:21:24 - ERROR - stderr - 68%|██████▊ | 2529/3741 [14:55:30<7:09:45, 21.28s/it] +2025-05-11 08:21:47 - ERROR - stderr - 68%|██████▊ | 2530/3741 [14:55:53<7:20:13, 21.81s/it] +2025-05-11 08:21:47 - ERROR - stderr - +2025-05-11 08:21:47 - ERROR - stderr - +2025-05-11 08:21:47 - INFO - stdout - {'loss': 0.4591, 'grad_norm': 0.7651566863059998, 'learning_rate': 5.0125038202907735e-06, 'epoch': 2.03} +2025-05-11 08:21:47 - ERROR - stderr - 68%|██████▊ | 2530/3741 [14:55:53<7:20:13, 21.81s/it] +2025-05-11 08:22:07 - ERROR - stderr - 68%|██████▊ | 2531/3741 [14:56:13<7:08:32, 21.25s/it] +2025-05-11 08:22:07 - ERROR - stderr - +2025-05-11 08:22:07 - ERROR - stderr - +2025-05-11 08:22:07 - INFO - stdout - {'loss': 0.5049, 'grad_norm': 0.814940333366394, 'learning_rate': 5.0050002798509956e-06, 'epoch': 2.03} +2025-05-11 08:22:07 - ERROR - stderr - 68%|██████▊ | 2531/3741 [14:56:13<7:08:32, 21.25s/it] +2025-05-11 08:22:30 - ERROR - stderr - 68%|██████▊ | 2532/3741 [14:56:36<7:19:42, 21.82s/it] +2025-05-11 08:22:30 - ERROR - stderr - +2025-05-11 08:22:30 - ERROR - stderr - +2025-05-11 08:22:30 - INFO - stdout - {'loss': 0.5016, 'grad_norm': 0.8493900895118713, 'learning_rate': 4.997500484832114e-06, 'epoch': 2.03} +2025-05-11 08:22:30 - ERROR - stderr - 68%|██████▊ | 2532/3741 [14:56:36<7:19:42, 21.82s/it] +2025-05-11 08:22:50 - ERROR - stderr - 68%|██████▊ | 2533/3741 [14:56:56<7:05:52, 21.15s/it] +2025-05-11 08:22:50 - ERROR - stderr - +2025-05-11 08:22:50 - ERROR - stderr - +2025-05-11 08:22:50 - INFO - stdout - {'loss': 0.4844, 'grad_norm': 0.8357752561569214, 'learning_rate': 4.990004440857735e-06, 'epoch': 2.03} +2025-05-11 08:22:50 - ERROR - stderr - 68%|██████▊ | 2533/3741 [14:56:56<7:05:52, 21.15s/it] +2025-05-11 08:23:12 - ERROR - stderr - 68%|██████▊ | 2534/3741 [14:57:19<7:15:16, 21.64s/it] +2025-05-11 08:23:12 - ERROR - stderr - +2025-05-11 08:23:12 - ERROR - stderr - +2025-05-11 08:23:12 - INFO - stdout - {'loss': 0.5135, 'grad_norm': 0.8459290862083435, 'learning_rate': 4.9825121535486475e-06, 'epoch': 2.03} +2025-05-11 08:23:12 - ERROR - stderr - 68%|██████▊ | 2534/3741 [14:57:19<7:15:16, 21.64s/it] +2025-05-11 08:23:33 - ERROR - stderr - 68%|██████▊ | 2535/3741 [14:57:40<7:10:12, 21.40s/it] +2025-05-11 08:23:33 - ERROR - stderr - +2025-05-11 08:23:33 - ERROR - stderr - +2025-05-11 08:23:33 - INFO - stdout - {'loss': 0.482, 'grad_norm': 0.7996906042098999, 'learning_rate': 4.975023628522825e-06, 'epoch': 2.03} +2025-05-11 08:23:33 - ERROR - stderr - 68%|██████▊ | 2535/3741 [14:57:40<7:10:12, 21.40s/it] +2025-05-11 08:23:56 - ERROR - stderr - 68%|██████▊ | 2536/3741 [14:58:02<7:16:48, 21.75s/it] +2025-05-11 08:23:56 - ERROR - stderr - +2025-05-11 08:23:56 - ERROR - stderr - +2025-05-11 08:23:56 - INFO - stdout - {'loss': 0.4959, 'grad_norm': 0.8636319041252136, 'learning_rate': 4.967538871395421e-06, 'epoch': 2.03} +2025-05-11 08:23:56 - ERROR - stderr - 68%|██████▊ | 2536/3741 [14:58:02<7:16:48, 21.75s/it] +2025-05-11 08:24:17 - ERROR - stderr - 68%|██████▊ | 2537/3741 [14:58:23<7:14:28, 21.65s/it] +2025-05-11 08:24:17 - ERROR - stderr - +2025-05-11 08:24:17 - ERROR - stderr - +2025-05-11 08:24:17 - INFO - stdout - {'loss': 0.4843, 'grad_norm': 0.813450276851654, 'learning_rate': 4.960057887778754e-06, 'epoch': 2.03} +2025-05-11 08:24:17 - ERROR - stderr - 68%|██████▊ | 2537/3741 [14:58:24<7:14:28, 21.65s/it] +2025-05-11 08:24:41 - ERROR - stderr - 68%|██████▊ | 2538/3741 [14:58:47<7:26:20, 22.26s/it] +2025-05-11 08:24:41 - ERROR - stderr - +2025-05-11 08:24:41 - ERROR - stderr - +2025-05-11 08:24:41 - INFO - stdout - {'loss': 0.497, 'grad_norm': 0.807138979434967, 'learning_rate': 4.952580683282324e-06, 'epoch': 2.04} +2025-05-11 08:24:41 - ERROR - stderr - 68%|██████▊ | 2538/3741 [14:58:47<7:26:20, 22.26s/it] +2025-05-11 08:25:11 - ERROR - stderr - 68%|██████▊ | 2539/3741 [14:59:17<8:11:43, 24.55s/it] +2025-05-11 08:25:11 - ERROR - stderr - +2025-05-11 08:25:11 - ERROR - stderr - +2025-05-11 08:25:11 - INFO - stdout - {'loss': 0.4893, 'grad_norm': 0.8144460916519165, 'learning_rate': 4.945107263512794e-06, 'epoch': 2.04} +2025-05-11 08:25:11 - ERROR - stderr - 68%|██████▊ | 2539/3741 [14:59:17<8:11:43, 24.55s/it] +2025-05-11 08:25:38 - ERROR - stderr - 68%|██████▊ | 2540/3741 [14:59:45<8:29:09, 25.44s/it] +2025-05-11 08:25:38 - ERROR - stderr - +2025-05-11 08:25:38 - ERROR - stderr - +2025-05-11 08:25:38 - INFO - stdout - {'loss': 0.5016, 'grad_norm': 0.8009450435638428, 'learning_rate': 4.937637634073988e-06, 'epoch': 2.04} +2025-05-11 08:25:38 - ERROR - stderr - 68%|██████▊ | 2540/3741 [14:59:45<8:29:09, 25.44s/it] +2025-05-11 08:26:01 - ERROR - stderr - 68%|██████▊ | 2541/3741 [15:00:08<8:14:50, 24.74s/it] +2025-05-11 08:26:01 - ERROR - stderr - +2025-05-11 08:26:01 - ERROR - stderr - +2025-05-11 08:26:01 - INFO - stdout - {'loss': 0.4966, 'grad_norm': 0.7973288893699646, 'learning_rate': 4.930171800566893e-06, 'epoch': 2.04} +2025-05-11 08:26:01 - ERROR - stderr - 68%|██████▊ | 2541/3741 [15:00:08<8:14:50, 24.74s/it] +2025-05-11 08:26:27 - ERROR - stderr - 68%|██████▊ | 2542/3741 [15:00:34<8:22:07, 25.13s/it] +2025-05-11 08:26:27 - ERROR - stderr - +2025-05-11 08:26:27 - ERROR - stderr - +2025-05-11 08:26:27 - INFO - stdout - {'loss': 0.5013, 'grad_norm': 0.773374617099762, 'learning_rate': 4.922709768589638e-06, 'epoch': 2.04} +2025-05-11 08:26:27 - ERROR - stderr - 68%|██████▊ | 2542/3741 [15:00:34<8:22:07, 25.13s/it] +2025-05-11 08:26:51 - ERROR - stderr - 68%|██████▊ | 2543/3741 [15:00:58<8:13:48, 24.73s/it] +2025-05-11 08:26:51 - ERROR - stderr - +2025-05-11 08:26:51 - ERROR - stderr - +2025-05-11 08:26:51 - INFO - stdout - {'loss': 0.4926, 'grad_norm': 0.8175215125083923, 'learning_rate': 4.915251543737512e-06, 'epoch': 2.04} +2025-05-11 08:26:51 - ERROR - stderr - 68%|██████▊ | 2543/3741 [15:00:58<8:13:48, 24.73s/it] +2025-05-11 08:27:11 - ERROR - stderr - 68%|██████▊ | 2544/3741 [15:01:18<7:45:18, 23.32s/it] +2025-05-11 08:27:11 - ERROR - stderr - +2025-05-11 08:27:11 - ERROR - stderr - +2025-05-11 08:27:11 - INFO - stdout - {'loss': 0.5069, 'grad_norm': 0.8121562600135803, 'learning_rate': 4.907797131602945e-06, 'epoch': 2.04} +2025-05-11 08:27:11 - ERROR - stderr - 68%|██████▊ | 2544/3741 [15:01:18<7:45:18, 23.32s/it] +2025-05-11 08:27:35 - ERROR - stderr - 68%|██████▊ | 2545/3741 [15:01:41<7:47:49, 23.47s/it] +2025-05-11 08:27:35 - ERROR - stderr - +2025-05-11 08:27:35 - ERROR - stderr - +2025-05-11 08:27:35 - INFO - stdout - {'loss': 0.4978, 'grad_norm': 0.7891466021537781, 'learning_rate': 4.900346537775513e-06, 'epoch': 2.04} +2025-05-11 08:27:35 - ERROR - stderr - 68%|██████▊ | 2545/3741 [15:01:41<7:47:49, 23.47s/it] +2025-05-11 08:27:58 - ERROR - stderr - 68%|██████▊ | 2546/3741 [15:02:05<7:46:39, 23.43s/it] +2025-05-11 08:27:58 - ERROR - stderr - +2025-05-11 08:27:58 - ERROR - stderr - +2025-05-11 08:27:58 - INFO - stdout - {'loss': 0.4882, 'grad_norm': 0.7856062650680542, 'learning_rate': 4.89289976784192e-06, 'epoch': 2.04} +2025-05-11 08:27:58 - ERROR - stderr - 68%|██████▊ | 2546/3741 [15:02:05<7:46:39, 23.43s/it] +2025-05-11 08:28:21 - ERROR - stderr - 68%|██████▊ | 2547/3741 [15:02:28<7:43:04, 23.27s/it] +2025-05-11 08:28:21 - ERROR - stderr - +2025-05-11 08:28:21 - ERROR - stderr - +2025-05-11 08:28:21 - INFO - stdout - {'loss': 0.4836, 'grad_norm': 0.7932535409927368, 'learning_rate': 4.885456827386008e-06, 'epoch': 2.04} +2025-05-11 08:28:21 - ERROR - stderr - 68%|██████▊ | 2547/3741 [15:02:28<7:43:04, 23.27s/it] +2025-05-11 08:28:41 - ERROR - stderr - 68%|██████▊ | 2548/3741 [15:02:47<7:22:20, 22.25s/it] +2025-05-11 08:28:41 - ERROR - stderr - +2025-05-11 08:28:41 - ERROR - stderr - +2025-05-11 08:28:41 - INFO - stdout - {'loss': 0.4609, 'grad_norm': 0.7187968492507935, 'learning_rate': 4.87801772198875e-06, 'epoch': 2.04} +2025-05-11 08:28:41 - ERROR - stderr - 68%|██████▊ | 2548/3741 [15:02:47<7:22:20, 22.25s/it] +2025-05-11 08:29:01 - ERROR - stderr - 68%|██████▊ | 2549/3741 [15:03:07<7:07:39, 21.53s/it] +2025-05-11 08:29:01 - ERROR - stderr - +2025-05-11 08:29:01 - ERROR - stderr - +2025-05-11 08:29:01 - INFO - stdout - {'loss': 0.5197, 'grad_norm': 0.8403437733650208, 'learning_rate': 4.870582457228239e-06, 'epoch': 2.04} +2025-05-11 08:29:01 - ERROR - stderr - 68%|██████▊ | 2549/3741 [15:03:07<7:07:39, 21.53s/it] +2025-05-11 08:29:21 - ERROR - stderr - 68%|██████▊ | 2550/3741 [15:03:27<6:59:10, 21.12s/it] +2025-05-11 08:29:21 - ERROR - stderr - +2025-05-11 08:29:21 - ERROR - stderr - +2025-05-11 08:29:21 - INFO - stdout - {'loss': 0.5118, 'grad_norm': 0.8300922513008118, 'learning_rate': 4.863151038679694e-06, 'epoch': 2.04} +2025-05-11 08:29:21 - ERROR - stderr - 68%|██████▊ | 2550/3741 [15:03:27<6:59:10, 21.12s/it] +2025-05-11 08:29:41 - ERROR - stderr - 68%|██████▊ | 2551/3741 [15:03:47<6:51:04, 20.73s/it] +2025-05-11 08:29:41 - ERROR - stderr - +2025-05-11 08:29:41 - ERROR - stderr - +2025-05-11 08:29:41 - INFO - stdout - {'loss': 0.4769, 'grad_norm': 0.8011190891265869, 'learning_rate': 4.855723471915438e-06, 'epoch': 2.05} +2025-05-11 08:29:41 - ERROR - stderr - 68%|██████▊ | 2551/3741 [15:03:47<6:51:04, 20.73s/it] +2025-05-11 08:30:04 - ERROR - stderr - 68%|██████▊ | 2552/3741 [15:04:10<7:01:51, 21.29s/it] +2025-05-11 08:30:04 - ERROR - stderr - +2025-05-11 08:30:04 - ERROR - stderr - +2025-05-11 08:30:04 - INFO - stdout - {'loss': 0.5017, 'grad_norm': 0.8211809992790222, 'learning_rate': 4.848299762504918e-06, 'epoch': 2.05} +2025-05-11 08:30:04 - ERROR - stderr - 68%|██████▊ | 2552/3741 [15:04:10<7:01:51, 21.29s/it] +2025-05-11 08:30:24 - ERROR - stderr - 68%|██████▊ | 2553/3741 [15:04:30<6:55:24, 20.98s/it] +2025-05-11 08:30:24 - ERROR - stderr - +2025-05-11 08:30:24 - ERROR - stderr - +2025-05-11 08:30:24 - INFO - stdout - {'loss': 0.4776, 'grad_norm': 0.7941953539848328, 'learning_rate': 4.840879916014683e-06, 'epoch': 2.05} +2025-05-11 08:30:24 - ERROR - stderr - 68%|██████▊ | 2553/3741 [15:04:30<6:55:24, 20.98s/it] +2025-05-11 08:30:48 - ERROR - stderr - 68%|██████▊ | 2554/3741 [15:04:54<7:12:18, 21.85s/it] +2025-05-11 08:30:48 - ERROR - stderr - +2025-05-11 08:30:48 - ERROR - stderr - +2025-05-11 08:30:48 - INFO - stdout - {'loss': 0.4887, 'grad_norm': 0.8611568212509155, 'learning_rate': 4.833463938008387e-06, 'epoch': 2.05} +2025-05-11 08:30:48 - ERROR - stderr - 68%|██████▊ | 2554/3741 [15:04:54<7:12:18, 21.85s/it] +2025-05-11 08:31:10 - ERROR - stderr - 68%|██████▊ | 2555/3741 [15:05:16<7:13:49, 21.95s/it] +2025-05-11 08:31:10 - ERROR - stderr - +2025-05-11 08:31:10 - ERROR - stderr - +2025-05-11 08:31:10 - INFO - stdout - {'loss': 0.508, 'grad_norm': 0.8546658754348755, 'learning_rate': 4.826051834046787e-06, 'epoch': 2.05} +2025-05-11 08:31:10 - ERROR - stderr - 68%|██████▊ | 2555/3741 [15:05:16<7:13:49, 21.95s/it] +2025-05-11 08:31:33 - ERROR - stderr - 68%|██████▊ | 2556/3741 [15:05:40<7:23:12, 22.44s/it] +2025-05-11 08:31:33 - ERROR - stderr - +2025-05-11 08:31:33 - ERROR - stderr - +2025-05-11 08:31:33 - INFO - stdout - {'loss': 0.5016, 'grad_norm': 0.8082013726234436, 'learning_rate': 4.818643609687724e-06, 'epoch': 2.05} +2025-05-11 08:31:33 - ERROR - stderr - 68%|██████▊ | 2556/3741 [15:05:40<7:23:12, 22.44s/it] +2025-05-11 08:31:55 - ERROR - stderr - 68%|██████▊ | 2557/3741 [15:06:01<7:18:21, 22.21s/it] +2025-05-11 08:31:55 - ERROR - stderr - +2025-05-11 08:31:55 - ERROR - stderr - +2025-05-11 08:31:55 - INFO - stdout - {'loss': 0.4957, 'grad_norm': 0.868209183216095, 'learning_rate': 4.811239270486139e-06, 'epoch': 2.05} +2025-05-11 08:31:55 - ERROR - stderr - 68%|██████▊ | 2557/3741 [15:06:01<7:18:21, 22.21s/it] +2025-05-11 08:32:19 - ERROR - stderr - 68%|██████▊ | 2558/3741 [15:06:25<7:27:11, 22.68s/it] +2025-05-11 08:32:19 - ERROR - stderr - +2025-05-11 08:32:19 - ERROR - stderr - +2025-05-11 08:32:19 - INFO - stdout - {'loss': 0.4874, 'grad_norm': 0.8040471076965332, 'learning_rate': 4.803838821994062e-06, 'epoch': 2.05} +2025-05-11 08:32:19 - ERROR - stderr - 68%|██████▊ | 2558/3741 [15:06:25<7:27:11, 22.68s/it] +2025-05-11 08:32:40 - ERROR - stderr - 68%|██████▊ | 2559/3741 [15:06:46<7:15:41, 22.12s/it] +2025-05-11 08:32:40 - ERROR - stderr - +2025-05-11 08:32:40 - ERROR - stderr - +2025-05-11 08:32:40 - INFO - stdout - {'loss': 0.4845, 'grad_norm': 0.7826542854309082, 'learning_rate': 4.796442269760592e-06, 'epoch': 2.05} +2025-05-11 08:32:40 - ERROR - stderr - 68%|██████▊ | 2559/3741 [15:06:46<7:15:41, 22.12s/it] +2025-05-11 08:33:00 - ERROR - stderr - 68%|██████▊ | 2560/3741 [15:07:06<7:03:10, 21.50s/it] +2025-05-11 08:33:00 - ERROR - stderr - +2025-05-11 08:33:00 - ERROR - stderr - +2025-05-11 08:33:00 - INFO - stdout - {'loss': 0.5049, 'grad_norm': 0.8145564794540405, 'learning_rate': 4.789049619331928e-06, 'epoch': 2.05} +2025-05-11 08:33:00 - ERROR - stderr - 68%|██████▊ | 2560/3741 [15:07:06<7:03:10, 21.50s/it] +2025-05-11 08:33:20 - ERROR - stderr - 68%|██████▊ | 2561/3741 [15:07:26<6:53:42, 21.04s/it] +2025-05-11 08:33:20 - ERROR - stderr - +2025-05-11 08:33:20 - ERROR - stderr - +2025-05-11 08:33:20 - INFO - stdout - {'loss': 0.4969, 'grad_norm': 0.8400808572769165, 'learning_rate': 4.781660876251322e-06, 'epoch': 2.05} +2025-05-11 08:33:20 - ERROR - stderr - 68%|██████▊ | 2561/3741 [15:07:26<6:53:42, 21.04s/it] +2025-05-11 08:33:40 - ERROR - stderr - 68%|██████▊ | 2562/3741 [15:07:46<6:46:17, 20.68s/it] +2025-05-11 08:33:40 - ERROR - stderr - +2025-05-11 08:33:40 - ERROR - stderr - +2025-05-11 08:33:40 - INFO - stdout - {'loss': 0.458, 'grad_norm': 0.8168050050735474, 'learning_rate': 4.774276046059107e-06, 'epoch': 2.05} +2025-05-11 08:33:40 - ERROR - stderr - 68%|██████▊ | 2562/3741 [15:07:46<6:46:17, 20.68s/it] +2025-05-11 08:34:03 - ERROR - stderr - 69%|██████▊ | 2563/3741 [15:08:09<7:01:10, 21.45s/it] +2025-05-11 08:34:03 - ERROR - stderr - +2025-05-11 08:34:03 - ERROR - stderr - +2025-05-11 08:34:03 - INFO - stdout - {'loss': 0.5026, 'grad_norm': 0.8102244734764099, 'learning_rate': 4.766895134292685e-06, 'epoch': 2.06} +2025-05-11 08:34:03 - ERROR - stderr - 69%|██████▊ | 2563/3741 [15:08:09<7:01:10, 21.45s/it] +2025-05-11 08:34:23 - ERROR - stderr - 69%|██████▊ | 2564/3741 [15:08:30<6:55:15, 21.17s/it] +2025-05-11 08:34:23 - ERROR - stderr - +2025-05-11 08:34:23 - ERROR - stderr - +2025-05-11 08:34:23 - INFO - stdout - {'loss': 0.5098, 'grad_norm': 0.8787121772766113, 'learning_rate': 4.759518146486504e-06, 'epoch': 2.06} +2025-05-11 08:34:23 - ERROR - stderr - 69%|██████▊ | 2564/3741 [15:08:30<6:55:15, 21.17s/it] +2025-05-11 08:34:46 - ERROR - stderr - 69%|██████▊ | 2565/3741 [15:08:52<7:03:56, 21.63s/it] +2025-05-11 08:34:46 - ERROR - stderr - +2025-05-11 08:34:46 - ERROR - stderr - +2025-05-11 08:34:46 - INFO - stdout - {'loss': 0.5131, 'grad_norm': 1.1912168264389038, 'learning_rate': 4.752145088172094e-06, 'epoch': 2.06} +2025-05-11 08:34:46 - ERROR - stderr - 69%|██████▊ | 2565/3741 [15:08:52<7:03:56, 21.63s/it] +2025-05-11 08:35:06 - ERROR - stderr - 69%|██████▊ | 2566/3741 [15:09:12<6:53:32, 21.12s/it] +2025-05-11 08:35:06 - ERROR - stderr - +2025-05-11 08:35:06 - ERROR - stderr - +2025-05-11 08:35:06 - INFO - stdout - {'loss': 0.4842, 'grad_norm': 0.7951311469078064, 'learning_rate': 4.744775964878017e-06, 'epoch': 2.06} +2025-05-11 08:35:06 - ERROR - stderr - 69%|██████▊ | 2566/3741 [15:09:12<6:53:32, 21.12s/it] +2025-05-11 08:35:28 - ERROR - stderr - 69%|██████▊ | 2567/3741 [15:09:35<7:00:01, 21.47s/it] +2025-05-11 08:35:28 - ERROR - stderr - +2025-05-11 08:35:28 - ERROR - stderr - +2025-05-11 08:35:28 - INFO - stdout - {'loss': 0.4933, 'grad_norm': 0.8363946080207825, 'learning_rate': 4.737410782129894e-06, 'epoch': 2.06} +2025-05-11 08:35:28 - ERROR - stderr - 69%|██████▊ | 2567/3741 [15:09:35<7:00:01, 21.47s/it] +2025-05-11 08:35:48 - ERROR - stderr - 69%|██████▊ | 2568/3741 [15:09:54<6:50:36, 21.00s/it] +2025-05-11 08:35:48 - ERROR - stderr - +2025-05-11 08:35:48 - ERROR - stderr - +2025-05-11 08:35:48 - INFO - stdout - {'loss': 0.4983, 'grad_norm': 0.8067214488983154, 'learning_rate': 4.730049545450394e-06, 'epoch': 2.06} +2025-05-11 08:35:48 - ERROR - stderr - 69%|██████▊ | 2568/3741 [15:09:55<6:50:36, 21.00s/it] +2025-05-11 08:35:49 - WARNING - transformers.tokenization_utils_base - Token indices sequence length is longer than the specified maximum sequence length for this model (5854 > 4096). Running this sequence through the model will result in indexing errors +2025-05-11 08:35:49 - WARNING - transformers.tokenization_utils_base - Token indices sequence length is longer than the specified maximum sequence length for this model (5854 > 4096). Running this sequence through the model will result in indexing errors +2025-05-11 08:36:11 - ERROR - stderr - 69%|██████▊ | 2569/3741 [15:10:17<6:58:00, 21.40s/it] +2025-05-11 08:36:11 - ERROR - stderr - +2025-05-11 08:36:11 - ERROR - stderr - +2025-05-11 08:36:11 - INFO - stdout - {'loss': 0.4895, 'grad_norm': 0.8054936528205872, 'learning_rate': 4.722692260359211e-06, 'epoch': 2.06} +2025-05-11 08:36:11 - ERROR - stderr - 69%|██████▊ | 2569/3741 [15:10:17<6:58:00, 21.40s/it] +2025-05-11 08:36:35 - ERROR - stderr - 69%|██████▊ | 2570/3741 [15:10:41<7:14:41, 22.27s/it] +2025-05-11 08:36:35 - ERROR - stderr - +2025-05-11 08:36:35 - ERROR - stderr - +2025-05-11 08:36:35 - INFO - stdout - {'loss': 0.4964, 'grad_norm': 0.8232284188270569, 'learning_rate': 4.715338932373107e-06, 'epoch': 2.06} +2025-05-11 08:36:35 - ERROR - stderr - 69%|██████▊ | 2570/3741 [15:10:41<7:14:41, 22.27s/it] +2025-05-11 08:36:57 - ERROR - stderr - 69%|██████▊ | 2571/3741 [15:11:04<7:16:38, 22.39s/it] +2025-05-11 08:36:58 - ERROR - stderr - +2025-05-11 08:36:58 - ERROR - stderr - +2025-05-11 08:36:58 - INFO - stdout - {'loss': 0.482, 'grad_norm': 0.8356310725212097, 'learning_rate': 4.707989567005845e-06, 'epoch': 2.06} +2025-05-11 08:36:58 - ERROR - stderr - 69%|██████▊ | 2571/3741 [15:11:04<7:16:38, 22.39s/it] +2025-05-11 08:37:18 - ERROR - stderr - 69%|██████▉ | 2572/3741 [15:11:24<7:03:26, 21.73s/it] +2025-05-11 08:37:18 - ERROR - stderr - +2025-05-11 08:37:18 - ERROR - stderr - +2025-05-11 08:37:18 - INFO - stdout - {'loss': 0.4894, 'grad_norm': 0.8142298460006714, 'learning_rate': 4.700644169768223e-06, 'epoch': 2.06} +2025-05-11 08:37:18 - ERROR - stderr - 69%|██████▉ | 2572/3741 [15:11:24<7:03:26, 21.73s/it] +2025-05-11 08:37:41 - ERROR - stderr - 69%|██████▉ | 2573/3741 [15:11:47<7:09:57, 22.09s/it] +2025-05-11 08:37:41 - ERROR - stderr - +2025-05-11 08:37:41 - ERROR - stderr - +2025-05-11 08:37:41 - INFO - stdout - {'loss': 0.4808, 'grad_norm': 0.8215280771255493, 'learning_rate': 4.693302746168088e-06, 'epoch': 2.06} +2025-05-11 08:37:41 - ERROR - stderr - 69%|██████▉ | 2573/3741 [15:11:47<7:09:57, 22.09s/it] +2025-05-11 08:38:00 - ERROR - stderr - 69%|██████▉ | 2574/3741 [15:12:07<6:56:38, 21.42s/it] +2025-05-11 08:38:00 - ERROR - stderr - +2025-05-11 08:38:00 - ERROR - stderr - +2025-05-11 08:38:00 - INFO - stdout - {'loss': 0.4986, 'grad_norm': 0.8594109416007996, 'learning_rate': 4.685965301710276e-06, 'epoch': 2.06} +2025-05-11 08:38:00 - ERROR - stderr - 69%|██████▉ | 2574/3741 [15:12:07<6:56:38, 21.42s/it] +2025-05-11 08:38:23 - ERROR - stderr - 69%|██████▉ | 2575/3741 [15:12:30<7:04:43, 21.86s/it] +2025-05-11 08:38:23 - ERROR - stderr - +2025-05-11 08:38:23 - ERROR - stderr - +2025-05-11 08:38:23 - INFO - stdout - {'loss': 0.4765, 'grad_norm': 0.7795203328132629, 'learning_rate': 4.678631841896657e-06, 'epoch': 2.06} +2025-05-11 08:38:23 - ERROR - stderr - 69%|██████▉ | 2575/3741 [15:12:30<7:04:43, 21.86s/it] +2025-05-11 08:38:43 - ERROR - stderr - 69%|██████▉ | 2576/3741 [15:12:50<6:54:31, 21.35s/it] +2025-05-11 08:38:44 - ERROR - stderr - +2025-05-11 08:38:44 - ERROR - stderr - +2025-05-11 08:38:44 - INFO - stdout - {'loss': 0.4933, 'grad_norm': 0.8334662318229675, 'learning_rate': 4.6713023722261106e-06, 'epoch': 2.07} +2025-05-11 08:38:44 - ERROR - stderr - 69%|██████▉ | 2576/3741 [15:12:50<6:54:31, 21.35s/it] +2025-05-11 08:39:08 - ERROR - stderr - 69%|██████▉ | 2577/3741 [15:13:14<7:09:39, 22.15s/it] +2025-05-11 08:39:08 - ERROR - stderr - +2025-05-11 08:39:08 - ERROR - stderr - +2025-05-11 08:39:08 - INFO - stdout - {'loss': 0.5096, 'grad_norm': 0.8101879954338074, 'learning_rate': 4.663976898194516e-06, 'epoch': 2.07} +2025-05-11 08:39:08 - ERROR - stderr - 69%|██████▉ | 2577/3741 [15:13:14<7:09:39, 22.15s/it] +2025-05-11 08:39:27 - ERROR - stderr - 69%|██████▉ | 2578/3741 [15:13:34<6:56:17, 21.48s/it] +2025-05-11 08:39:27 - ERROR - stderr - +2025-05-11 08:39:27 - ERROR - stderr - +2025-05-11 08:39:27 - INFO - stdout - {'loss': 0.4825, 'grad_norm': 0.8048333525657654, 'learning_rate': 4.656655425294774e-06, 'epoch': 2.07} +2025-05-11 08:39:27 - ERROR - stderr - 69%|██████▉ | 2578/3741 [15:13:34<6:56:17, 21.48s/it] +2025-05-11 08:39:51 - ERROR - stderr - 69%|██████▉ | 2579/3741 [15:13:57<7:06:25, 22.02s/it] +2025-05-11 08:39:51 - ERROR - stderr - +2025-05-11 08:39:51 - ERROR - stderr - +2025-05-11 08:39:51 - INFO - stdout - {'loss': 0.4801, 'grad_norm': 0.8437180519104004, 'learning_rate': 4.649337959016764e-06, 'epoch': 2.07} +2025-05-11 08:39:51 - ERROR - stderr - 69%|██████▉ | 2579/3741 [15:13:57<7:06:25, 22.02s/it] +2025-05-11 08:40:11 - ERROR - stderr - 69%|██████▉ | 2580/3741 [15:14:17<6:56:57, 21.55s/it] +2025-05-11 08:40:11 - ERROR - stderr - +2025-05-11 08:40:11 - ERROR - stderr - +2025-05-11 08:40:11 - INFO - stdout - {'loss': 0.4864, 'grad_norm': 0.8027264475822449, 'learning_rate': 4.6420245048473766e-06, 'epoch': 2.07} +2025-05-11 08:40:11 - ERROR - stderr - 69%|██████▉ | 2580/3741 [15:14:17<6:56:57, 21.55s/it] +2025-05-11 08:40:35 - ERROR - stderr - 69%|██████▉ | 2581/3741 [15:14:42<7:12:41, 22.38s/it] +2025-05-11 08:40:35 - ERROR - stderr - +2025-05-11 08:40:35 - ERROR - stderr - +2025-05-11 08:40:35 - INFO - stdout - {'loss': 0.494, 'grad_norm': 0.8142487406730652, 'learning_rate': 4.634715068270491e-06, 'epoch': 2.07} +2025-05-11 08:40:35 - ERROR - stderr - 69%|██████▉ | 2581/3741 [15:14:42<7:12:41, 22.38s/it] +2025-05-11 08:40:56 - ERROR - stderr - 69%|██████▉ | 2582/3741 [15:15:02<7:01:58, 21.84s/it] +2025-05-11 08:40:56 - ERROR - stderr - +2025-05-11 08:40:56 - ERROR - stderr - +2025-05-11 08:40:56 - INFO - stdout - {'loss': 0.4805, 'grad_norm': 0.8042910695075989, 'learning_rate': 4.6274096547669625e-06, 'epoch': 2.07} +2025-05-11 08:40:56 - ERROR - stderr - 69%|██████▉ | 2582/3741 [15:15:02<7:01:58, 21.84s/it] +2025-05-11 08:41:20 - ERROR - stderr - 69%|██████▉ | 2583/3741 [15:15:27<7:16:26, 22.61s/it] +2025-05-11 08:41:20 - ERROR - stderr - +2025-05-11 08:41:20 - ERROR - stderr - +2025-05-11 08:41:20 - INFO - stdout - {'loss': 0.5094, 'grad_norm': 0.8223109245300293, 'learning_rate': 4.62010826981465e-06, 'epoch': 2.07} +2025-05-11 08:41:20 - ERROR - stderr - 69%|██████▉ | 2583/3741 [15:15:27<7:16:26, 22.61s/it] +2025-05-11 08:41:40 - ERROR - stderr - 69%|██████▉ | 2584/3741 [15:15:47<7:00:03, 21.78s/it] +2025-05-11 08:41:40 - ERROR - stderr - +2025-05-11 08:41:40 - ERROR - stderr - +2025-05-11 08:41:40 - INFO - stdout - {'loss': 0.4881, 'grad_norm': 0.8160894513130188, 'learning_rate': 4.612810918888374e-06, 'epoch': 2.07} +2025-05-11 08:41:40 - ERROR - stderr - 69%|██████▉ | 2584/3741 [15:15:47<7:00:03, 21.78s/it] +2025-05-11 08:42:03 - ERROR - stderr - 69%|██████▉ | 2585/3741 [15:16:09<7:04:24, 22.03s/it] +2025-05-11 08:42:03 - ERROR - stderr - +2025-05-11 08:42:03 - ERROR - stderr - +2025-05-11 08:42:03 - INFO - stdout - {'loss': 0.488, 'grad_norm': 0.8002573847770691, 'learning_rate': 4.605517607459938e-06, 'epoch': 2.07} +2025-05-11 08:42:03 - ERROR - stderr - 69%|██████▉ | 2585/3741 [15:16:09<7:04:24, 22.03s/it] +2025-05-11 08:42:23 - ERROR - stderr - 69%|██████▉ | 2586/3741 [15:16:29<6:52:47, 21.44s/it] +2025-05-11 08:42:23 - ERROR - stderr - +2025-05-11 08:42:23 - ERROR - stderr - +2025-05-11 08:42:23 - INFO - stdout - {'loss': 0.4976, 'grad_norm': 0.7782284021377563, 'learning_rate': 4.598228340998118e-06, 'epoch': 2.07} +2025-05-11 08:42:23 - ERROR - stderr - 69%|██████▉ | 2586/3741 [15:16:29<6:52:47, 21.44s/it] +2025-05-11 08:42:44 - ERROR - stderr - 69%|██████▉ | 2587/3741 [15:16:50<6:47:46, 21.20s/it] +2025-05-11 08:42:44 - ERROR - stderr - +2025-05-11 08:42:44 - ERROR - stderr - +2025-05-11 08:42:44 - INFO - stdout - {'loss': 0.4935, 'grad_norm': 0.7785077691078186, 'learning_rate': 4.590943124968651e-06, 'epoch': 2.07} +2025-05-11 08:42:44 - ERROR - stderr - 69%|██████▉ | 2587/3741 [15:16:50<6:47:46, 21.20s/it] +2025-05-11 08:43:03 - ERROR - stderr - 69%|██████▉ | 2588/3741 [15:17:10<6:39:27, 20.79s/it] +2025-05-11 08:43:03 - ERROR - stderr - +2025-05-11 08:43:03 - ERROR - stderr - +2025-05-11 08:43:03 - INFO - stdout - {'loss': 0.4781, 'grad_norm': 0.8213626742362976, 'learning_rate': 4.583661964834238e-06, 'epoch': 2.08} +2025-05-11 08:43:03 - ERROR - stderr - 69%|██████▉ | 2588/3741 [15:17:10<6:39:27, 20.79s/it] +2025-05-11 08:43:23 - ERROR - stderr - 69%|██████▉ | 2589/3741 [15:17:29<6:32:46, 20.46s/it] +2025-05-11 08:43:23 - ERROR - stderr - +2025-05-11 08:43:23 - ERROR - stderr - +2025-05-11 08:43:23 - INFO - stdout - {'loss': 0.4833, 'grad_norm': 0.8277866244316101, 'learning_rate': 4.576384866054546e-06, 'epoch': 2.08} +2025-05-11 08:43:23 - ERROR - stderr - 69%|██████▉ | 2589/3741 [15:17:29<6:32:46, 20.46s/it] +2025-05-11 08:43:44 - ERROR - stderr - 69%|██████▉ | 2590/3741 [15:17:50<6:32:54, 20.48s/it] +2025-05-11 08:43:44 - ERROR - stderr - +2025-05-11 08:43:44 - ERROR - stderr - +2025-05-11 08:43:44 - INFO - stdout - {'loss': 0.4927, 'grad_norm': 0.8673251867294312, 'learning_rate': 4.5691118340861885e-06, 'epoch': 2.08} +2025-05-11 08:43:44 - ERROR - stderr - 69%|██████▉ | 2590/3741 [15:17:50<6:32:54, 20.48s/it] +2025-05-11 08:44:03 - ERROR - stderr - 69%|██████▉ | 2591/3741 [15:18:10<6:28:29, 20.27s/it] +2025-05-11 08:44:03 - ERROR - stderr - +2025-05-11 08:44:03 - ERROR - stderr - +2025-05-11 08:44:03 - INFO - stdout - {'loss': 0.504, 'grad_norm': 0.8178399205207825, 'learning_rate': 4.561842874382737e-06, 'epoch': 2.08} +2025-05-11 08:44:03 - ERROR - stderr - 69%|██████▉ | 2591/3741 [15:18:10<6:28:29, 20.27s/it] +2025-05-11 08:44:25 - ERROR - stderr - 69%|██████▉ | 2592/3741 [15:18:31<6:33:15, 20.54s/it] +2025-05-11 08:44:25 - ERROR - stderr - +2025-05-11 08:44:25 - ERROR - stderr - +2025-05-11 08:44:25 - INFO - stdout - {'loss': 0.4728, 'grad_norm': 0.8196151852607727, 'learning_rate': 4.554577992394697e-06, 'epoch': 2.08} +2025-05-11 08:44:25 - ERROR - stderr - 69%|██████▉ | 2592/3741 [15:18:31<6:33:15, 20.54s/it] +2025-05-11 08:44:45 - ERROR - stderr - 69%|██████▉ | 2593/3741 [15:18:51<6:31:04, 20.44s/it] +2025-05-11 08:44:45 - ERROR - stderr - +2025-05-11 08:44:45 - ERROR - stderr - +2025-05-11 08:44:45 - INFO - stdout - {'loss': 0.5019, 'grad_norm': 0.8163505792617798, 'learning_rate': 4.54731719356953e-06, 'epoch': 2.08} +2025-05-11 08:44:45 - ERROR - stderr - 69%|██████▉ | 2593/3741 [15:18:51<6:31:04, 20.44s/it] +2025-05-11 08:45:08 - ERROR - stderr - 69%|██████▉ | 2594/3741 [15:19:14<6:44:20, 21.15s/it] +2025-05-11 08:45:08 - ERROR - stderr - +2025-05-11 08:45:08 - ERROR - stderr - +2025-05-11 08:45:08 - INFO - stdout - {'loss': 0.4947, 'grad_norm': 0.8096843957901001, 'learning_rate': 4.540060483351628e-06, 'epoch': 2.08} +2025-05-11 08:45:08 - ERROR - stderr - 69%|██████▉ | 2594/3741 [15:19:14<6:44:20, 21.15s/it] +2025-05-11 08:45:28 - ERROR - stderr - 69%|██████▉ | 2595/3741 [15:19:34<6:39:50, 20.93s/it] +2025-05-11 08:45:28 - ERROR - stderr - +2025-05-11 08:45:28 - ERROR - stderr - +2025-05-11 08:45:28 - INFO - stdout - {'loss': 0.4999, 'grad_norm': 0.846545398235321, 'learning_rate': 4.532807867182322e-06, 'epoch': 2.08} +2025-05-11 08:45:28 - ERROR - stderr - 69%|██████▉ | 2595/3741 [15:19:34<6:39:50, 20.93s/it] +2025-05-11 08:45:52 - ERROR - stderr - 69%|██████▉ | 2596/3741 [15:19:58<6:56:32, 21.83s/it] +2025-05-11 08:45:52 - ERROR - stderr - +2025-05-11 08:45:52 - ERROR - stderr - +2025-05-11 08:45:52 - INFO - stdout - {'loss': 0.503, 'grad_norm': 0.8637265563011169, 'learning_rate': 4.525559350499872e-06, 'epoch': 2.08} +2025-05-11 08:45:52 - ERROR - stderr - 69%|██████▉ | 2596/3741 [15:19:58<6:56:32, 21.83s/it] +2025-05-11 08:46:12 - ERROR - stderr - 69%|██████▉ | 2597/3741 [15:20:18<6:45:49, 21.28s/it] +2025-05-11 08:46:12 - ERROR - stderr - +2025-05-11 08:46:12 - ERROR - stderr - +2025-05-11 08:46:12 - INFO - stdout - {'loss': 0.488, 'grad_norm': 0.795164942741394, 'learning_rate': 4.5183149387394566e-06, 'epoch': 2.08} +2025-05-11 08:46:12 - ERROR - stderr - 69%|██████▉ | 2597/3741 [15:20:18<6:45:49, 21.28s/it] +2025-05-11 08:46:12 - WARNING - transformers.tokenization_utils_base - Token indices sequence length is longer than the specified maximum sequence length for this model (5986 > 4096). Running this sequence through the model will result in indexing errors +2025-05-11 08:46:12 - WARNING - transformers.tokenization_utils_base - Token indices sequence length is longer than the specified maximum sequence length for this model (5986 > 4096). Running this sequence through the model will result in indexing errors +2025-05-11 08:46:32 - ERROR - stderr - 69%|██████▉ | 2598/3741 [15:20:38<6:37:28, 20.86s/it] +2025-05-11 08:46:32 - ERROR - stderr - +2025-05-11 08:46:32 - ERROR - stderr - +2025-05-11 08:46:32 - INFO - stdout - {'loss': 0.4855, 'grad_norm': 0.8308284282684326, 'learning_rate': 4.511074637333185e-06, 'epoch': 2.08} +2025-05-11 08:46:32 - ERROR - stderr - 69%|██████▉ | 2598/3741 [15:20:38<6:37:28, 20.86s/it] +2025-05-11 08:46:56 - ERROR - stderr - 69%|██████▉ | 2599/3741 [15:21:03<6:56:57, 21.91s/it] +2025-05-11 08:46:56 - ERROR - stderr - +2025-05-11 08:46:56 - ERROR - stderr - +2025-05-11 08:46:56 - INFO - stdout - {'loss': 0.483, 'grad_norm': 0.8101129531860352, 'learning_rate': 4.503838451710082e-06, 'epoch': 2.08} +2025-05-11 08:46:56 - ERROR - stderr - 69%|██████▉ | 2599/3741 [15:21:03<6:56:57, 21.91s/it] +2025-05-11 08:47:16 - ERROR - stderr - 70%|██████▉ | 2600/3741 [15:21:22<6:44:11, 21.25s/it] +2025-05-11 08:47:16 - ERROR - stderr - +2025-05-11 08:47:16 - ERROR - stderr - +2025-05-11 08:47:16 - INFO - stdout - {'loss': 0.4921, 'grad_norm': 0.8016064167022705, 'learning_rate': 4.49660638729609e-06, 'epoch': 2.09} +2025-05-11 08:47:16 - ERROR - stderr - 70%|██████▉ | 2600/3741 [15:21:22<6:44:11, 21.25s/it] +2025-05-11 08:47:37 - ERROR - stderr - 70%|██████▉ | 2601/3741 [15:21:44<6:44:01, 21.26s/it] +2025-05-11 08:47:37 - ERROR - stderr - +2025-05-11 08:47:37 - ERROR - stderr - +2025-05-11 08:47:37 - INFO - stdout - {'loss': 0.4983, 'grad_norm': 0.8189466595649719, 'learning_rate': 4.489378449514051e-06, 'epoch': 2.09} +2025-05-11 08:47:37 - ERROR - stderr - 70%|██████▉ | 2601/3741 [15:21:44<6:44:01, 21.26s/it] +2025-05-11 08:47:57 - ERROR - stderr - 70%|██████▉ | 2602/3741 [15:22:03<6:35:44, 20.85s/it] +2025-05-11 08:47:57 - ERROR - stderr - +2025-05-11 08:47:57 - ERROR - stderr - +2025-05-11 08:47:57 - INFO - stdout - {'loss': 0.506, 'grad_norm': 0.8600638508796692, 'learning_rate': 4.482154643783722e-06, 'epoch': 2.09} +2025-05-11 08:47:57 - ERROR - stderr - 70%|██████▉ | 2602/3741 [15:22:03<6:35:44, 20.85s/it] +2025-05-11 08:48:20 - ERROR - stderr - 70%|██████▉ | 2603/3741 [15:22:27<6:48:58, 21.56s/it] +2025-05-11 08:48:20 - ERROR - stderr - +2025-05-11 08:48:20 - ERROR - stderr - +2025-05-11 08:48:20 - INFO - stdout - {'loss': 0.4746, 'grad_norm': 0.7984684705734253, 'learning_rate': 4.4749349755217575e-06, 'epoch': 2.09} +2025-05-11 08:48:20 - ERROR - stderr - 70%|██████▉ | 2603/3741 [15:22:27<6:48:58, 21.56s/it] +2025-05-11 08:48:41 - ERROR - stderr - 70%|██████▉ | 2604/3741 [15:22:47<6:40:43, 21.15s/it] +2025-05-11 08:48:41 - ERROR - stderr - +2025-05-11 08:48:41 - ERROR - stderr - +2025-05-11 08:48:41 - INFO - stdout - {'loss': 0.4957, 'grad_norm': 0.8474909067153931, 'learning_rate': 4.467719450141711e-06, 'epoch': 2.09} +2025-05-11 08:48:41 - ERROR - stderr - 70%|██████▉ | 2604/3741 [15:22:47<6:40:43, 21.15s/it] +2025-05-11 08:49:04 - ERROR - stderr - 70%|██████▉ | 2605/3741 [15:23:10<6:51:03, 21.71s/it] +2025-05-11 08:49:04 - ERROR - stderr - +2025-05-11 08:49:04 - ERROR - stderr - +2025-05-11 08:49:04 - INFO - stdout - {'loss': 0.4828, 'grad_norm': 0.8484524488449097, 'learning_rate': 4.460508073054033e-06, 'epoch': 2.09} +2025-05-11 08:49:04 - ERROR - stderr - 70%|██████▉ | 2605/3741 [15:23:10<6:51:03, 21.71s/it] +2025-05-11 08:49:24 - ERROR - stderr - 70%|██████▉ | 2606/3741 [15:23:30<6:41:09, 21.21s/it] +2025-05-11 08:49:24 - ERROR - stderr - +2025-05-11 08:49:24 - ERROR - stderr - +2025-05-11 08:49:24 - INFO - stdout - {'loss': 0.4942, 'grad_norm': 0.8325912952423096, 'learning_rate': 4.453300849666053e-06, 'epoch': 2.09} +2025-05-11 08:49:24 - ERROR - stderr - 70%|██████▉ | 2606/3741 [15:23:30<6:41:09, 21.21s/it] +2025-05-11 08:49:47 - ERROR - stderr - 70%|██████▉ | 2607/3741 [15:23:53<6:53:42, 21.89s/it] +2025-05-11 08:49:47 - ERROR - stderr - +2025-05-11 08:49:47 - ERROR - stderr - +2025-05-11 08:49:47 - INFO - stdout - {'loss': 0.4944, 'grad_norm': 0.8752564787864685, 'learning_rate': 4.446097785381995e-06, 'epoch': 2.09} +2025-05-11 08:49:47 - ERROR - stderr - 70%|██████▉ | 2607/3741 [15:23:53<6:53:42, 21.89s/it] +2025-05-11 08:50:07 - ERROR - stderr - 70%|██████▉ | 2608/3741 [15:24:13<6:40:29, 21.21s/it] +2025-05-11 08:50:07 - ERROR - stderr - +2025-05-11 08:50:07 - ERROR - stderr - +2025-05-11 08:50:07 - INFO - stdout - {'loss': 0.5203, 'grad_norm': 0.8515805006027222, 'learning_rate': 4.438898885602962e-06, 'epoch': 2.09} +2025-05-11 08:50:07 - ERROR - stderr - 70%|██████▉ | 2608/3741 [15:24:13<6:40:29, 21.21s/it] +2025-05-11 08:50:29 - ERROR - stderr - 70%|██████▉ | 2609/3741 [15:24:35<6:43:46, 21.40s/it] +2025-05-11 08:50:29 - ERROR - stderr - +2025-05-11 08:50:29 - ERROR - stderr - +2025-05-11 08:50:29 - INFO - stdout - {'loss': 0.5042, 'grad_norm': 0.8291308283805847, 'learning_rate': 4.431704155726936e-06, 'epoch': 2.09} +2025-05-11 08:50:29 - ERROR - stderr - 70%|██████▉ | 2609/3741 [15:24:35<6:43:46, 21.40s/it] +2025-05-11 08:50:48 - ERROR - stderr - 70%|██████▉ | 2610/3741 [15:24:54<6:32:24, 20.82s/it] +2025-05-11 08:50:48 - ERROR - stderr - +2025-05-11 08:50:48 - ERROR - stderr - +2025-05-11 08:50:48 - INFO - stdout - {'loss': 0.4855, 'grad_norm': 0.7905226349830627, 'learning_rate': 4.424513601148772e-06, 'epoch': 2.09} +2025-05-11 08:50:48 - ERROR - stderr - 70%|██████▉ | 2610/3741 [15:24:54<6:32:24, 20.82s/it] +2025-05-11 08:51:08 - ERROR - stderr - 70%|██████▉ | 2611/3741 [15:25:14<6:25:49, 20.49s/it] +2025-05-11 08:51:08 - ERROR - stderr - +2025-05-11 08:51:08 - ERROR - stderr - +2025-05-11 08:51:08 - INFO - stdout - {'loss': 0.5087, 'grad_norm': 0.7867658734321594, 'learning_rate': 4.417327227260183e-06, 'epoch': 2.09} +2025-05-11 08:51:08 - ERROR - stderr - 70%|██████▉ | 2611/3741 [15:25:14<6:25:49, 20.49s/it] +2025-05-11 08:51:28 - ERROR - stderr - 70%|██████▉ | 2612/3741 [15:25:35<6:25:37, 20.49s/it] +2025-05-11 08:51:28 - ERROR - stderr - +2025-05-11 08:51:28 - ERROR - stderr - +2025-05-11 08:51:28 - INFO - stdout - {'loss': 0.493, 'grad_norm': 0.7623449563980103, 'learning_rate': 4.410145039449771e-06, 'epoch': 2.09} +2025-05-11 08:51:28 - ERROR - stderr - 70%|██████▉ | 2612/3741 [15:25:35<6:25:37, 20.49s/it] +2025-05-11 08:51:48 - ERROR - stderr - 70%|██████▉ | 2613/3741 [15:25:54<6:20:17, 20.23s/it] +2025-05-11 08:51:48 - ERROR - stderr - +2025-05-11 08:51:48 - ERROR - stderr - +2025-05-11 08:51:48 - INFO - stdout - {'loss': 0.4888, 'grad_norm': 0.816936194896698, 'learning_rate': 4.402967043102974e-06, 'epoch': 2.1} +2025-05-11 08:51:48 - ERROR - stderr - 70%|██████▉ | 2613/3741 [15:25:54<6:20:17, 20.23s/it] +2025-05-11 08:52:10 - ERROR - stderr - 70%|██████▉ | 2614/3741 [15:26:17<6:33:11, 20.93s/it] +2025-05-11 08:52:10 - ERROR - stderr - +2025-05-11 08:52:10 - ERROR - stderr - +2025-05-11 08:52:10 - INFO - stdout - {'loss': 0.4777, 'grad_norm': 0.7807714343070984, 'learning_rate': 4.395793243602102e-06, 'epoch': 2.1} +2025-05-11 08:52:10 - ERROR - stderr - 70%|██████▉ | 2614/3741 [15:26:17<6:33:11, 20.93s/it] +2025-05-11 08:52:30 - ERROR - stderr - 70%|██████▉ | 2615/3741 [15:26:36<6:25:48, 20.56s/it] +2025-05-11 08:52:30 - ERROR - stderr - +2025-05-11 08:52:30 - ERROR - stderr - +2025-05-11 08:52:30 - INFO - stdout - {'loss': 0.4997, 'grad_norm': 0.7865536212921143, 'learning_rate': 4.388623646326318e-06, 'epoch': 2.1} +2025-05-11 08:52:30 - ERROR - stderr - 70%|██████▉ | 2615/3741 [15:26:36<6:25:48, 20.56s/it] +2025-05-11 08:52:54 - ERROR - stderr - 70%|██████▉ | 2616/3741 [15:27:00<6:44:25, 21.57s/it] +2025-05-11 08:52:54 - ERROR - stderr - +2025-05-11 08:52:54 - ERROR - stderr - +2025-05-11 08:52:54 - INFO - stdout - {'loss': 0.504, 'grad_norm': 0.8231841325759888, 'learning_rate': 4.381458256651622e-06, 'epoch': 2.1} +2025-05-11 08:52:54 - ERROR - stderr - 70%|██████▉ | 2616/3741 [15:27:00<6:44:25, 21.57s/it] +2025-05-11 08:53:14 - ERROR - stderr - 70%|██████▉ | 2617/3741 [15:27:20<6:35:10, 21.10s/it] +2025-05-11 08:53:14 - ERROR - stderr - +2025-05-11 08:53:14 - ERROR - stderr - +2025-05-11 08:53:14 - INFO - stdout - {'loss': 0.4859, 'grad_norm': 0.8537681698799133, 'learning_rate': 4.374297079950872e-06, 'epoch': 2.1} +2025-05-11 08:53:14 - ERROR - stderr - 70%|██████▉ | 2617/3741 [15:27:20<6:35:10, 21.10s/it] +2025-05-11 08:53:39 - ERROR - stderr - 70%|██████▉ | 2618/3741 [15:27:45<6:55:33, 22.20s/it] +2025-05-11 08:53:39 - ERROR - stderr - +2025-05-11 08:53:39 - ERROR - stderr - +2025-05-11 08:53:39 - INFO - stdout - {'loss': 0.5137, 'grad_norm': 0.89604651927948, 'learning_rate': 4.367140121593764e-06, 'epoch': 2.1} +2025-05-11 08:53:39 - ERROR - stderr - 70%|██████▉ | 2618/3741 [15:27:45<6:55:33, 22.20s/it] +2025-05-11 08:53:59 - ERROR - stderr - 70%|███████ | 2619/3741 [15:28:05<6:41:59, 21.50s/it] +2025-05-11 08:53:59 - ERROR - stderr - +2025-05-11 08:53:59 - ERROR - stderr - +2025-05-11 08:53:59 - INFO - stdout - {'loss': 0.4979, 'grad_norm': 0.838858962059021, 'learning_rate': 4.359987386946822e-06, 'epoch': 2.1} +2025-05-11 08:53:59 - ERROR - stderr - 70%|███████ | 2619/3741 [15:28:05<6:41:59, 21.50s/it] +2025-05-11 08:54:22 - ERROR - stderr - 70%|███████ | 2620/3741 [15:28:28<6:49:34, 21.92s/it] +2025-05-11 08:54:22 - ERROR - stderr - +2025-05-11 08:54:22 - ERROR - stderr - +2025-05-11 08:54:22 - INFO - stdout - {'loss': 0.4767, 'grad_norm': 0.8409374952316284, 'learning_rate': 4.352838881373421e-06, 'epoch': 2.1} +2025-05-11 08:54:22 - ERROR - stderr - 70%|███████ | 2620/3741 [15:28:28<6:49:34, 21.92s/it] +2025-05-11 08:54:42 - ERROR - stderr - 70%|███████ | 2621/3741 [15:28:48<6:38:22, 21.34s/it] +2025-05-11 08:54:42 - ERROR - stderr - +2025-05-11 08:54:42 - ERROR - stderr - +2025-05-11 08:54:42 - INFO - stdout - {'loss': 0.4772, 'grad_norm': 0.7959094643592834, 'learning_rate': 4.345694610233744e-06, 'epoch': 2.1} +2025-05-11 08:54:42 - ERROR - stderr - 70%|███████ | 2621/3741 [15:28:48<6:38:22, 21.34s/it] +2025-05-11 08:55:04 - ERROR - stderr - 70%|███████ | 2622/3741 [15:29:10<6:43:18, 21.63s/it] +2025-05-11 08:55:04 - ERROR - stderr - +2025-05-11 08:55:04 - ERROR - stderr - +2025-05-11 08:55:04 - INFO - stdout - {'loss': 0.4998, 'grad_norm': 0.8662393689155579, 'learning_rate': 4.338554578884813e-06, 'epoch': 2.1} +2025-05-11 08:55:04 - ERROR - stderr - 70%|███████ | 2622/3741 [15:29:10<6:43:18, 21.63s/it] +2025-05-11 08:55:23 - ERROR - stderr - 70%|███████ | 2623/3741 [15:29:30<6:31:47, 21.03s/it] +2025-05-11 08:55:23 - ERROR - stderr - +2025-05-11 08:55:23 - ERROR - stderr - +2025-05-11 08:55:23 - INFO - stdout - {'loss': 0.4968, 'grad_norm': 0.8256474733352661, 'learning_rate': 4.331418792680468e-06, 'epoch': 2.1} +2025-05-11 08:55:23 - ERROR - stderr - 70%|███████ | 2623/3741 [15:29:30<6:31:47, 21.03s/it] +2025-05-11 08:55:43 - ERROR - stderr - 70%|███████ | 2624/3741 [15:29:50<6:24:17, 20.64s/it] +2025-05-11 08:55:43 - ERROR - stderr - +2025-05-11 08:55:43 - ERROR - stderr - +2025-05-11 08:55:43 - INFO - stdout - {'loss': 0.5243, 'grad_norm': 0.8457236289978027, 'learning_rate': 4.324287256971358e-06, 'epoch': 2.1} +2025-05-11 08:55:43 - ERROR - stderr - 70%|███████ | 2624/3741 [15:29:50<6:24:17, 20.64s/it] +2025-05-11 08:56:03 - ERROR - stderr - 70%|███████ | 2625/3741 [15:30:09<6:18:55, 20.37s/it] +2025-05-11 08:56:03 - ERROR - stderr - +2025-05-11 08:56:03 - ERROR - stderr - +2025-05-11 08:56:03 - INFO - stdout - {'loss': 0.4426, 'grad_norm': 0.7527933716773987, 'learning_rate': 4.3171599771049625e-06, 'epoch': 2.11} +2025-05-11 08:56:03 - ERROR - stderr - 70%|███████ | 2625/3741 [15:30:09<6:18:55, 20.37s/it] +2025-05-11 08:56:23 - ERROR - stderr - 70%|███████ | 2626/3741 [15:30:29<6:15:11, 20.19s/it] +2025-05-11 08:56:23 - ERROR - stderr - +2025-05-11 08:56:23 - ERROR - stderr - +2025-05-11 08:56:23 - INFO - stdout - {'loss': 0.5017, 'grad_norm': 0.8174936175346375, 'learning_rate': 4.3100369584255475e-06, 'epoch': 2.11} +2025-05-11 08:56:23 - ERROR - stderr - 70%|███████ | 2626/3741 [15:30:29<6:15:11, 20.19s/it] +2025-05-11 08:56:44 - ERROR - stderr - 70%|███████ | 2627/3741 [15:30:50<6:18:54, 20.41s/it] +2025-05-11 08:56:44 - ERROR - stderr - +2025-05-11 08:56:44 - ERROR - stderr - +2025-05-11 08:56:44 - INFO - stdout - {'loss': 0.4952, 'grad_norm': 0.8383927941322327, 'learning_rate': 4.302918206274202e-06, 'epoch': 2.11} +2025-05-11 08:56:44 - ERROR - stderr - 70%|███████ | 2627/3741 [15:30:50<6:18:54, 20.41s/it] +2025-05-11 08:57:04 - ERROR - stderr - 70%|███████ | 2628/3741 [15:31:10<6:16:08, 20.28s/it] +2025-05-11 08:57:04 - ERROR - stderr - +2025-05-11 08:57:04 - ERROR - stderr - +2025-05-11 08:57:04 - INFO - stdout - {'loss': 0.4923, 'grad_norm': 0.8061890602111816, 'learning_rate': 4.295803725988807e-06, 'epoch': 2.11} +2025-05-11 08:57:04 - ERROR - stderr - 70%|███████ | 2628/3741 [15:31:10<6:16:08, 20.28s/it] +2025-05-11 08:57:26 - ERROR - stderr - 70%|███████ | 2629/3741 [15:31:33<6:28:51, 20.98s/it] +2025-05-11 08:57:26 - ERROR - stderr - +2025-05-11 08:57:26 - ERROR - stderr - +2025-05-11 08:57:26 - INFO - stdout - {'loss': 0.5019, 'grad_norm': 0.8143693804740906, 'learning_rate': 4.2886935229040375e-06, 'epoch': 2.11} +2025-05-11 08:57:26 - ERROR - stderr - 70%|███████ | 2629/3741 [15:31:33<6:28:51, 20.98s/it] +2025-05-11 08:57:27 - WARNING - transformers.tokenization_utils_base - Token indices sequence length is longer than the specified maximum sequence length for this model (5949 > 4096). Running this sequence through the model will result in indexing errors +2025-05-11 08:57:27 - WARNING - transformers.tokenization_utils_base - Token indices sequence length is longer than the specified maximum sequence length for this model (5949 > 4096). Running this sequence through the model will result in indexing errors +2025-05-11 08:57:46 - ERROR - stderr - 70%|███████ | 2630/3741 [15:31:52<6:20:35, 20.55s/it] +2025-05-11 08:57:46 - ERROR - stderr - +2025-05-11 08:57:46 - ERROR - stderr - +2025-05-11 08:57:46 - INFO - stdout - {'loss': 0.4858, 'grad_norm': 0.8190400004386902, 'learning_rate': 4.281587602351376e-06, 'epoch': 2.11} +2025-05-11 08:57:46 - ERROR - stderr - 70%|███████ | 2630/3741 [15:31:52<6:20:35, 20.55s/it] +2025-05-11 08:58:13 - ERROR - stderr - 70%|███████ | 2631/3741 [15:32:19<6:54:39, 22.41s/it] +2025-05-11 08:58:13 - ERROR - stderr - +2025-05-11 08:58:13 - ERROR - stderr - +2025-05-11 08:58:13 - INFO - stdout - {'loss': 0.4635, 'grad_norm': 0.7694993615150452, 'learning_rate': 4.274485969659074e-06, 'epoch': 2.11} +2025-05-11 08:58:13 - ERROR - stderr - 70%|███████ | 2631/3741 [15:32:19<6:54:39, 22.41s/it] +2025-05-11 08:58:32 - ERROR - stderr - 70%|███████ | 2632/3741 [15:32:39<6:39:36, 21.62s/it] +2025-05-11 08:58:32 - ERROR - stderr - +2025-05-11 08:58:32 - ERROR - stderr - +2025-05-11 08:58:32 - INFO - stdout - {'loss': 0.506, 'grad_norm': 0.840126633644104, 'learning_rate': 4.267388630152182e-06, 'epoch': 2.11} +2025-05-11 08:58:32 - ERROR - stderr - 70%|███████ | 2632/3741 [15:32:39<6:39:36, 21.62s/it] +2025-05-11 08:58:55 - ERROR - stderr - 70%|███████ | 2633/3741 [15:33:02<6:47:04, 22.04s/it] +2025-05-11 08:58:55 - ERROR - stderr - +2025-05-11 08:58:55 - ERROR - stderr - +2025-05-11 08:58:55 - INFO - stdout - {'loss': 0.4815, 'grad_norm': 0.8422167301177979, 'learning_rate': 4.26029558915253e-06, 'epoch': 2.11} +2025-05-11 08:58:55 - ERROR - stderr - 70%|███████ | 2633/3741 [15:33:02<6:47:04, 22.04s/it] +2025-05-11 08:59:15 - ERROR - stderr - 70%|███████ | 2634/3741 [15:33:22<6:36:05, 21.47s/it] +2025-05-11 08:59:15 - ERROR - stderr - +2025-05-11 08:59:15 - ERROR - stderr - +2025-05-11 08:59:15 - INFO - stdout - {'loss': 0.4826, 'grad_norm': 0.83141028881073, 'learning_rate': 4.2532068519787124e-06, 'epoch': 2.11} +2025-05-11 08:59:15 - ERROR - stderr - 70%|███████ | 2634/3741 [15:33:22<6:36:05, 21.47s/it] +2025-05-11 08:59:39 - ERROR - stderr - 70%|███████ | 2635/3741 [15:33:45<6:46:13, 22.04s/it] +2025-05-11 08:59:39 - ERROR - stderr - +2025-05-11 08:59:39 - ERROR - stderr - +2025-05-11 08:59:39 - INFO - stdout - {'loss': 0.4945, 'grad_norm': 0.8261462450027466, 'learning_rate': 4.246122423946114e-06, 'epoch': 2.11} +2025-05-11 08:59:39 - ERROR - stderr - 70%|███████ | 2635/3741 [15:33:45<6:46:13, 22.04s/it] +2025-05-11 08:59:58 - ERROR - stderr - 70%|███████ | 2636/3741 [15:34:05<6:32:03, 21.29s/it] +2025-05-11 08:59:58 - ERROR - stderr - +2025-05-11 08:59:58 - ERROR - stderr - +2025-05-11 08:59:58 - INFO - stdout - {'loss': 0.4753, 'grad_norm': 0.8225822448730469, 'learning_rate': 4.239042310366875e-06, 'epoch': 2.11} +2025-05-11 08:59:58 - ERROR - stderr - 70%|███████ | 2636/3741 [15:34:05<6:32:03, 21.29s/it] +2025-05-11 09:00:22 - ERROR - stderr - 70%|███████ | 2637/3741 [15:34:28<6:42:04, 21.85s/it] +2025-05-11 09:00:22 - ERROR - stderr - +2025-05-11 09:00:22 - ERROR - stderr - +2025-05-11 09:00:22 - INFO - stdout - {'loss': 0.4763, 'grad_norm': 0.8107204437255859, 'learning_rate': 4.23196651654991e-06, 'epoch': 2.11} +2025-05-11 09:00:22 - ERROR - stderr - 70%|███████ | 2637/3741 [15:34:28<6:42:04, 21.85s/it] +2025-05-11 09:00:42 - ERROR - stderr - 71%|███████ | 2638/3741 [15:34:48<6:31:36, 21.30s/it] +2025-05-11 09:00:42 - ERROR - stderr - +2025-05-11 09:00:42 - ERROR - stderr - +2025-05-11 09:00:42 - INFO - stdout - {'loss': 0.4931, 'grad_norm': 0.8356348276138306, 'learning_rate': 4.224895047800892e-06, 'epoch': 2.12} +2025-05-11 09:00:42 - ERROR - stderr - 71%|███████ | 2638/3741 [15:34:48<6:31:36, 21.30s/it] +2025-05-11 09:01:02 - ERROR - stderr - 71%|███████ | 2639/3741 [15:35:08<6:26:21, 21.04s/it] +2025-05-11 09:01:02 - ERROR - stderr - +2025-05-11 09:01:02 - ERROR - stderr - +2025-05-11 09:01:02 - INFO - stdout - {'loss': 0.4685, 'grad_norm': 0.803632915019989, 'learning_rate': 4.217827909422241e-06, 'epoch': 2.12} +2025-05-11 09:01:02 - ERROR - stderr - 71%|███████ | 2639/3741 [15:35:08<6:26:21, 21.04s/it] +2025-05-11 09:01:22 - ERROR - stderr - 71%|███████ | 2640/3741 [15:35:28<6:19:02, 20.66s/it] +2025-05-11 09:01:22 - ERROR - stderr - +2025-05-11 09:01:22 - ERROR - stderr - +2025-05-11 09:01:22 - INFO - stdout - {'loss': 0.4863, 'grad_norm': 0.8820094466209412, 'learning_rate': 4.210765106713143e-06, 'epoch': 2.12} +2025-05-11 09:01:22 - ERROR - stderr - 71%|███████ | 2640/3741 [15:35:28<6:19:02, 20.66s/it] +2025-05-11 09:01:43 - ERROR - stderr - 71%|███████ | 2641/3741 [15:35:49<6:19:35, 20.70s/it] +2025-05-11 09:01:43 - ERROR - stderr - +2025-05-11 09:01:43 - ERROR - stderr - +2025-05-11 09:01:43 - INFO - stdout - {'loss': 0.496, 'grad_norm': 0.8546995520591736, 'learning_rate': 4.2037066449695275e-06, 'epoch': 2.12} +2025-05-11 09:01:43 - ERROR - stderr - 71%|███████ | 2641/3741 [15:35:49<6:19:35, 20.70s/it] +2025-05-11 09:02:02 - ERROR - stderr - 71%|███████ | 2642/3741 [15:36:09<6:14:44, 20.46s/it] +2025-05-11 09:02:02 - ERROR - stderr - +2025-05-11 09:02:02 - ERROR - stderr - +2025-05-11 09:02:02 - INFO - stdout - {'loss': 0.5053, 'grad_norm': 0.8304917216300964, 'learning_rate': 4.196652529484068e-06, 'epoch': 2.12} +2025-05-11 09:02:02 - ERROR - stderr - 71%|███████ | 2642/3741 [15:36:09<6:14:44, 20.46s/it] +2025-05-11 09:02:22 - ERROR - stderr - 71%|███████ | 2643/3741 [15:36:28<6:10:08, 20.23s/it] +2025-05-11 09:02:22 - ERROR - stderr - +2025-05-11 09:02:22 - ERROR - stderr - +2025-05-11 09:02:22 - INFO - stdout - {'loss': 0.5033, 'grad_norm': 0.8051602244377136, 'learning_rate': 4.189602765546188e-06, 'epoch': 2.12} +2025-05-11 09:02:22 - ERROR - stderr - 71%|███████ | 2643/3741 [15:36:28<6:10:08, 20.23s/it] +2025-05-11 09:02:43 - ERROR - stderr - 71%|███████ | 2644/3741 [15:36:49<6:14:05, 20.46s/it] +2025-05-11 09:02:43 - ERROR - stderr - +2025-05-11 09:02:43 - ERROR - stderr - +2025-05-11 09:02:43 - INFO - stdout - {'loss': 0.5049, 'grad_norm': 0.8486653566360474, 'learning_rate': 4.18255735844203e-06, 'epoch': 2.12} +2025-05-11 09:02:43 - ERROR - stderr - 71%|███████ | 2644/3741 [15:36:49<6:14:05, 20.46s/it] +2025-05-11 09:03:03 - ERROR - stderr - 71%|███████ | 2645/3741 [15:37:09<6:09:04, 20.20s/it] +2025-05-11 09:03:03 - ERROR - stderr - +2025-05-11 09:03:03 - ERROR - stderr - +2025-05-11 09:03:03 - INFO - stdout - {'loss': 0.5047, 'grad_norm': 0.8334927558898926, 'learning_rate': 4.175516313454485e-06, 'epoch': 2.12} +2025-05-11 09:03:03 - ERROR - stderr - 71%|███████ | 2645/3741 [15:37:09<6:09:04, 20.20s/it] +2025-05-11 09:03:25 - ERROR - stderr - 71%|███████ | 2646/3741 [15:37:32<6:21:03, 20.88s/it] +2025-05-11 09:03:25 - ERROR - stderr - +2025-05-11 09:03:25 - ERROR - stderr - +2025-05-11 09:03:25 - INFO - stdout - {'loss': 0.5031, 'grad_norm': 0.8217371106147766, 'learning_rate': 4.168479635863167e-06, 'epoch': 2.12} +2025-05-11 09:03:25 - ERROR - stderr - 71%|███████ | 2646/3741 [15:37:32<6:21:03, 20.88s/it] +2025-05-11 09:03:46 - ERROR - stderr - 71%|███████ | 2647/3741 [15:37:52<6:18:19, 20.75s/it] +2025-05-11 09:03:46 - ERROR - stderr - +2025-05-11 09:03:46 - ERROR - stderr - +2025-05-11 09:03:46 - INFO - stdout - {'loss': 0.5255, 'grad_norm': 0.8687995076179504, 'learning_rate': 4.161447330944422e-06, 'epoch': 2.12} +2025-05-11 09:03:46 - ERROR - stderr - 71%|███████ | 2647/3741 [15:37:52<6:18:19, 20.75s/it] +2025-05-11 09:04:09 - ERROR - stderr - 71%|███████ | 2648/3741 [15:38:15<6:30:50, 21.45s/it] +2025-05-11 09:04:09 - ERROR - stderr - +2025-05-11 09:04:09 - ERROR - stderr - +2025-05-11 09:04:09 - INFO - stdout - {'loss': 0.4944, 'grad_norm': 0.838424026966095, 'learning_rate': 4.154419403971305e-06, 'epoch': 2.12} +2025-05-11 09:04:09 - ERROR - stderr - 71%|███████ | 2648/3741 [15:38:15<6:30:50, 21.45s/it] +2025-05-11 09:04:28 - ERROR - stderr - 71%|███████ | 2649/3741 [15:38:35<6:20:31, 20.91s/it] +2025-05-11 09:04:28 - ERROR - stderr - +2025-05-11 09:04:28 - ERROR - stderr - +2025-05-11 09:04:28 - INFO - stdout - {'loss': 0.4791, 'grad_norm': 0.7598464488983154, 'learning_rate': 4.1473958602135956e-06, 'epoch': 2.12} +2025-05-11 09:04:28 - ERROR - stderr - 71%|███████ | 2649/3741 [15:38:35<6:20:31, 20.91s/it] +2025-05-11 09:04:52 - ERROR - stderr - 71%|███████ | 2650/3741 [15:38:58<6:34:04, 21.67s/it] +2025-05-11 09:04:52 - ERROR - stderr - +2025-05-11 09:04:52 - ERROR - stderr - +2025-05-11 09:04:52 - INFO - stdout - {'loss': 0.5074, 'grad_norm': 0.9013649821281433, 'learning_rate': 4.140376704937789e-06, 'epoch': 2.13} +2025-05-11 09:04:52 - ERROR - stderr - 71%|███████ | 2650/3741 [15:38:58<6:34:04, 21.67s/it] +2025-05-11 09:05:12 - ERROR - stderr - 71%|███████ | 2651/3741 [15:39:18<6:24:46, 21.18s/it] +2025-05-11 09:05:12 - ERROR - stderr - +2025-05-11 09:05:12 - ERROR - stderr - +2025-05-11 09:05:12 - INFO - stdout - {'loss': 0.4883, 'grad_norm': 0.8267843723297119, 'learning_rate': 4.133361943407085e-06, 'epoch': 2.13} +2025-05-11 09:05:12 - ERROR - stderr - 71%|███████ | 2651/3741 [15:39:18<6:24:46, 21.18s/it] +2025-05-11 09:05:36 - ERROR - stderr - 71%|███████ | 2652/3741 [15:39:42<6:39:45, 22.03s/it] +2025-05-11 09:05:36 - ERROR - stderr - +2025-05-11 09:05:36 - ERROR - stderr - +2025-05-11 09:05:36 - INFO - stdout - {'loss': 0.4759, 'grad_norm': 0.8689286112785339, 'learning_rate': 4.126351580881395e-06, 'epoch': 2.13} +2025-05-11 09:05:36 - ERROR - stderr - 71%|███████ | 2652/3741 [15:39:42<6:39:45, 22.03s/it] +2025-05-11 09:05:56 - ERROR - stderr - 71%|███████ | 2653/3741 [15:40:02<6:28:35, 21.43s/it] +2025-05-11 09:05:56 - ERROR - stderr - +2025-05-11 09:05:56 - ERROR - stderr - +2025-05-11 09:05:56 - INFO - stdout - {'loss': 0.5206, 'grad_norm': 0.912152886390686, 'learning_rate': 4.11934562261732e-06, 'epoch': 2.13} +2025-05-11 09:05:56 - ERROR - stderr - 71%|███████ | 2653/3741 [15:40:02<6:28:35, 21.43s/it] +2025-05-11 09:06:16 - ERROR - stderr - 71%|███████ | 2654/3741 [15:40:23<6:22:02, 21.09s/it] +2025-05-11 09:06:16 - ERROR - stderr - +2025-05-11 09:06:16 - ERROR - stderr - +2025-05-11 09:06:16 - INFO - stdout - {'loss': 0.4897, 'grad_norm': 0.832675576210022, 'learning_rate': 4.112344073868171e-06, 'epoch': 2.13} +2025-05-11 09:06:16 - ERROR - stderr - 71%|███████ | 2654/3741 [15:40:23<6:22:02, 21.09s/it] +2025-05-11 09:06:36 - ERROR - stderr - 71%|███████ | 2655/3741 [15:40:43<6:16:53, 20.82s/it] +2025-05-11 09:06:36 - ERROR - stderr - +2025-05-11 09:06:36 - ERROR - stderr - +2025-05-11 09:06:36 - INFO - stdout - {'loss': 0.5046, 'grad_norm': 0.8386573791503906, 'learning_rate': 4.105346939883946e-06, 'epoch': 2.13} +2025-05-11 09:06:36 - ERROR - stderr - 71%|███████ | 2655/3741 [15:40:43<6:16:53, 20.82s/it] +2025-05-11 09:06:56 - ERROR - stderr - 71%|███████ | 2656/3741 [15:41:03<6:12:29, 20.60s/it] +2025-05-11 09:06:56 - ERROR - stderr - +2025-05-11 09:06:56 - ERROR - stderr - +2025-05-11 09:06:56 - INFO - stdout - {'loss': 0.5104, 'grad_norm': 0.8459692597389221, 'learning_rate': 4.098354225911336e-06, 'epoch': 2.13} +2025-05-11 09:06:56 - ERROR - stderr - 71%|███████ | 2656/3741 [15:41:03<6:12:29, 20.60s/it] +2025-05-11 09:07:16 - ERROR - stderr - 71%|███████ | 2657/3741 [15:41:23<6:08:31, 20.40s/it] +2025-05-11 09:07:16 - ERROR - stderr - +2025-05-11 09:07:16 - ERROR - stderr - +2025-05-11 09:07:16 - INFO - stdout - {'loss': 0.4764, 'grad_norm': 0.8297333121299744, 'learning_rate': 4.091365937193719e-06, 'epoch': 2.13} +2025-05-11 09:07:16 - ERROR - stderr - 71%|███████ | 2657/3741 [15:41:23<6:08:31, 20.40s/it] +2025-05-11 09:07:36 - ERROR - stderr - 71%|███████ | 2658/3741 [15:41:43<6:06:02, 20.28s/it] +2025-05-11 09:07:36 - ERROR - stderr - +2025-05-11 09:07:36 - ERROR - stderr - +2025-05-11 09:07:36 - INFO - stdout - {'loss': 0.4874, 'grad_norm': 0.8380979895591736, 'learning_rate': 4.084382078971143e-06, 'epoch': 2.13} +2025-05-11 09:07:36 - ERROR - stderr - 71%|███████ | 2658/3741 [15:41:43<6:06:02, 20.28s/it] +2025-05-11 09:07:58 - ERROR - stderr - 71%|███████ | 2659/3741 [15:42:04<6:12:55, 20.68s/it] +2025-05-11 09:07:58 - ERROR - stderr - +2025-05-11 09:07:58 - ERROR - stderr - +2025-05-11 09:07:58 - INFO - stdout - {'loss': 0.5001, 'grad_norm': 0.8121061325073242, 'learning_rate': 4.0774026564803494e-06, 'epoch': 2.13} +2025-05-11 09:07:58 - ERROR - stderr - 71%|███████ | 2659/3741 [15:42:04<6:12:55, 20.68s/it] +2025-05-11 09:08:18 - ERROR - stderr - 71%|███████ | 2660/3741 [15:42:24<6:08:43, 20.47s/it] +2025-05-11 09:08:18 - ERROR - stderr - +2025-05-11 09:08:18 - ERROR - stderr - +2025-05-11 09:08:18 - INFO - stdout - {'loss': 0.4694, 'grad_norm': 0.8189265727996826, 'learning_rate': 4.070427674954748e-06, 'epoch': 2.13} +2025-05-11 09:08:18 - ERROR - stderr - 71%|���██████ | 2660/3741 [15:42:24<6:08:43, 20.47s/it] +2025-05-11 09:08:41 - ERROR - stderr - 71%|███████ | 2661/3741 [15:42:47<6:20:47, 21.16s/it] +2025-05-11 09:08:41 - ERROR - stderr - +2025-05-11 09:08:41 - ERROR - stderr - +2025-05-11 09:08:41 - INFO - stdout - {'loss': 0.4957, 'grad_norm': 0.8043553829193115, 'learning_rate': 4.063457139624407e-06, 'epoch': 2.13} +2025-05-11 09:08:41 - ERROR - stderr - 71%|███████ | 2661/3741 [15:42:47<6:20:47, 21.16s/it] +2025-05-11 09:09:02 - ERROR - stderr - 71%|███████ | 2662/3741 [15:43:08<6:18:44, 21.06s/it] +2025-05-11 09:09:02 - ERROR - stderr - +2025-05-11 09:09:02 - ERROR - stderr - +2025-05-11 09:09:02 - INFO - stdout - {'loss': 0.4764, 'grad_norm': 0.7954015731811523, 'learning_rate': 4.056491055716088e-06, 'epoch': 2.13} +2025-05-11 09:09:02 - ERROR - stderr - 71%|███████ | 2662/3741 [15:43:08<6:18:44, 21.06s/it] +2025-05-11 09:09:26 - ERROR - stderr - 71%|███████ | 2663/3741 [15:43:32<6:34:23, 21.95s/it] +2025-05-11 09:09:26 - ERROR - stderr - +2025-05-11 09:09:26 - ERROR - stderr - +2025-05-11 09:09:26 - INFO - stdout - {'loss': 0.4811, 'grad_norm': 0.7770733833312988, 'learning_rate': 4.049529428453184e-06, 'epoch': 2.14} +2025-05-11 09:09:26 - ERROR - stderr - 71%|███████ | 2663/3741 [15:43:32<6:34:23, 21.95s/it] +2025-05-11 09:09:46 - ERROR - stderr - 71%|███████ | 2664/3741 [15:43:52<6:25:25, 21.47s/it] +2025-05-11 09:09:46 - ERROR - stderr - +2025-05-11 09:09:46 - ERROR - stderr - +2025-05-11 09:09:46 - INFO - stdout - {'loss': 0.4998, 'grad_norm': 0.8199390172958374, 'learning_rate': 4.042572263055765e-06, 'epoch': 2.14} +2025-05-11 09:09:46 - ERROR - stderr - 71%|███████ | 2664/3741 [15:43:52<6:25:25, 21.47s/it] +2025-05-11 09:10:10 - ERROR - stderr - 71%|███████ | 2665/3741 [15:44:16<6:38:53, 22.24s/it] +2025-05-11 09:10:10 - ERROR - stderr - +2025-05-11 09:10:10 - ERROR - stderr - +2025-05-11 09:10:10 - INFO - stdout - {'loss': 0.4694, 'grad_norm': 0.8421967029571533, 'learning_rate': 4.035619564740555e-06, 'epoch': 2.14} +2025-05-11 09:10:10 - ERROR - stderr - 71%|███████ | 2665/3741 [15:44:16<6:38:53, 22.24s/it] +2025-05-11 09:10:30 - ERROR - stderr - 71%|███████▏ | 2666/3741 [15:44:37<6:28:30, 21.68s/it] +2025-05-11 09:10:30 - ERROR - stderr - +2025-05-11 09:10:30 - ERROR - stderr - +2025-05-11 09:10:30 - INFO - stdout - {'loss': 0.4854, 'grad_norm': 0.8318009376525879, 'learning_rate': 4.028671338720912e-06, 'epoch': 2.14} +2025-05-11 09:10:30 - ERROR - stderr - 71%|███████▏ | 2666/3741 [15:44:37<6:28:30, 21.68s/it] +2025-05-11 09:10:54 - ERROR - stderr - 71%|███████▏ | 2667/3741 [15:45:00<6:37:43, 22.22s/it] +2025-05-11 09:10:54 - ERROR - stderr - +2025-05-11 09:10:54 - ERROR - stderr - +2025-05-11 09:10:54 - INFO - stdout - {'loss': 0.4877, 'grad_norm': 0.8843598365783691, 'learning_rate': 4.021727590206868e-06, 'epoch': 2.14} +2025-05-11 09:10:54 - ERROR - stderr - 71%|███████▏ | 2667/3741 [15:45:00<6:37:43, 22.22s/it] +2025-05-11 09:11:15 - ERROR - stderr - 71%|███████▏ | 2668/3741 [15:45:21<6:29:14, 21.77s/it] +2025-05-11 09:11:15 - ERROR - stderr - +2025-05-11 09:11:15 - ERROR - stderr - +2025-05-11 09:11:15 - INFO - stdout - {'loss': 0.5328, 'grad_norm': 0.8518946170806885, 'learning_rate': 4.01478832440507e-06, 'epoch': 2.14} +2025-05-11 09:11:15 - ERROR - stderr - 71%|███████▏ | 2668/3741 [15:45:21<6:29:14, 21.77s/it] +2025-05-11 09:11:38 - ERROR - stderr - 71%|███████▏ | 2669/3741 [15:45:44<6:37:20, 22.24s/it] +2025-05-11 09:11:38 - ERROR - stderr - +2025-05-11 09:11:38 - ERROR - stderr - +2025-05-11 09:11:38 - INFO - stdout - {'loss': 0.5143, 'grad_norm': 0.8255460858345032, 'learning_rate': 4.00785354651882e-06, 'epoch': 2.14} +2025-05-11 09:11:38 - ERROR - stderr - 71%|███████▏ | 2669/3741 [15:45:44<6:37:20, 22.24s/it] +2025-05-11 09:11:58 - ERROR - stderr - 71%|███████▏ | 2670/3741 [15:46:04<6:26:13, 21.64s/it] +2025-05-11 09:11:58 - ERROR - stderr - +2025-05-11 09:11:58 - ERROR - stderr - +2025-05-11 09:11:58 - INFO - stdout - {'loss': 0.5018, 'grad_norm': 0.8585113286972046, 'learning_rate': 4.000923261748055e-06, 'epoch': 2.14} +2025-05-11 09:11:58 - ERROR - stderr - 71%|███████▏ | 2670/3741 [15:46:04<6:26:13, 21.64s/it] +2025-05-11 09:12:18 - ERROR - stderr - 71%|███████▏ | 2671/3741 [15:46:25<6:18:45, 21.24s/it] +2025-05-11 09:12:18 - ERROR - stderr - +2025-05-11 09:12:18 - ERROR - stderr - +2025-05-11 09:12:18 - INFO - stdout - {'loss': 0.4755, 'grad_norm': 0.8432846665382385, 'learning_rate': 3.9939974752893275e-06, 'epoch': 2.14} +2025-05-11 09:12:18 - ERROR - stderr - 71%|███████▏ | 2671/3741 [15:46:25<6:18:45, 21.24s/it] +2025-05-11 09:12:39 - ERROR - stderr - 71%|███████▏ | 2672/3741 [15:46:45<6:14:34, 21.02s/it] +2025-05-11 09:12:39 - ERROR - stderr - +2025-05-11 09:12:39 - ERROR - stderr - +2025-05-11 09:12:39 - INFO - stdout - {'loss': 0.4902, 'grad_norm': 0.9172224402427673, 'learning_rate': 3.9870761923358405e-06, 'epoch': 2.14} +2025-05-11 09:12:39 - ERROR - stderr - 71%|███████▏ | 2672/3741 [15:46:45<6:14:34, 21.02s/it] +2025-05-11 09:12:59 - ERROR - stderr - 71%|███████▏ | 2673/3741 [15:47:06<6:10:24, 20.81s/it] +2025-05-11 09:12:59 - ERROR - stderr - +2025-05-11 09:12:59 - ERROR - stderr - +2025-05-11 09:12:59 - INFO - stdout - {'loss': 0.5009, 'grad_norm': 0.8651891946792603, 'learning_rate': 3.980159418077403e-06, 'epoch': 2.14} +2025-05-11 09:12:59 - ERROR - stderr - 71%|███████▏ | 2673/3741 [15:47:06<6:10:24, 20.81s/it] +2025-05-11 09:13:20 - ERROR - stderr - 71%|███████▏ | 2674/3741 [15:47:26<6:09:51, 20.80s/it] +2025-05-11 09:13:20 - ERROR - stderr - +2025-05-11 09:13:20 - ERROR - stderr - +2025-05-11 09:13:20 - INFO - stdout - {'loss': 0.5091, 'grad_norm': 0.8147042393684387, 'learning_rate': 3.97324715770044e-06, 'epoch': 2.14} +2025-05-11 09:13:20 - ERROR - stderr - 71%|███████▏ | 2674/3741 [15:47:26<6:09:51, 20.80s/it] +2025-05-11 09:13:40 - ERROR - stderr - 72%|███████▏ | 2675/3741 [15:47:47<6:06:35, 20.63s/it] +2025-05-11 09:13:40 - ERROR - stderr - +2025-05-11 09:13:40 - ERROR - stderr - +2025-05-11 09:13:40 - INFO - stdout - {'loss': 0.4879, 'grad_norm': 0.8397491574287415, 'learning_rate': 3.966339416388013e-06, 'epoch': 2.15} +2025-05-11 09:13:40 - ERROR - stderr - 72%|███████▏ | 2675/3741 [15:47:47<6:06:35, 20.63s/it] +2025-05-11 09:14:03 - ERROR - stderr - 72%|███████▏ | 2676/3741 [15:48:09<6:14:44, 21.11s/it] +2025-05-11 09:14:03 - ERROR - stderr - +2025-05-11 09:14:03 - ERROR - stderr - +2025-05-11 09:14:03 - INFO - stdout - {'loss': 0.5036, 'grad_norm': 0.8129686713218689, 'learning_rate': 3.959436199319771e-06, 'epoch': 2.15} +2025-05-11 09:14:03 - ERROR - stderr - 72%|███████▏ | 2676/3741 [15:48:09<6:14:44, 21.11s/it] +2025-05-11 09:14:23 - ERROR - stderr - 72%|███████▏ | 2677/3741 [15:48:29<6:08:25, 20.78s/it] +2025-05-11 09:14:23 - ERROR - stderr - +2025-05-11 09:14:23 - ERROR - stderr - +2025-05-11 09:14:23 - INFO - stdout - {'loss': 0.4864, 'grad_norm': 0.8532066345214844, 'learning_rate': 3.952537511671988e-06, 'epoch': 2.15} +2025-05-11 09:14:23 - ERROR - stderr - 72%|███████▏ | 2677/3741 [15:48:29<6:08:25, 20.78s/it] +2025-05-11 09:14:46 - ERROR - stderr - 72%|███████▏ | 2678/3741 [15:48:52<6:21:26, 21.53s/it] +2025-05-11 09:14:46 - ERROR - stderr - +2025-05-11 09:14:46 - ERROR - stderr - +2025-05-11 09:14:46 - INFO - stdout - {'loss': 0.4962, 'grad_norm': 0.817719578742981, 'learning_rate': 3.9456433586175335e-06, 'epoch': 2.15} +2025-05-11 09:14:46 - ERROR - stderr - 72%|███████▏ | 2678/3741 [15:48:52<6:21:26, 21.53s/it] +2025-05-11 09:15:06 - ERROR - stderr - 72%|███████▏ | 2679/3741 [15:49:12<6:12:25, 21.04s/it] +2025-05-11 09:15:06 - ERROR - stderr - +2025-05-11 09:15:06 - ERROR - stderr - +2025-05-11 09:15:06 - INFO - stdout - {'loss': 0.4987, 'grad_norm': 0.8393282890319824, 'learning_rate': 3.938753745325872e-06, 'epoch': 2.15} +2025-05-11 09:15:06 - ERROR - stderr - 72%|███████▏ | 2679/3741 [15:49:12<6:12:25, 21.04s/it] +2025-05-11 09:15:29 - ERROR - stderr - 72%|███████▏ | 2680/3741 [15:49:36<6:25:42, 21.81s/it] +2025-05-11 09:15:29 - ERROR - stderr - +2025-05-11 09:15:29 - ERROR - stderr - +2025-05-11 09:15:29 - INFO - stdout - {'loss': 0.5131, 'grad_norm': 0.9043245315551758, 'learning_rate': 3.931868676963082e-06, 'epoch': 2.15} +2025-05-11 09:15:29 - ERROR - stderr - 72%|███████▏ | 2680/3741 [15:49:36<6:25:42, 21.81s/it] +2025-05-11 09:15:49 - ERROR - stderr - 72%|███████▏ | 2681/3741 [15:49:56<6:16:25, 21.31s/it] +2025-05-11 09:15:49 - ERROR - stderr - +2025-05-11 09:15:49 - ERROR - stderr - +2025-05-11 09:15:49 - INFO - stdout - {'loss': 0.5018, 'grad_norm': 0.876274049282074, 'learning_rate': 3.924988158691812e-06, 'epoch': 2.15} +2025-05-11 09:15:49 - ERROR - stderr - 72%|███████▏ | 2681/3741 [15:49:56<6:16:25, 21.31s/it] +2025-05-11 09:16:09 - ERROR - stderr - 72%|███████▏ | 2682/3741 [15:50:16<6:08:41, 20.89s/it] +2025-05-11 09:16:09 - ERROR - stderr - +2025-05-11 09:16:09 - ERROR - stderr - +2025-05-11 09:16:09 - INFO - stdout - {'loss': 0.5111, 'grad_norm': 0.8482949137687683, 'learning_rate': 3.918112195671313e-06, 'epoch': 2.15} +2025-05-11 09:16:09 - ERROR - stderr - 72%|███████▏ | 2682/3741 [15:50:16<6:08:41, 20.89s/it] +2025-05-11 09:16:29 - ERROR - stderr - 72%|███████▏ | 2683/3741 [15:50:35<6:02:31, 20.56s/it] +2025-05-11 09:16:29 - ERROR - stderr - +2025-05-11 09:16:29 - ERROR - stderr - +2025-05-11 09:16:29 - INFO - stdout - {'loss': 0.5079, 'grad_norm': 0.8316020965576172, 'learning_rate': 3.9112407930574195e-06, 'epoch': 2.15} +2025-05-11 09:16:29 - ERROR - stderr - 72%|███████▏ | 2683/3741 [15:50:35<6:02:31, 20.56s/it] +2025-05-11 09:16:49 - ERROR - stderr - 72%|███████▏ | 2684/3741 [15:50:55<5:59:05, 20.38s/it] +2025-05-11 09:16:49 - ERROR - stderr - +2025-05-11 09:16:49 - ERROR - stderr - +2025-05-11 09:16:49 - INFO - stdout - {'loss': 0.4938, 'grad_norm': 0.8304176330566406, 'learning_rate': 3.904373956002532e-06, 'epoch': 2.15} +2025-05-11 09:16:49 - ERROR - stderr - 72%|███████▏ | 2684/3741 [15:50:55<5:59:05, 20.38s/it] +2025-05-11 09:17:12 - ERROR - stderr - 72%|███████▏ | 2685/3741 [15:51:18<6:10:37, 21.06s/it] +2025-05-11 09:17:12 - ERROR - stderr - +2025-05-11 09:17:12 - ERROR - stderr - +2025-05-11 09:17:12 - INFO - stdout - {'loss': 0.4879, 'grad_norm': 0.8400371670722961, 'learning_rate': 3.897511689655653e-06, 'epoch': 2.15} +2025-05-11 09:17:12 - ERROR - stderr - 72%|███████▏ | 2685/3741 [15:51:18<6:10:37, 21.06s/it] +2025-05-11 09:17:32 - ERROR - stderr - 72%|███████▏ | 2686/3741 [15:51:39<6:07:20, 20.89s/it] +2025-05-11 09:17:32 - ERROR - stderr - +2025-05-11 09:17:32 - ERROR - stderr - +2025-05-11 09:17:32 - INFO - stdout - {'loss': 0.4841, 'grad_norm': 0.8171892762184143, 'learning_rate': 3.890653999162333e-06, 'epoch': 2.15} +2025-05-11 09:17:32 - ERROR - stderr - 72%|███████▏ | 2686/3741 [15:51:39<6:07:20, 20.89s/it] +2025-05-11 09:17:56 - ERROR - stderr - 72%|███████▏ | 2687/3741 [15:52:02<6:20:55, 21.68s/it] +2025-05-11 09:17:56 - ERROR - stderr - +2025-05-11 09:17:56 - ERROR - stderr - +2025-05-11 09:17:56 - INFO - stdout - {'loss': 0.5202, 'grad_norm': 0.8671571612358093, 'learning_rate': 3.8838008896647075e-06, 'epoch': 2.15} +2025-05-11 09:17:56 - ERROR - stderr - 72%|███████▏ | 2687/3741 [15:52:02<6:20:55, 21.68s/it] +2025-05-11 09:18:16 - ERROR - stderr - 72%|███████▏ | 2688/3741 [15:52:22<6:12:32, 21.23s/it] +2025-05-11 09:18:16 - ERROR - stderr - +2025-05-11 09:18:16 - ERROR - stderr - +2025-05-11 09:18:16 - INFO - stdout - {'loss': 0.5099, 'grad_norm': 0.8403294682502747, 'learning_rate': 3.876952366301472e-06, 'epoch': 2.16} +2025-05-11 09:18:16 - ERROR - stderr - 72%|███████▏ | 2688/3741 [15:52:22<6:12:32, 21.23s/it] +2025-05-11 09:18:39 - ERROR - stderr - 72%|███████▏ | 2689/3741 [15:52:45<6:21:46, 21.77s/it] +2025-05-11 09:18:39 - ERROR - stderr - +2025-05-11 09:18:39 - ERROR - stderr - +2025-05-11 09:18:39 - INFO - stdout - {'loss': 0.508, 'grad_norm': 0.8804596066474915, 'learning_rate': 3.870108434207877e-06, 'epoch': 2.16} +2025-05-11 09:18:39 - ERROR - stderr - 72%|███████▏ | 2689/3741 [15:52:45<6:21:46, 21.77s/it] +2025-05-11 09:18:59 - ERROR - stderr - 72%|███████▏ | 2690/3741 [15:53:05<6:11:31, 21.21s/it] +2025-05-11 09:18:59 - ERROR - stderr - +2025-05-11 09:18:59 - ERROR - stderr - +2025-05-11 09:18:59 - INFO - stdout - {'loss': 0.5076, 'grad_norm': 0.8431464433670044, 'learning_rate': 3.863269098515738e-06, 'epoch': 2.16} +2025-05-11 09:18:59 - ERROR - stderr - 72%|███████▏ | 2690/3741 [15:53:05<6:11:31, 21.21s/it] +2025-05-11 09:19:22 - ERROR - stderr - 72%|███████▏ | 2691/3741 [15:53:28<6:19:05, 21.66s/it] +2025-05-11 09:19:22 - ERROR - stderr - +2025-05-11 09:19:22 - ERROR - stderr - +2025-05-11 09:19:22 - INFO - stdout - {'loss': 0.4894, 'grad_norm': 0.8322014212608337, 'learning_rate': 3.856434364353424e-06, 'epoch': 2.16} +2025-05-11 09:19:22 - ERROR - stderr - 72%|███████▏ | 2691/3741 [15:53:28<6:19:05, 21.66s/it] +2025-05-11 09:19:41 - ERROR - stderr - 72%|███████▏ | 2692/3741 [15:53:48<6:09:03, 21.11s/it] +2025-05-11 09:19:41 - ERROR - stderr - +2025-05-11 09:19:41 - ERROR - stderr - +2025-05-11 09:19:41 - INFO - stdout - {'loss': 0.489, 'grad_norm': 0.845043420791626, 'learning_rate': 3.84960423684585e-06, 'epoch': 2.16} +2025-05-11 09:19:41 - ERROR - stderr - 72%|███████▏ | 2692/3741 [15:53:48<6:09:03, 21.11s/it] +2025-05-11 09:20:01 - ERROR - stderr - 72%|███████▏ | 2693/3741 [15:54:07<6:01:09, 20.68s/it] +2025-05-11 09:20:01 - ERROR - stderr - +2025-05-11 09:20:01 - ERROR - stderr - +2025-05-11 09:20:01 - INFO - stdout - {'loss': 0.4815, 'grad_norm': 0.8513469696044922, 'learning_rate': 3.842778721114482e-06, 'epoch': 2.16} +2025-05-11 09:20:01 - ERROR - stderr - 72%|███████▏ | 2693/3741 [15:54:07<6:01:09, 20.68s/it] +2025-05-11 09:20:21 - ERROR - stderr - 72%|███████▏ | 2694/3741 [15:54:27<5:55:00, 20.34s/it] +2025-05-11 09:20:21 - ERROR - stderr - +2025-05-11 09:20:21 - ERROR - stderr - +2025-05-11 09:20:21 - INFO - stdout - {'loss': 0.4791, 'grad_norm': 0.8116580247879028, 'learning_rate': 3.835957822277317e-06, 'epoch': 2.16} +2025-05-11 09:20:21 - ERROR - stderr - 72%|███████▏ | 2694/3741 [15:54:27<5:55:00, 20.34s/it] +2025-05-11 09:20:40 - ERROR - stderr - 72%|███████▏ | 2695/3741 [15:54:47<5:51:02, 20.14s/it] +2025-05-11 09:20:40 - ERROR - stderr - +2025-05-11 09:20:40 - ERROR - stderr - +2025-05-11 09:20:40 - INFO - stdout - {'loss': 0.5097, 'grad_norm': 0.8411436676979065, 'learning_rate': 3.829141545448901e-06, 'epoch': 2.16} +2025-05-11 09:20:40 - ERROR - stderr - 72%|███████▏ | 2695/3741 [15:54:47<5:51:02, 20.14s/it] +2025-05-11 09:21:02 - ERROR - stderr - 72%|███████▏ | 2696/3741 [15:55:09<6:00:01, 20.67s/it] +2025-05-11 09:21:02 - ERROR - stderr - +2025-05-11 09:21:02 - ERROR - stderr - +2025-05-11 09:21:02 - INFO - stdout - {'loss': 0.4959, 'grad_norm': 0.8362702131271362, 'learning_rate': 3.82232989574031e-06, 'epoch': 2.16} +2025-05-11 09:21:02 - ERROR - stderr - 72%|███████▏ | 2696/3741 [15:55:09<6:00:01, 20.67s/it] +2025-05-11 09:21:22 - ERROR - stderr - 72%|███████▏ | 2697/3741 [15:55:28<5:54:16, 20.36s/it] +2025-05-11 09:21:22 - ERROR - stderr - +2025-05-11 09:21:22 - ERROR - stderr - +2025-05-11 09:21:22 - INFO - stdout - {'loss': 0.5093, 'grad_norm': 0.8088338375091553, 'learning_rate': 3.815522878259153e-06, 'epoch': 2.16} +2025-05-11 09:21:22 - ERROR - stderr - 72%|███████▏ | 2697/3741 [15:55:28<5:54:16, 20.36s/it] +2025-05-11 09:21:45 - ERROR - stderr - 72%|███████▏ | 2698/3741 [15:55:51<6:07:28, 21.14s/it] +2025-05-11 09:21:45 - ERROR - stderr - +2025-05-11 09:21:45 - ERROR - stderr - +2025-05-11 09:21:45 - INFO - stdout - {'loss': 0.4834, 'grad_norm': 0.8484572768211365, 'learning_rate': 3.8087204981095625e-06, 'epoch': 2.16} +2025-05-11 09:21:45 - ERROR - stderr - 72%|███████▏ | 2698/3741 [15:55:51<6:07:28, 21.14s/it] +2025-05-11 09:22:04 - ERROR - stderr - 72%|███████▏ | 2699/3741 [15:56:11<5:58:03, 20.62s/it] +2025-05-11 09:22:04 - ERROR - stderr - +2025-05-11 09:22:04 - ERROR - stderr - +2025-05-11 09:22:04 - INFO - stdout - {'loss': 0.4883, 'grad_norm': 0.8214154839515686, 'learning_rate': 3.8019227603921927e-06, 'epoch': 2.16} +2025-05-11 09:22:04 - ERROR - stderr - 72%|███████▏ | 2699/3741 [15:56:11<5:58:03, 20.62s/it] +2025-05-11 09:22:28 - ERROR - stderr - 72%|███████▏ | 2700/3741 [15:56:34<6:11:33, 21.42s/it] +2025-05-11 09:22:28 - ERROR - stderr - +2025-05-11 09:22:28 - ERROR - stderr - +2025-05-11 09:22:28 - INFO - stdout - {'loss': 0.4891, 'grad_norm': 0.8145144581794739, 'learning_rate': 3.7951296702042194e-06, 'epoch': 2.17} +2025-05-11 09:22:28 - ERROR - stderr - 72%|███████▏ | 2700/3741 [15:56:34<6:11:33, 21.42s/it] +2025-05-11 09:22:47 - ERROR - stderr - 72%|███████▏ | 2701/3741 [15:56:53<6:01:24, 20.85s/it] +2025-05-11 09:22:47 - ERROR - stderr - +2025-05-11 09:22:47 - ERROR - stderr - +2025-05-11 09:22:47 - INFO - stdout - {'loss': 0.4821, 'grad_norm': 0.7915971875190735, 'learning_rate': 3.7883412326393352e-06, 'epoch': 2.17} +2025-05-11 09:22:47 - ERROR - stderr - 72%|███████▏ | 2701/3741 [15:56:53<6:01:24, 20.85s/it] +2025-05-11 09:23:07 - ERROR - stderr - 72%|███████▏ | 2702/3741 [15:57:13<5:56:57, 20.61s/it] +2025-05-11 09:23:07 - ERROR - stderr - +2025-05-11 09:23:07 - ERROR - stderr - +2025-05-11 09:23:07 - INFO - stdout - {'loss': 0.4941, 'grad_norm': 0.809570848941803, 'learning_rate': 3.7815574527877395e-06, 'epoch': 2.17} +2025-05-11 09:23:07 - ERROR - stderr - 72%|███████▏ | 2702/3741 [15:57:13<5:56:57, 20.61s/it] +2025-05-11 09:23:27 - ERROR - stderr - 72%|███████▏ | 2703/3741 [15:57:33<5:51:06, 20.30s/it] +2025-05-11 09:23:27 - ERROR - stderr - +2025-05-11 09:23:27 - ERROR - stderr - +2025-05-11 09:23:27 - INFO - stdout - {'loss': 0.4698, 'grad_norm': 0.8416109085083008, 'learning_rate': 3.7747783357361455e-06, 'epoch': 2.17} +2025-05-11 09:23:27 - ERROR - stderr - 72%|███████▏ | 2703/3741 [15:57:33<5:51:06, 20.30s/it] +2025-05-11 09:23:46 - ERROR - stderr - 72%|███████▏ | 2704/3741 [15:57:53<5:47:42, 20.12s/it] +2025-05-11 09:23:46 - ERROR - stderr - +2025-05-11 09:23:46 - ERROR - stderr - +2025-05-11 09:23:46 - INFO - stdout - {'loss': 0.4938, 'grad_norm': 0.8841302394866943, 'learning_rate': 3.7680038865677603e-06, 'epoch': 2.17} +2025-05-11 09:23:46 - ERROR - stderr - 72%|███████▏ | 2704/3741 [15:57:53<5:47:42, 20.12s/it] +2025-05-11 09:24:08 - ERROR - stderr - 72%|███████▏ | 2705/3741 [15:58:14<5:52:37, 20.42s/it] +2025-05-11 09:24:08 - ERROR - stderr - +2025-05-11 09:24:08 - ERROR - stderr - +2025-05-11 09:24:08 - INFO - stdout - {'loss': 0.4911, 'grad_norm': 0.8262732028961182, 'learning_rate': 3.7612341103622984e-06, 'epoch': 2.17} +2025-05-11 09:24:08 - ERROR - stderr - 72%|███████▏ | 2705/3741 [15:58:14<5:52:37, 20.42s/it] +2025-05-11 09:24:28 - ERROR - stderr - 72%|███████▏ | 2706/3741 [15:58:34<5:50:35, 20.32s/it] +2025-05-11 09:24:28 - ERROR - stderr - +2025-05-11 09:24:28 - ERROR - stderr - +2025-05-11 09:24:28 - INFO - stdout - {'loss': 0.467, 'grad_norm': 0.815339207649231, 'learning_rate': 3.7544690121959704e-06, 'epoch': 2.17} +2025-05-11 09:24:28 - ERROR - stderr - 72%|███████▏ | 2706/3741 [15:58:34<5:50:35, 20.32s/it] +2025-05-11 09:24:49 - ERROR - stderr - 72%|███████▏ | 2707/3741 [15:58:55<5:56:21, 20.68s/it] +2025-05-11 09:24:49 - ERROR - stderr - +2025-05-11 09:24:49 - ERROR - stderr - +2025-05-11 09:24:49 - INFO - stdout - {'loss': 0.4913, 'grad_norm': 0.8234750628471375, 'learning_rate': 3.7477085971414785e-06, 'epoch': 2.17} +2025-05-11 09:24:49 - ERROR - stderr - 72%|███████▏ | 2707/3741 [15:58:55<5:56:21, 20.68s/it] +2025-05-11 09:25:09 - ERROR - stderr - 72%|███████▏ | 2708/3741 [15:59:15<5:50:14, 20.34s/it] +2025-05-11 09:25:09 - ERROR - stderr - +2025-05-11 09:25:09 - ERROR - stderr - +2025-05-11 09:25:09 - INFO - stdout - {'loss': 0.505, 'grad_norm': 0.8718725442886353, 'learning_rate': 3.7409528702680078e-06, 'epoch': 2.17} +2025-05-11 09:25:09 - ERROR - stderr - 72%|███████▏ | 2708/3741 [15:59:15<5:50:14, 20.34s/it] +2025-05-11 09:25:32 - ERROR - stderr - 72%|███████▏ | 2709/3741 [15:59:38<6:03:43, 21.15s/it] +2025-05-11 09:25:32 - ERROR - stderr - +2025-05-11 09:25:32 - ERROR - stderr - +2025-05-11 09:25:32 - INFO - stdout - {'loss': 0.4958, 'grad_norm': 0.8053902983665466, 'learning_rate': 3.7342018366412336e-06, 'epoch': 2.17} +2025-05-11 09:25:32 - ERROR - stderr - 72%|███████▏ | 2709/3741 [15:59:38<6:03:43, 21.15s/it] +2025-05-11 09:25:51 - ERROR - stderr - 72%|███████▏ | 2710/3741 [15:59:58<5:56:17, 20.73s/it] +2025-05-11 09:25:51 - ERROR - stderr - +2025-05-11 09:25:51 - ERROR - stderr - +2025-05-11 09:25:51 - INFO - stdout - {'loss': 0.4882, 'grad_norm': 0.8399242758750916, 'learning_rate': 3.7274555013233118e-06, 'epoch': 2.17} +2025-05-11 09:25:51 - ERROR - stderr - 72%|███████▏ | 2710/3741 [15:59:58<5:56:17, 20.73s/it] +2025-05-11 09:26:12 - ERROR - stderr - 72%|███████▏ | 2711/3741 [16:00:18<5:53:03, 20.57s/it] +2025-05-11 09:26:12 - ERROR - stderr - +2025-05-11 09:26:12 - ERROR - stderr - +2025-05-11 09:26:12 - INFO - stdout - {'loss': 0.4969, 'grad_norm': 0.8590044975280762, 'learning_rate': 3.720713869372875e-06, 'epoch': 2.17} +2025-05-11 09:26:12 - ERROR - stderr - 72%|███████▏ | 2711/3741 [16:00:18<5:53:03, 20.57s/it] +2025-05-11 09:26:31 - ERROR - stderr - 72%|███████▏ | 2712/3741 [16:00:38<5:48:42, 20.33s/it] +2025-05-11 09:26:31 - ERROR - stderr - +2025-05-11 09:26:31 - ERROR - stderr - +2025-05-11 09:26:31 - INFO - stdout - {'loss': 0.4689, 'grad_norm': 0.8541808724403381, 'learning_rate': 3.71397694584503e-06, 'epoch': 2.17} +2025-05-11 09:26:31 - ERROR - stderr - 72%|███████▏ | 2712/3741 [16:00:38<5:48:42, 20.33s/it] +2025-05-11 09:26:51 - ERROR - stderr - 73%|███████▎ | 2713/3741 [16:00:58<5:46:37, 20.23s/it] +2025-05-11 09:26:51 - ERROR - stderr - +2025-05-11 09:26:51 - ERROR - stderr - +2025-05-11 09:26:51 - INFO - stdout - {'loss': 0.4941, 'grad_norm': 0.8489252328872681, 'learning_rate': 3.7072447357913477e-06, 'epoch': 2.18} +2025-05-11 09:26:51 - ERROR - stderr - 73%|███████▎ | 2713/3741 [16:00:58<5:46:37, 20.23s/it] +2025-05-11 09:27:14 - ERROR - stderr - 73%|███████▎ | 2714/3741 [16:01:20<5:57:29, 20.89s/it] +2025-05-11 09:27:14 - ERROR - stderr - +2025-05-11 09:27:14 - ERROR - stderr - +2025-05-11 09:27:14 - INFO - stdout - {'loss': 0.4818, 'grad_norm': 0.8415629267692566, 'learning_rate': 3.700517244259868e-06, 'epoch': 2.18} +2025-05-11 09:27:14 - ERROR - stderr - 73%|███████▎ | 2714/3741 [16:01:20<5:57:29, 20.89s/it] +2025-05-11 09:27:34 - ERROR - stderr - 73%|███████▎ | 2715/3741 [16:01:40<5:51:48, 20.57s/it] +2025-05-11 09:27:34 - ERROR - stderr - +2025-05-11 09:27:34 - ERROR - stderr - +2025-05-11 09:27:34 - INFO - stdout - {'loss': 0.5023, 'grad_norm': 0.8425331711769104, 'learning_rate': 3.693794476295096e-06, 'epoch': 2.18} +2025-05-11 09:27:34 - ERROR - stderr - 73%|███████▎ | 2715/3741 [16:01:40<5:51:48, 20.57s/it] +2025-05-11 09:27:56 - ERROR - stderr - 73%|███████▎ | 2716/3741 [16:02:02<5:58:18, 20.97s/it] +2025-05-11 09:27:56 - ERROR - stderr - +2025-05-11 09:27:56 - ERROR - stderr - +2025-05-11 09:27:56 - INFO - stdout - {'loss': 0.4943, 'grad_norm': 0.8632931113243103, 'learning_rate': 3.687076436937992e-06, 'epoch': 2.18} +2025-05-11 09:27:56 - ERROR - stderr - 73%|███████▎ | 2716/3741 [16:02:02<5:58:18, 20.97s/it] +2025-05-11 09:28:15 - ERROR - stderr - 73%|███████▎ | 2717/3741 [16:02:21<5:50:50, 20.56s/it] +2025-05-11 09:28:15 - ERROR - stderr - +2025-05-11 09:28:15 - ERROR - stderr - +2025-05-11 09:28:15 - INFO - stdout - {'loss': 0.4764, 'grad_norm': 0.8151559829711914, 'learning_rate': 3.6803631312259724e-06, 'epoch': 2.18} +2025-05-11 09:28:15 - ERROR - stderr - 73%|███████▎ | 2717/3741 [16:02:22<5:50:50, 20.56s/it] +2025-05-11 09:28:39 - ERROR - stderr - 73%|███████▎ | 2718/3741 [16:02:45<6:06:42, 21.51s/it] +2025-05-11 09:28:39 - ERROR - stderr - +2025-05-11 09:28:39 - ERROR - stderr - +2025-05-11 09:28:39 - INFO - stdout - {'loss': 0.4779, 'grad_norm': 0.8196760416030884, 'learning_rate': 3.6736545641928965e-06, 'epoch': 2.18} +2025-05-11 09:28:39 - ERROR - stderr - 73%|███████▎ | 2718/3741 [16:02:45<6:06:42, 21.51s/it] +2025-05-11 09:28:59 - ERROR - stderr - 73%|███████▎ | 2719/3741 [16:03:05<5:58:33, 21.05s/it] +2025-05-11 09:28:59 - ERROR - stderr - +2025-05-11 09:28:59 - ERROR - stderr - +2025-05-11 09:28:59 - INFO - stdout - {'loss': 0.4855, 'grad_norm': 0.7820659279823303, 'learning_rate': 3.6669507408690806e-06, 'epoch': 2.18} +2025-05-11 09:28:59 - ERROR - stderr - 73%|███████▎ | 2719/3741 [16:03:05<5:58:33, 21.05s/it] +2025-05-11 09:29:21 - ERROR - stderr - 73%|███████▎ | 2720/3741 [16:03:28<6:05:20, 21.47s/it] +2025-05-11 09:29:21 - ERROR - stderr - +2025-05-11 09:29:21 - ERROR - stderr - +2025-05-11 09:29:21 - INFO - stdout - {'loss': 0.4851, 'grad_norm': 0.8023489713668823, 'learning_rate': 3.6602516662812824e-06, 'epoch': 2.18} +2025-05-11 09:29:21 - ERROR - stderr - 73%|███████▎ | 2720/3741 [16:03:28<6:05:20, 21.47s/it] +2025-05-11 09:29:41 - ERROR - stderr - 73%|███████▎ | 2721/3741 [16:03:47<5:56:32, 20.97s/it] +2025-05-11 09:29:41 - ERROR - stderr - +2025-05-11 09:29:41 - ERROR - stderr - +2025-05-11 09:29:41 - INFO - stdout - {'loss': 0.4893, 'grad_norm': 0.9628251194953918, 'learning_rate': 3.653557345452685e-06, 'epoch': 2.18} +2025-05-11 09:29:41 - ERROR - stderr - 73%|███████▎ | 2721/3741 [16:03:47<5:56:32, 20.97s/it] +2025-05-11 09:30:04 - ERROR - stderr - 73%|███████▎ | 2722/3741 [16:04:10<6:04:31, 21.46s/it] +2025-05-11 09:30:04 - ERROR - stderr - +2025-05-11 09:30:04 - ERROR - stderr - +2025-05-11 09:30:04 - INFO - stdout - {'loss': 0.4775, 'grad_norm': 0.8556442260742188, 'learning_rate': 3.6468677834029343e-06, 'epoch': 2.18} +2025-05-11 09:30:04 - ERROR - stderr - 73%|███████▎ | 2722/3741 [16:04:10<6:04:31, 21.46s/it] +2025-05-11 09:30:24 - ERROR - stderr - 73%|███████▎ | 2723/3741 [16:04:30<5:57:05, 21.05s/it] +2025-05-11 09:30:24 - ERROR - stderr - +2025-05-11 09:30:24 - ERROR - stderr - +2025-05-11 09:30:24 - INFO - stdout - {'loss': 0.4782, 'grad_norm': 0.9213688373565674, 'learning_rate': 3.6401829851480786e-06, 'epoch': 2.18} +2025-05-11 09:30:24 - ERROR - stderr - 73%|███████▎ | 2723/3741 [16:04:30<5:57:05, 21.05s/it] +2025-05-11 09:30:46 - ERROR - stderr - 73%|███████▎ | 2724/3741 [16:04:52<6:02:39, 21.40s/it] +2025-05-11 09:30:46 - ERROR - stderr - +2025-05-11 09:30:46 - ERROR - stderr - +2025-05-11 09:30:46 - INFO - stdout - {'loss': 0.487, 'grad_norm': 0.8511019349098206, 'learning_rate': 3.6335029557006117e-06, 'epoch': 2.18} +2025-05-11 09:30:46 - ERROR - stderr - 73%|███████▎ | 2724/3741 [16:04:52<6:02:39, 21.40s/it] +2025-05-11 09:30:47 - INFO - stdout - WARNING: tokenization mismatch: 3227 vs. 3245. (ignored) +2025-05-11 09:31:06 - ERROR - stderr - 73%|███████▎ | 2725/3741 [16:05:12<5:53:37, 20.88s/it] +2025-05-11 09:31:06 - ERROR - stderr - +2025-05-11 09:31:06 - ERROR - stderr - +2025-05-11 09:31:06 - INFO - stdout - {'loss': 0.4673, 'grad_norm': 0.8275080323219299, 'learning_rate': 3.626827700069452e-06, 'epoch': 2.19} +2025-05-11 09:31:06 - ERROR - stderr - 73%|███████▎ | 2725/3741 [16:05:12<5:53:37, 20.88s/it] +2025-05-11 09:31:25 - ERROR - stderr - 73%|███████▎ | 2726/3741 [16:05:32<5:46:44, 20.50s/it] +2025-05-11 09:31:25 - ERROR - stderr - +2025-05-11 09:31:25 - ERROR - stderr - +2025-05-11 09:31:25 - INFO - stdout - {'loss': 0.5106, 'grad_norm': 0.8424668908119202, 'learning_rate': 3.6201572232599227e-06, 'epoch': 2.19} +2025-05-11 09:31:25 - ERROR - stderr - 73%|███████▎ | 2726/3741 [16:05:32<5:46:44, 20.50s/it] +2025-05-11 09:31:45 - ERROR - stderr - 73%|███████▎ | 2727/3741 [16:05:51<5:43:04, 20.30s/it] +2025-05-11 09:31:45 - ERROR - stderr - +2025-05-11 09:31:45 - ERROR - stderr - +2025-05-11 09:31:45 - INFO - stdout - {'loss': 0.5259, 'grad_norm': 0.9268709421157837, 'learning_rate': 3.6134915302737862e-06, 'epoch': 2.19} +2025-05-11 09:31:45 - ERROR - stderr - 73%|███████▎ | 2727/3741 [16:05:51<5:43:04, 20.30s/it] +2025-05-11 09:32:05 - ERROR - stderr - 73%|███████▎ | 2728/3741 [16:06:12<5:41:26, 20.22s/it] +2025-05-11 09:32:05 - ERROR - stderr - +2025-05-11 09:32:05 - ERROR - stderr - +2025-05-11 09:32:05 - INFO - stdout - {'loss': 0.5069, 'grad_norm': 0.8410111665725708, 'learning_rate': 3.606830626109198e-06, 'epoch': 2.19} +2025-05-11 09:32:05 - ERROR - stderr - 73%|███████▎ | 2728/3741 [16:06:12<5:41:26, 20.22s/it] +2025-05-11 09:32:25 - ERROR - stderr - 73%|███████▎ | 2729/3741 [16:06:31<5:37:44, 20.02s/it] +2025-05-11 09:32:25 - ERROR - stderr - +2025-05-11 09:32:25 - ERROR - stderr - +2025-05-11 09:32:25 - INFO - stdout - {'loss': 0.4787, 'grad_norm': 0.8103172779083252, 'learning_rate': 3.600174515760733e-06, 'epoch': 2.19} +2025-05-11 09:32:25 - ERROR - stderr - 73%|███████▎ | 2729/3741 [16:06:31<5:37:44, 20.02s/it] +2025-05-11 09:32:45 - ERROR - stderr - 73%|███████▎ | 2730/3741 [16:06:51<5:36:12, 19.95s/it] +2025-05-11 09:32:45 - ERROR - stderr - +2025-05-11 09:32:45 - ERROR - stderr - +2025-05-11 09:32:45 - INFO - stdout - {'loss': 0.5043, 'grad_norm': 0.8217037916183472, 'learning_rate': 3.5935232042193734e-06, 'epoch': 2.19} +2025-05-11 09:32:45 - ERROR - stderr - 73%|███████▎ | 2730/3741 [16:06:51<5:36:12, 19.95s/it] +2025-05-11 09:33:06 - ERROR - stderr - 73%|███████▎ | 2731/3741 [16:07:12<5:41:48, 20.31s/it] +2025-05-11 09:33:06 - ERROR - stderr - +2025-05-11 09:33:06 - ERROR - stderr - +2025-05-11 09:33:06 - INFO - stdout - {'loss': 0.5117, 'grad_norm': 0.8149628639221191, 'learning_rate': 3.58687669647249e-06, 'epoch': 2.19} +2025-05-11 09:33:06 - ERROR - stderr - 73%|███████▎ | 2731/3741 [16:07:12<5:41:48, 20.31s/it] +2025-05-11 09:33:25 - ERROR - stderr - 73%|███████▎ | 2732/3741 [16:07:32<5:38:15, 20.11s/it] +2025-05-11 09:33:25 - ERROR - stderr - +2025-05-11 09:33:25 - ERROR - stderr - +2025-05-11 09:33:25 - INFO - stdout - {'loss': 0.4684, 'grad_norm': 0.8104637265205383, 'learning_rate': 3.5802349975038718e-06, 'epoch': 2.19} +2025-05-11 09:33:25 - ERROR - stderr - 73%|███████▎ | 2732/3741 [16:07:32<5:38:15, 20.11s/it] +2025-05-11 09:33:48 - ERROR - stderr - 73%|███████▎ | 2733/3741 [16:07:55<5:52:22, 20.98s/it] +2025-05-11 09:33:48 - ERROR - stderr - +2025-05-11 09:33:48 - ERROR - stderr - +2025-05-11 09:33:48 - INFO - stdout - {'loss': 0.4875, 'grad_norm': 0.8468414545059204, 'learning_rate': 3.573598112293687e-06, 'epoch': 2.19} +2025-05-11 09:33:48 - ERROR - stderr - 73%|███████▎ | 2733/3741 [16:07:55<5:52:22, 20.98s/it] +2025-05-11 09:34:08 - ERROR - stderr - 73%|███████▎ | 2734/3741 [16:08:14<5:45:50, 20.61s/it] +2025-05-11 09:34:08 - ERROR - stderr - +2025-05-11 09:34:08 - ERROR - stderr - +2025-05-11 09:34:08 - INFO - stdout - {'loss': 0.5076, 'grad_norm': 0.8977518677711487, 'learning_rate': 3.5669660458184886e-06, 'epoch': 2.19} +2025-05-11 09:34:08 - ERROR - stderr - 73%|███████▎ | 2734/3741 [16:08:14<5:45:50, 20.61s/it] +2025-05-11 09:34:31 - ERROR - stderr - 73%|███████▎ | 2735/3741 [16:08:38<5:59:28, 21.44s/it] +2025-05-11 09:34:31 - ERROR - stderr - +2025-05-11 09:34:31 - ERROR - stderr - +2025-05-11 09:34:31 - INFO - stdout - {'loss': 0.4962, 'grad_norm': 0.8071417808532715, 'learning_rate': 3.560338803051241e-06, 'epoch': 2.19} +2025-05-11 09:34:31 - ERROR - stderr - 73%|███████▎ | 2735/3741 [16:08:38<5:59:28, 21.44s/it] +2025-05-11 09:34:51 - ERROR - stderr - 73%|███████▎ | 2736/3741 [16:08:57<5:49:54, 20.89s/it] +2025-05-11 09:34:51 - ERROR - stderr - +2025-05-11 09:34:51 - ERROR - stderr - +2025-05-11 09:34:51 - INFO - stdout - {'loss': 0.5087, 'grad_norm': 0.8739727139472961, 'learning_rate': 3.5537163889612656e-06, 'epoch': 2.19} +2025-05-11 09:34:51 - ERROR - stderr - 73%|███████▎ | 2736/3741 [16:08:57<5:49:54, 20.89s/it] +2025-05-11 09:35:14 - ERROR - stderr - 73%|███████▎ | 2737/3741 [16:09:20<5:58:31, 21.43s/it] +2025-05-11 09:35:14 - ERROR - stderr - +2025-05-11 09:35:14 - ERROR - stderr - +2025-05-11 09:35:14 - INFO - stdout - {'loss': 0.4997, 'grad_norm': 0.8069101572036743, 'learning_rate': 3.547098808514279e-06, 'epoch': 2.19} +2025-05-11 09:35:14 - ERROR - stderr - 73%|███████▎ | 2737/3741 [16:09:20<5:58:31, 21.43s/it] +2025-05-11 09:35:33 - ERROR - stderr - 73%|███████▎ | 2738/3741 [16:09:40<5:49:12, 20.89s/it] +2025-05-11 09:35:33 - ERROR - stderr - +2025-05-11 09:35:33 - ERROR - stderr - +2025-05-11 09:35:33 - INFO - stdout - {'loss': 0.4875, 'grad_norm': 0.8516671061515808, 'learning_rate': 3.5404860666723695e-06, 'epoch': 2.2} +2025-05-11 09:35:33 - ERROR - stderr - 73%|███████▎ | 2738/3741 [16:09:40<5:49:12, 20.89s/it] +2025-05-11 09:35:56 - ERROR - stderr - 73%|███████▎ | 2739/3741 [16:10:03<5:58:50, 21.49s/it] +2025-05-11 09:35:56 - ERROR - stderr - +2025-05-11 09:35:56 - ERROR - stderr - +2025-05-11 09:35:56 - INFO - stdout - {'loss': 0.4838, 'grad_norm': 0.8388312458992004, 'learning_rate': 3.5338781683939882e-06, 'epoch': 2.2} +2025-05-11 09:35:56 - ERROR - stderr - 73%|███████▎ | 2739/3741 [16:10:03<5:58:50, 21.49s/it] +2025-05-11 09:36:16 - ERROR - stderr - 73%|███████▎ | 2740/3741 [16:10:22<5:50:21, 21.00s/it] +2025-05-11 09:36:16 - ERROR - stderr - +2025-05-11 09:36:16 - ERROR - stderr - +2025-05-11 09:36:16 - INFO - stdout - {'loss': 0.5076, 'grad_norm': 0.8394985198974609, 'learning_rate': 3.527275118633974e-06, 'epoch': 2.2} +2025-05-11 09:36:16 - ERROR - stderr - 73%|███████▎ | 2740/3741 [16:10:22<5:50:21, 21.00s/it] +2025-05-11 09:36:38 - ERROR - stderr - 73%|███████▎ | 2741/3741 [16:10:45<5:55:58, 21.36s/it] +2025-05-11 09:36:38 - ERROR - stderr - +2025-05-11 09:36:38 - ERROR - stderr - +2025-05-11 09:36:38 - INFO - stdout - {'loss': 0.4994, 'grad_norm': 0.8449310064315796, 'learning_rate': 3.52067692234351e-06, 'epoch': 2.2} +2025-05-11 09:36:38 - ERROR - stderr - 73%|███████▎ | 2741/3741 [16:10:45<5:55:58, 21.36s/it] +2025-05-11 09:36:58 - ERROR - stderr - 73%|███████▎ | 2742/3741 [16:11:05<5:48:13, 20.91s/it] +2025-05-11 09:36:58 - ERROR - stderr - +2025-05-11 09:36:58 - ERROR - stderr - +2025-05-11 09:36:58 - INFO - stdout - {'loss': 0.4839, 'grad_norm': 0.806086003780365, 'learning_rate': 3.514083584470149e-06, 'epoch': 2.2} +2025-05-11 09:36:58 - ERROR - stderr - 73%|███████▎ | 2742/3741 [16:11:05<5:48:13, 20.91s/it] +2025-05-11 09:37:19 - ERROR - stderr - 73%|███████▎ | 2743/3741 [16:11:25<5:46:51, 20.85s/it] +2025-05-11 09:37:19 - ERROR - stderr - +2025-05-11 09:37:19 - ERROR - stderr - +2025-05-11 09:37:19 - INFO - stdout - {'loss': 0.4765, 'grad_norm': 0.8380177617073059, 'learning_rate': 3.507495109957808e-06, 'epoch': 2.2} +2025-05-11 09:37:19 - ERROR - stderr - 73%|███████▎ | 2743/3741 [16:11:25<5:46:51, 20.85s/it] +2025-05-11 09:37:39 - ERROR - stderr - 73%|███████▎ | 2744/3741 [16:11:45<5:41:32, 20.55s/it] +2025-05-11 09:37:39 - ERROR - stderr - +2025-05-11 09:37:39 - ERROR - stderr - +2025-05-11 09:37:39 - INFO - stdout - {'loss': 0.4788, 'grad_norm': 0.8069396615028381, 'learning_rate': 3.5009115037467355e-06, 'epoch': 2.2} +2025-05-11 09:37:39 - ERROR - stderr - 73%|███████▎ | 2744/3741 [16:11:45<5:41:32, 20.55s/it] +2025-05-11 09:37:59 - ERROR - stderr - 73%|███████▎ | 2745/3741 [16:12:05<5:37:40, 20.34s/it] +2025-05-11 09:37:59 - ERROR - stderr - +2025-05-11 09:37:59 - ERROR - stderr - +2025-05-11 09:37:59 - INFO - stdout - {'loss': 0.505, 'grad_norm': 0.8960397243499756, 'learning_rate': 3.4943327707735586e-06, 'epoch': 2.2} +2025-05-11 09:37:59 - ERROR - stderr - 73%|███████▎ | 2745/3741 [16:12:05<5:37:40, 20.34s/it] +2025-05-11 09:38:19 - ERROR - stderr - 73%|███████▎ | 2746/3741 [16:12:26<5:39:26, 20.47s/it] +2025-05-11 09:38:19 - ERROR - stderr - +2025-05-11 09:38:19 - ERROR - stderr - +2025-05-11 09:38:19 - INFO - stdout - {'loss': 0.4837, 'grad_norm': 0.8247140645980835, 'learning_rate': 3.4877589159712266e-06, 'epoch': 2.2} +2025-05-11 09:38:19 - ERROR - stderr - 73%|███████▎ | 2746/3741 [16:12:26<5:39:26, 20.47s/it] +2025-05-11 09:38:39 - ERROR - stderr - 73%|███████▎ | 2747/3741 [16:12:45<5:35:18, 20.24s/it] +2025-05-11 09:38:39 - ERROR - stderr - +2025-05-11 09:38:39 - ERROR - stderr - +2025-05-11 09:38:39 - INFO - stdout - {'loss': 0.4965, 'grad_norm': 0.8580472469329834, 'learning_rate': 3.481189944269041e-06, 'epoch': 2.2} +2025-05-11 09:38:39 - ERROR - stderr - 73%|███████▎ | 2747/3741 [16:12:45<5:35:18, 20.24s/it] +2025-05-11 09:39:00 - ERROR - stderr - 73%|███████▎ | 2748/3741 [16:13:07<5:39:58, 20.54s/it] +2025-05-11 09:39:00 - ERROR - stderr - +2025-05-11 09:39:00 - ERROR - stderr - +2025-05-11 09:39:00 - INFO - stdout - {'loss': 0.4888, 'grad_norm': 0.8549031615257263, 'learning_rate': 3.4746258605926443e-06, 'epoch': 2.2} +2025-05-11 09:39:00 - ERROR - stderr - 73%|███████▎ | 2748/3741 [16:13:07<5:39:58, 20.54s/it] +2025-05-11 09:39:20 - ERROR - stderr - 73%|███████▎ | 2749/3741 [16:13:27<5:36:50, 20.37s/it] +2025-05-11 09:39:20 - ERROR - stderr - +2025-05-11 09:39:20 - ERROR - stderr - +2025-05-11 09:39:20 - INFO - stdout - {'loss': 0.4763, 'grad_norm': 0.8439877033233643, 'learning_rate': 3.468066669864004e-06, 'epoch': 2.2} +2025-05-11 09:39:20 - ERROR - stderr - 73%|███████▎ | 2749/3741 [16:13:27<5:36:50, 20.37s/it] +2025-05-11 09:39:43 - ERROR - stderr - 74%|███████▎ | 2750/3741 [16:13:49<5:46:30, 20.98s/it] +2025-05-11 09:39:43 - ERROR - stderr - +2025-05-11 09:39:43 - ERROR - stderr - +2025-05-11 09:39:43 - INFO - stdout - {'loss': 0.5199, 'grad_norm': 0.843323290348053, 'learning_rate': 3.461512377001427e-06, 'epoch': 2.21} +2025-05-11 09:39:43 - ERROR - stderr - 74%|███████��� | 2750/3741 [16:13:49<5:46:30, 20.98s/it] +2025-05-11 09:40:03 - ERROR - stderr - 74%|███████▎ | 2751/3741 [16:14:09<5:42:29, 20.76s/it] +2025-05-11 09:40:03 - ERROR - stderr - +2025-05-11 09:40:03 - ERROR - stderr - +2025-05-11 09:40:03 - INFO - stdout - {'loss': 0.489, 'grad_norm': 0.8714971542358398, 'learning_rate': 3.4549629869195467e-06, 'epoch': 2.21} +2025-05-11 09:40:03 - ERROR - stderr - 74%|███████▎ | 2751/3741 [16:14:09<5:42:29, 20.76s/it] +2025-05-11 09:40:26 - ERROR - stderr - 74%|███████▎ | 2752/3741 [16:14:32<5:52:51, 21.41s/it] +2025-05-11 09:40:26 - ERROR - stderr - +2025-05-11 09:40:26 - ERROR - stderr - +2025-05-11 09:40:26 - INFO - stdout - {'loss': 0.4635, 'grad_norm': 0.8079264163970947, 'learning_rate': 3.448418504529318e-06, 'epoch': 2.21} +2025-05-11 09:40:26 - ERROR - stderr - 74%|███████▎ | 2752/3741 [16:14:32<5:52:51, 21.41s/it] +2025-05-11 09:40:46 - ERROR - stderr - 74%|███████▎ | 2753/3741 [16:14:52<5:44:58, 20.95s/it] +2025-05-11 09:40:46 - ERROR - stderr - +2025-05-11 09:40:46 - ERROR - stderr - +2025-05-11 09:40:46 - INFO - stdout - {'loss': 0.4858, 'grad_norm': 0.8408187031745911, 'learning_rate': 3.44187893473802e-06, 'epoch': 2.21} +2025-05-11 09:40:46 - ERROR - stderr - 74%|███████▎ | 2753/3741 [16:14:52<5:44:58, 20.95s/it] +2025-05-11 09:41:09 - ERROR - stderr - 74%|███████▎ | 2754/3741 [16:15:16<5:57:10, 21.71s/it] +2025-05-11 09:41:09 - ERROR - stderr - +2025-05-11 09:41:09 - ERROR - stderr - +2025-05-11 09:41:09 - INFO - stdout - {'loss': 0.4884, 'grad_norm': 0.8397002220153809, 'learning_rate': 3.435344282449239e-06, 'epoch': 2.21} +2025-05-11 09:41:09 - ERROR - stderr - 74%|███████▎ | 2754/3741 [16:15:16<5:57:10, 21.71s/it] +2025-05-11 09:41:29 - ERROR - stderr - 74%|███████▎ | 2755/3741 [16:15:36<5:48:14, 21.19s/it] +2025-05-11 09:41:29 - ERROR - stderr - +2025-05-11 09:41:29 - ERROR - stderr - +2025-05-11 09:41:29 - INFO - stdout - {'loss': 0.4985, 'grad_norm': 0.9133360981941223, 'learning_rate': 3.4288145525628813e-06, 'epoch': 2.21} +2025-05-11 09:41:29 - ERROR - stderr - 74%|███████▎ | 2755/3741 [16:15:36<5:48:14, 21.19s/it] +2025-05-11 09:41:52 - ERROR - stderr - 74%|███████▎ | 2756/3741 [16:15:59<5:58:04, 21.81s/it] +2025-05-11 09:41:52 - ERROR - stderr - +2025-05-11 09:41:52 - ERROR - stderr - +2025-05-11 09:41:52 - INFO - stdout - {'loss': 0.5094, 'grad_norm': 0.886596143245697, 'learning_rate': 3.422289749975163e-06, 'epoch': 2.21} +2025-05-11 09:41:52 - ERROR - stderr - 74%|███████▎ | 2756/3741 [16:15:59<5:58:04, 21.81s/it] +2025-05-11 09:42:12 - ERROR - stderr - 74%|███████▎ | 2757/3741 [16:16:19<5:48:06, 21.23s/it] +2025-05-11 09:42:12 - ERROR - stderr - +2025-05-11 09:42:12 - ERROR - stderr - +2025-05-11 09:42:12 - INFO - stdout - {'loss': 0.5027, 'grad_norm': 0.8573530912399292, 'learning_rate': 3.415769879578601e-06, 'epoch': 2.21} +2025-05-11 09:42:12 - ERROR - stderr - 74%|███████▎ | 2757/3741 [16:16:19<5:48:06, 21.23s/it] +2025-05-11 09:42:36 - ERROR - stderr - 74%|███████▎ | 2758/3741 [16:16:42<5:59:37, 21.95s/it] +2025-05-11 09:42:36 - ERROR - stderr - +2025-05-11 09:42:36 - ERROR - stderr - +2025-05-11 09:42:36 - INFO - stdout - {'loss': 0.4878, 'grad_norm': 0.8812514543533325, 'learning_rate': 3.4092549462620215e-06, 'epoch': 2.21} +2025-05-11 09:42:36 - ERROR - stderr - 74%|███████▎ | 2758/3741 [16:16:42<5:59:37, 21.95s/it] +2025-05-11 09:42:56 - ERROR - stderr - 74%|███████▍ | 2759/3741 [16:17:02<5:49:09, 21.33s/it] +2025-05-11 09:42:56 - ERROR - stderr - +2025-05-11 09:42:56 - ERROR - stderr - +2025-05-11 09:42:56 - INFO - stdout - {'loss': 0.4836, 'grad_norm': 0.8391367197036743, 'learning_rate': 3.4027449549105353e-06, 'epoch': 2.21} +2025-05-11 09:42:56 - ERROR - stderr - 74%|███████▍ | 2759/3741 [16:17:02<5:49:09, 21.33s/it] +2025-05-11 09:43:18 - ERROR - stderr - 74%|███████▍ | 2760/3741 [16:17:25<5:54:11, 21.66s/it] +2025-05-11 09:43:18 - ERROR - stderr - +2025-05-11 09:43:18 - ERROR - stderr - +2025-05-11 09:43:18 - INFO - stdout - {'loss': 0.4747, 'grad_norm': 0.7842381596565247, 'learning_rate': 3.3962399104055597e-06, 'epoch': 2.21} +2025-05-11 09:43:18 - ERROR - stderr - 74%|███████▍ | 2760/3741 [16:17:25<5:54:11, 21.66s/it] +2025-05-11 09:43:38 - ERROR - stderr - 74%|███████▍ | 2761/3741 [16:17:44<5:43:00, 21.00s/it] +2025-05-11 09:43:38 - ERROR - stderr - +2025-05-11 09:43:38 - ERROR - stderr - +2025-05-11 09:43:38 - INFO - stdout - {'loss': 0.4775, 'grad_norm': 0.8407445549964905, 'learning_rate': 3.3897398176247984e-06, 'epoch': 2.21} +2025-05-11 09:43:38 - ERROR - stderr - 74%|███████▍ | 2761/3741 [16:17:44<5:43:00, 21.00s/it] +2025-05-11 09:43:59 - ERROR - stderr - 74%|███████▍ | 2762/3741 [16:18:06<5:46:13, 21.22s/it] +2025-05-11 09:44:00 - ERROR - stderr - +2025-05-11 09:44:00 - ERROR - stderr - +2025-05-11 09:44:00 - INFO - stdout - {'loss': 0.5205, 'grad_norm': 0.8051870465278625, 'learning_rate': 3.383244681442246e-06, 'epoch': 2.21} +2025-05-11 09:44:00 - ERROR - stderr - 74%|███████▍ | 2762/3741 [16:18:06<5:46:13, 21.22s/it] +2025-05-11 09:44:19 - ERROR - stderr - 74%|███████▍ | 2763/3741 [16:18:26<5:38:55, 20.79s/it] +2025-05-11 09:44:19 - ERROR - stderr - +2025-05-11 09:44:19 - ERROR - stderr - +2025-05-11 09:44:19 - INFO - stdout - {'loss': 0.4794, 'grad_norm': 0.788661539554596, 'learning_rate': 3.376754506728167e-06, 'epoch': 2.22} +2025-05-11 09:44:19 - ERROR - stderr - 74%|███████▍ | 2763/3741 [16:18:26<5:38:55, 20.79s/it] +2025-05-11 09:44:41 - ERROR - stderr - 74%|███████▍ | 2764/3741 [16:18:47<5:43:15, 21.08s/it] +2025-05-11 09:44:41 - ERROR - stderr - +2025-05-11 09:44:41 - ERROR - stderr - +2025-05-11 09:44:41 - INFO - stdout - {'loss': 0.4687, 'grad_norm': 0.8352063298225403, 'learning_rate': 3.370269298349128e-06, 'epoch': 2.22} +2025-05-11 09:44:41 - ERROR - stderr - 74%|███████▍ | 2764/3741 [16:18:47<5:43:15, 21.08s/it] +2025-05-11 09:45:01 - ERROR - stderr - 74%|███████▍ | 2765/3741 [16:19:07<5:35:40, 20.64s/it] +2025-05-11 09:45:01 - ERROR - stderr - +2025-05-11 09:45:01 - ERROR - stderr - +2025-05-11 09:45:01 - INFO - stdout - {'loss': 0.4769, 'grad_norm': 0.8610337376594543, 'learning_rate': 3.363789061167949e-06, 'epoch': 2.22} +2025-05-11 09:45:01 - ERROR - stderr - 74%|███████▍ | 2765/3741 [16:19:07<5:35:40, 20.64s/it] +2025-05-11 09:45:23 - ERROR - stderr - 74%|███████▍ | 2766/3741 [16:19:30<5:46:08, 21.30s/it] +2025-05-11 09:45:24 - ERROR - stderr - +2025-05-11 09:45:24 - ERROR - stderr - +2025-05-11 09:45:24 - INFO - stdout - {'loss': 0.469, 'grad_norm': 0.8776218891143799, 'learning_rate': 3.3573138000437367e-06, 'epoch': 2.22} +2025-05-11 09:45:24 - ERROR - stderr - 74%|███████▍ | 2766/3741 [16:19:30<5:46:08, 21.30s/it] +2025-05-11 09:45:43 - ERROR - stderr - 74%|███████▍ | 2767/3741 [16:19:49<5:37:40, 20.80s/it] +2025-05-11 09:45:43 - ERROR - stderr - +2025-05-11 09:45:43 - ERROR - stderr - +2025-05-11 09:45:43 - INFO - stdout - {'loss': 0.4961, 'grad_norm': 0.8633357882499695, 'learning_rate': 3.3508435198318645e-06, 'epoch': 2.22} +2025-05-11 09:45:43 - ERROR - stderr - 74%|███████▍ | 2767/3741 [16:19:49<5:37:40, 20.80s/it] +2025-05-11 09:46:06 - ERROR - stderr - 74%|███████▍ | 2768/3741 [16:20:12<5:47:10, 21.41s/it] +2025-05-11 09:46:06 - ERROR - stderr - +2025-05-11 09:46:06 - ERROR - stderr - +2025-05-11 09:46:06 - INFO - stdout - {'loss': 0.4973, 'grad_norm': 0.8187628984451294, 'learning_rate': 3.34437822538396e-06, 'epoch': 2.22} +2025-05-11 09:46:06 - ERROR - stderr - 74%|███████▍ | 2768/3741 [16:20:12<5:47:10, 21.41s/it] +2025-05-11 09:46:26 - ERROR - stderr - 74%|███████▍ | 2769/3741 [16:20:32<5:38:07, 20.87s/it] +2025-05-11 09:46:26 - ERROR - stderr - +2025-05-11 09:46:26 - ERROR - stderr - +2025-05-11 09:46:26 - INFO - stdout - {'loss': 0.5006, 'grad_norm': 0.805854082107544, 'learning_rate': 3.337917921547934e-06, 'epoch': 2.22} +2025-05-11 09:46:26 - ERROR - stderr - 74%|███████▍ | 2769/3741 [16:20:32<5:38:07, 20.87s/it] +2025-05-11 09:46:48 - ERROR - stderr - 74%|███████▍ | 2770/3741 [16:20:54<5:44:21, 21.28s/it] +2025-05-11 09:46:48 - ERROR - stderr - +2025-05-11 09:46:48 - ERROR - stderr - +2025-05-11 09:46:48 - INFO - stdout - {'loss': 0.4847, 'grad_norm': 0.8261088132858276, 'learning_rate': 3.3314626131679328e-06, 'epoch': 2.22} +2025-05-11 09:46:48 - ERROR - stderr - 74%|███████▍ | 2770/3741 [16:20:54<5:44:21, 21.28s/it] +2025-05-11 09:47:07 - ERROR - stderr - 74%|███████▍ | 2771/3741 [16:21:14<5:35:44, 20.77s/it] +2025-05-11 09:47:07 - ERROR - stderr - +2025-05-11 09:47:07 - ERROR - stderr - +2025-05-11 09:47:07 - INFO - stdout - {'loss': 0.4876, 'grad_norm': 0.8578211069107056, 'learning_rate': 3.3250123050843696e-06, 'epoch': 2.22} +2025-05-11 09:47:07 - ERROR - stderr - 74%|███████▍ | 2771/3741 [16:21:14<5:35:44, 20.77s/it] +2025-05-11 09:47:27 - ERROR - stderr - 74%|███████▍ | 2772/3741 [16:21:33<5:30:36, 20.47s/it] +2025-05-11 09:47:27 - ERROR - stderr - +2025-05-11 09:47:27 - ERROR - stderr - +2025-05-11 09:47:27 - INFO - stdout - {'loss': 0.4817, 'grad_norm': 0.8757601976394653, 'learning_rate': 3.318567002133909e-06, 'epoch': 2.22} +2025-05-11 09:47:27 - ERROR - stderr - 74%|███████▍ | 2772/3741 [16:21:33<5:30:36, 20.47s/it] +2025-05-11 09:47:47 - ERROR - stderr - 74%|███████▍ | 2773/3741 [16:21:53<5:25:56, 20.20s/it] +2025-05-11 09:47:47 - ERROR - stderr - +2025-05-11 09:47:47 - ERROR - stderr - +2025-05-11 09:47:47 - INFO - stdout - {'loss': 0.4905, 'grad_norm': 0.8412430286407471, 'learning_rate': 3.312126709149447e-06, 'epoch': 2.22} +2025-05-11 09:47:47 - ERROR - stderr - 74%|███████▍ | 2773/3741 [16:21:53<5:25:56, 20.20s/it] +2025-05-11 09:48:08 - ERROR - stderr - 74%|███████▍ | 2774/3741 [16:22:14<5:30:28, 20.51s/it] +2025-05-11 09:48:08 - ERROR - stderr - +2025-05-11 09:48:08 - ERROR - stderr - +2025-05-11 09:48:08 - INFO - stdout - {'loss': 0.5099, 'grad_norm': 0.833476722240448, 'learning_rate': 3.3056914309601483e-06, 'epoch': 2.22} +2025-05-11 09:48:08 - ERROR - stderr - 74%|███████▍ | 2774/3741 [16:22:14<5:30:28, 20.51s/it] +2025-05-11 09:48:29 - ERROR - stderr - 74%|███████▍ | 2775/3741 [16:22:35<5:31:50, 20.61s/it] +2025-05-11 09:48:29 - ERROR - stderr - +2025-05-11 09:48:29 - ERROR - stderr - +2025-05-11 09:48:29 - INFO - stdout - {'loss': 0.4878, 'grad_norm': 0.8237016797065735, 'learning_rate': 3.299261172391399e-06, 'epoch': 2.23} +2025-05-11 09:48:29 - ERROR - stderr - 74%|███████▍ | 2775/3741 [16:22:35<5:31:50, 20.61s/it] +2025-05-11 09:48:50 - ERROR - stderr - 74%|███████▍ | 2776/3741 [16:22:56<5:33:54, 20.76s/it] +2025-05-11 09:48:50 - ERROR - stderr - +2025-05-11 09:48:50 - ERROR - stderr - +2025-05-11 09:48:50 - INFO - stdout - {'loss': 0.5018, 'grad_norm': 0.8863388299942017, 'learning_rate': 3.2928359382648166e-06, 'epoch': 2.23} +2025-05-11 09:48:50 - ERROR - stderr - 74%|███████▍ | 2776/3741 [16:22:56<5:33:54, 20.76s/it] +2025-05-11 09:49:10 - ERROR - stderr - 74%|███████▍ | 2777/3741 [16:23:16<5:29:52, 20.53s/it] +2025-05-11 09:49:10 - ERROR - stderr - +2025-05-11 09:49:10 - ERROR - stderr - +2025-05-11 09:49:10 - INFO - stdout - {'loss': 0.5004, 'grad_norm': 0.9207762479782104, 'learning_rate': 3.286415733398276e-06, 'epoch': 2.23} +2025-05-11 09:49:10 - ERROR - stderr - 74%|███████▍ | 2777/3741 [16:23:16<5:29:52, 20.53s/it] +2025-05-11 09:49:30 - ERROR - stderr - 74%|███████▍ | 2778/3741 [16:23:36<5:27:03, 20.38s/it] +2025-05-11 09:49:30 - ERROR - stderr - +2025-05-11 09:49:30 - ERROR - stderr - +2025-05-11 09:49:30 - INFO - stdout - {'loss': 0.4769, 'grad_norm': 0.8882100582122803, 'learning_rate': 3.280000562605854e-06, 'epoch': 2.23} +2025-05-11 09:49:30 - ERROR - stderr - 74%|███████▍ | 2778/3741 [16:23:36<5:27:03, 20.38s/it] +2025-05-11 09:49:51 - ERROR - stderr - 74%|███████▍ | 2779/3741 [16:23:57<5:27:58, 20.46s/it] +2025-05-11 09:49:51 - ERROR - stderr - +2025-05-11 09:49:51 - ERROR - stderr - +2025-05-11 09:49:51 - INFO - stdout - {'loss': 0.4896, 'grad_norm': 0.8794893622398376, 'learning_rate': 3.2735904306978684e-06, 'epoch': 2.23} +2025-05-11 09:49:51 - ERROR - stderr - 74%|███████▍ | 2779/3741 [16:23:57<5:27:58, 20.46s/it] +2025-05-11 09:50:10 - ERROR - stderr - 74%|███████▍ | 2780/3741 [16:24:17<5:23:46, 20.21s/it] +2025-05-11 09:50:10 - ERROR - stderr - +2025-05-11 09:50:10 - ERROR - stderr - +2025-05-11 09:50:10 - INFO - stdout - {'loss': 0.466, 'grad_norm': 0.8332772850990295, 'learning_rate': 3.2671853424808574e-06, 'epoch': 2.23} +2025-05-11 09:50:10 - ERROR - stderr - 74%|███████▍ | 2780/3741 [16:24:17<5:23:46, 20.21s/it] +2025-05-11 09:50:32 - ERROR - stderr - 74%|███████▍ | 2781/3741 [16:24:38<5:29:32, 20.60s/it] +2025-05-11 09:50:32 - ERROR - stderr - +2025-05-11 09:50:32 - ERROR - stderr - +2025-05-11 09:50:32 - INFO - stdout - {'loss': 0.4785, 'grad_norm': 0.827366292476654, 'learning_rate': 3.2607853027575643e-06, 'epoch': 2.23} +2025-05-11 09:50:32 - ERROR - stderr - 74%|███████▍ | 2781/3741 [16:24:38<5:29:32, 20.60s/it] +2025-05-11 09:50:52 - ERROR - stderr - 74%|███████▍ | 2782/3741 [16:24:58<5:25:29, 20.36s/it] +2025-05-11 09:50:52 - ERROR - stderr - +2025-05-11 09:50:52 - ERROR - stderr - +2025-05-11 09:50:52 - INFO - stdout - {'loss': 0.5128, 'grad_norm': 0.8560011386871338, 'learning_rate': 3.2543903163269697e-06, 'epoch': 2.23} +2025-05-11 09:50:52 - ERROR - stderr - 74%|███████▍ | 2782/3741 [16:24:58<5:25:29, 20.36s/it] +2025-05-11 09:51:13 - ERROR - stderr - 74%|███████▍ | 2783/3741 [16:25:19<5:29:19, 20.63s/it] +2025-05-11 09:51:13 - ERROR - stderr - +2025-05-11 09:51:13 - ERROR - stderr - +2025-05-11 09:51:13 - INFO - stdout - {'loss': 0.4706, 'grad_norm': 0.8379847407341003, 'learning_rate': 3.2480003879842424e-06, 'epoch': 2.23} +2025-05-11 09:51:13 - ERROR - stderr - 74%|███████▍ | 2783/3741 [16:25:19<5:29:19, 20.63s/it] +2025-05-11 09:51:32 - ERROR - stderr - 74%|███████▍ | 2784/3741 [16:25:39<5:24:22, 20.34s/it] +2025-05-11 09:51:32 - ERROR - stderr - +2025-05-11 09:51:32 - ERROR - stderr - +2025-05-11 09:51:32 - INFO - stdout - {'loss': 0.4936, 'grad_norm': 0.8355280756950378, 'learning_rate': 3.2416155225207726e-06, 'epoch': 2.23} +2025-05-11 09:51:32 - ERROR - stderr - 74%|███████▍ | 2784/3741 [16:25:39<5:24:22, 20.34s/it] +2025-05-11 09:51:55 - ERROR - stderr - 74%|███████▍ | 2785/3741 [16:26:01<5:33:27, 20.93s/it] +2025-05-11 09:51:55 - ERROR - stderr - +2025-05-11 09:51:55 - ERROR - stderr - +2025-05-11 09:51:55 - INFO - stdout - {'loss': 0.4908, 'grad_norm': 0.8986692428588867, 'learning_rate': 3.2352357247241517e-06, 'epoch': 2.23} +2025-05-11 09:51:55 - ERROR - stderr - 74%|███████▍ | 2785/3741 [16:26:01<5:33:27, 20.93s/it] +2025-05-11 09:52:15 - ERROR - stderr - 74%|███████▍ | 2786/3741 [16:26:21<5:27:42, 20.59s/it] +2025-05-11 09:52:15 - ERROR - stderr - +2025-05-11 09:52:15 - ERROR - stderr - +2025-05-11 09:52:15 - INFO - stdout - {'loss': 0.4813, 'grad_norm': 0.8830366730690002, 'learning_rate': 3.2288609993781606e-06, 'epoch': 2.23} +2025-05-11 09:52:15 - ERROR - stderr - 74%|███████▍ | 2786/3741 [16:26:21<5:27:42, 20.59s/it] +2025-05-11 09:52:36 - ERROR - stderr - 74%|███████▍ | 2787/3741 [16:26:43<5:32:50, 20.93s/it] +2025-05-11 09:52:36 - ERROR - stderr - +2025-05-11 09:52:36 - ERROR - stderr - +2025-05-11 09:52:36 - INFO - stdout - {'loss': 0.4854, 'grad_norm': 0.8496928811073303, 'learning_rate': 3.2224913512627976e-06, 'epoch': 2.23} +2025-05-11 09:52:36 - ERROR - stderr - 74%|███████▍ | 2787/3741 [16:26:43<5:32:50, 20.93s/it] +2025-05-11 09:52:56 - ERROR - stderr - 75%|███████▍ | 2788/3741 [16:27:03<5:27:41, 20.63s/it] +2025-05-11 09:52:56 - ERROR - stderr - +2025-05-11 09:52:56 - ERROR - stderr - +2025-05-11 09:52:56 - INFO - stdout - {'loss': 0.5021, 'grad_norm': 0.8446789979934692, 'learning_rate': 3.2161267851542333e-06, 'epoch': 2.24} +2025-05-11 09:52:56 - ERROR - stderr - 75%|███████▍ | 2788/3741 [16:27:03<5:27:41, 20.63s/it] +2025-05-11 09:53:19 - ERROR - stderr - 75%|███████▍ | 2789/3741 [16:27:25<5:38:21, 21.33s/it] +2025-05-11 09:53:19 - ERROR - stderr - +2025-05-11 09:53:19 - ERROR - stderr - +2025-05-11 09:53:19 - INFO - stdout - {'loss': 0.4641, 'grad_norm': 0.8232909440994263, 'learning_rate': 3.2097673058248378e-06, 'epoch': 2.24} +2025-05-11 09:53:19 - ERROR - stderr - 75%|███████▍ | 2789/3741 [16:27:25<5:38:21, 21.33s/it] +2025-05-11 09:53:39 - ERROR - stderr - 75%|███████▍ | 2790/3741 [16:27:45<5:30:57, 20.88s/it] +2025-05-11 09:53:39 - ERROR - stderr - +2025-05-11 09:53:39 - ERROR - stderr - +2025-05-11 09:53:39 - INFO - stdout - {'loss': 0.4796, 'grad_norm': 0.8378138542175293, 'learning_rate': 3.2034129180431705e-06, 'epoch': 2.24} +2025-05-11 09:53:39 - ERROR - stderr - 75%|███████▍ | 2790/3741 [16:27:45<5:30:57, 20.88s/it] +2025-05-11 09:54:01 - ERROR - stderr - 75%|███████▍ | 2791/3741 [16:28:08<5:38:04, 21.35s/it] +2025-05-11 09:54:01 - ERROR - stderr - +2025-05-11 09:54:01 - ERROR - stderr - +2025-05-11 09:54:01 - INFO - stdout - {'loss': 0.4931, 'grad_norm': 0.8232404589653015, 'learning_rate': 3.1970636265739595e-06, 'epoch': 2.24} +2025-05-11 09:54:01 - ERROR - stderr - 75%|███████▍ | 2791/3741 [16:28:08<5:38:04, 21.35s/it] +2025-05-11 09:54:21 - ERROR - stderr - 75%|███████▍ | 2792/3741 [16:28:28<5:31:10, 20.94s/it] +2025-05-11 09:54:21 - ERROR - stderr - +2025-05-11 09:54:21 - ERROR - stderr - +2025-05-11 09:54:21 - INFO - stdout - {'loss': 0.4851, 'grad_norm': 0.8911257982254028, 'learning_rate': 3.1907194361781234e-06, 'epoch': 2.24} +2025-05-11 09:54:21 - ERROR - stderr - 75%|███████▍ | 2792/3741 [16:28:28<5:31:10, 20.94s/it] +2025-05-11 09:54:44 - ERROR - stderr - 75%|███████▍ | 2793/3741 [16:28:51<5:39:48, 21.51s/it] +2025-05-11 09:54:44 - ERROR - stderr - +2025-05-11 09:54:44 - ERROR - stderr - +2025-05-11 09:54:44 - INFO - stdout - {'loss': 0.4906, 'grad_norm': 0.8609054684638977, 'learning_rate': 3.1843803516127537e-06, 'epoch': 2.24} +2025-05-11 09:54:44 - ERROR - stderr - 75%|███████▍ | 2793/3741 [16:28:51<5:39:48, 21.51s/it] +2025-05-11 09:55:04 - ERROR - stderr - 75%|███████▍ | 2794/3741 [16:29:11<5:32:10, 21.05s/it] +2025-05-11 09:55:04 - ERROR - stderr - +2025-05-11 09:55:04 - ERROR - stderr - +2025-05-11 09:55:04 - INFO - stdout - {'loss': 0.4871, 'grad_norm': 0.869034469127655, 'learning_rate': 3.178046377631109e-06, 'epoch': 2.24} +2025-05-11 09:55:04 - ERROR - stderr - 75%|███████▍ | 2794/3741 [16:29:11<5:32:10, 21.05s/it] +2025-05-11 09:55:27 - ERROR - stderr - 75%|███████▍ | 2795/3741 [16:29:34<5:42:07, 21.70s/it] +2025-05-11 09:55:27 - ERROR - stderr - +2025-05-11 09:55:27 - ERROR - stderr - +2025-05-11 09:55:27 - INFO - stdout - {'loss': 0.4746, 'grad_norm': 0.8282964825630188, 'learning_rate': 3.1717175189826246e-06, 'epoch': 2.24} +2025-05-11 09:55:27 - ERROR - stderr - 75%|███████▍ | 2795/3741 [16:29:34<5:42:07, 21.70s/it] +2025-05-11 09:55:47 - ERROR - stderr - 75%|███████▍ | 2796/3741 [16:29:53<5:31:38, 21.06s/it] +2025-05-11 09:55:47 - ERROR - stderr - +2025-05-11 09:55:47 - ERROR - stderr - +2025-05-11 09:55:47 - INFO - stdout - {'loss': 0.4599, 'grad_norm': 0.7694876194000244, 'learning_rate': 3.1653937804128863e-06, 'epoch': 2.24} +2025-05-11 09:55:47 - ERROR - stderr - 75%|███████▍ | 2796/3741 [16:29:53<5:31:38, 21.06s/it] +2025-05-11 09:56:09 - ERROR - stderr - 75%|███████▍ | 2797/3741 [16:30:15<5:36:30, 21.39s/it] +2025-05-11 09:56:09 - ERROR - stderr - +2025-05-11 09:56:09 - ERROR - stderr - +2025-05-11 09:56:09 - INFO - stdout - {'loss': 0.465, 'grad_norm': 0.8961299657821655, 'learning_rate': 3.159075166663653e-06, 'epoch': 2.24} +2025-05-11 09:56:09 - ERROR - stderr - 75%|███████▍ | 2797/3741 [16:30:15<5:36:30, 21.39s/it] +2025-05-11 09:56:29 - ERROR - stderr - 75%|███████▍ | 2798/3741 [16:30:35<5:27:21, 20.83s/it] +2025-05-11 09:56:29 - ERROR - stderr - +2025-05-11 09:56:29 - ERROR - stderr - +2025-05-11 09:56:29 - INFO - stdout - {'loss': 0.4863, 'grad_norm': 0.8370431661605835, 'learning_rate': 3.1527616824728356e-06, 'epoch': 2.24} +2025-05-11 09:56:29 - ERROR - stderr - 75%|███████▍ | 2798/3741 [16:30:35<5:27:21, 20.83s/it] +2025-05-11 09:56:51 - ERROR - stderr - 75%|███████▍ | 2799/3741 [16:30:58<5:34:59, 21.34s/it] +2025-05-11 09:56:51 - ERROR - stderr - +2025-05-11 09:56:51 - ERROR - stderr - +2025-05-11 09:56:51 - INFO - stdout - {'loss': 0.4786, 'grad_norm': 0.8903129696846008, 'learning_rate': 3.1464533325744997e-06, 'epoch': 2.24} +2025-05-11 09:56:51 - ERROR - stderr - 75%|███████▍ | 2799/3741 [16:30:58<5:34:59, 21.34s/it] +2025-05-11 09:57:11 - ERROR - stderr - 75%|███████▍ | 2800/3741 [16:31:17<5:26:56, 20.85s/it] +2025-05-11 09:57:11 - ERROR - stderr - +2025-05-11 09:57:11 - ERROR - stderr - +2025-05-11 09:57:11 - INFO - stdout - {'loss': 0.5046, 'grad_norm': 0.8716090321540833, 'learning_rate': 3.140150121698864e-06, 'epoch': 2.25} +2025-05-11 09:57:11 - ERROR - stderr - 75%|███████▍ | 2800/3741 [16:31:17<5:26:56, 20.85s/it] +2025-05-11 09:57:33 - ERROR - stderr - 75%|███████▍ | 2801/3741 [16:31:39<5:33:15, 21.27s/it] +2025-05-11 09:57:33 - ERROR - stderr - +2025-05-11 09:57:33 - ERROR - stderr - +2025-05-11 09:57:33 - INFO - stdout - {'loss': 0.4695, 'grad_norm': 0.8388259410858154, 'learning_rate': 3.1338520545722852e-06, 'epoch': 2.25} +2025-05-11 09:57:33 - ERROR - stderr - 75%|███████▍ | 2801/3741 [16:31:39<5:33:15, 21.27s/it] +2025-05-11 09:57:53 - ERROR - stderr - 75%|███████▍ | 2802/3741 [16:31:59<5:25:34, 20.80s/it] +2025-05-11 09:57:53 - ERROR - stderr - +2025-05-11 09:57:53 - ERROR - stderr - +2025-05-11 09:57:53 - INFO - stdout - {'loss': 0.497, 'grad_norm': 0.8695220351219177, 'learning_rate': 3.1275591359172698e-06, 'epoch': 2.25} +2025-05-11 09:57:53 - ERROR - stderr - 75%|███████▍ | 2802/3741 [16:31:59<5:25:34, 20.80s/it] +2025-05-11 09:58:15 - ERROR - stderr - 75%|███████▍ | 2803/3741 [16:32:21<5:32:07, 21.24s/it] +2025-05-11 09:58:15 - ERROR - stderr - +2025-05-11 09:58:15 - ERROR - stderr - +2025-05-11 09:58:15 - INFO - stdout - {'loss': 0.5044, 'grad_norm': 0.8859379291534424, 'learning_rate': 3.1212713704524644e-06, 'epoch': 2.25} +2025-05-11 09:58:15 - ERROR - stderr - 75%|███████▍ | 2803/3741 [16:32:21<5:32:07, 21.24s/it] +2025-05-11 09:58:35 - ERROR - stderr - 75%|███████▍ | 2804/3741 [16:32:41<5:23:12, 20.70s/it] +2025-05-11 09:58:35 - ERROR - stderr - +2025-05-11 09:58:35 - ERROR - stderr - +2025-05-11 09:58:35 - INFO - stdout - {'loss': 0.4905, 'grad_norm': 0.8411895632743835, 'learning_rate': 3.114988762892649e-06, 'epoch': 2.25} +2025-05-11 09:58:35 - ERROR - stderr - 75%|███████▍ | 2804/3741 [16:32:41<5:23:12, 20.70s/it] +2025-05-11 09:58:57 - ERROR - stderr - 75%|███████▍ | 2805/3741 [16:33:03<5:30:25, 21.18s/it] +2025-05-11 09:58:57 - ERROR - stderr - +2025-05-11 09:58:57 - ERROR - stderr - +2025-05-11 09:58:57 - INFO - stdout - {'loss': 0.5039, 'grad_norm': 0.874116837978363, 'learning_rate': 3.1087113179487394e-06, 'epoch': 2.25} +2025-05-11 09:58:57 - ERROR - stderr - 75%|███████▍ | 2805/3741 [16:33:03<5:30:25, 21.18s/it] +2025-05-11 09:59:16 - ERROR - stderr - 75%|███████▌ | 2806/3741 [16:33:23<5:22:24, 20.69s/it] +2025-05-11 09:59:16 - ERROR - stderr - +2025-05-11 09:59:16 - ERROR - stderr - +2025-05-11 09:59:16 - INFO - stdout - {'loss': 0.4955, 'grad_norm': 0.8561678528785706, 'learning_rate': 3.102439040327773e-06, 'epoch': 2.25} +2025-05-11 09:59:16 - ERROR - stderr - 75%|███████▌ | 2806/3741 [16:33:23<5:22:24, 20.69s/it] +2025-05-11 09:59:39 - ERROR - stderr - 75%|███████▌ | 2807/3741 [16:33:45<5:30:34, 21.24s/it] +2025-05-11 09:59:39 - ERROR - stderr - +2025-05-11 09:59:39 - ERROR - stderr - +2025-05-11 09:59:39 - INFO - stdout - {'loss': 0.4832, 'grad_norm': 0.8340683579444885, 'learning_rate': 3.096171934732918e-06, 'epoch': 2.25} +2025-05-11 09:59:39 - ERROR - stderr - 75%|███████▌ | 2807/3741 [16:33:45<5:30:34, 21.24s/it] +2025-05-11 09:59:58 - ERROR - stderr - 75%|███████▌ | 2808/3741 [16:34:05<5:22:15, 20.72s/it] +2025-05-11 09:59:58 - ERROR - stderr - +2025-05-11 09:59:58 - ERROR - stderr - +2025-05-11 09:59:58 - INFO - stdout - {'loss': 0.4948, 'grad_norm': 0.8148999810218811, 'learning_rate': 3.0899100058634646e-06, 'epoch': 2.25} +2025-05-11 09:59:58 - ERROR - stderr - 75%|███████▌ | 2808/3741 [16:34:05<5:22:15, 20.72s/it] +2025-05-11 10:00:21 - ERROR - stderr - 75%|███████▌ | 2809/3741 [16:34:28<5:31:41, 21.35s/it] +2025-05-11 10:00:21 - ERROR - stderr - +2025-05-11 10:00:21 - ERROR - stderr - +2025-05-11 10:00:21 - INFO - stdout - {'loss': 0.4748, 'grad_norm': 0.827923595905304, 'learning_rate': 3.0836532584148237e-06, 'epoch': 2.25} +2025-05-11 10:00:21 - ERROR - stderr - 75%|███████▌ | 2809/3741 [16:34:28<5:31:41, 21.35s/it] +2025-05-11 10:00:41 - ERROR - stderr - 75%|███████▌ | 2810/3741 [16:34:47<5:23:11, 20.83s/it] +2025-05-11 10:00:41 - ERROR - stderr - +2025-05-11 10:00:41 - ERROR - stderr - +2025-05-11 10:00:41 - INFO - stdout - {'loss': 0.5062, 'grad_norm': 0.852433443069458, 'learning_rate': 3.0774016970785116e-06, 'epoch': 2.25} +2025-05-11 10:00:41 - ERROR - stderr - 75%|███████▌ | 2810/3741 [16:34:47<5:23:11, 20.83s/it] +2025-05-11 10:01:03 - ERROR - stderr - 75%|███████▌ | 2811/3741 [16:35:10<5:30:11, 21.30s/it] +2025-05-11 10:01:03 - ERROR - stderr - +2025-05-11 10:01:03 - ERROR - stderr - +2025-05-11 10:01:03 - INFO - stdout - {'loss': 0.4702, 'grad_norm': 0.8013387322425842, 'learning_rate': 3.0711553265421645e-06, 'epoch': 2.25} +2025-05-11 10:01:03 - ERROR - stderr - 75%|███████▌ | 2811/3741 [16:35:10<5:30:11, 21.30s/it] +2025-05-11 10:01:23 - ERROR - stderr - 75%|███████▌ | 2812/3741 [16:35:29<5:21:39, 20.77s/it] +2025-05-11 10:01:23 - ERROR - stderr - +2025-05-11 10:01:23 - ERROR - stderr - +2025-05-11 10:01:23 - INFO - stdout - {'loss': 0.4855, 'grad_norm': 0.8474105596542358, 'learning_rate': 3.0649141514895243e-06, 'epoch': 2.26} +2025-05-11 10:01:23 - ERROR - stderr - 75%|███████▌ | 2812/3741 [16:35:29<5:21:39, 20.77s/it] +2025-05-11 10:01:45 - ERROR - stderr - 75%|███████▌ | 2813/3741 [16:35:51<5:28:30, 21.24s/it] +2025-05-11 10:01:45 - ERROR - stderr - +2025-05-11 10:01:45 - ERROR - stderr - +2025-05-11 10:01:45 - INFO - stdout - {'loss': 0.503, 'grad_norm': 0.844879150390625, 'learning_rate': 3.058678176600436e-06, 'epoch': 2.26} +2025-05-11 10:01:45 - ERROR - stderr - 75%|███████▌ | 2813/3741 [16:35:51<5:28:30, 21.24s/it] +2025-05-11 10:02:05 - ERROR - stderr - 75%|███████▌ | 2814/3741 [16:36:11<5:21:39, 20.82s/it] +2025-05-11 10:02:05 - ERROR - stderr - +2025-05-11 10:02:05 - ERROR - stderr - +2025-05-11 10:02:05 - INFO - stdout - {'loss': 0.4956, 'grad_norm': 0.8638342022895813, 'learning_rate': 3.0524474065508492e-06, 'epoch': 2.26} +2025-05-11 10:02:05 - ERROR - stderr - 75%|███████▌ | 2814/3741 [16:36:11<5:21:39, 20.82s/it] +2025-05-11 10:02:28 - ERROR - stderr - 75%|███████▌ | 2815/3741 [16:36:35<5:32:29, 21.54s/it] +2025-05-11 10:02:28 - ERROR - stderr - +2025-05-11 10:02:28 - ERROR - stderr - +2025-05-11 10:02:28 - INFO - stdout - {'loss': 0.5089, 'grad_norm': 0.8933137655258179, 'learning_rate': 3.0462218460128e-06, 'epoch': 2.26} +2025-05-11 10:02:28 - ERROR - stderr - 75%|███████▌ | 2815/3741 [16:36:35<5:32:29, 21.54s/it] +2025-05-11 10:02:48 - ERROR - stderr - 75%|███████▌ | 2816/3741 [16:36:54<5:22:31, 20.92s/it] +2025-05-11 10:02:48 - ERROR - stderr - +2025-05-11 10:02:48 - ERROR - stderr - +2025-05-11 10:02:48 - INFO - stdout - {'loss': 0.4774, 'grad_norm': 0.8648194074630737, 'learning_rate': 3.0400014996544314e-06, 'epoch': 2.26} +2025-05-11 10:02:48 - ERROR - stderr - 75%|███████▌ | 2816/3741 [16:36:54<5:22:31, 20.92s/it] +2025-05-11 10:03:11 - ERROR - stderr - 75%|███████▌ | 2817/3741 [16:37:17<5:33:13, 21.64s/it] +2025-05-11 10:03:11 - ERROR - stderr - +2025-05-11 10:03:11 - ERROR - stderr - +2025-05-11 10:03:11 - INFO - stdout - {'loss': 0.4732, 'grad_norm': 0.8064270615577698, 'learning_rate': 3.0337863721399694e-06, 'epoch': 2.26} +2025-05-11 10:03:11 - ERROR - stderr - 75%|███████▌ | 2817/3741 [16:37:17<5:33:13, 21.64s/it] +2025-05-11 10:03:31 - ERROR - stderr - 75%|███████▌ | 2818/3741 [16:37:37<5:23:42, 21.04s/it] +2025-05-11 10:03:31 - ERROR - stderr - +2025-05-11 10:03:31 - ERROR - stderr - +2025-05-11 10:03:31 - INFO - stdout - {'loss': 0.4763, 'grad_norm': 0.8423921465873718, 'learning_rate': 3.0275764681297292e-06, 'epoch': 2.26} +2025-05-11 10:03:31 - ERROR - stderr - 75%|███████▌ | 2818/3741 [16:37:37<5:23:42, 21.04s/it] +2025-05-11 10:03:54 - ERROR - stderr - 75%|███████▌ | 2819/3741 [16:38:00<5:31:48, 21.59s/it] +2025-05-11 10:03:54 - ERROR - stderr - +2025-05-11 10:03:54 - ERROR - stderr - +2025-05-11 10:03:54 - INFO - stdout - {'loss': 0.4917, 'grad_norm': 0.8474961519241333, 'learning_rate': 3.02137179228011e-06, 'epoch': 2.26} +2025-05-11 10:03:54 - ERROR - stderr - 75%|███████▌ | 2819/3741 [16:38:00<5:31:48, 21.59s/it] +2025-05-11 10:04:13 - ERROR - stderr - 75%|███████▌ | 2820/3741 [16:38:20<5:23:19, 21.06s/it] +2025-05-11 10:04:13 - ERROR - stderr - +2025-05-11 10:04:13 - ERROR - stderr - +2025-05-11 10:04:13 - INFO - stdout - {'loss': 0.4909, 'grad_norm': 0.8607854843139648, 'learning_rate': 3.0151723492435837e-06, 'epoch': 2.26} +2025-05-11 10:04:13 - ERROR - stderr - 75%|███████▌ | 2820/3741 [16:38:20<5:23:19, 21.06s/it] +2025-05-11 10:04:37 - ERROR - stderr - 75%|███████▌ | 2821/3741 [16:38:43<5:34:28, 21.81s/it] +2025-05-11 10:04:37 - ERROR - stderr - +2025-05-11 10:04:37 - ERROR - stderr - +2025-05-11 10:04:37 - INFO - stdout - {'loss': 0.48, 'grad_norm': 0.8570009469985962, 'learning_rate': 3.008978143668707e-06, 'epoch': 2.26} +2025-05-11 10:04:37 - ERROR - stderr - 75%|███████▌ | 2821/3741 [16:38:43<5:34:28, 21.81s/it] +2025-05-11 10:04:57 - ERROR - stderr - 75%|███████▌ | 2822/3741 [16:39:03<5:26:42, 21.33s/it] +2025-05-11 10:04:57 - ERROR - stderr - +2025-05-11 10:04:57 - ERROR - stderr - +2025-05-11 10:04:57 - INFO - stdout - {'loss': 0.4902, 'grad_norm': 0.8390832543373108, 'learning_rate': 3.00278918020011e-06, 'epoch': 2.26} +2025-05-11 10:04:57 - ERROR - stderr - 75%|███████▌ | 2822/3741 [16:39:03<5:26:42, 21.33s/it] +2025-05-11 10:05:20 - ERROR - stderr - 75%|███████▌ | 2823/3741 [16:39:27<5:34:40, 21.87s/it] +2025-05-11 10:05:20 - ERROR - stderr - +2025-05-11 10:05:20 - ERROR - stderr - +2025-05-11 10:05:20 - INFO - stdout - {'loss': 0.4789, 'grad_norm': 0.8215196132659912, 'learning_rate': 2.9966054634784756e-06, 'epoch': 2.26} +2025-05-11 10:05:20 - ERROR - stderr - 75%|███████▌ | 2823/3741 [16:39:27<5:34:40, 21.87s/it] +2025-05-11 10:05:40 - ERROR - stderr - 75%|███████▌ | 2824/3741 [16:39:46<5:24:22, 21.22s/it] +2025-05-11 10:05:40 - ERROR - stderr - +2025-05-11 10:05:40 - ERROR - stderr - +2025-05-11 10:05:40 - INFO - stdout - {'loss': 0.4532, 'grad_norm': 0.836609423160553, 'learning_rate': 2.990426998140582e-06, 'epoch': 2.26} +2025-05-11 10:05:40 - ERROR - stderr - 75%|███████▌ | 2824/3741 [16:39:46<5:24:22, 21.22s/it] +2025-05-11 10:06:03 - ERROR - stderr - 76%|███████▌ | 2825/3741 [16:40:09<5:32:11, 21.76s/it] +2025-05-11 10:06:03 - ERROR - stderr - +2025-05-11 10:06:03 - ERROR - stderr - +2025-05-11 10:06:03 - INFO - stdout - {'loss': 0.4777, 'grad_norm': 0.8278366923332214, 'learning_rate': 2.9842537888192414e-06, 'epoch': 2.27} +2025-05-11 10:06:03 - ERROR - stderr - 76%|███████▌ | 2825/3741 [16:40:09<5:32:11, 21.76s/it] +2025-05-11 10:06:23 - ERROR - stderr - 76%|███████▌ | 2826/3741 [16:40:29<5:22:08, 21.12s/it] +2025-05-11 10:06:23 - ERROR - stderr - +2025-05-11 10:06:23 - ERROR - stderr - +2025-05-11 10:06:23 - INFO - stdout - {'loss': 0.476, 'grad_norm': 0.8481134176254272, 'learning_rate': 2.97808584014334e-06, 'epoch': 2.27} +2025-05-11 10:06:23 - ERROR - stderr - 76%|███████▌ | 2826/3741 [16:40:29<5:22:08, 21.12s/it] +2025-05-11 10:06:46 - ERROR - stderr - 76%|███████▌ | 2827/3741 [16:40:52<5:30:31, 21.70s/it] +2025-05-11 10:06:46 - ERROR - stderr - +2025-05-11 10:06:46 - ERROR - stderr - +2025-05-11 10:06:46 - INFO - stdout - {'loss': 0.4861, 'grad_norm': 0.8720846176147461, 'learning_rate': 2.9719231567378182e-06, 'epoch': 2.27} +2025-05-11 10:06:46 - ERROR - stderr - 76%|███████▌ | 2827/3741 [16:40:52<5:30:31, 21.70s/it] +2025-05-11 10:07:06 - ERROR - stderr - 76%|███████▌ | 2828/3741 [16:41:12<5:21:49, 21.15s/it] +2025-05-11 10:07:06 - ERROR - stderr - +2025-05-11 10:07:06 - ERROR - stderr - +2025-05-11 10:07:06 - INFO - stdout - {'loss': 0.4858, 'grad_norm': 0.8726981282234192, 'learning_rate': 2.9657657432236573e-06, 'epoch': 2.27} +2025-05-11 10:07:06 - ERROR - stderr - 76%|███████▌ | 2828/3741 [16:41:12<5:21:49, 21.15s/it] +2025-05-11 10:07:27 - ERROR - stderr - 76%|███████▌ | 2829/3741 [16:41:33<5:21:54, 21.18s/it] +2025-05-11 10:07:27 - ERROR - stderr - +2025-05-11 10:07:27 - ERROR - stderr - +2025-05-11 10:07:27 - INFO - stdout - {'loss': 0.5009, 'grad_norm': 0.9136954545974731, 'learning_rate': 2.959613604217908e-06, 'epoch': 2.27} +2025-05-11 10:07:27 - ERROR - stderr - 76%|███████▌ | 2829/3741 [16:41:33<5:21:54, 21.18s/it] +2025-05-11 10:07:47 - ERROR - stderr - 76%|███████▌ | 2830/3741 [16:41:53<5:15:43, 20.79s/it] +2025-05-11 10:07:47 - ERROR - stderr - +2025-05-11 10:07:47 - ERROR - stderr - +2025-05-11 10:07:47 - INFO - stdout - {'loss': 0.4844, 'grad_norm': 0.8403001427650452, 'learning_rate': 2.953466744333644e-06, 'epoch': 2.27} +2025-05-11 10:07:47 - ERROR - stderr - 76%|███████▌ | 2830/3741 [16:41:53<5:15:43, 20.79s/it] +2025-05-11 10:08:07 - ERROR - stderr - 76%|███████▌ | 2831/3741 [16:42:14<5:14:17, 20.72s/it] +2025-05-11 10:08:07 - ERROR - stderr - +2025-05-11 10:08:07 - ERROR - stderr - +2025-05-11 10:08:07 - INFO - stdout - {'loss': 0.4881, 'grad_norm': 0.8584656715393066, 'learning_rate': 2.947325168179994e-06, 'epoch': 2.27} +2025-05-11 10:08:07 - ERROR - stderr - 76%|███████▌ | 2831/3741 [16:42:14<5:14:17, 20.72s/it] +2025-05-11 10:08:27 - ERROR - stderr - 76%|███████▌ | 2832/3741 [16:42:33<5:10:01, 20.46s/it] +2025-05-11 10:08:27 - ERROR - stderr - +2025-05-11 10:08:27 - ERROR - stderr - +2025-05-11 10:08:27 - INFO - stdout - {'loss': 0.4938, 'grad_norm': 0.8448304533958435, 'learning_rate': 2.9411888803621237e-06, 'epoch': 2.27} +2025-05-11 10:08:27 - ERROR - stderr - 76%|███████▌ | 2832/3741 [16:42:33<5:10:01, 20.46s/it] +2025-05-11 10:08:47 - ERROR - stderr - 76%|███████▌ | 2833/3741 [16:42:54<5:08:33, 20.39s/it] +2025-05-11 10:08:47 - ERROR - stderr - +2025-05-11 10:08:47 - ERROR - stderr - +2025-05-11 10:08:47 - INFO - stdout - {'loss': 0.5019, 'grad_norm': 0.8816596269607544, 'learning_rate': 2.9350578854812194e-06, 'epoch': 2.27} +2025-05-11 10:08:47 - ERROR - stderr - 76%|███████▌ | 2833/3741 [16:42:54<5:08:33, 20.39s/it] +2025-05-11 10:09:07 - ERROR - stderr - 76%|███████▌ | 2834/3741 [16:43:14<5:06:40, 20.29s/it] +2025-05-11 10:09:07 - ERROR - stderr - +2025-05-11 10:09:07 - ERROR - stderr - +2025-05-11 10:09:07 - INFO - stdout - {'loss': 0.4926, 'grad_norm': 0.8737924098968506, 'learning_rate': 2.9289321881345257e-06, 'epoch': 2.27} +2025-05-11 10:09:07 - ERROR - stderr - 76%|███████▌ | 2834/3741 [16:43:14<5:06:40, 20.29s/it] +2025-05-11 10:09:27 - ERROR - stderr - 76%|███████▌ | 2835/3741 [16:43:34<5:04:34, 20.17s/it] +2025-05-11 10:09:27 - ERROR - stderr - +2025-05-11 10:09:27 - ERROR - stderr - +2025-05-11 10:09:27 - INFO - stdout - {'loss': 0.4696, 'grad_norm': 0.8343014121055603, 'learning_rate': 2.922811792915291e-06, 'epoch': 2.27} +2025-05-11 10:09:27 - ERROR - stderr - 76%|███████▌ | 2835/3741 [16:43:34<5:04:34, 20.17s/it] +2025-05-11 10:09:47 - ERROR - stderr - 76%|███████▌ | 2836/3741 [16:43:54<5:04:05, 20.16s/it] +2025-05-11 10:09:47 - ERROR - stderr - +2025-05-11 10:09:47 - ERROR - stderr - +2025-05-11 10:09:47 - INFO - stdout - {'loss': 0.4689, 'grad_norm': 0.8349881768226624, 'learning_rate': 2.916696704412789e-06, 'epoch': 2.27} +2025-05-11 10:09:47 - ERROR - stderr - 76%|███████▌ | 2836/3741 [16:43:54<5:04:05, 20.16s/it] +2025-05-11 10:10:07 - ERROR - stderr - 76%|███████▌ | 2837/3741 [16:44:13<5:01:20, 20.00s/it] +2025-05-11 10:10:07 - ERROR - stderr - +2025-05-11 10:10:07 - ERROR - stderr - +2025-05-11 10:10:07 - INFO - stdout - {'loss': 0.5054, 'grad_norm': 0.8880947232246399, 'learning_rate': 2.9105869272123366e-06, 'epoch': 2.28} +2025-05-11 10:10:07 - ERROR - stderr - 76%|███████▌ | 2837/3741 [16:44:13<5:01:20, 20.00s/it] +2025-05-11 10:10:28 - ERROR - stderr - 76%|███████▌ | 2838/3741 [16:44:34<5:05:36, 20.31s/it] +2025-05-11 10:10:28 - ERROR - stderr - +2025-05-11 10:10:28 - ERROR - stderr - +2025-05-11 10:10:28 - INFO - stdout - {'loss': 0.4892, 'grad_norm': 0.8601458072662354, 'learning_rate': 2.9044824658952407e-06, 'epoch': 2.28} +2025-05-11 10:10:28 - ERROR - stderr - 76%|███████▌ | 2838/3741 [16:44:34<5:05:36, 20.31s/it] +2025-05-11 10:10:48 - ERROR - stderr - 76%|███████▌ | 2839/3741 [16:44:54<5:02:18, 20.11s/it] +2025-05-11 10:10:48 - ERROR - stderr - +2025-05-11 10:10:48 - ERROR - stderr - +2025-05-11 10:10:48 - INFO - stdout - {'loss': 0.48, 'grad_norm': 0.8508468270301819, 'learning_rate': 2.898383325038838e-06, 'epoch': 2.28} +2025-05-11 10:10:48 - ERROR - stderr - 76%|███████▌ | 2839/3741 [16:44:54<5:02:18, 20.11s/it] +2025-05-11 10:11:09 - ERROR - stderr - 76%|███████▌ | 2840/3741 [16:45:16<5:08:53, 20.57s/it] +2025-05-11 10:11:09 - ERROR - stderr - +2025-05-11 10:11:09 - ERROR - stderr - +2025-05-11 10:11:09 - INFO - stdout - {'loss': 0.4873, 'grad_norm': 0.8209986090660095, 'learning_rate': 2.8922895092164773e-06, 'epoch': 2.28} +2025-05-11 10:11:09 - ERROR - stderr - 76%|███████▌ | 2840/3741 [16:45:16<5:08:53, 20.57s/it] +2025-05-11 10:11:29 - ERROR - stderr - 76%|███████▌ | 2841/3741 [16:45:36<5:05:36, 20.37s/it] +2025-05-11 10:11:29 - ERROR - stderr - +2025-05-11 10:11:29 - ERROR - stderr - +2025-05-11 10:11:29 - INFO - stdout - {'loss': 0.4926, 'grad_norm': 0.820483922958374, 'learning_rate': 2.886201022997497e-06, 'epoch': 2.28} +2025-05-11 10:11:29 - ERROR - stderr - 76%|███████▌ | 2841/3741 [16:45:36<5:05:36, 20.37s/it] +2025-05-11 10:11:51 - ERROR - stderr - 76%|███████▌ | 2842/3741 [16:45:57<5:11:59, 20.82s/it] +2025-05-11 10:11:51 - ERROR - stderr - +2025-05-11 10:11:51 - ERROR - stderr - +2025-05-11 10:11:51 - INFO - stdout - {'loss': 0.4753, 'grad_norm': 0.8504437208175659, 'learning_rate': 2.8801178709472645e-06, 'epoch': 2.28} +2025-05-11 10:11:51 - ERROR - stderr - 76%|███████▌ | 2842/3741 [16:45:57<5:11:59, 20.82s/it] +2025-05-11 10:12:11 - ERROR - stderr - 76%|███████▌ | 2843/3741 [16:46:17<5:06:55, 20.51s/it] +2025-05-11 10:12:11 - ERROR - stderr - +2025-05-11 10:12:11 - ERROR - stderr - +2025-05-11 10:12:11 - INFO - stdout - {'loss': 0.465, 'grad_norm': 0.8321656584739685, 'learning_rate': 2.8740400576271265e-06, 'epoch': 2.28} +2025-05-11 10:12:11 - ERROR - stderr - 76%|███████▌ | 2843/3741 [16:46:17<5:06:55, 20.51s/it] +2025-05-11 10:12:33 - ERROR - stderr - 76%|███████▌ | 2844/3741 [16:46:39<5:13:41, 20.98s/it] +2025-05-11 10:12:33 - ERROR - stderr - +2025-05-11 10:12:33 - ERROR - stderr - +2025-05-11 10:12:33 - INFO - stdout - {'loss': 0.4641, 'grad_norm': 0.8143665194511414, 'learning_rate': 2.8679675875944356e-06, 'epoch': 2.28} +2025-05-11 10:12:33 - ERROR - stderr - 76%|███████▌ | 2844/3741 [16:46:39<5:13:41, 20.98s/it] +2025-05-11 10:12:53 - ERROR - stderr - 76%|███████▌ | 2845/3741 [16:46:59<5:09:13, 20.71s/it] +2025-05-11 10:12:53 - ERROR - stderr - +2025-05-11 10:12:53 - ERROR - stderr - +2025-05-11 10:12:53 - INFO - stdout - {'loss': 0.4784, 'grad_norm': 0.8252444267272949, 'learning_rate': 2.8619004654025418e-06, 'epoch': 2.28} +2025-05-11 10:12:53 - ERROR - stderr - 76%|███████▌ | 2845/3741 [16:46:59<5:09:13, 20.71s/it] +2025-05-11 10:13:16 - ERROR - stderr - 76%|███████▌ | 2846/3741 [16:47:22<5:18:07, 21.33s/it] +2025-05-11 10:13:16 - ERROR - stderr - +2025-05-11 10:13:16 - ERROR - stderr - +2025-05-11 10:13:16 - INFO - stdout - {'loss': 0.4836, 'grad_norm': 0.8725248575210571, 'learning_rate': 2.85583869560077e-06, 'epoch': 2.28} +2025-05-11 10:13:16 - ERROR - stderr - 76%|███████▌ | 2846/3741 [16:47:22<5:18:07, 21.33s/it] +2025-05-11 10:13:36 - ERROR - stderr - 76%|███████▌ | 2847/3741 [16:47:42<5:11:05, 20.88s/it] +2025-05-11 10:13:36 - ERROR - stderr - +2025-05-11 10:13:36 - ERROR - stderr - +2025-05-11 10:13:36 - INFO - stdout - {'loss': 0.4934, 'grad_norm': 0.8364203572273254, 'learning_rate': 2.8497822827344522e-06, 'epoch': 2.28} +2025-05-11 10:13:36 - ERROR - stderr - 76%|███████▌ | 2847/3741 [16:47:42<5:11:05, 20.88s/it] +2025-05-11 10:13:58 - ERROR - stderr - 76%|███████▌ | 2848/3741 [16:48:04<5:17:02, 21.30s/it] +2025-05-11 10:13:58 - ERROR - stderr - +2025-05-11 10:13:58 - ERROR - stderr - +2025-05-11 10:13:58 - INFO - stdout - {'loss': 0.5294, 'grad_norm': 0.9842785000801086, 'learning_rate': 2.8437312313448863e-06, 'epoch': 2.28} +2025-05-11 10:13:58 - ERROR - stderr - 76%|███████▌ | 2848/3741 [16:48:04<5:17:02, 21.30s/it] +2025-05-11 10:14:18 - ERROR - stderr - 76%|███████▌ | 2849/3741 [16:48:24<5:09:18, 20.81s/it] +2025-05-11 10:14:18 - ERROR - stderr - +2025-05-11 10:14:18 - ERROR - stderr - +2025-05-11 10:14:18 - INFO - stdout - {'loss': 0.4845, 'grad_norm': 0.8217899799346924, 'learning_rate': 2.837685545969359e-06, 'epoch': 2.28} +2025-05-11 10:14:18 - ERROR - stderr - 76%|███████▌ | 2849/3741 [16:48:24<5:09:18, 20.81s/it] +2025-05-11 10:14:40 - ERROR - stderr - 76%|███████▌ | 2850/3741 [16:48:46<5:14:46, 21.20s/it] +2025-05-11 10:14:40 - ERROR - stderr - +2025-05-11 10:14:40 - ERROR - stderr - +2025-05-11 10:14:40 - INFO - stdout - {'loss': 0.4888, 'grad_norm': 0.847512423992157, 'learning_rate': 2.8316452311411326e-06, 'epoch': 2.29} +2025-05-11 10:14:40 - ERROR - stderr - 76%|███████▌ | 2850/3741 [16:48:46<5:14:46, 21.20s/it] +2025-05-11 10:14:59 - ERROR - stderr - 76%|███████▌ | 2851/3741 [16:49:06<5:07:23, 20.72s/it] +2025-05-11 10:14:59 - ERROR - stderr - +2025-05-11 10:14:59 - ERROR - stderr - +2025-05-11 10:14:59 - INFO - stdout - {'loss': 0.4856, 'grad_norm': 0.8483214974403381, 'learning_rate': 2.8256102913894355e-06, 'epoch': 2.29} +2025-05-11 10:14:59 - ERROR - stderr - 76%|███████▌ | 2851/3741 [16:49:06<5:07:23, 20.72s/it] +2025-05-11 10:15:22 - ERROR - stderr - 76%|███████▌ | 2852/3741 [16:49:28<5:13:58, 21.19s/it] +2025-05-11 10:15:22 - ERROR - stderr - +2025-05-11 10:15:22 - ERROR - stderr - +2025-05-11 10:15:22 - INFO - stdout - {'loss': 0.4837, 'grad_norm': 0.8673418760299683, 'learning_rate': 2.8195807312394763e-06, 'epoch': 2.29} +2025-05-11 10:15:22 - ERROR - stderr - 76%|███████▌ | 2852/3741 [16:49:28<5:13:58, 21.19s/it] +2025-05-11 10:15:41 - ERROR - stderr - 76%|███████▋ | 2853/3741 [16:49:47<5:06:25, 20.70s/it] +2025-05-11 10:15:41 - ERROR - stderr - +2025-05-11 10:15:41 - ERROR - stderr - +2025-05-11 10:15:41 - INFO - stdout - {'loss': 0.4729, 'grad_norm': 0.8354519009590149, 'learning_rate': 2.8135565552124224e-06, 'epoch': 2.29} +2025-05-11 10:15:41 - ERROR - stderr - 76%|███████▋ | 2853/3741 [16:49:48<5:06:25, 20.70s/it] +2025-05-11 10:16:04 - ERROR - stderr - 76%|███████▋ | 2854/3741 [16:50:10<5:14:17, 21.26s/it] +2025-05-11 10:16:04 - ERROR - stderr - +2025-05-11 10:16:04 - ERROR - stderr - +2025-05-11 10:16:04 - INFO - stdout - {'loss': 0.492, 'grad_norm': 0.8773465752601624, 'learning_rate': 2.8075377678254058e-06, 'epoch': 2.29} +2025-05-11 10:16:04 - ERROR - stderr - 76%|███████▋ | 2854/3741 [16:50:10<5:14:17, 21.26s/it] +2025-05-11 10:16:24 - ERROR - stderr - 76%|███████▋ | 2855/3741 [16:50:30<5:08:07, 20.87s/it] +2025-05-11 10:16:24 - ERROR - stderr - +2025-05-11 10:16:24 - ERROR - stderr - +2025-05-11 10:16:24 - INFO - stdout - {'loss': 0.4971, 'grad_norm': 0.8436979055404663, 'learning_rate': 2.801524373591522e-06, 'epoch': 2.29} +2025-05-11 10:16:24 - ERROR - stderr - 76%|███████▋ | 2855/3741 [16:50:30<5:08:07, 20.87s/it] +2025-05-11 10:16:47 - ERROR - stderr - 76%|███████▋ | 2856/3741 [16:50:53<5:19:00, 21.63s/it] +2025-05-11 10:16:47 - ERROR - stderr - +2025-05-11 10:16:47 - ERROR - stderr - +2025-05-11 10:16:47 - INFO - stdout - {'loss': 0.4994, 'grad_norm': 0.8587144613265991, 'learning_rate': 2.7955163770198136e-06, 'epoch': 2.29} +2025-05-11 10:16:47 - ERROR - stderr - 76%|███████▋ | 2856/3741 [16:50:53<5:19:00, 21.63s/it] +2025-05-11 10:17:07 - ERROR - stderr - 76%|███████▋ | 2857/3741 [16:51:13<5:10:00, 21.04s/it] +2025-05-11 10:17:07 - ERROR - stderr - +2025-05-11 10:17:07 - ERROR - stderr - +2025-05-11 10:17:07 - INFO - stdout - {'loss': 0.4767, 'grad_norm': 0.8141018152236938, 'learning_rate': 2.789513782615283e-06, 'epoch': 2.29} +2025-05-11 10:17:07 - ERROR - stderr - 76%|███████▋ | 2857/3741 [16:51:13<5:10:00, 21.04s/it] +2025-05-11 10:17:29 - ERROR - stderr - 76%|███████▋ | 2858/3741 [16:51:35<5:15:07, 21.41s/it] +2025-05-11 10:17:29 - ERROR - stderr - +2025-05-11 10:17:29 - ERROR - stderr - +2025-05-11 10:17:29 - INFO - stdout - {'loss': 0.5074, 'grad_norm': 0.8428380489349365, 'learning_rate': 2.78351659487888e-06, 'epoch': 2.29} +2025-05-11 10:17:29 - ERROR - stderr - 76%|███████▋ | 2858/3741 [16:51:35<5:15:07, 21.41s/it] +2025-05-11 10:17:49 - ERROR - stderr - 76%|███████▋ | 2859/3741 [16:51:55<5:08:57, 21.02s/it] +2025-05-11 10:17:49 - ERROR - stderr - +2025-05-11 10:17:49 - ERROR - stderr - +2025-05-11 10:17:49 - INFO - stdout - {'loss': 0.4801, 'grad_norm': 0.8327274918556213, 'learning_rate': 2.777524818307501e-06, 'epoch': 2.29} +2025-05-11 10:17:49 - ERROR - stderr - 76%|███████▋ | 2859/3741 [16:51:55<5:08:57, 21.02s/it] +2025-05-11 10:18:12 - ERROR - stderr - 76%|███████▋ | 2860/3741 [16:52:18<5:16:01, 21.52s/it] +2025-05-11 10:18:12 - ERROR - stderr - +2025-05-11 10:18:12 - ERROR - stderr - +2025-05-11 10:18:12 - INFO - stdout - {'loss': 0.4726, 'grad_norm': 0.8546361327171326, 'learning_rate': 2.7715384573939865e-06, 'epoch': 2.29} +2025-05-11 10:18:12 - ERROR - stderr - 76%|███████▋ | 2860/3741 [16:52:18<5:16:01, 21.52s/it] +2025-05-11 10:18:32 - ERROR - stderr - 76%|███████▋ | 2861/3741 [16:52:38<5:07:45, 20.98s/it] +2025-05-11 10:18:32 - ERROR - stderr - +2025-05-11 10:18:32 - ERROR - stderr - +2025-05-11 10:18:32 - INFO - stdout - {'loss': 0.4861, 'grad_norm': 0.8553916811943054, 'learning_rate': 2.7655575166271067e-06, 'epoch': 2.29} +2025-05-11 10:18:32 - ERROR - stderr - 76%|███████▋ | 2861/3741 [16:52:38<5:07:45, 20.98s/it] +2025-05-11 10:18:53 - ERROR - stderr - 77%|███████▋ | 2862/3741 [16:53:00<5:11:05, 21.23s/it] +2025-05-11 10:18:53 - ERROR - stderr - +2025-05-11 10:18:53 - ERROR - stderr - +2025-05-11 10:18:53 - INFO - stdout - {'loss': 0.4627, 'grad_norm': 0.8089895844459534, 'learning_rate': 2.7595820004915795e-06, 'epoch': 2.3} +2025-05-11 10:18:53 - ERROR - stderr - 77%|███████▋ | 2862/3741 [16:53:00<5:11:05, 21.23s/it] +2025-05-11 10:19:13 - ERROR - stderr - 77%|███████▋ | 2863/3741 [16:53:20<5:04:39, 20.82s/it] +2025-05-11 10:19:13 - ERROR - stderr - +2025-05-11 10:19:13 - ERROR - stderr - +2025-05-11 10:19:13 - INFO - stdout - {'loss': 0.4779, 'grad_norm': 0.8807501196861267, 'learning_rate': 2.7536119134680493e-06, 'epoch': 2.3} +2025-05-11 10:19:13 - ERROR - stderr - 77%|███████▋ | 2863/3741 [16:53:20<5:04:39, 20.82s/it] +2025-05-11 10:19:36 - ERROR - stderr - 77%|███████▋ | 2864/3741 [16:53:43<5:14:38, 21.53s/it] +2025-05-11 10:19:36 - ERROR - stderr - +2025-05-11 10:19:36 - ERROR - stderr - +2025-05-11 10:19:36 - INFO - stdout - {'loss': 0.5165, 'grad_norm': 0.8504637479782104, 'learning_rate': 2.747647260033095e-06, 'epoch': 2.3} +2025-05-11 10:19:36 - ERROR - stderr - 77%|███████▋ | 2864/3741 [16:53:43<5:14:38, 21.53s/it] +2025-05-11 10:19:56 - ERROR - stderr - 77%|███████▋ | 2865/3741 [16:54:02<5:05:59, 20.96s/it] +2025-05-11 10:19:56 - ERROR - stderr - +2025-05-11 10:19:56 - ERROR - stderr - +2025-05-11 10:19:56 - INFO - stdout - {'loss': 0.4904, 'grad_norm': 0.8142641186714172, 'learning_rate': 2.7416880446592087e-06, 'epoch': 2.3} +2025-05-11 10:19:56 - ERROR - stderr - 77%|███████▋ | 2865/3741 [16:54:02<5:05:59, 20.96s/it] +2025-05-11 10:20:19 - ERROR - stderr - 77%|███████▋ | 2866/3741 [16:54:25<5:12:24, 21.42s/it] +2025-05-11 10:20:19 - ERROR - stderr - +2025-05-11 10:20:19 - ERROR - stderr - +2025-05-11 10:20:19 - INFO - stdout - {'loss': 0.5014, 'grad_norm': 0.8312305808067322, 'learning_rate': 2.7357342718148184e-06, 'epoch': 2.3} +2025-05-11 10:20:19 - ERROR - stderr - 77%|███████▋ | 2866/3741 [16:54:25<5:12:24, 21.42s/it] +2025-05-11 10:20:39 - ERROR - stderr - 77%|███████▋ | 2867/3741 [16:54:45<5:06:50, 21.06s/it] +2025-05-11 10:20:39 - ERROR - stderr - +2025-05-11 10:20:39 - ERROR - stderr - +2025-05-11 10:20:39 - INFO - stdout - {'loss': 0.4728, 'grad_norm': 0.8162922859191895, 'learning_rate': 2.729785945964264e-06, 'epoch': 2.3} +2025-05-11 10:20:39 - ERROR - stderr - 77%|███████▋ | 2867/3741 [16:54:45<5:06:50, 21.06s/it] +2025-05-11 10:21:01 - ERROR - stderr - 77%|███████▋ | 2868/3741 [16:55:08<5:13:13, 21.53s/it] +2025-05-11 10:21:01 - ERROR - stderr - +2025-05-11 10:21:01 - ERROR - stderr - +2025-05-11 10:21:01 - INFO - stdout - {'loss': 0.4964, 'grad_norm': 0.8514026999473572, 'learning_rate': 2.723843071567803e-06, 'epoch': 2.3} +2025-05-11 10:21:01 - ERROR - stderr - 77%|███████▋ | 2868/3741 [16:55:08<5:13:13, 21.53s/it] +2025-05-11 10:21:21 - ERROR - stderr - 77%|███████▋ | 2869/3741 [16:55:28<5:06:39, 21.10s/it] +2025-05-11 10:21:22 - ERROR - stderr - +2025-05-11 10:21:22 - ERROR - stderr - +2025-05-11 10:21:22 - INFO - stdout - {'loss': 0.4783, 'grad_norm': 0.8272191882133484, 'learning_rate': 2.717905653081608e-06, 'epoch': 2.3} +2025-05-11 10:21:22 - ERROR - stderr - 77%|███████▋ | 2869/3741 [16:55:28<5:06:39, 21.10s/it] +2025-05-11 10:21:45 - ERROR - stderr - 77%|███████▋ | 2870/3741 [16:55:51<5:15:12, 21.71s/it] +2025-05-11 10:21:45 - ERROR - stderr - +2025-05-11 10:21:45 - ERROR - stderr - +2025-05-11 10:21:45 - INFO - stdout - {'loss': 0.4773, 'grad_norm': 0.8552436232566833, 'learning_rate': 2.7119736949577534e-06, 'epoch': 2.3} +2025-05-11 10:21:45 - ERROR - stderr - 77%|███████▋ | 2870/3741 [16:55:51<5:15:12, 21.71s/it] +2025-05-11 10:22:04 - ERROR - stderr - 77%|███████▋ | 2871/3741 [16:56:10<5:05:07, 21.04s/it] +2025-05-11 10:22:04 - ERROR - stderr - +2025-05-11 10:22:04 - ERROR - stderr - +2025-05-11 10:22:04 - INFO - stdout - {'loss': 0.4754, 'grad_norm': 0.8255532383918762, 'learning_rate': 2.706047201644224e-06, 'epoch': 2.3} +2025-05-11 10:22:04 - ERROR - stderr - 77%|███████▋ | 2871/3741 [16:56:10<5:05:07, 21.04s/it] +2025-05-11 10:22:27 - ERROR - stderr - 77%|███████▋ | 2872/3741 [16:56:33<5:12:28, 21.57s/it] +2025-05-11 10:22:27 - ERROR - stderr - +2025-05-11 10:22:27 - ERROR - stderr - +2025-05-11 10:22:27 - INFO - stdout - {'loss': 0.5073, 'grad_norm': 0.8571800589561462, 'learning_rate': 2.7001261775849086e-06, 'epoch': 2.3} +2025-05-11 10:22:27 - ERROR - stderr - 77%|███████▋ | 2872/3741 [16:56:33<5:12:28, 21.57s/it] +2025-05-11 10:22:47 - ERROR - stderr - 77%|███████▋ | 2873/3741 [16:56:53<5:03:35, 20.99s/it] +2025-05-11 10:22:47 - ERROR - stderr - +2025-05-11 10:22:47 - ERROR - stderr - +2025-05-11 10:22:47 - INFO - stdout - {'loss': 0.5018, 'grad_norm': 0.8622461557388306, 'learning_rate': 2.69421062721959e-06, 'epoch': 2.3} +2025-05-11 10:22:47 - ERROR - stderr - 77%|███████▋ | 2873/3741 [16:56:53<5:03:35, 20.99s/it] +2025-05-11 10:23:11 - ERROR - stderr - 77%|███████▋ | 2874/3741 [16:57:17<5:16:36, 21.91s/it] +2025-05-11 10:23:11 - ERROR - stderr - +2025-05-11 10:23:11 - ERROR - stderr - +2025-05-11 10:23:11 - INFO - stdout - {'loss': 0.4727, 'grad_norm': 0.8514299392700195, 'learning_rate': 2.688300554983955e-06, 'epoch': 2.3} +2025-05-11 10:23:11 - ERROR - stderr - 77%|██████��▋ | 2874/3741 [16:57:17<5:16:36, 21.91s/it] +2025-05-11 10:23:30 - ERROR - stderr - 77%|███████▋ | 2875/3741 [16:57:37<5:07:28, 21.30s/it] +2025-05-11 10:23:31 - ERROR - stderr - +2025-05-11 10:23:31 - ERROR - stderr - +2025-05-11 10:23:31 - INFO - stdout - {'loss': 0.4859, 'grad_norm': 0.8603047728538513, 'learning_rate': 2.682395965309569e-06, 'epoch': 2.31} +2025-05-11 10:23:31 - ERROR - stderr - 77%|███████▋ | 2875/3741 [16:57:37<5:07:28, 21.30s/it] +2025-05-11 10:23:53 - ERROR - stderr - 77%|███████▋ | 2876/3741 [16:57:59<5:12:08, 21.65s/it] +2025-05-11 10:23:53 - ERROR - stderr - +2025-05-11 10:23:53 - ERROR - stderr - +2025-05-11 10:23:53 - INFO - stdout - {'loss': 0.4869, 'grad_norm': 0.8229160308837891, 'learning_rate': 2.6764968626238986e-06, 'epoch': 2.31} +2025-05-11 10:23:53 - ERROR - stderr - 77%|███████▋ | 2876/3741 [16:57:59<5:12:08, 21.65s/it] +2025-05-11 10:24:12 - ERROR - stderr - 77%|███████▋ | 2877/3741 [16:58:19<5:02:37, 21.02s/it] +2025-05-11 10:24:13 - ERROR - stderr - +2025-05-11 10:24:13 - ERROR - stderr - +2025-05-11 10:24:13 - INFO - stdout - {'loss': 0.4736, 'grad_norm': 0.8354660868644714, 'learning_rate': 2.6706032513502913e-06, 'epoch': 2.31} +2025-05-11 10:24:13 - ERROR - stderr - 77%|███████▋ | 2877/3741 [16:58:19<5:02:37, 21.02s/it] +2025-05-11 10:24:32 - ERROR - stderr - 77%|███████▋ | 2878/3741 [16:58:38<4:56:21, 20.60s/it] +2025-05-11 10:24:32 - ERROR - stderr - +2025-05-11 10:24:32 - ERROR - stderr - +2025-05-11 10:24:32 - INFO - stdout - {'loss': 0.482, 'grad_norm': 0.8746702075004578, 'learning_rate': 2.664715135907977e-06, 'epoch': 2.31} +2025-05-11 10:24:32 - ERROR - stderr - 77%|███████▋ | 2878/3741 [16:58:38<4:56:21, 20.60s/it] +2025-05-11 10:24:53 - ERROR - stderr - 77%|███████▋ | 2879/3741 [16:58:59<4:55:13, 20.55s/it] +2025-05-11 10:24:53 - ERROR - stderr - +2025-05-11 10:24:53 - ERROR - stderr - +2025-05-11 10:24:53 - INFO - stdout - {'loss': 0.4917, 'grad_norm': 0.8408623933792114, 'learning_rate': 2.65883252071207e-06, 'epoch': 2.31} +2025-05-11 10:24:53 - ERROR - stderr - 77%|███████▋ | 2879/3741 [16:58:59<4:55:13, 20.55s/it] +2025-05-11 10:25:12 - ERROR - stderr - 77%|███████▋ | 2880/3741 [16:59:18<4:49:59, 20.21s/it] +2025-05-11 10:25:12 - ERROR - stderr - +2025-05-11 10:25:12 - ERROR - stderr - +2025-05-11 10:25:12 - INFO - stdout - {'loss': 0.4974, 'grad_norm': 0.8481791615486145, 'learning_rate': 2.652955410173548e-06, 'epoch': 2.31} +2025-05-11 10:25:12 - ERROR - stderr - 77%|███████▋ | 2880/3741 [16:59:18<4:49:59, 20.21s/it] +2025-05-11 10:25:32 - ERROR - stderr - 77%|███████▋ | 2881/3741 [16:59:38<4:47:21, 20.05s/it] +2025-05-11 10:25:32 - ERROR - stderr - +2025-05-11 10:25:32 - ERROR - stderr - +2025-05-11 10:25:32 - INFO - stdout - {'loss': 0.4732, 'grad_norm': 0.8575053215026855, 'learning_rate': 2.6470838086992724e-06, 'epoch': 2.31} +2025-05-11 10:25:32 - ERROR - stderr - 77%|███████▋ | 2881/3741 [16:59:38<4:47:21, 20.05s/it] +2025-05-11 10:25:51 - ERROR - stderr - 77%|███████▋ | 2882/3741 [16:59:58<4:45:22, 19.93s/it] +2025-05-11 10:25:51 - ERROR - stderr - +2025-05-11 10:25:51 - ERROR - stderr - +2025-05-11 10:25:51 - INFO - stdout - {'loss': 0.4728, 'grad_norm': 0.857205867767334, 'learning_rate': 2.641217720691972e-06, 'epoch': 2.31} +2025-05-11 10:25:51 - ERROR - stderr - 77%|███████▋ | 2882/3741 [16:59:58<4:45:22, 19.93s/it] +2025-05-11 10:26:14 - ERROR - stderr - 77%|███████▋ | 2883/3741 [17:00:20<4:55:56, 20.70s/it] +2025-05-11 10:26:14 - ERROR - stderr - +2025-05-11 10:26:14 - ERROR - stderr - +2025-05-11 10:26:14 - INFO - stdout - {'loss': 0.4909, 'grad_norm': 0.8567532300949097, 'learning_rate': 2.6353571505502317e-06, 'epoch': 2.31} +2025-05-11 10:26:14 - ERROR - stderr - 77%|███████▋ | 2883/3741 [17:00:20<4:55:56, 20.70s/it] +2025-05-11 10:26:34 - ERROR - stderr - 77%|███████▋ | 2884/3741 [17:00:40<4:52:00, 20.44s/it] +2025-05-11 10:26:34 - ERROR - stderr - +2025-05-11 10:26:34 - ERROR - stderr - +2025-05-11 10:26:34 - INFO - stdout - {'loss': 0.4863, 'grad_norm': 0.8322728276252747, 'learning_rate': 2.6295021026685176e-06, 'epoch': 2.31} +2025-05-11 10:26:34 - ERROR - stderr - 77%|███████▋ | 2884/3741 [17:00:40<4:52:00, 20.44s/it] +2025-05-11 10:26:56 - ERROR - stderr - 77%|███████▋ | 2885/3741 [17:01:03<5:01:49, 21.16s/it] +2025-05-11 10:26:56 - ERROR - stderr - +2025-05-11 10:26:56 - ERROR - stderr - +2025-05-11 10:26:56 - INFO - stdout - {'loss': 0.4861, 'grad_norm': 0.8363150954246521, 'learning_rate': 2.623652581437135e-06, 'epoch': 2.31} +2025-05-11 10:26:56 - ERROR - stderr - 77%|███████▋ | 2885/3741 [17:01:03<5:01:49, 21.16s/it] +2025-05-11 10:27:16 - ERROR - stderr - 77%|███████▋ | 2886/3741 [17:01:22<4:54:41, 20.68s/it] +2025-05-11 10:27:16 - ERROR - stderr - +2025-05-11 10:27:16 - ERROR - stderr - +2025-05-11 10:27:16 - INFO - stdout - {'loss': 0.5115, 'grad_norm': 0.8756260871887207, 'learning_rate': 2.617808591242258e-06, 'epoch': 2.31} +2025-05-11 10:27:16 - ERROR - stderr - 77%|███████▋ | 2886/3741 [17:01:22<4:54:41, 20.68s/it] +2025-05-11 10:27:38 - ERROR - stderr - 77%|███████▋ | 2887/3741 [17:01:45<5:01:29, 21.18s/it] +2025-05-11 10:27:38 - ERROR - stderr - +2025-05-11 10:27:38 - ERROR - stderr - +2025-05-11 10:27:38 - INFO - stdout - {'loss': 0.4944, 'grad_norm': 0.8478529453277588, 'learning_rate': 2.6119701364659124e-06, 'epoch': 2.32} +2025-05-11 10:27:38 - ERROR - stderr - 77%|███████▋ | 2887/3741 [17:01:45<5:01:29, 21.18s/it] +2025-05-11 10:27:58 - ERROR - stderr - 77%|███████▋ | 2888/3741 [17:02:05<4:55:15, 20.77s/it] +2025-05-11 10:27:58 - ERROR - stderr - +2025-05-11 10:27:58 - ERROR - stderr - +2025-05-11 10:27:58 - INFO - stdout - {'loss': 0.4886, 'grad_norm': 0.8674932718276978, 'learning_rate': 2.6061372214859595e-06, 'epoch': 2.32} +2025-05-11 10:27:58 - ERROR - stderr - 77%|███████▋ | 2888/3741 [17:02:05<4:55:15, 20.77s/it] +2025-05-11 10:28:21 - ERROR - stderr - 77%|███████▋ | 2889/3741 [17:02:27<5:02:17, 21.29s/it] +2025-05-11 10:28:21 - ERROR - stderr - +2025-05-11 10:28:21 - ERROR - stderr - +2025-05-11 10:28:21 - INFO - stdout - {'loss': 0.5083, 'grad_norm': 0.8758816719055176, 'learning_rate': 2.6003098506761316e-06, 'epoch': 2.32} +2025-05-11 10:28:21 - ERROR - stderr - 77%|███████▋ | 2889/3741 [17:02:27<5:02:17, 21.29s/it] +2025-05-11 10:28:40 - ERROR - stderr - 77%|███████▋ | 2890/3741 [17:02:46<4:53:50, 20.72s/it] +2025-05-11 10:28:40 - ERROR - stderr - +2025-05-11 10:28:40 - ERROR - stderr - +2025-05-11 10:28:40 - INFO - stdout - {'loss': 0.4809, 'grad_norm': 0.8525510430335999, 'learning_rate': 2.5944880284059804e-06, 'epoch': 2.32} +2025-05-11 10:28:40 - ERROR - stderr - 77%|███████▋ | 2890/3741 [17:02:46<4:53:50, 20.72s/it] +2025-05-11 10:29:02 - ERROR - stderr - 77%|███████▋ | 2891/3741 [17:03:08<4:57:17, 20.98s/it] +2025-05-11 10:29:02 - ERROR - stderr - +2025-05-11 10:29:02 - ERROR - stderr - +2025-05-11 10:29:02 - INFO - stdout - {'loss': 0.4929, 'grad_norm': 0.840334415435791, 'learning_rate': 2.588671759040909e-06, 'epoch': 2.32} +2025-05-11 10:29:02 - ERROR - stderr - 77%|███████▋ | 2891/3741 [17:03:08<4:57:17, 20.98s/it] +2025-05-11 10:29:21 - ERROR - stderr - 77%|███████▋ | 2892/3741 [17:03:28<4:50:50, 20.55s/it] +2025-05-11 10:29:21 - ERROR - stderr - +2025-05-11 10:29:21 - ERROR - stderr - +2025-05-11 10:29:21 - INFO - stdout - {'loss': 0.4818, 'grad_norm': 0.8207436203956604, 'learning_rate': 2.582861046942158e-06, 'epoch': 2.32} +2025-05-11 10:29:21 - ERROR - stderr - 77%|███████▋ | 2892/3741 [17:03:28<4:50:50, 20.55s/it] +2025-05-11 10:29:45 - ERROR - stderr - 77%|███████▋ | 2893/3741 [17:03:52<5:05:59, 21.65s/it] +2025-05-11 10:29:45 - ERROR - stderr - +2025-05-11 10:29:45 - ERROR - stderr - +2025-05-11 10:29:45 - INFO - stdout - {'loss': 0.4912, 'grad_norm': 0.8692548871040344, 'learning_rate': 2.577055896466788e-06, 'epoch': 2.32} +2025-05-11 10:29:45 - ERROR - stderr - 77%|███████▋ | 2893/3741 [17:03:52<5:05:59, 21.65s/it] +2025-05-11 10:30:05 - ERROR - stderr - 77%|███████▋ | 2894/3741 [17:04:12<4:57:50, 21.10s/it] +2025-05-11 10:30:05 - ERROR - stderr - +2025-05-11 10:30:05 - ERROR - stderr - +2025-05-11 10:30:05 - INFO - stdout - {'loss': 0.4636, 'grad_norm': 0.8304778337478638, 'learning_rate': 2.571256311967709e-06, 'epoch': 2.32} +2025-05-11 10:30:05 - ERROR - stderr - 77%|███████▋ | 2894/3741 [17:04:12<4:57:50, 21.10s/it] +2025-05-11 10:30:25 - ERROR - stderr - 77%|███████▋ | 2895/3741 [17:04:31<4:51:02, 20.64s/it] +2025-05-11 10:30:25 - ERROR - stderr - +2025-05-11 10:30:25 - ERROR - stderr - +2025-05-11 10:30:25 - INFO - stdout - {'loss': 0.4784, 'grad_norm': 0.8468001484870911, 'learning_rate': 2.565462297793644e-06, 'epoch': 2.32} +2025-05-11 10:30:25 - ERROR - stderr - 77%|███████▋ | 2895/3741 [17:04:31<4:51:02, 20.64s/it] +2025-05-11 10:30:45 - ERROR - stderr - 77%|███████▋ | 2896/3741 [17:04:51<4:47:08, 20.39s/it] +2025-05-11 10:30:45 - ERROR - stderr - +2025-05-11 10:30:45 - ERROR - stderr - +2025-05-11 10:30:45 - INFO - stdout - {'loss': 0.4685, 'grad_norm': 0.8657370209693909, 'learning_rate': 2.5596738582891335e-06, 'epoch': 2.32} +2025-05-11 10:30:45 - ERROR - stderr - 77%|███████▋ | 2896/3741 [17:04:51<4:47:08, 20.39s/it] +2025-05-11 10:31:04 - ERROR - stderr - 77%|███████▋ | 2897/3741 [17:05:11<4:44:00, 20.19s/it] +2025-05-11 10:31:04 - ERROR - stderr - +2025-05-11 10:31:04 - ERROR - stderr - +2025-05-11 10:31:04 - INFO - stdout - {'loss': 0.4642, 'grad_norm': 0.8483834862709045, 'learning_rate': 2.5538909977945593e-06, 'epoch': 2.32} +2025-05-11 10:31:04 - ERROR - stderr - 77%|███████▋ | 2897/3741 [17:05:11<4:44:00, 20.19s/it] +2025-05-11 10:31:24 - ERROR - stderr - 77%|███████▋ | 2898/3741 [17:05:30<4:40:47, 19.98s/it] +2025-05-11 10:31:24 - ERROR - stderr - +2025-05-11 10:31:24 - ERROR - stderr - +2025-05-11 10:31:24 - INFO - stdout - {'loss': 0.4746, 'grad_norm': 0.8330668210983276, 'learning_rate': 2.5481137206460994e-06, 'epoch': 2.32} +2025-05-11 10:31:24 - ERROR - stderr - 77%|███████▋ | 2898/3741 [17:05:30<4:40:47, 19.98s/it] +2025-05-11 10:31:44 - ERROR - stderr - 77%|███████▋ | 2899/3741 [17:05:50<4:39:41, 19.93s/it] +2025-05-11 10:31:44 - ERROR - stderr - +2025-05-11 10:31:44 - ERROR - stderr - +2025-05-11 10:31:44 - INFO - stdout - {'loss': 0.4604, 'grad_norm': 0.7988243103027344, 'learning_rate': 2.542342031175754e-06, 'epoch': 2.32} +2025-05-11 10:31:44 - ERROR - stderr - 77%|███████▋ | 2899/3741 [17:05:50<4:39:41, 19.93s/it] +2025-05-11 10:32:05 - ERROR - stderr - 78%|███████▊ | 2900/3741 [17:06:11<4:45:32, 20.37s/it] +2025-05-11 10:32:05 - ERROR - stderr - +2025-05-11 10:32:05 - ERROR - stderr - +2025-05-11 10:32:05 - INFO - stdout - {'loss': 0.4834, 'grad_norm': 0.8841968774795532, 'learning_rate': 2.536575933711336e-06, 'epoch': 2.33} +2025-05-11 10:32:05 - ERROR - stderr - 78%|███████▊ | 2900/3741 [17:06:11<4:45:32, 20.37s/it] +2025-05-11 10:32:25 - ERROR - stderr - 78%|███████▊ | 2901/3741 [17:06:31<4:41:44, 20.12s/it] +2025-05-11 10:32:25 - ERROR - stderr - +2025-05-11 10:32:25 - ERROR - stderr - +2025-05-11 10:32:25 - INFO - stdout - {'loss': 0.4787, 'grad_norm': 0.9015833735466003, 'learning_rate': 2.5308154325764543e-06, 'epoch': 2.33} +2025-05-11 10:32:25 - ERROR - stderr - 78%|███████▊ | 2901/3741 [17:06:31<4:41:44, 20.12s/it] +2025-05-11 10:32:46 - ERROR - stderr - 78%|███████▊ | 2902/3741 [17:06:53<4:47:49, 20.58s/it] +2025-05-11 10:32:46 - ERROR - stderr - +2025-05-11 10:32:46 - ERROR - stderr - +2025-05-11 10:32:46 - INFO - stdout - {'loss': 0.5111, 'grad_norm': 0.8655214905738831, 'learning_rate': 2.5250605320905387e-06, 'epoch': 2.33} +2025-05-11 10:32:46 - ERROR - stderr - 78%|███████▊ | 2902/3741 [17:06:53<4:47:49, 20.58s/it] +2025-05-11 10:33:06 - ERROR - stderr - 78%|███████▊ | 2903/3741 [17:07:12<4:43:02, 20.27s/it] +2025-05-11 10:33:06 - ERROR - stderr - +2025-05-11 10:33:06 - ERROR - stderr - +2025-05-11 10:33:06 - INFO - stdout - {'loss': 0.4828, 'grad_norm': 0.8751394152641296, 'learning_rate': 2.519311236568801e-06, 'epoch': 2.33} +2025-05-11 10:33:06 - ERROR - stderr - 78%|███████▊ | 2903/3741 [17:07:12<4:43:02, 20.27s/it] +2025-05-11 10:33:29 - ERROR - stderr - 78%|███████▊ | 2904/3741 [17:07:35<4:53:27, 21.04s/it] +2025-05-11 10:33:29 - ERROR - stderr - +2025-05-11 10:33:29 - ERROR - stderr - +2025-05-11 10:33:29 - INFO - stdout - {'loss': 0.4704, 'grad_norm': 0.8501570820808411, 'learning_rate': 2.5135675503222623e-06, 'epoch': 2.33} +2025-05-11 10:33:29 - ERROR - stderr - 78%|███████▊ | 2904/3741 [17:07:35<4:53:27, 21.04s/it] +2025-05-11 10:33:48 - ERROR - stderr - 78%|███████▊ | 2905/3741 [17:07:54<4:46:53, 20.59s/it] +2025-05-11 10:33:48 - ERROR - stderr - +2025-05-11 10:33:48 - ERROR - stderr - +2025-05-11 10:33:48 - INFO - stdout - {'loss': 0.4816, 'grad_norm': 0.8615387678146362, 'learning_rate': 2.5078294776577372e-06, 'epoch': 2.33} +2025-05-11 10:33:48 - ERROR - stderr - 78%|███████▊ | 2905/3741 [17:07:55<4:46:53, 20.59s/it] +2025-05-11 10:34:12 - ERROR - stderr - 78%|███████▊ | 2906/3741 [17:08:19<5:00:57, 21.63s/it] +2025-05-11 10:34:12 - ERROR - stderr - +2025-05-11 10:34:12 - ERROR - stderr - +2025-05-11 10:34:12 - INFO - stdout - {'loss': 0.4831, 'grad_norm': 0.8755018711090088, 'learning_rate': 2.5020970228778198e-06, 'epoch': 2.33} +2025-05-11 10:34:12 - ERROR - stderr - 78%|███████▊ | 2906/3741 [17:08:19<5:00:57, 21.63s/it] +2025-05-11 10:34:32 - ERROR - stderr - 78%|███████▊ | 2907/3741 [17:08:38<4:52:39, 21.05s/it] +2025-05-11 10:34:32 - ERROR - stderr - +2025-05-11 10:34:32 - ERROR - stderr - +2025-05-11 10:34:32 - INFO - stdout - {'loss': 0.4664, 'grad_norm': 0.877673327922821, 'learning_rate': 2.49637019028091e-06, 'epoch': 2.33} +2025-05-11 10:34:32 - ERROR - stderr - 78%|███████▊ | 2907/3741 [17:08:38<4:52:39, 21.05s/it] +2025-05-11 10:34:55 - ERROR - stderr - 78%|███████▊ | 2908/3741 [17:09:02<5:02:45, 21.81s/it] +2025-05-11 10:34:56 - ERROR - stderr - +2025-05-11 10:34:56 - ERROR - stderr - +2025-05-11 10:34:56 - INFO - stdout - {'loss': 0.4914, 'grad_norm': 0.8591618537902832, 'learning_rate': 2.4906489841611736e-06, 'epoch': 2.33} +2025-05-11 10:34:56 - ERROR - stderr - 78%|███████▊ | 2908/3741 [17:09:02<5:02:45, 21.81s/it] +2025-05-11 10:35:15 - ERROR - stderr - 78%|███████▊ | 2909/3741 [17:09:21<4:53:26, 21.16s/it] +2025-05-11 10:35:15 - ERROR - stderr - +2025-05-11 10:35:15 - ERROR - stderr - +2025-05-11 10:35:15 - INFO - stdout - {'loss': 0.4756, 'grad_norm': 0.820887565612793, 'learning_rate': 2.48493340880857e-06, 'epoch': 2.33} +2025-05-11 10:35:15 - ERROR - stderr - 78%|███████▊ | 2909/3741 [17:09:21<4:53:26, 21.16s/it] +2025-05-11 10:35:36 - ERROR - stderr - 78%|███████▊ | 2910/3741 [17:09:42<4:50:04, 20.94s/it] +2025-05-11 10:35:36 - ERROR - stderr - +2025-05-11 10:35:36 - ERROR - stderr - +2025-05-11 10:35:36 - INFO - stdout - {'loss': 0.4756, 'grad_norm': 0.8460831642150879, 'learning_rate': 2.4792234685088312e-06, 'epoch': 2.33} +2025-05-11 10:35:36 - ERROR - stderr - 78%|███████▊ | 2910/3741 [17:09:42<4:50:04, 20.94s/it] +2025-05-11 10:35:55 - ERROR - stderr - 78%|███████▊ | 2911/3741 [17:10:02<4:44:46, 20.59s/it] +2025-05-11 10:35:55 - ERROR - stderr - +2025-05-11 10:35:55 - ERROR - stderr - +2025-05-11 10:35:55 - INFO - stdout - {'loss': 0.4647, 'grad_norm': 0.8340601921081543, 'learning_rate': 2.473519167543467e-06, 'epoch': 2.33} +2025-05-11 10:35:55 - ERROR - stderr - 78%|███████▊ | 2911/3741 [17:10:02<4:44:46, 20.59s/it] +2025-05-11 10:36:15 - ERROR - stderr - 78%|███████▊ | 2912/3741 [17:10:22<4:41:25, 20.37s/it] +2025-05-11 10:36:15 - ERROR - stderr - +2025-05-11 10:36:15 - ERROR - stderr - +2025-05-11 10:36:15 - INFO - stdout - {'loss': 0.4587, 'grad_norm': 0.8154265284538269, 'learning_rate': 2.4678205101897523e-06, 'epoch': 2.34} +2025-05-11 10:36:15 - ERROR - stderr - 78%|███████▊ | 2912/3741 [17:10:22<4:41:25, 20.37s/it] +2025-05-11 10:36:35 - ERROR - stderr - 78%|███████▊ | 2913/3741 [17:10:41<4:39:21, 20.24s/it] +2025-05-11 10:36:35 - ERROR - stderr - +2025-05-11 10:36:35 - ERROR - stderr - +2025-05-11 10:36:35 - INFO - stdout - {'loss': 0.4981, 'grad_norm': 0.8957749605178833, 'learning_rate': 2.462127500720737e-06, 'epoch': 2.34} +2025-05-11 10:36:35 - ERROR - stderr - 78%|███████▊ | 2913/3741 [17:10:41<4:39:21, 20.24s/it] +2025-05-11 10:36:56 - ERROR - stderr - 78%|███████▊ | 2914/3741 [17:11:02<4:40:05, 20.32s/it] +2025-05-11 10:36:56 - ERROR - stderr - +2025-05-11 10:36:56 - ERROR - stderr - +2025-05-11 10:36:56 - INFO - stdout - {'loss': 0.4774, 'grad_norm': 0.8637145757675171, 'learning_rate': 2.456440143405232e-06, 'epoch': 2.34} +2025-05-11 10:36:56 - ERROR - stderr - 78%|███████▊ | 2914/3741 [17:11:02<4:40:05, 20.32s/it] +2025-05-11 10:37:16 - ERROR - stderr - 78%|███████▊ | 2915/3741 [17:11:23<4:41:07, 20.42s/it] +2025-05-11 10:37:16 - ERROR - stderr - +2025-05-11 10:37:16 - ERROR - stderr - +2025-05-11 10:37:16 - INFO - stdout - {'loss': 0.4996, 'grad_norm': 0.8834245800971985, 'learning_rate': 2.4507584425078133e-06, 'epoch': 2.34} +2025-05-11 10:37:16 - ERROR - stderr - 78%|███████▊ | 2915/3741 [17:11:23<4:41:07, 20.42s/it] +2025-05-11 10:37:36 - ERROR - stderr - 78%|███████▊ | 2916/3741 [17:11:43<4:39:24, 20.32s/it] +2025-05-11 10:37:36 - ERROR - stderr - +2025-05-11 10:37:36 - ERROR - stderr - +2025-05-11 10:37:36 - INFO - stdout - {'loss': 0.509, 'grad_norm': 0.8881711363792419, 'learning_rate': 2.4450824022888166e-06, 'epoch': 2.34} +2025-05-11 10:37:36 - ERROR - stderr - 78%|███████▊ | 2916/3741 [17:11:43<4:39:24, 20.32s/it] +2025-05-11 10:37:58 - ERROR - stderr - 78%|███████▊ | 2917/3741 [17:12:05<4:45:39, 20.80s/it] +2025-05-11 10:37:58 - ERROR - stderr - +2025-05-11 10:37:58 - ERROR - stderr - +2025-05-11 10:37:58 - INFO - stdout - {'loss': 0.4873, 'grad_norm': 0.9144017100334167, 'learning_rate': 2.4394120270043233e-06, 'epoch': 2.34} +2025-05-11 10:37:58 - ERROR - stderr - 78%|███████▊ | 2917/3741 [17:12:05<4:45:39, 20.80s/it] +2025-05-11 10:38:18 - ERROR - stderr - 78%|███████▊ | 2918/3741 [17:12:24<4:41:16, 20.51s/it] +2025-05-11 10:38:18 - ERROR - stderr - +2025-05-11 10:38:18 - ERROR - stderr - +2025-05-11 10:38:18 - INFO - stdout - {'loss': 0.464, 'grad_norm': 0.8431651592254639, 'learning_rate': 2.433747320906177e-06, 'epoch': 2.34} +2025-05-11 10:38:18 - ERROR - stderr - 78%|███████▊ | 2918/3741 [17:12:24<4:41:16, 20.51s/it] +2025-05-11 10:38:39 - ERROR - stderr - 78%|███████▊ | 2919/3741 [17:12:46<4:44:13, 20.75s/it] +2025-05-11 10:38:39 - ERROR - stderr - +2025-05-11 10:38:39 - ERROR - stderr - +2025-05-11 10:38:39 - INFO - stdout - {'loss': 0.4791, 'grad_norm': 0.8841955661773682, 'learning_rate': 2.4280882882419676e-06, 'epoch': 2.34} +2025-05-11 10:38:39 - ERROR - stderr - 78%|███████▊ | 2919/3741 [17:12:46<4:44:13, 20.75s/it] +2025-05-11 10:38:59 - ERROR - stderr - 78%|███████▊ | 2920/3741 [17:13:06<4:40:19, 20.49s/it] +2025-05-11 10:38:59 - ERROR - stderr - +2025-05-11 10:38:59 - ERROR - stderr - +2025-05-11 10:38:59 - INFO - stdout - {'loss': 0.5035, 'grad_norm': 0.9004083871841431, 'learning_rate': 2.4224349332550313e-06, 'epoch': 2.34} +2025-05-11 10:38:59 - ERROR - stderr - 78%|███████▊ | 2920/3741 [17:13:06<4:40:19, 20.49s/it] +2025-05-11 10:39:22 - ERROR - stderr - 78%|███████▊ | 2921/3741 [17:13:29<4:50:31, 21.26s/it] +2025-05-11 10:39:22 - ERROR - stderr - +2025-05-11 10:39:22 - ERROR - stderr - +2025-05-11 10:39:22 - INFO - stdout - {'loss': 0.4744, 'grad_norm': 0.8716943264007568, 'learning_rate': 2.4167872601844476e-06, 'epoch': 2.34} +2025-05-11 10:39:22 - ERROR - stderr - 78%|███████▊ | 2921/3741 [17:13:29<4:50:31, 21.26s/it] +2025-05-11 10:39:42 - ERROR - stderr - 78%|███████▊ | 2922/3741 [17:13:48<4:44:12, 20.82s/it] +2025-05-11 10:39:42 - ERROR - stderr - +2025-05-11 10:39:42 - ERROR - stderr - +2025-05-11 10:39:42 - INFO - stdout - {'loss': 0.4983, 'grad_norm': 0.8678621053695679, 'learning_rate': 2.411145273265029e-06, 'epoch': 2.34} +2025-05-11 10:39:42 - ERROR - stderr - 78%|███████▊ | 2922/3741 [17:13:49<4:44:12, 20.82s/it] +2025-05-11 10:40:05 - ERROR - stderr - 78%|███████▊ | 2923/3741 [17:14:11<4:51:39, 21.39s/it] +2025-05-11 10:40:05 - ERROR - stderr - +2025-05-11 10:40:05 - ERROR - stderr - +2025-05-11 10:40:05 - INFO - stdout - {'loss': 0.4798, 'grad_norm': 0.8493649959564209, 'learning_rate': 2.405508976727332e-06, 'epoch': 2.34} +2025-05-11 10:40:05 - ERROR - stderr - 78%|███████▊ | 2923/3741 [17:14:11<4:51:39, 21.39s/it] +2025-05-11 10:40:25 - ERROR - stderr - 78%|███████▊ | 2924/3741 [17:14:31<4:44:00, 20.86s/it] +2025-05-11 10:40:25 - ERROR - stderr - +2025-05-11 10:40:25 - ERROR - stderr - +2025-05-11 10:40:25 - INFO - stdout - {'loss': 0.5009, 'grad_norm': 0.8921751379966736, 'learning_rate': 2.3998783747976473e-06, 'epoch': 2.34} +2025-05-11 10:40:25 - ERROR - stderr - 78%|███████▊ | 2924/3741 [17:14:31<4:44:00, 20.86s/it] +2025-05-11 10:40:47 - ERROR - stderr - 78%|███████▊ | 2925/3741 [17:14:54<4:51:11, 21.41s/it] +2025-05-11 10:40:47 - ERROR - stderr - +2025-05-11 10:40:47 - ERROR - stderr - +2025-05-11 10:40:47 - INFO - stdout - {'loss': 0.473, 'grad_norm': 0.8245465755462646, 'learning_rate': 2.3942534716979827e-06, 'epoch': 2.35} +2025-05-11 10:40:47 - ERROR - stderr - 78%|███████▊ | 2925/3741 [17:14:54<4:51:11, 21.41s/it] +2025-05-11 10:41:07 - ERROR - stderr - 78%|███████▊ | 2926/3741 [17:15:13<4:44:36, 20.95s/it] +2025-05-11 10:41:07 - ERROR - stderr - +2025-05-11 10:41:07 - ERROR - stderr - +2025-05-11 10:41:07 - INFO - stdout - {'loss': 0.452, 'grad_norm': 0.7934049367904663, 'learning_rate': 2.3886342716460932e-06, 'epoch': 2.35} +2025-05-11 10:41:07 - ERROR - stderr - 78%|███████▊ | 2926/3741 [17:15:13<4:44:36, 20.95s/it] +2025-05-11 10:41:30 - ERROR - stderr - 78%|███████▊ | 2927/3741 [17:15:36<4:51:59, 21.52s/it] +2025-05-11 10:41:30 - ERROR - stderr - +2025-05-11 10:41:30 - ERROR - stderr - +2025-05-11 10:41:30 - INFO - stdout - {'loss': 0.4759, 'grad_norm': 0.8302115201950073, 'learning_rate': 2.3830207788554394e-06, 'epoch': 2.35} +2025-05-11 10:41:30 - ERROR - stderr - 78%|███████▊ | 2927/3741 [17:15:36<4:51:59, 21.52s/it] +2025-05-11 10:41:50 - ERROR - stderr - 78%|███████▊ | 2928/3741 [17:15:56<4:43:41, 20.94s/it] +2025-05-11 10:41:50 - ERROR - stderr - +2025-05-11 10:41:50 - ERROR - stderr - +2025-05-11 10:41:50 - INFO - stdout - {'loss': 0.4752, 'grad_norm': 0.8664802312850952, 'learning_rate': 2.3774129975352112e-06, 'epoch': 2.35} +2025-05-11 10:41:50 - ERROR - stderr - 78%|███████▊ | 2928/3741 [17:15:56<4:43:41, 20.94s/it] +2025-05-11 10:42:12 - ERROR - stderr - 78%|███████▊ | 2929/3741 [17:16:19<4:50:26, 21.46s/it] +2025-05-11 10:42:12 - ERROR - stderr - +2025-05-11 10:42:12 - ERROR - stderr - +2025-05-11 10:42:12 - INFO - stdout - {'loss': 0.4827, 'grad_norm': 0.8159329295158386, 'learning_rate': 2.371810931890316e-06, 'epoch': 2.35} +2025-05-11 10:42:12 - ERROR - stderr - 78%|███████▊ | 2929/3741 [17:16:19<4:50:26, 21.46s/it] +2025-05-11 10:42:32 - ERROR - stderr - 78%|███████▊ | 2930/3741 [17:16:38<4:42:31, 20.90s/it] +2025-05-11 10:42:32 - ERROR - stderr - +2025-05-11 10:42:32 - ERROR - stderr - +2025-05-11 10:42:32 - INFO - stdout - {'loss': 0.4715, 'grad_norm': 0.8080800771713257, 'learning_rate': 2.366214586121366e-06, 'epoch': 2.35} +2025-05-11 10:42:32 - ERROR - stderr - 78%|███████▊ | 2930/3741 [17:16:38<4:42:31, 20.90s/it] +2025-05-11 10:42:54 - ERROR - stderr - 78%|███████▊ | 2931/3741 [17:17:00<4:46:50, 21.25s/it] +2025-05-11 10:42:54 - ERROR - stderr - +2025-05-11 10:42:54 - ERROR - stderr - +2025-05-11 10:42:54 - INFO - stdout - {'loss': 0.494, 'grad_norm': 0.8522915840148926, 'learning_rate': 2.360623964424703e-06, 'epoch': 2.35} +2025-05-11 10:42:54 - ERROR - stderr - 78%|███████▊ | 2931/3741 [17:17:00<4:46:50, 21.25s/it] +2025-05-11 10:43:13 - ERROR - stderr - 78%|███████▊ | 2932/3741 [17:17:20<4:39:14, 20.71s/it] +2025-05-11 10:43:13 - ERROR - stderr - +2025-05-11 10:43:13 - ERROR - stderr - +2025-05-11 10:43:13 - INFO - stdout - {'loss': 0.4748, 'grad_norm': 0.8710372447967529, 'learning_rate': 2.3550390709923575e-06, 'epoch': 2.35} +2025-05-11 10:43:13 - ERROR - stderr - 78%|███████▊ | 2932/3741 [17:17:20<4:39:14, 20.71s/it] +2025-05-11 10:43:36 - ERROR - stderr - 78%|███████▊ | 2933/3741 [17:17:42<4:47:29, 21.35s/it] +2025-05-11 10:43:36 - ERROR - stderr - +2025-05-11 10:43:36 - ERROR - stderr - +2025-05-11 10:43:36 - INFO - stdout - {'loss': 0.4708, 'grad_norm': 0.8452877998352051, 'learning_rate': 2.349459910012075e-06, 'epoch': 2.35} +2025-05-11 10:43:36 - ERROR - stderr - 78%|███████▊ | 2933/3741 [17:17:42<4:47:29, 21.35s/it] +2025-05-11 10:43:56 - ERROR - stderr - 78%|███████▊ | 2934/3741 [17:18:02<4:39:38, 20.79s/it] +2025-05-11 10:43:56 - ERROR - stderr - +2025-05-11 10:43:56 - ERROR - stderr - +2025-05-11 10:43:56 - INFO - stdout - {'loss': 0.4782, 'grad_norm': 0.8797398805618286, 'learning_rate': 2.343886485667303e-06, 'epoch': 2.35} +2025-05-11 10:43:56 - ERROR - stderr - 78%|███████▊ | 2934/3741 [17:18:02<4:39:38, 20.79s/it] +2025-05-11 10:44:19 - ERROR - stderr - 78%|███████▊ | 2935/3741 [17:18:26<4:50:53, 21.65s/it] +2025-05-11 10:44:19 - ERROR - stderr - +2025-05-11 10:44:19 - ERROR - stderr - +2025-05-11 10:44:19 - INFO - stdout - {'loss': 0.478, 'grad_norm': 0.8327314853668213, 'learning_rate': 2.3383188021371773e-06, 'epoch': 2.35} +2025-05-11 10:44:19 - ERROR - stderr - 78%|███████▊ | 2935/3741 [17:18:26<4:50:53, 21.65s/it] +2025-05-11 10:44:39 - ERROR - stderr - 78%|███████▊ | 2936/3741 [17:18:46<4:43:57, 21.16s/it] +2025-05-11 10:44:39 - ERROR - stderr - +2025-05-11 10:44:39 - ERROR - stderr - +2025-05-11 10:44:39 - INFO - stdout - {'loss': 0.4457, 'grad_norm': 0.8496788740158081, 'learning_rate': 2.332756863596547e-06, 'epoch': 2.35} +2025-05-11 10:44:39 - ERROR - stderr - 78%|███████▊ | 2936/3741 [17:18:46<4:43:57, 21.16s/it] +2025-05-11 10:45:03 - ERROR - stderr - 79%|███████▊ | 2937/3741 [17:19:09<4:53:02, 21.87s/it] +2025-05-11 10:45:03 - ERROR - stderr - +2025-05-11 10:45:03 - ERROR - stderr - +2025-05-11 10:45:03 - INFO - stdout - {'loss': 0.4896, 'grad_norm': 0.842319130897522, 'learning_rate': 2.327200674215937e-06, 'epoch': 2.36} +2025-05-11 10:45:03 - ERROR - stderr - 79%|███████▊ | 2937/3741 [17:19:09<4:53:02, 21.87s/it] +2025-05-11 10:45:23 - ERROR - stderr - 79%|███████▊ | 2938/3741 [17:19:29<4:46:18, 21.39s/it] +2025-05-11 10:45:23 - ERROR - stderr - +2025-05-11 10:45:23 - ERROR - stderr - +2025-05-11 10:45:23 - INFO - stdout - {'loss': 0.5128, 'grad_norm': 0.8751649260520935, 'learning_rate': 2.3216502381615633e-06, 'epoch': 2.36} +2025-05-11 10:45:23 - ERROR - stderr - 79%|███████▊ | 2938/3741 [17:19:29<4:46:18, 21.39s/it] +2025-05-11 10:45:47 - ERROR - stderr - 79%|███████▊ | 2939/3741 [17:19:53<4:54:30, 22.03s/it] +2025-05-11 10:45:47 - ERROR - stderr - +2025-05-11 10:45:47 - ERROR - stderr - +2025-05-11 10:45:47 - INFO - stdout - {'loss': 0.4961, 'grad_norm': 0.8716158866882324, 'learning_rate': 2.316105559595342e-06, 'epoch': 2.36} +2025-05-11 10:45:47 - ERROR - stderr - 79%|███████▊ | 2939/3741 [17:19:53<4:54:30, 22.03s/it] +2025-05-11 10:46:07 - ERROR - stderr - 79%|███████▊ | 2940/3741 [17:20:13<4:47:14, 21.52s/it] +2025-05-11 10:46:07 - ERROR - stderr - +2025-05-11 10:46:07 - ERROR - stderr - +2025-05-11 10:46:07 - INFO - stdout - {'loss': 0.4853, 'grad_norm': 0.9204161763191223, 'learning_rate': 2.310566642674854e-06, 'epoch': 2.36} +2025-05-11 10:46:07 - ERROR - stderr - 79%|███████▊ | 2940/3741 [17:20:13<4:47:14, 21.52s/it] +2025-05-11 10:46:30 - ERROR - stderr - 79%|███████▊ | 2941/3741 [17:20:37<4:53:52, 22.04s/it] +2025-05-11 10:46:30 - ERROR - stderr - +2025-05-11 10:46:30 - ERROR - stderr - +2025-05-11 10:46:30 - INFO - stdout - {'loss': 0.4946, 'grad_norm': 0.9012963175773621, 'learning_rate': 2.3050334915533713e-06, 'epoch': 2.36} +2025-05-11 10:46:30 - ERROR - stderr - 79%|███████▊ | 2941/3741 [17:20:37<4:53:52, 22.04s/it] +2025-05-11 10:46:51 - ERROR - stderr - 79%|███████▊ | 2942/3741 [17:20:57<4:46:51, 21.54s/it] +2025-05-11 10:46:51 - ERROR - stderr - +2025-05-11 10:46:51 - ERROR - stderr - +2025-05-11 10:46:51 - INFO - stdout - {'loss': 0.4866, 'grad_norm': 0.8790763020515442, 'learning_rate': 2.2995061103798397e-06, 'epoch': 2.36} +2025-05-11 10:46:51 - ERROR - stderr - 79%|███████▊ | 2942/3741 [17:20:57<4:46:51, 21.54s/it] +2025-05-11 10:47:14 - ERROR - stderr - 79%|███████▊ | 2943/3741 [17:21:20<4:54:23, 22.14s/it] +2025-05-11 10:47:14 - ERROR - stderr - +2025-05-11 10:47:14 - ERROR - stderr - +2025-05-11 10:47:14 - INFO - stdout - {'loss': 0.4877, 'grad_norm': 0.8495937585830688, 'learning_rate': 2.2939845032988707e-06, 'epoch': 2.36} +2025-05-11 10:47:14 - ERROR - stderr - 79%|███████▊ | 2943/3741 [17:21:20<4:54:23, 22.14s/it] +2025-05-11 10:47:34 - ERROR - stderr - 79%|███████▊ | 2944/3741 [17:21:41<4:46:14, 21.55s/it] +2025-05-11 10:47:34 - ERROR - stderr - +2025-05-11 10:47:34 - ERROR - stderr - +2025-05-11 10:47:34 - INFO - stdout - {'loss': 0.4856, 'grad_norm': 0.8435322046279907, 'learning_rate': 2.288468674450766e-06, 'epoch': 2.36} +2025-05-11 10:47:34 - ERROR - stderr - 79%|███████▊ | 2944/3741 [17:21:41<4:46:14, 21.55s/it] +2025-05-11 10:47:57 - ERROR - stderr - 79%|███████▊ | 2945/3741 [17:22:04<4:52:14, 22.03s/it] +2025-05-11 10:47:57 - ERROR - stderr - +2025-05-11 10:47:57 - ERROR - stderr - +2025-05-11 10:47:57 - INFO - stdout - {'loss': 0.4737, 'grad_norm': 0.8671093583106995, 'learning_rate': 2.28295862797147e-06, 'epoch': 2.36} +2025-05-11 10:47:57 - ERROR - stderr - 79%|███████▊ | 2945/3741 [17:22:04<4:52:14, 22.03s/it] +2025-05-11 10:48:18 - ERROR - stderr - 79%|███████▊ | 2946/3741 [17:22:24<4:44:42, 21.49s/it] +2025-05-11 10:48:18 - ERROR - stderr - +2025-05-11 10:48:18 - ERROR - stderr - +2025-05-11 10:48:18 - INFO - stdout - {'loss': 0.5076, 'grad_norm': 0.8573446273803711, 'learning_rate': 2.27745436799261e-06, 'epoch': 2.36} +2025-05-11 10:48:18 - ERROR - stderr - 79%|███████▊ | 2946/3741 [17:22:24<4:44:42, 21.49s/it] +2025-05-11 10:48:40 - ERROR - stderr - 79%|███████▉ | 2947/3741 [17:22:47<4:48:34, 21.81s/it] +2025-05-11 10:48:40 - ERROR - stderr - +2025-05-11 10:48:40 - ERROR - stderr - +2025-05-11 10:48:40 - INFO - stdout - {'loss': 0.4653, 'grad_norm': 0.8513672947883606, 'learning_rate': 2.271955898641467e-06, 'epoch': 2.36} +2025-05-11 10:48:40 - ERROR - stderr - 79%|███████▉ | 2947/3741 [17:22:47<4:48:34, 21.81s/it] +2025-05-11 10:49:01 - ERROR - stderr - 79%|███████▉ | 2948/3741 [17:23:07<4:43:18, 21.44s/it] +2025-05-11 10:49:01 - ERROR - stderr - +2025-05-11 10:49:01 - ERROR - stderr - +2025-05-11 10:49:01 - INFO - stdout - {'loss': 0.4721, 'grad_norm': 0.8536444306373596, 'learning_rate': 2.2664632240409746e-06, 'epoch': 2.36} +2025-05-11 10:49:01 - ERROR - stderr - 79%|███████▉ | 2948/3741 [17:23:07<4:43:18, 21.44s/it] +2025-05-11 10:49:24 - ERROR - stderr - 79%|███████▉ | 2949/3741 [17:23:30<4:48:04, 21.82s/it] +2025-05-11 10:49:24 - ERROR - stderr - +2025-05-11 10:49:24 - ERROR - stderr - +2025-05-11 10:49:24 - INFO - stdout - {'loss': 0.5051, 'grad_norm': 0.8997591733932495, 'learning_rate': 2.260976348309737e-06, 'epoch': 2.36} +2025-05-11 10:49:24 - ERROR - stderr - 79%|███████▉ | 2949/3741 [17:23:30<4:48:04, 21.82s/it] +2025-05-11 10:49:44 - ERROR - stderr - 79%|███████▉ | 2950/3741 [17:23:50<4:42:04, 21.40s/it] +2025-05-11 10:49:44 - ERROR - stderr - +2025-05-11 10:49:44 - ERROR - stderr - +2025-05-11 10:49:44 - INFO - stdout - {'loss': 0.485, 'grad_norm': 0.854433536529541, 'learning_rate': 2.255495275561993e-06, 'epoch': 2.37} +2025-05-11 10:49:44 - ERROR - stderr - 79%|███████▉ | 2950/3741 [17:23:50<4:42:04, 21.40s/it] +2025-05-11 10:50:07 - ERROR - stderr - 79%|███████▉ | 2951/3741 [17:24:13<4:47:48, 21.86s/it] +2025-05-11 10:50:07 - ERROR - stderr - +2025-05-11 10:50:07 - ERROR - stderr - +2025-05-11 10:50:07 - INFO - stdout - {'loss': 0.4982, 'grad_norm': 0.9025599956512451, 'learning_rate': 2.2500200099076395e-06, 'epoch': 2.37} +2025-05-11 10:50:07 - ERROR - stderr - 79%|███████▉ | 2951/3741 [17:24:13<4:47:48, 21.86s/it] +2025-05-11 10:50:28 - ERROR - stderr - 79%|███████▉ | 2952/3741 [17:24:34<4:43:21, 21.55s/it] +2025-05-11 10:50:28 - ERROR - stderr - +2025-05-11 10:50:28 - ERROR - stderr - +2025-05-11 10:50:28 - INFO - stdout - {'loss': 0.4662, 'grad_norm': 0.7927220463752747, 'learning_rate': 2.2445505554522207e-06, 'epoch': 2.37} +2025-05-11 10:50:28 - ERROR - stderr - 79%|███████▉ | 2952/3741 [17:24:34<4:43:21, 21.55s/it] +2025-05-11 10:50:51 - ERROR - stderr - 79%|███████▉ | 2953/3741 [17:24:57<4:48:23, 21.96s/it] +2025-05-11 10:50:51 - ERROR - stderr - +2025-05-11 10:50:51 - ERROR - stderr - +2025-05-11 10:50:51 - INFO - stdout - {'loss': 0.4682, 'grad_norm': 0.8282037973403931, 'learning_rate': 2.239086916296914e-06, 'epoch': 2.37} +2025-05-11 10:50:51 - ERROR - stderr - 79%|███████▉ | 2953/3741 [17:24:57<4:48:23, 21.96s/it] +2025-05-11 10:51:11 - ERROR - stderr - 79%|███████▉ | 2954/3741 [17:25:17<4:41:07, 21.43s/it] +2025-05-11 10:51:11 - ERROR - stderr - +2025-05-11 10:51:11 - ERROR - stderr - +2025-05-11 10:51:11 - INFO - stdout - {'loss': 0.4766, 'grad_norm': 0.8752132654190063, 'learning_rate': 2.2336290965385454e-06, 'epoch': 2.37} +2025-05-11 10:51:11 - ERROR - stderr - 79%|███████▉ | 2954/3741 [17:25:17<4:41:07, 21.43s/it] +2025-05-11 10:51:33 - ERROR - stderr - 79%|███████▉ | 2955/3741 [17:25:39<4:41:48, 21.51s/it] +2025-05-11 10:51:33 - ERROR - stderr - +2025-05-11 10:51:33 - ERROR - stderr - +2025-05-11 10:51:33 - INFO - stdout - {'loss': 0.4887, 'grad_norm': 0.8500027656555176, 'learning_rate': 2.228177100269573e-06, 'epoch': 2.37} +2025-05-11 10:51:33 - ERROR - stderr - 79%|███████▉ | 2955/3741 [17:25:39<4:41:48, 21.51s/it] +2025-05-11 10:51:52 - ERROR - stderr - 79%|███████▉ | 2956/3741 [17:25:58<4:33:38, 20.92s/it] +2025-05-11 10:51:52 - ERROR - stderr - +2025-05-11 10:51:52 - ERROR - stderr - +2025-05-11 10:51:52 - INFO - stdout - {'loss': 0.495, 'grad_norm': 0.8315219879150391, 'learning_rate': 2.22273093157809e-06, 'epoch': 2.37} +2025-05-11 10:51:52 - ERROR - stderr - 79%|███████▉ | 2956/3741 [17:25:58<4:33:38, 20.92s/it] +2025-05-11 10:52:14 - ERROR - stderr - 79%|███████▉ | 2957/3741 [17:26:20<4:37:21, 21.23s/it] +2025-05-11 10:52:14 - ERROR - stderr - +2025-05-11 10:52:14 - ERROR - stderr - +2025-05-11 10:52:14 - INFO - stdout - {'loss': 0.4768, 'grad_norm': 0.8728612065315247, 'learning_rate': 2.217290594547822e-06, 'epoch': 2.37} +2025-05-11 10:52:14 - ERROR - stderr - 79%|███████▉ | 2957/3741 [17:26:20<4:37:21, 21.23s/it] +2025-05-11 10:52:33 - ERROR - stderr - 79%|███████▉ | 2958/3741 [17:26:40<4:29:52, 20.68s/it] +2025-05-11 10:52:33 - ERROR - stderr - +2025-05-11 10:52:33 - ERROR - stderr - +2025-05-11 10:52:33 - INFO - stdout - {'loss': 0.4768, 'grad_norm': 0.8619360327720642, 'learning_rate': 2.2118560932581123e-06, 'epoch': 2.37} +2025-05-11 10:52:33 - ERROR - stderr - 79%|███████▉ | 2958/3741 [17:26:40<4:29:52, 20.68s/it] +2025-05-11 10:52:56 - ERROR - stderr - 79%|███████▉ | 2959/3741 [17:27:02<4:37:36, 21.30s/it] +2025-05-11 10:52:56 - ERROR - stderr - +2025-05-11 10:52:56 - ERROR - stderr - +2025-05-11 10:52:56 - INFO - stdout - {'loss': 0.4638, 'grad_norm': 0.8790128231048584, 'learning_rate': 2.2064274317839394e-06, 'epoch': 2.37} +2025-05-11 10:52:56 - ERROR - stderr - 79%|███████▉ | 2959/3741 [17:27:02<4:37:36, 21.30s/it] +2025-05-11 10:53:16 - ERROR - stderr - 79%|███████▉ | 2960/3741 [17:27:22<4:31:41, 20.87s/it] +2025-05-11 10:53:16 - ERROR - stderr - +2025-05-11 10:53:16 - ERROR - stderr - +2025-05-11 10:53:16 - INFO - stdout - {'loss': 0.4762, 'grad_norm': 0.8765968680381775, 'learning_rate': 2.2010046141958973e-06, 'epoch': 2.37} +2025-05-11 10:53:16 - ERROR - stderr - 79%|███████▉ | 2960/3741 [17:27:22<4:31:41, 20.87s/it] +2025-05-11 10:53:39 - ERROR - stderr - 79%|███████▉ | 2961/3741 [17:27:45<4:38:59, 21.46s/it] +2025-05-11 10:53:39 - ERROR - stderr - +2025-05-11 10:53:39 - ERROR - stderr - +2025-05-11 10:53:39 - INFO - stdout - {'loss': 0.4694, 'grad_norm': 0.790704607963562, 'learning_rate': 2.1955876445602008e-06, 'epoch': 2.37} +2025-05-11 10:53:39 - ERROR - stderr - 79%|███████▉ | 2961/3741 [17:27:45<4:38:59, 21.46s/it] +2025-05-11 10:53:58 - ERROR - stderr - 79%|███████▉ | 2962/3741 [17:28:05<4:31:27, 20.91s/it] +2025-05-11 10:53:58 - ERROR - stderr - +2025-05-11 10:53:58 - ERROR - stderr - +2025-05-11 10:53:58 - INFO - stdout - {'loss': 0.4642, 'grad_norm': 0.8622111082077026, 'learning_rate': 2.190176526938679e-06, 'epoch': 2.38} +2025-05-11 10:53:58 - ERROR - stderr - 79%|███████▉ | 2962/3741 [17:28:05<4:31:27, 20.91s/it] +2025-05-11 10:54:21 - ERROR - stderr - 79%|███████▉ | 2963/3741 [17:28:27<4:37:23, 21.39s/it] +2025-05-11 10:54:21 - ERROR - stderr - +2025-05-11 10:54:21 - ERROR - stderr - +2025-05-11 10:54:21 - INFO - stdout - {'loss': 0.4809, 'grad_norm': 0.8704236149787903, 'learning_rate': 2.1847712653887687e-06, 'epoch': 2.38} +2025-05-11 10:54:21 - ERROR - stderr - 79%|███████▉ | 2963/3741 [17:28:27<4:37:23, 21.39s/it] +2025-05-11 10:54:41 - ERROR - stderr - 79%|███████▉ | 2964/3741 [17:28:47<4:29:57, 20.85s/it] +2025-05-11 10:54:41 - ERROR - stderr - +2025-05-11 10:54:41 - ERROR - stderr - +2025-05-11 10:54:41 - INFO - stdout - {'loss': 0.5033, 'grad_norm': 0.8916087746620178, 'learning_rate': 2.17937186396352e-06, 'epoch': 2.38} +2025-05-11 10:54:41 - ERROR - stderr - 79%|███████▉ | 2964/3741 [17:28:47<4:29:57, 20.85s/it] +2025-05-11 10:55:03 - ERROR - stderr - 79%|███████▉ | 2965/3741 [17:29:09<4:34:58, 21.26s/it] +2025-05-11 10:55:03 - ERROR - stderr - +2025-05-11 10:55:03 - ERROR - stderr - +2025-05-11 10:55:03 - INFO - stdout - {'loss': 0.5026, 'grad_norm': 0.8955850601196289, 'learning_rate': 2.1739783267115888e-06, 'epoch': 2.38} +2025-05-11 10:55:03 - ERROR - stderr - 79%|███████▉ | 2965/3741 [17:29:09<4:34:58, 21.26s/it] +2025-05-11 10:55:22 - ERROR - stderr - 79%|███████▉ | 2966/3741 [17:29:29<4:27:54, 20.74s/it] +2025-05-11 10:55:22 - ERROR - stderr - +2025-05-11 10:55:22 - ERROR - stderr - +2025-05-11 10:55:22 - INFO - stdout - {'loss': 0.4896, 'grad_norm': 0.860222339630127, 'learning_rate': 2.1685906576772365e-06, 'epoch': 2.38} +2025-05-11 10:55:22 - ERROR - stderr - 79%|███████▉ | 2966/3741 [17:29:29<4:27:54, 20.74s/it] +2025-05-11 10:55:46 - ERROR - stderr - 79%|███████▉ | 2967/3741 [17:29:52<4:37:30, 21.51s/it] +2025-05-11 10:55:46 - ERROR - stderr - +2025-05-11 10:55:46 - ERROR - stderr - +2025-05-11 10:55:46 - INFO - stdout - {'loss': 0.4615, 'grad_norm': 0.8153157234191895, 'learning_rate': 2.1632088609003133e-06, 'epoch': 2.38} +2025-05-11 10:55:46 - ERROR - stderr - 79%|███████▉ | 2967/3741 [17:29:52<4:37:30, 21.51s/it] +2025-05-11 10:56:05 - ERROR - stderr - 79%|███████▉ | 2968/3741 [17:30:12<4:30:31, 21.00s/it] +2025-05-11 10:56:05 - ERROR - stderr - +2025-05-11 10:56:05 - ERROR - stderr - +2025-05-11 10:56:05 - INFO - stdout - {'loss': 0.4947, 'grad_norm': 0.8858957290649414, 'learning_rate': 2.157832940416279e-06, 'epoch': 2.38} +2025-05-11 10:56:05 - ERROR - stderr - 79%|███████▉ | 2968/3741 [17:30:12<4:30:31, 21.00s/it] +2025-05-11 10:56:28 - ERROR - stderr - 79%|███████▉ | 2969/3741 [17:30:35<4:37:21, 21.56s/it] +2025-05-11 10:56:28 - ERROR - stderr - +2025-05-11 10:56:28 - ERROR - stderr - +2025-05-11 10:56:28 - INFO - stdout - {'loss': 0.4623, 'grad_norm': 0.8330368995666504, 'learning_rate': 2.1524629002561803e-06, 'epoch': 2.38} +2025-05-11 10:56:28 - ERROR - stderr - 79%|███████▉ | 2969/3741 [17:30:35<4:37:21, 21.56s/it] +2025-05-11 10:56:48 - ERROR - stderr - 79%|███████▉ | 2970/3741 [17:30:54<4:29:20, 20.96s/it] +2025-05-11 10:56:48 - ERROR - stderr - +2025-05-11 10:56:48 - ERROR - stderr - +2025-05-11 10:56:48 - INFO - stdout - {'loss': 0.4907, 'grad_norm': 0.9089009165763855, 'learning_rate': 2.1470987444466564e-06, 'epoch': 2.38} +2025-05-11 10:56:48 - ERROR - stderr - 79%|███████▉ | 2970/3741 [17:30:54<4:29:20, 20.96s/it] +2025-05-11 10:57:10 - ERROR - stderr - 79%|███████▉ | 2971/3741 [17:31:16<4:32:28, 21.23s/it] +2025-05-11 10:57:10 - ERROR - stderr - +2025-05-11 10:57:10 - ERROR - stderr - +2025-05-11 10:57:10 - INFO - stdout - {'loss': 0.4827, 'grad_norm': 0.805916965007782, 'learning_rate': 2.141740477009937e-06, 'epoch': 2.38} +2025-05-11 10:57:10 - ERROR - stderr - 79%|███████▉ | 2971/3741 [17:31:16<4:32:28, 21.23s/it] +2025-05-11 10:57:29 - ERROR - stderr - 79%|███████▉ | 2972/3741 [17:31:35<4:25:13, 20.69s/it] +2025-05-11 10:57:29 - ERROR - stderr - +2025-05-11 10:57:29 - ERROR - stderr - +2025-05-11 10:57:29 - INFO - stdout - {'loss': 0.4669, 'grad_norm': 0.85927814245224, 'learning_rate': 2.1363881019638277e-06, 'epoch': 2.38} +2025-05-11 10:57:29 - ERROR - stderr - 79%|███████▉ | 2972/3741 [17:31:35<4:25:13, 20.69s/it] +2025-05-11 10:57:52 - ERROR - stderr - 79%|███████▉ | 2973/3741 [17:31:58<4:33:00, 21.33s/it] +2025-05-11 10:57:52 - ERROR - stderr - +2025-05-11 10:57:52 - ERROR - stderr - +2025-05-11 10:57:52 - INFO - stdout - {'loss': 0.5256, 'grad_norm': 0.9052659273147583, 'learning_rate': 2.1310416233217246e-06, 'epoch': 2.38} +2025-05-11 10:57:52 - ERROR - stderr - 79%|███████▉ | 2973/3741 [17:31:58<4:33:00, 21.33s/it] +2025-05-11 10:58:12 - ERROR - stderr - 79%|███████▉ | 2974/3741 [17:32:18<4:26:19, 20.83s/it] +2025-05-11 10:58:12 - ERROR - stderr - +2025-05-11 10:58:12 - ERROR - stderr - +2025-05-11 10:58:12 - INFO - stdout - {'loss': 0.4947, 'grad_norm': 0.8463844656944275, 'learning_rate': 2.1257010450926e-06, 'epoch': 2.38} +2025-05-11 10:58:12 - ERROR - stderr - 79%|███████▉ | 2974/3741 [17:32:18<4:26:19, 20.83s/it] +2025-05-11 10:58:35 - ERROR - stderr - 80%|███████▉ | 2975/3741 [17:32:41<4:34:49, 21.53s/it] +2025-05-11 10:58:35 - ERROR - stderr - +2025-05-11 10:58:35 - ERROR - stderr - +2025-05-11 10:58:35 - INFO - stdout - {'loss': 0.4768, 'grad_norm': 0.8669213652610779, 'learning_rate': 2.1203663712809995e-06, 'epoch': 2.39} +2025-05-11 10:58:35 - ERROR - stderr - 80%|███████▉ | 2975/3741 [17:32:41<4:34:49, 21.53s/it] +2025-05-11 10:58:54 - ERROR - stderr - 80%|███████▉ | 2976/3741 [17:33:01<4:26:41, 20.92s/it] +2025-05-11 10:58:54 - ERROR - stderr - +2025-05-11 10:58:54 - ERROR - stderr - +2025-05-11 10:58:54 - INFO - stdout - {'loss': 0.5004, 'grad_norm': 0.8719237446784973, 'learning_rate': 2.115037605887048e-06, 'epoch': 2.39} +2025-05-11 10:58:54 - ERROR - stderr - 80%|███████▉ | 2976/3741 [17:33:01<4:26:41, 20.92s/it] +2025-05-11 10:59:15 - ERROR - stderr - 80%|███████▉ | 2977/3741 [17:33:22<4:26:40, 20.94s/it] +2025-05-11 10:59:15 - ERROR - stderr - +2025-05-11 10:59:15 - ERROR - stderr - +2025-05-11 10:59:15 - INFO - stdout - {'loss': 0.4877, 'grad_norm': 0.8576990365982056, 'learning_rate': 2.1097147529064286e-06, 'epoch': 2.39} +2025-05-11 10:59:15 - ERROR - stderr - 80%|███████▉ | 2977/3741 [17:33:22<4:26:40, 20.94s/it] +2025-05-11 10:59:35 - ERROR - stderr - 80%|███████▉ | 2978/3741 [17:33:41<4:21:33, 20.57s/it] +2025-05-11 10:59:35 - ERROR - stderr - +2025-05-11 10:59:35 - ERROR - stderr - +2025-05-11 10:59:35 - INFO - stdout - {'loss': 0.4715, 'grad_norm': 0.8337488770484924, 'learning_rate': 2.104397816330401e-06, 'epoch': 2.39} +2025-05-11 10:59:35 - ERROR - stderr - 80%|███████▉ | 2978/3741 [17:33:41<4:21:33, 20.57s/it] +2025-05-11 10:59:55 - ERROR - stderr - 80%|███████▉ | 2979/3741 [17:34:01<4:17:58, 20.31s/it] +2025-05-11 10:59:55 - ERROR - stderr - +2025-05-11 10:59:55 - ERROR - stderr - +2025-05-11 10:59:55 - INFO - stdout - {'loss': 0.4819, 'grad_norm': 0.8252597451210022, 'learning_rate': 2.0990868001457853e-06, 'epoch': 2.39} +2025-05-11 10:59:55 - ERROR - stderr - 80%|███████▉ | 2979/3741 [17:34:01<4:17:58, 20.31s/it] +2025-05-11 11:00:15 - ERROR - stderr - 80%|███████▉ | 2980/3741 [17:34:21<4:16:38, 20.23s/it] +2025-05-11 11:00:15 - ERROR - stderr - +2025-05-11 11:00:15 - ERROR - stderr - +2025-05-11 11:00:15 - INFO - stdout - {'loss': 0.4702, 'grad_norm': 0.8549776077270508, 'learning_rate': 2.093781708334962e-06, 'epoch': 2.39} +2025-05-11 11:00:15 - ERROR - stderr - 80%|███████▉ | 2980/3741 [17:34:21<4:16:38, 20.23s/it] +2025-05-11 11:00:34 - ERROR - stderr - 80%|███████▉ | 2981/3741 [17:34:41<4:14:11, 20.07s/it] +2025-05-11 11:00:34 - ERROR - stderr - +2025-05-11 11:00:34 - ERROR - stderr - +2025-05-11 11:00:34 - INFO - stdout - {'loss': 0.4758, 'grad_norm': 0.8355644345283508, 'learning_rate': 2.088482544875873e-06, 'epoch': 2.39} +2025-05-11 11:00:34 - ERROR - stderr - 80%|███████▉ | 2981/3741 [17:34:41<4:14:11, 20.07s/it] +2025-05-11 11:00:56 - ERROR - stderr - 80%|███████▉ | 2982/3741 [17:35:02<4:17:45, 20.38s/it] +2025-05-11 11:00:56 - ERROR - stderr - +2025-05-11 11:00:56 - ERROR - stderr - +2025-05-11 11:00:56 - INFO - stdout - {'loss': 0.4998, 'grad_norm': 0.857735276222229, 'learning_rate': 2.0831893137420046e-06, 'epoch': 2.39} +2025-05-11 11:00:56 - ERROR - stderr - 80%|███████▉ | 2982/3741 [17:35:02<4:17:45, 20.38s/it] +2025-05-11 11:01:15 - ERROR - stderr - 80%|███████▉ | 2983/3741 [17:35:22<4:15:14, 20.20s/it] +2025-05-11 11:01:15 - ERROR - stderr - +2025-05-11 11:01:15 - ERROR - stderr - +2025-05-11 11:01:15 - INFO - stdout - {'loss': 0.475, 'grad_norm': 0.9026484489440918, 'learning_rate': 2.077902018902407e-06, 'epoch': 2.39} +2025-05-11 11:01:15 - ERROR - stderr - 80%|███████▉ | 2983/3741 [17:35:22<4:15:14, 20.20s/it] +2025-05-11 11:01:37 - ERROR - stderr - 80%|███████▉ | 2984/3741 [17:35:44<4:21:43, 20.74s/it] +2025-05-11 11:01:37 - ERROR - stderr - +2025-05-11 11:01:37 - ERROR - stderr - +2025-05-11 11:01:37 - INFO - stdout - {'loss': 0.4798, 'grad_norm': 0.8243964314460754, 'learning_rate': 2.072620664321674e-06, 'epoch': 2.39} +2025-05-11 11:01:37 - ERROR - stderr - 80%|███████▉ | 2984/3741 [17:35:44<4:21:43, 20.74s/it] +2025-05-11 11:01:57 - ERROR - stderr - 80%|███████▉ | 2985/3741 [17:36:03<4:16:33, 20.36s/it] +2025-05-11 11:01:57 - ERROR - stderr - +2025-05-11 11:01:57 - ERROR - stderr - +2025-05-11 11:01:57 - INFO - stdout - {'loss': 0.4483, 'grad_norm': 0.8589446544647217, 'learning_rate': 2.067345253959938e-06, 'epoch': 2.39} +2025-05-11 11:01:57 - ERROR - stderr - 80%|███████▉ | 2985/3741 [17:36:03<4:16:33, 20.36s/it] +2025-05-11 11:02:19 - ERROR - stderr - 80%|███████▉ | 2986/3741 [17:36:25<4:23:10, 20.92s/it] +2025-05-11 11:02:19 - ERROR - stderr - +2025-05-11 11:02:19 - ERROR - stderr - +2025-05-11 11:02:19 - INFO - stdout - {'loss': 0.4368, 'grad_norm': 0.8188499212265015, 'learning_rate': 2.0620757917728927e-06, 'epoch': 2.39} +2025-05-11 11:02:19 - ERROR - stderr - 80%|███████▉ | 2986/3741 [17:36:25<4:23:10, 20.92s/it] +2025-05-11 11:02:39 - ERROR - stderr - 80%|███████▉ | 2987/3741 [17:36:46<4:20:14, 20.71s/it] +2025-05-11 11:02:39 - ERROR - stderr - +2025-05-11 11:02:39 - ERROR - stderr - +2025-05-11 11:02:39 - INFO - stdout - {'loss': 0.4714, 'grad_norm': 0.8684476613998413, 'learning_rate': 2.0568122817117507e-06, 'epoch': 2.4} +2025-05-11 11:02:39 - ERROR - stderr - 80%|███████▉ | 2987/3741 [17:36:46<4:20:14, 20.71s/it] +2025-05-11 11:03:02 - ERROR - stderr - 80%|███████▉ | 2988/3741 [17:37:08<4:27:56, 21.35s/it] +2025-05-11 11:03:02 - ERROR - stderr - +2025-05-11 11:03:02 - ERROR - stderr - +2025-05-11 11:03:02 - INFO - stdout - {'loss': 0.4778, 'grad_norm': 0.8384298086166382, 'learning_rate': 2.051554727723276e-06, 'epoch': 2.4} +2025-05-11 11:03:02 - ERROR - stderr - 80%|███████▉ | 2988/3741 [17:37:08<4:27:56, 21.35s/it] +2025-05-11 11:03:22 - ERROR - stderr - 80%|███████▉ | 2989/3741 [17:37:28<4:21:31, 20.87s/it] +2025-05-11 11:03:22 - ERROR - stderr - +2025-05-11 11:03:22 - ERROR - stderr - +2025-05-11 11:03:22 - INFO - stdout - {'loss': 0.488, 'grad_norm': 0.872604250907898, 'learning_rate': 2.046303133749764e-06, 'epoch': 2.4} +2025-05-11 11:03:22 - ERROR - stderr - 80%|███████▉ | 2989/3741 [17:37:28<4:21:31, 20.87s/it] +2025-05-11 11:03:45 - ERROR - stderr - 80%|███████▉ | 2990/3741 [17:37:51<4:30:08, 21.58s/it] +2025-05-11 11:03:45 - ERROR - stderr - +2025-05-11 11:03:45 - ERROR - stderr - +2025-05-11 11:03:45 - INFO - stdout - {'loss': 0.4955, 'grad_norm': 0.9264422655105591, 'learning_rate': 2.041057503729028e-06, 'epoch': 2.4} +2025-05-11 11:03:45 - ERROR - stderr - 80%|███████▉ | 2990/3741 [17:37:51<4:30:08, 21.58s/it] +2025-05-11 11:04:05 - ERROR - stderr - 80%|███████▉ | 2991/3741 [17:38:11<4:24:08, 21.13s/it] +2025-05-11 11:04:05 - ERROR - stderr - +2025-05-11 11:04:05 - ERROR - stderr - +2025-05-11 11:04:05 - INFO - stdout - {'loss': 0.4809, 'grad_norm': 0.8388825058937073, 'learning_rate': 2.035817841594434e-06, 'epoch': 2.4} +2025-05-11 11:04:05 - ERROR - stderr - 80%|███████▉ | 2991/3741 [17:38:11<4:24:08, 21.13s/it] +2025-05-11 11:04:28 - ERROR - stderr - 80%|███████▉ | 2992/3741 [17:38:34<4:30:44, 21.69s/it] +2025-05-11 11:04:28 - ERROR - stderr - +2025-05-11 11:04:28 - ERROR - stderr - +2025-05-11 11:04:28 - INFO - stdout - {'loss': 0.4903, 'grad_norm': 0.9164281487464905, 'learning_rate': 2.0305841512748494e-06, 'epoch': 2.4} +2025-05-11 11:04:28 - ERROR - stderr - 80%|███████▉ | 2992/3741 [17:38:34<4:30:44, 21.69s/it] +2025-05-11 11:04:48 - ERROR - stderr - 80%|████████ | 2993/3741 [17:38:54<4:23:19, 21.12s/it] +2025-05-11 11:04:48 - ERROR - stderr - +2025-05-11 11:04:48 - ERROR - stderr - +2025-05-11 11:04:48 - INFO - stdout - {'loss': 0.5062, 'grad_norm': 0.9148788452148438, 'learning_rate': 2.0253564366946764e-06, 'epoch': 2.4} +2025-05-11 11:04:48 - ERROR - stderr - 80%|████████ | 2993/3741 [17:38:54<4:23:19, 21.12s/it] +2025-05-11 11:05:11 - ERROR - stderr - 80%|████████ | 2994/3741 [17:39:17<4:29:30, 21.65s/it] +2025-05-11 11:05:11 - ERROR - stderr - +2025-05-11 11:05:11 - ERROR - stderr - +2025-05-11 11:05:11 - INFO - stdout - {'loss': 0.4907, 'grad_norm': 0.8578090071678162, 'learning_rate': 2.020134701773836e-06, 'epoch': 2.4} +2025-05-11 11:05:11 - ERROR - stderr - 80%|████████ | 2994/3741 [17:39:17<4:29:30, 21.65s/it] +2025-05-11 11:05:31 - ERROR - stderr - 80%|████████ | 2995/3741 [17:39:37<4:22:23, 21.10s/it] +2025-05-11 11:05:31 - ERROR - stderr - +2025-05-11 11:05:31 - ERROR - stderr - +2025-05-11 11:05:31 - INFO - stdout - {'loss': 0.4526, 'grad_norm': 0.8483836650848389, 'learning_rate': 2.0149189504277553e-06, 'epoch': 2.4} +2025-05-11 11:05:31 - ERROR - stderr - 80%|████████ | 2995/3741 [17:39:37<4:22:23, 21.10s/it] +2025-05-11 11:05:53 - ERROR - stderr - 80%|████████ | 2996/3741 [17:40:00<4:28:28, 21.62s/it] +2025-05-11 11:05:53 - ERROR - stderr - +2025-05-11 11:05:53 - ERROR - stderr - +2025-05-11 11:05:53 - INFO - stdout - {'loss': 0.468, 'grad_norm': 0.8625277280807495, 'learning_rate': 2.0097091865673923e-06, 'epoch': 2.4} +2025-05-11 11:05:53 - ERROR - stderr - 80%|████████ | 2996/3741 [17:40:00<4:28:28, 21.62s/it] +2025-05-11 11:06:13 - ERROR - stderr - 80%|████████ | 2997/3741 [17:40:20<4:21:57, 21.13s/it] +2025-05-11 11:06:13 - ERROR - stderr - +2025-05-11 11:06:13 - ERROR - stderr - +2025-05-11 11:06:13 - INFO - stdout - {'loss': 0.4907, 'grad_norm': 0.8608683347702026, 'learning_rate': 2.0045054140992002e-06, 'epoch': 2.4} +2025-05-11 11:06:13 - ERROR - stderr - 80%|████████ | 2997/3741 [17:40:20<4:21:57, 21.13s/it] +2025-05-11 11:06:36 - ERROR - stderr - 80%|████████ | 2998/3741 [17:40:43<4:28:26, 21.68s/it] +2025-05-11 11:06:36 - ERROR - stderr - +2025-05-11 11:06:36 - ERROR - stderr - +2025-05-11 11:06:36 - INFO - stdout - {'loss': 0.4727, 'grad_norm': 0.83229660987854, 'learning_rate': 1.9993076369251406e-06, 'epoch': 2.4} +2025-05-11 11:06:36 - ERROR - stderr - 80%|████████ | 2998/3741 [17:40:43<4:28:26, 21.68s/it] +2025-05-11 11:06:56 - ERROR - stderr - 80%|████████ | 2999/3741 [17:41:03<4:21:49, 21.17s/it] +2025-05-11 11:06:56 - ERROR - stderr - +2025-05-11 11:06:56 - ERROR - stderr - +2025-05-11 11:06:56 - INFO - stdout - {'loss': 0.472, 'grad_norm': 0.8340911865234375, 'learning_rate': 1.9941158589426924e-06, 'epoch': 2.4} +2025-05-11 11:06:56 - ERROR - stderr - 80%|████████ | 2999/3741 [17:41:03<4:21:49, 21.17s/it] +2025-05-11 11:07:20 - ERROR - stderr - 80%|████████ | 3000/3741 [17:41:26<4:29:45, 21.84s/it] +2025-05-11 11:07:20 - ERROR - stderr - +2025-05-11 11:07:20 - ERROR - stderr - +2025-05-11 11:07:20 - INFO - stdout - {'loss': 0.4952, 'grad_norm': 0.9387159943580627, 'learning_rate': 1.9889300840448224e-06, 'epoch': 2.41} +2025-05-11 11:07:20 - ERROR - stderr - 80%|████████ | 3000/3741 [17:41:26<4:29:45, 21.84s/it] +2025-05-11 11:07:40 - ERROR - stderr - 80%|████████ | 3001/3741 [17:41:46<4:23:23, 21.36s/it] +2025-05-11 11:07:40 - ERROR - stderr - +2025-05-11 11:07:40 - ERROR - stderr - +2025-05-11 11:07:40 - INFO - stdout - {'loss': 0.4659, 'grad_norm': 0.8406565189361572, 'learning_rate': 1.98375031612e-06, 'epoch': 2.41} +2025-05-11 11:07:40 - ERROR - stderr - 80%|████████ | 3001/3741 [17:41:46<4:23:23, 21.36s/it] +2025-05-11 11:08:03 - ERROR - stderr - 80%|████████ | 3002/3741 [17:42:09<4:28:28, 21.80s/it] +2025-05-11 11:08:03 - ERROR - stderr - +2025-05-11 11:08:03 - ERROR - stderr - +2025-05-11 11:08:03 - INFO - stdout - {'loss': 0.4815, 'grad_norm': 0.9237212538719177, 'learning_rate': 1.9785765590521978e-06, 'epoch': 2.41} +2025-05-11 11:08:03 - ERROR - stderr - 80%|████████ | 3002/3741 [17:42:09<4:28:28, 21.80s/it] +2025-05-11 11:08:23 - ERROR - stderr - 80%|████████ | 3003/3741 [17:42:30<4:23:08, 21.39s/it] +2025-05-11 11:08:23 - ERROR - stderr - +2025-05-11 11:08:23 - ERROR - stderr - +2025-05-11 11:08:23 - INFO - stdout - {'loss': 0.4842, 'grad_norm': 0.8607485294342041, 'learning_rate': 1.9734088167208664e-06, 'epoch': 2.41} +2025-05-11 11:08:23 - ERROR - stderr - 80%|████████ | 3003/3741 [17:42:30<4:23:08, 21.39s/it] +2025-05-11 11:08:44 - ERROR - stderr - 80%|████████ | 3004/3741 [17:42:50<4:19:13, 21.10s/it] +2025-05-11 11:08:44 - ERROR - stderr - +2025-05-11 11:08:44 - ERROR - stderr - +2025-05-11 11:08:44 - INFO - stdout - {'loss': 0.4651, 'grad_norm': 0.8381433486938477, 'learning_rate': 1.968247093000963e-06, 'epoch': 2.41} +2025-05-11 11:08:44 - ERROR - stderr - 80%|████████ | 3004/3741 [17:42:50<4:19:13, 21.10s/it] +2025-05-11 11:09:04 - ERROR - stderr - 80%|████████ | 3005/3741 [17:43:10<4:15:53, 20.86s/it] +2025-05-11 11:09:04 - ERROR - stderr - +2025-05-11 11:09:04 - ERROR - stderr - +2025-05-11 11:09:04 - INFO - stdout - {'loss': 0.484, 'grad_norm': 0.8409081697463989, 'learning_rate': 1.96309139176292e-06, 'epoch': 2.41} +2025-05-11 11:09:04 - ERROR - stderr - 80%|████████ | 3005/3741 [17:43:10<4:15:53, 20.86s/it] +2025-05-11 11:09:25 - ERROR - stderr - 80%|████████ | 3006/3741 [17:43:31<4:14:23, 20.77s/it] +2025-05-11 11:09:25 - ERROR - stderr - +2025-05-11 11:09:25 - ERROR - stderr - +2025-05-11 11:09:25 - INFO - stdout - {'loss': 0.4851, 'grad_norm': 0.8303772807121277, 'learning_rate': 1.9579417168726566e-06, 'epoch': 2.41} +2025-05-11 11:09:25 - ERROR - stderr - 80%|████████ | 3006/3741 [17:43:31<4:14:23, 20.77s/it] +2025-05-11 11:09:45 - ERROR - stderr - 80%|████████ | 3007/3741 [17:43:51<4:13:06, 20.69s/it] +2025-05-11 11:09:45 - ERROR - stderr - +2025-05-11 11:09:45 - ERROR - stderr - +2025-05-11 11:09:45 - INFO - stdout - {'loss': 0.4631, 'grad_norm': 0.8264473676681519, 'learning_rate': 1.9527980721915798e-06, 'epoch': 2.41} +2025-05-11 11:09:45 - ERROR - stderr - 80%|████████ | 3007/3741 [17:43:51<4:13:06, 20.69s/it] +2025-05-11 11:10:05 - ERROR - stderr - 80%|████████ | 3008/3741 [17:44:12<4:11:20, 20.57s/it] +2025-05-11 11:10:06 - ERROR - stderr - +2025-05-11 11:10:06 - ERROR - stderr - +2025-05-11 11:10:06 - INFO - stdout - {'loss': 0.4722, 'grad_norm': 0.8889646530151367, 'learning_rate': 1.9476604615765605e-06, 'epoch': 2.41} +2025-05-11 11:10:06 - ERROR - stderr - 80%|████████ | 3008/3741 [17:44:12<4:11:20, 20.57s/it] +2025-05-11 11:10:25 - ERROR - stderr - 80%|████████ | 3009/3741 [17:44:31<4:07:34, 20.29s/it] +2025-05-11 11:10:25 - ERROR - stderr - +2025-05-11 11:10:25 - ERROR - stderr - +2025-05-11 11:10:25 - INFO - stdout - {'loss': 0.4926, 'grad_norm': 0.8740735650062561, 'learning_rate': 1.942528888879964e-06, 'epoch': 2.41} +2025-05-11 11:10:25 - ERROR - stderr - 80%|████████ | 3009/3741 [17:44:31<4:07:34, 20.29s/it] +2025-05-11 11:10:45 - ERROR - stderr - 80%|████████ | 3010/3741 [17:44:51<4:05:24, 20.14s/it] +2025-05-11 11:10:45 - ERROR - stderr - +2025-05-11 11:10:45 - ERROR - stderr - +2025-05-11 11:10:45 - INFO - stdout - {'loss': 0.4663, 'grad_norm': 0.835271418094635, 'learning_rate': 1.937403357949611e-06, 'epoch': 2.41} +2025-05-11 11:10:45 - ERROR - stderr - 80%|████████ | 3010/3741 [17:44:51<4:05:24, 20.14s/it] +2025-05-11 11:11:07 - ERROR - stderr - 80%|████████ | 3011/3741 [17:45:14<4:13:29, 20.83s/it] +2025-05-11 11:11:07 - ERROR - stderr - +2025-05-11 11:11:07 - ERROR - stderr - +2025-05-11 11:11:07 - INFO - stdout - {'loss': 0.4773, 'grad_norm': 0.8700141310691833, 'learning_rate': 1.932283872628803e-06, 'epoch': 2.41} +2025-05-11 11:11:07 - ERROR - stderr - 80%|████████ | 3011/3741 [17:45:14<4:13:29, 20.83s/it] +2025-05-11 11:11:27 - ERROR - stderr - 81%|████████ | 3012/3741 [17:45:33<4:09:12, 20.51s/it] +2025-05-11 11:11:27 - ERROR - stderr - +2025-05-11 11:11:27 - ERROR - stderr - +2025-05-11 11:11:27 - INFO - stdout - {'loss': 0.4952, 'grad_norm': 0.8676986694335938, 'learning_rate': 1.927170436756305e-06, 'epoch': 2.42} +2025-05-11 11:11:27 - ERROR - stderr - 81%|████████ | 3012/3741 [17:45:33<4:09:12, 20.51s/it] +2025-05-11 11:11:50 - ERROR - stderr - 81%|████████ | 3013/3741 [17:45:57<4:18:46, 21.33s/it] +2025-05-11 11:11:50 - ERROR - stderr - +2025-05-11 11:11:50 - ERROR - stderr - +2025-05-11 11:11:50 - INFO - stdout - {'loss': 0.4806, 'grad_norm': 0.902349591255188, 'learning_rate': 1.922063054166341e-06, 'epoch': 2.42} +2025-05-11 11:11:50 - ERROR - stderr - 81%|████████ | 3013/3741 [17:45:57<4:18:46, 21.33s/it] +2025-05-11 11:12:10 - ERROR - stderr - 81%|████████ | 3014/3741 [17:46:17<4:13:55, 20.96s/it] +2025-05-11 11:12:10 - ERROR - stderr - +2025-05-11 11:12:10 - ERROR - stderr - +2025-05-11 11:12:10 - INFO - stdout - {'loss': 0.4784, 'grad_norm': 0.8744420409202576, 'learning_rate': 1.916961728688603e-06, 'epoch': 2.42} +2025-05-11 11:12:10 - ERROR - stderr - 81%|████████ | 3014/3741 [17:46:17<4:13:55, 20.96s/it] +2025-05-11 11:12:32 - ERROR - stderr - 81%|████████ | 3015/3741 [17:46:39<4:17:20, 21.27s/it] +2025-05-11 11:12:32 - ERROR - stderr - +2025-05-11 11:12:32 - ERROR - stderr - +2025-05-11 11:12:32 - INFO - stdout - {'loss': 0.4974, 'grad_norm': 0.8729916214942932, 'learning_rate': 1.9118664641482386e-06, 'epoch': 2.42} +2025-05-11 11:12:32 - ERROR - stderr - 81%|████████ | 3015/3741 [17:46:39<4:17:20, 21.27s/it] +2025-05-11 11:12:53 - ERROR - stderr - 81%|████████ | 3016/3741 [17:46:59<4:13:14, 20.96s/it] +2025-05-11 11:12:53 - ERROR - stderr - +2025-05-11 11:12:53 - ERROR - stderr - +2025-05-11 11:12:53 - INFO - stdout - {'loss': 0.487, 'grad_norm': 0.9082834720611572, 'learning_rate': 1.9067772643658511e-06, 'epoch': 2.42} +2025-05-11 11:12:53 - ERROR - stderr - 81%|████████ | 3016/3741 [17:46:59<4:13:14, 20.96s/it] +2025-05-11 11:13:16 - ERROR - stderr - 81%|████████ | 3017/3741 [17:47:22<4:22:20, 21.74s/it] +2025-05-11 11:13:16 - ERROR - stderr - +2025-05-11 11:13:16 - ERROR - stderr - +2025-05-11 11:13:16 - INFO - stdout - {'loss': 0.4468, 'grad_norm': 0.8420137166976929, 'learning_rate': 1.901694133157499e-06, 'epoch': 2.42} +2025-05-11 11:13:16 - ERROR - stderr - 81%|████████ | 3017/3741 [17:47:22<4:22:20, 21.74s/it] +2025-05-11 11:13:36 - ERROR - stderr - 81%|████████ | 3018/3741 [17:47:42<4:15:05, 21.17s/it] +2025-05-11 11:13:36 - ERROR - stderr - +2025-05-11 11:13:36 - ERROR - stderr - +2025-05-11 11:13:36 - INFO - stdout - {'loss': 0.4624, 'grad_norm': 0.8739819526672363, 'learning_rate': 1.896617074334679e-06, 'epoch': 2.42} +2025-05-11 11:13:36 - ERROR - stderr - 81%|████████ | 3018/3741 [17:47:42<4:15:05, 21.17s/it] +2025-05-11 11:13:59 - ERROR - stderr - 81%|████████ | 3019/3741 [17:48:06<4:22:45, 21.84s/it] +2025-05-11 11:13:59 - ERROR - stderr - +2025-05-11 11:13:59 - ERROR - stderr - +2025-05-11 11:13:59 - INFO - stdout - {'loss': 0.4767, 'grad_norm': 0.8350537419319153, 'learning_rate': 1.8915460917043494e-06, 'epoch': 2.42} +2025-05-11 11:13:59 - ERROR - stderr - 81%|████████ | 3019/3741 [17:48:06<4:22:45, 21.84s/it] +2025-05-11 11:14:19 - ERROR - stderr - 81%|████████ | 3020/3741 [17:48:26<4:15:49, 21.29s/it] +2025-05-11 11:14:19 - ERROR - stderr - +2025-05-11 11:14:19 - ERROR - stderr - +2025-05-11 11:14:19 - INFO - stdout - {'loss': 0.458, 'grad_norm': 0.8347825407981873, 'learning_rate': 1.8864811890689016e-06, 'epoch': 2.42} +2025-05-11 11:14:19 - ERROR - stderr - 81%|████████ | 3020/3741 [17:48:26<4:15:49, 21.29s/it] +2025-05-11 11:14:43 - ERROR - stderr - 81%|████████ | 3021/3741 [17:48:50<4:25:09, 22.10s/it] +2025-05-11 11:14:43 - ERROR - stderr - +2025-05-11 11:14:43 - ERROR - stderr - +2025-05-11 11:14:43 - INFO - stdout - {'loss': 0.4694, 'grad_norm': 0.8250964879989624, 'learning_rate': 1.8814223702261757e-06, 'epoch': 2.42} +2025-05-11 11:14:43 - ERROR - stderr - 81%|████████ | 3021/3741 [17:48:50<4:25:09, 22.10s/it] +2025-05-11 11:15:04 - ERROR - stderr - 81%|████████ | 3022/3741 [17:49:10<4:18:40, 21.59s/it] +2025-05-11 11:15:04 - ERROR - stderr - +2025-05-11 11:15:04 - ERROR - stderr - +2025-05-11 11:15:04 - INFO - stdout - {'loss': 0.4831, 'grad_norm': 0.8303848505020142, 'learning_rate': 1.8763696389694463e-06, 'epoch': 2.42} +2025-05-11 11:15:04 - ERROR - stderr - 81%|████████ | 3022/3741 [17:49:10<4:18:40, 21.59s/it] +2025-05-11 11:15:27 - ERROR - stderr - 81%|████████ | 3023/3741 [17:49:34<4:25:16, 22.17s/it] +2025-05-11 11:15:27 - ERROR - stderr - +2025-05-11 11:15:27 - ERROR - stderr - +2025-05-11 11:15:27 - INFO - stdout - {'loss': 0.4736, 'grad_norm': 0.8795514106750488, 'learning_rate': 1.8713229990874194e-06, 'epoch': 2.42} +2025-05-11 11:15:27 - ERROR - stderr - 81%|████████ | 3023/3741 [17:49:34<4:25:16, 22.17s/it] +2025-05-11 11:15:47 - ERROR - stderr - 81%|████████ | 3024/3741 [17:49:53<4:16:12, 21.44s/it] +2025-05-11 11:15:47 - ERROR - stderr - +2025-05-11 11:15:47 - ERROR - stderr - +2025-05-11 11:15:47 - INFO - stdout - {'loss': 0.4751, 'grad_norm': 0.8881685733795166, 'learning_rate': 1.86628245436424e-06, 'epoch': 2.43} +2025-05-11 11:15:47 - ERROR - stderr - 81%|████████ | 3024/3741 [17:49:53<4:16:12, 21.44s/it] +2025-05-11 11:16:09 - ERROR - stderr - 81%|████████ | 3025/3741 [17:50:15<4:18:12, 21.64s/it] +2025-05-11 11:16:09 - ERROR - stderr - +2025-05-11 11:16:09 - ERROR - stderr - +2025-05-11 11:16:09 - INFO - stdout - {'loss': 0.5201, 'grad_norm': 0.9160009622573853, 'learning_rate': 1.8612480085794804e-06, 'epoch': 2.43} +2025-05-11 11:16:09 - ERROR - stderr - 81%|████████ | 3025/3741 [17:50:15<4:18:12, 21.64s/it] +2025-05-11 11:16:29 - ERROR - stderr - 81%|████████ | 3026/3741 [17:50:35<4:09:59, 20.98s/it] +2025-05-11 11:16:29 - ERROR - stderr - +2025-05-11 11:16:29 - ERROR - stderr - +2025-05-11 11:16:29 - INFO - stdout - {'loss': 0.4675, 'grad_norm': 0.8275811076164246, 'learning_rate': 1.8562196655081422e-06, 'epoch': 2.43} +2025-05-11 11:16:29 - ERROR - stderr - 81%|████████ | 3026/3741 [17:50:35<4:09:59, 20.98s/it] +2025-05-11 11:16:48 - ERROR - stderr - 81%|████████ | 3027/3741 [17:50:54<4:03:48, 20.49s/it] +2025-05-11 11:16:48 - ERROR - stderr - +2025-05-11 11:16:48 - ERROR - stderr - +2025-05-11 11:16:48 - INFO - stdout - {'loss': 0.4896, 'grad_norm': 0.895634651184082, 'learning_rate': 1.8511974289206413e-06, 'epoch': 2.43} +2025-05-11 11:16:48 - ERROR - stderr - 81%|████████ | 3027/3741 [17:50:54<4:03:48, 20.49s/it] +2025-05-11 11:17:08 - ERROR - stderr - 81%|████████ | 3028/3741 [17:51:14<4:01:03, 20.29s/it] +2025-05-11 11:17:08 - ERROR - stderr - +2025-05-11 11:17:08 - ERROR - stderr - +2025-05-11 11:17:08 - INFO - stdout - {'loss': 0.4749, 'grad_norm': 0.8538333177566528, 'learning_rate': 1.8461813025828268e-06, 'epoch': 2.43} +2025-05-11 11:17:08 - ERROR - stderr - 81%|████████ | 3028/3741 [17:51:14<4:01:03, 20.29s/it] +2025-05-11 11:17:27 - ERROR - stderr - 81%|████████ | 3029/3741 [17:51:34<3:58:11, 20.07s/it] +2025-05-11 11:17:27 - ERROR - stderr - +2025-05-11 11:17:27 - ERROR - stderr - +2025-05-11 11:17:27 - INFO - stdout - {'loss': 0.4798, 'grad_norm': 0.8479071855545044, 'learning_rate': 1.8411712902559597e-06, 'epoch': 2.43} +2025-05-11 11:17:27 - ERROR - stderr - 81%|████████ | 3029/3741 [17:51:34<3:58:11, 20.07s/it] +2025-05-11 11:17:49 - ERROR - stderr - 81%|████████ | 3030/3741 [17:51:55<4:01:52, 20.41s/it] +2025-05-11 11:17:49 - ERROR - stderr - +2025-05-11 11:17:49 - ERROR - stderr - +2025-05-11 11:17:49 - INFO - stdout - {'loss': 0.4951, 'grad_norm': 0.875977635383606, 'learning_rate': 1.8361673956967175e-06, 'epoch': 2.43} +2025-05-11 11:17:49 - ERROR - stderr - 81%|████████ | 3030/3741 [17:51:55<4:01:52, 20.41s/it] +2025-05-11 11:18:08 - ERROR - stderr - 81%|████████ | 3031/3741 [17:52:15<3:59:21, 20.23s/it] +2025-05-11 11:18:08 - ERROR - stderr - +2025-05-11 11:18:08 - ERROR - stderr - +2025-05-11 11:18:08 - INFO - stdout - {'loss': 0.5221, 'grad_norm': 0.9012177586555481, 'learning_rate': 1.831169622657194e-06, 'epoch': 2.43} +2025-05-11 11:18:08 - ERROR - stderr - 81%|████████ | 3031/3741 [17:52:15<3:59:21, 20.23s/it] +2025-05-11 11:18:29 - ERROR - stderr - 81%|████████ | 3032/3741 [17:52:35<4:01:08, 20.41s/it] +2025-05-11 11:18:29 - ERROR - stderr - +2025-05-11 11:18:29 - ERROR - stderr - +2025-05-11 11:18:29 - INFO - stdout - {'loss': 0.4712, 'grad_norm': 0.8884789943695068, 'learning_rate': 1.826177974884885e-06, 'epoch': 2.43} +2025-05-11 11:18:29 - ERROR - stderr - 81%|████████ | 3032/3741 [17:52:35<4:01:08, 20.41s/it] +2025-05-11 11:18:50 - ERROR - stderr - 81%|████████ | 3033/3741 [17:52:56<4:00:41, 20.40s/it] +2025-05-11 11:18:50 - ERROR - stderr - +2025-05-11 11:18:50 - ERROR - stderr - +2025-05-11 11:18:50 - INFO - stdout - {'loss': 0.4645, 'grad_norm': 0.8643089532852173, 'learning_rate': 1.8211924561227001e-06, 'epoch': 2.43} +2025-05-11 11:18:50 - ERROR - stderr - 81%|████████ | 3033/3741 [17:52:56<4:00:41, 20.40s/it] +2025-05-11 11:19:12 - ERROR - stderr - 81%|████████ | 3034/3741 [17:53:19<4:08:39, 21.10s/it] +2025-05-11 11:19:12 - ERROR - stderr - +2025-05-11 11:19:12 - ERROR - stderr - +2025-05-11 11:19:12 - INFO - stdout - {'loss': 0.5044, 'grad_norm': 0.8726524710655212, 'learning_rate': 1.816213070108951e-06, 'epoch': 2.43} +2025-05-11 11:19:12 - ERROR - stderr - 81%|████████ | 3034/3741 [17:53:19<4:08:39, 21.10s/it] +2025-05-11 11:19:32 - ERROR - stderr - 81%|████████ | 3035/3741 [17:53:38<4:03:33, 20.70s/it] +2025-05-11 11:19:32 - ERROR - stderr - +2025-05-11 11:19:32 - ERROR - stderr - +2025-05-11 11:19:32 - INFO - stdout - {'loss': 0.4747, 'grad_norm': 0.8304829597473145, 'learning_rate': 1.8112398205773507e-06, 'epoch': 2.43} +2025-05-11 11:19:32 - ERROR - stderr - 81%|████████ | 3035/3741 [17:53:38<4:03:33, 20.70s/it] +2025-05-11 11:19:55 - ERROR - stderr - 81%|████████ | 3036/3741 [17:54:01<4:10:22, 21.31s/it] +2025-05-11 11:19:55 - ERROR - stderr - +2025-05-11 11:19:55 - ERROR - stderr - +2025-05-11 11:19:55 - INFO - stdout - {'loss': 0.4924, 'grad_norm': 0.8465933799743652, 'learning_rate': 1.8062727112570133e-06, 'epoch': 2.43} +2025-05-11 11:19:55 - ERROR - stderr - 81%|████████ | 3036/3741 [17:54:01<4:10:22, 21.31s/it] +2025-05-11 11:20:15 - ERROR - stderr - 81%|████████ | 3037/3741 [17:54:21<4:04:49, 20.87s/it] +2025-05-11 11:20:15 - ERROR - stderr - +2025-05-11 11:20:15 - ERROR - stderr - +2025-05-11 11:20:15 - INFO - stdout - {'loss': 0.4974, 'grad_norm': 0.8618703484535217, 'learning_rate': 1.8013117458724416e-06, 'epoch': 2.44} +2025-05-11 11:20:15 - ERROR - stderr - 81%|████████ | 3037/3741 [17:54:21<4:04:49, 20.87s/it] +2025-05-11 11:20:38 - ERROR - stderr - 81%|████████ | 3038/3741 [17:54:44<4:12:53, 21.58s/it] +2025-05-11 11:20:38 - ERROR - stderr - +2025-05-11 11:20:38 - ERROR - stderr - +2025-05-11 11:20:38 - INFO - stdout - {'loss': 0.4641, 'grad_norm': 0.872769832611084, 'learning_rate': 1.79635692814354e-06, 'epoch': 2.44} +2025-05-11 11:20:38 - ERROR - stderr - 81%|████████ | 3038/3741 [17:54:44<4:12:53, 21.58s/it] +2025-05-11 11:20:58 - ERROR - stderr - 81%|████████ | 3039/3741 [17:55:04<4:06:00, 21.03s/it] +2025-05-11 11:20:58 - ERROR - stderr - +2025-05-11 11:20:58 - ERROR - stderr - +2025-05-11 11:20:58 - INFO - stdout - {'loss': 0.4678, 'grad_norm': 0.8966688513755798, 'learning_rate': 1.7914082617856022e-06, 'epoch': 2.44} +2025-05-11 11:20:58 - ERROR - stderr - 81%|████████ | 3039/3741 [17:55:04<4:06:00, 21.03s/it] +2025-05-11 11:21:21 - ERROR - stderr - 81%|████████▏ | 3040/3741 [17:55:28<4:14:51, 21.81s/it] +2025-05-11 11:21:21 - ERROR - stderr - +2025-05-11 11:21:21 - ERROR - stderr - +2025-05-11 11:21:21 - INFO - stdout - {'loss': 0.4602, 'grad_norm': 0.8599264025688171, 'learning_rate': 1.7864657505092964e-06, 'epoch': 2.44} +2025-05-11 11:21:21 - ERROR - stderr - 81%|████████▏ | 3040/3741 [17:55:28<4:14:51, 21.81s/it] +2025-05-11 11:21:41 - ERROR - stderr - 81%|████████▏ | 3041/3741 [17:55:47<4:07:04, 21.18s/it] +2025-05-11 11:21:41 - ERROR - stderr - +2025-05-11 11:21:41 - ERROR - stderr - +2025-05-11 11:21:41 - INFO - stdout - {'loss': 0.4846, 'grad_norm': 0.9014189839363098, 'learning_rate': 1.7815293980206993e-06, 'epoch': 2.44} +2025-05-11 11:21:41 - ERROR - stderr - 81%|████████▏ | 3041/3741 [17:55:47<4:07:04, 21.18s/it] +2025-05-11 11:22:03 - ERROR - stderr - 81%|████████▏ | 3042/3741 [17:56:10<4:11:34, 21.59s/it] +2025-05-11 11:22:03 - ERROR - stderr - +2025-05-11 11:22:03 - ERROR - stderr - +2025-05-11 11:22:03 - INFO - stdout - {'loss': 0.4674, 'grad_norm': 0.8558383584022522, 'learning_rate': 1.776599208021247e-06, 'epoch': 2.44} +2025-05-11 11:22:03 - ERROR - stderr - 81%|████████▏ | 3042/3741 [17:56:10<4:11:34, 21.59s/it] +2025-05-11 11:22:23 - ERROR - stderr - 81%|████████▏ | 3043/3741 [17:56:30<4:04:45, 21.04s/it] +2025-05-11 11:22:23 - ERROR - stderr - +2025-05-11 11:22:23 - ERROR - stderr - +2025-05-11 11:22:23 - INFO - stdout - {'loss': 0.4783, 'grad_norm': 0.8910719156265259, 'learning_rate': 1.7716751842077663e-06, 'epoch': 2.44} +2025-05-11 11:22:23 - ERROR - stderr - 81%|████████▏ | 3043/3741 [17:56:30<4:04:45, 21.04s/it] +2025-05-11 11:22:46 - ERROR - stderr - 81%|████████▏ | 3044/3741 [17:56:53<4:11:10, 21.62s/it] +2025-05-11 11:22:46 - ERROR - stderr - +2025-05-11 11:22:46 - ERROR - stderr - +2025-05-11 11:22:46 - INFO - stdout - {'loss': 0.4687, 'grad_norm': 0.865467369556427, 'learning_rate': 1.7667573302724606e-06, 'epoch': 2.44} +2025-05-11 11:22:46 - ERROR - stderr - 81%|████████▏ | 3044/3741 [17:56:53<4:11:10, 21.62s/it] +2025-05-11 11:23:06 - ERROR - stderr - 81%|████████▏ | 3045/3741 [17:57:12<4:03:50, 21.02s/it] +2025-05-11 11:23:06 - ERROR - stderr - +2025-05-11 11:23:06 - ERROR - stderr - +2025-05-11 11:23:06 - INFO - stdout - {'loss': 0.487, 'grad_norm': 0.9217147827148438, 'learning_rate': 1.7618456499028968e-06, 'epoch': 2.44} +2025-05-11 11:23:06 - ERROR - stderr - 81%|████████▏ | 3045/3741 [17:57:12<4:03:50, 21.02s/it] +2025-05-11 11:23:30 - ERROR - stderr - 81%|████████▏ | 3046/3741 [17:57:36<4:14:01, 21.93s/it] +2025-05-11 11:23:30 - ERROR - stderr - +2025-05-11 11:23:30 - ERROR - stderr - +2025-05-11 11:23:30 - INFO - stdout - {'loss': 0.4507, 'grad_norm': 0.8558123707771301, 'learning_rate': 1.7569401467820302e-06, 'epoch': 2.44} +2025-05-11 11:23:30 - ERROR - stderr - 81%|████████▏ | 3046/3741 [17:57:36<4:14:01, 21.93s/it] +2025-05-11 11:23:49 - ERROR - stderr - 81%|████████▏ | 3047/3741 [17:57:56<4:05:32, 21.23s/it] +2025-05-11 11:23:49 - ERROR - stderr - +2025-05-11 11:23:49 - ERROR - stderr - +2025-05-11 11:23:49 - INFO - stdout - {'loss': 0.4415, 'grad_norm': 0.8037233352661133, 'learning_rate': 1.752040824588167e-06, 'epoch': 2.44} +2025-05-11 11:23:49 - ERROR - stderr - 81%|████████▏ | 3047/3741 [17:57:56<4:05:32, 21.23s/it] +2025-05-11 11:24:12 - ERROR - stderr - 81%|████████▏ | 3048/3741 [17:58:19<4:10:45, 21.71s/it] +2025-05-11 11:24:12 - ERROR - stderr - +2025-05-11 11:24:12 - ERROR - stderr - +2025-05-11 11:24:12 - INFO - stdout - {'loss': 0.5039, 'grad_norm': 0.9270918369293213, 'learning_rate': 1.7471476869949877e-06, 'epoch': 2.44} +2025-05-11 11:24:12 - ERROR - stderr - 81%|████████▏ | 3048/3741 [17:58:19<4:10:45, 21.71s/it] +2025-05-11 11:24:32 - ERROR - stderr - 82%|████████▏ | 3049/3741 [17:58:38<4:03:10, 21.08s/it] +2025-05-11 11:24:32 - ERROR - stderr - +2025-05-11 11:24:32 - ERROR - stderr - +2025-05-11 11:24:32 - INFO - stdout - {'loss': 0.4791, 'grad_norm': 0.8629381060600281, 'learning_rate': 1.7422607376715362e-06, 'epoch': 2.45} +2025-05-11 11:24:32 - ERROR - stderr - 82%|████████▏ | 3049/3741 [17:58:38<4:03:10, 21.08s/it] +2025-05-11 11:24:55 - ERROR - stderr - 82%|████████▏ | 3050/3741 [17:59:02<4:10:49, 21.78s/it] +2025-05-11 11:24:55 - ERROR - stderr - +2025-05-11 11:24:55 - ERROR - stderr - +2025-05-11 11:24:55 - INFO - stdout - {'loss': 0.4942, 'grad_norm': 0.8409698605537415, 'learning_rate': 1.7373799802822067e-06, 'epoch': 2.45} +2025-05-11 11:24:55 - ERROR - stderr - 82%|████████▏ | 3050/3741 [17:59:02<4:10:49, 21.78s/it] +2025-05-11 11:25:15 - ERROR - stderr - 82%|████████▏ | 3051/3741 [17:59:21<4:02:49, 21.11s/it] +2025-05-11 11:25:15 - ERROR - stderr - +2025-05-11 11:25:15 - ERROR - stderr - +2025-05-11 11:25:15 - INFO - stdout - {'loss': 0.4756, 'grad_norm': 0.8830838203430176, 'learning_rate': 1.7325054184867652e-06, 'epoch': 2.45} +2025-05-11 11:25:15 - ERROR - stderr - 82%|████████▏ | 3051/3741 [17:59:21<4:02:49, 21.11s/it] +2025-05-11 11:25:38 - ERROR - stderr - 82%|████████▏ | 3052/3741 [17:59:44<4:09:54, 21.76s/it] +2025-05-11 11:25:38 - ERROR - stderr - +2025-05-11 11:25:38 - ERROR - stderr - +2025-05-11 11:25:38 - INFO - stdout - {'loss': 0.478, 'grad_norm': 0.9066561460494995, 'learning_rate': 1.7276370559403188e-06, 'epoch': 2.45} +2025-05-11 11:25:38 - ERROR - stderr - 82%|████████▏ | 3052/3741 [17:59:44<4:09:54, 21.76s/it] +2025-05-11 11:25:58 - ERROR - stderr - 82%|████████▏ | 3053/3741 [18:00:04<4:01:36, 21.07s/it] +2025-05-11 11:25:58 - ERROR - stderr - +2025-05-11 11:25:58 - ERROR - stderr - +2025-05-11 11:25:58 - INFO - stdout - {'loss': 0.4911, 'grad_norm': 0.9000374674797058, 'learning_rate': 1.7227748962933343e-06, 'epoch': 2.45} +2025-05-11 11:25:58 - ERROR - stderr - 82%|████████▏ | 3053/3741 [18:00:04<4:01:36, 21.07s/it] +2025-05-11 11:26:20 - ERROR - stderr - 82%|████████▏ | 3054/3741 [18:00:26<4:05:50, 21.47s/it] +2025-05-11 11:26:20 - ERROR - stderr - +2025-05-11 11:26:20 - ERROR - stderr - +2025-05-11 11:26:20 - INFO - stdout - {'loss': 0.5002, 'grad_norm': 0.9526277184486389, 'learning_rate': 1.7179189431916254e-06, 'epoch': 2.45} +2025-05-11 11:26:20 - ERROR - stderr - 82%|████████▏ | 3054/3741 [18:00:26<4:05:50, 21.47s/it] +2025-05-11 11:26:40 - ERROR - stderr - 82%|███��████▏ | 3055/3741 [18:00:46<3:58:57, 20.90s/it] +2025-05-11 11:26:40 - ERROR - stderr - +2025-05-11 11:26:40 - ERROR - stderr - +2025-05-11 11:26:40 - INFO - stdout - {'loss': 0.4973, 'grad_norm': 0.85796719789505, 'learning_rate': 1.713069200276346e-06, 'epoch': 2.45} +2025-05-11 11:26:40 - ERROR - stderr - 82%|████████▏ | 3055/3741 [18:00:46<3:58:57, 20.90s/it] +2025-05-11 11:27:02 - ERROR - stderr - 82%|████████▏ | 3056/3741 [18:01:09<4:05:16, 21.48s/it] +2025-05-11 11:27:02 - ERROR - stderr - +2025-05-11 11:27:02 - ERROR - stderr - +2025-05-11 11:27:02 - INFO - stdout - {'loss': 0.4688, 'grad_norm': 0.8333845734596252, 'learning_rate': 1.708225671184003e-06, 'epoch': 2.45} +2025-05-11 11:27:02 - ERROR - stderr - 82%|████████▏ | 3056/3741 [18:01:09<4:05:16, 21.48s/it] +2025-05-11 11:27:22 - ERROR - stderr - 82%|████████▏ | 3057/3741 [18:01:29<3:59:33, 21.01s/it] +2025-05-11 11:27:22 - ERROR - stderr - +2025-05-11 11:27:22 - ERROR - stderr - +2025-05-11 11:27:22 - INFO - stdout - {'loss': 0.4913, 'grad_norm': 0.8949407935142517, 'learning_rate': 1.7033883595464407e-06, 'epoch': 2.45} +2025-05-11 11:27:22 - ERROR - stderr - 82%|████████▏ | 3057/3741 [18:01:29<3:59:33, 21.01s/it] +2025-05-11 11:27:45 - ERROR - stderr - 82%|████████▏ | 3058/3741 [18:01:51<4:04:25, 21.47s/it] +2025-05-11 11:27:45 - ERROR - stderr - +2025-05-11 11:27:45 - ERROR - stderr - +2025-05-11 11:27:45 - INFO - stdout - {'loss': 0.4581, 'grad_norm': 0.8218083381652832, 'learning_rate': 1.6985572689908326e-06, 'epoch': 2.45} +2025-05-11 11:27:45 - ERROR - stderr - 82%|████████▏ | 3058/3741 [18:01:51<4:04:25, 21.47s/it] +2025-05-11 11:28:04 - ERROR - stderr - 82%|████████▏ | 3059/3741 [18:02:11<3:57:12, 20.87s/it] +2025-05-11 11:28:04 - ERROR - stderr - +2025-05-11 11:28:04 - ERROR - stderr - +2025-05-11 11:28:04 - INFO - stdout - {'loss': 0.4905, 'grad_norm': 0.8660341501235962, 'learning_rate': 1.693732403139705e-06, 'epoch': 2.45} +2025-05-11 11:28:04 - ERROR - stderr - 82%|████████▏ | 3059/3741 [18:02:11<3:57:12, 20.87s/it] +2025-05-11 11:28:27 - ERROR - stderr - 82%|████████▏ | 3060/3741 [18:02:33<4:03:18, 21.44s/it] +2025-05-11 11:28:27 - ERROR - stderr - +2025-05-11 11:28:27 - ERROR - stderr - +2025-05-11 11:28:27 - INFO - stdout - {'loss': 0.4618, 'grad_norm': 0.8607407808303833, 'learning_rate': 1.688913765610899e-06, 'epoch': 2.45} +2025-05-11 11:28:27 - ERROR - stderr - 82%|████████▏ | 3060/3741 [18:02:33<4:03:18, 21.44s/it] +2025-05-11 11:28:47 - ERROR - stderr - 82%|████████▏ | 3061/3741 [18:02:53<3:57:04, 20.92s/it] +2025-05-11 11:28:47 - ERROR - stderr - +2025-05-11 11:28:47 - ERROR - stderr - +2025-05-11 11:28:47 - INFO - stdout - {'loss': 0.4626, 'grad_norm': 0.8669100403785706, 'learning_rate': 1.684101360017596e-06, 'epoch': 2.45} +2025-05-11 11:28:47 - ERROR - stderr - 82%|████████▏ | 3061/3741 [18:02:53<3:57:04, 20.92s/it] +2025-05-11 11:29:09 - ERROR - stderr - 82%|████████▏ | 3062/3741 [18:03:15<4:01:17, 21.32s/it] +2025-05-11 11:29:09 - ERROR - stderr - +2025-05-11 11:29:09 - ERROR - stderr - +2025-05-11 11:29:09 - INFO - stdout - {'loss': 0.4873, 'grad_norm': 0.8576903939247131, 'learning_rate': 1.6792951899683018e-06, 'epoch': 2.46} +2025-05-11 11:29:09 - ERROR - stderr - 82%|████████▏ | 3062/3741 [18:03:15<4:01:17, 21.32s/it] +2025-05-11 11:29:29 - ERROR - stderr - 82%|████████▏ | 3063/3741 [18:03:35<3:55:35, 20.85s/it] +2025-05-11 11:29:29 - ERROR - stderr - +2025-05-11 11:29:29 - ERROR - stderr - +2025-05-11 11:29:29 - INFO - stdout - {'loss': 0.4976, 'grad_norm': 0.8291400074958801, 'learning_rate': 1.6744952590668452e-06, 'epoch': 2.46} +2025-05-11 11:29:29 - ERROR - stderr - 82%|████████▏ | 3063/3741 [18:03:35<3:55:35, 20.85s/it] +2025-05-11 11:29:50 - ERROR - stderr - 82%|████████▏ | 3064/3741 [18:03:56<3:56:04, 20.92s/it] +2025-05-11 11:29:50 - ERROR - stderr - +2025-05-11 11:29:50 - ERROR - stderr - +2025-05-11 11:29:50 - INFO - stdout - {'loss': 0.4716, 'grad_norm': 0.8194727897644043, 'learning_rate': 1.669701570912381e-06, 'epoch': 2.46} +2025-05-11 11:29:50 - ERROR - stderr - 82%|████████▏ | 3064/3741 [18:03:56<3:56:04, 20.92s/it] +2025-05-11 11:30:09 - ERROR - stderr - 82%|████████▏ | 3065/3741 [18:04:16<3:50:30, 20.46s/it] +2025-05-11 11:30:09 - ERROR - stderr - +2025-05-11 11:30:09 - ERROR - stderr - +2025-05-11 11:30:09 - INFO - stdout - {'loss': 0.4765, 'grad_norm': 0.8513212203979492, 'learning_rate': 1.6649141290993765e-06, 'epoch': 2.46} +2025-05-11 11:30:09 - ERROR - stderr - 82%|████████▏ | 3065/3741 [18:04:16<3:50:30, 20.46s/it] +2025-05-11 11:30:31 - ERROR - stderr - 82%|████████▏ | 3066/3741 [18:04:37<3:54:57, 20.88s/it] +2025-05-11 11:30:31 - ERROR - stderr - +2025-05-11 11:30:31 - ERROR - stderr - +2025-05-11 11:30:31 - INFO - stdout - {'loss': 0.4738, 'grad_norm': 0.8940701484680176, 'learning_rate': 1.6601329372176177e-06, 'epoch': 2.46} +2025-05-11 11:30:31 - ERROR - stderr - 82%|████████▏ | 3066/3741 [18:04:38<3:54:57, 20.88s/it] +2025-05-11 11:30:51 - ERROR - stderr - 82%|████████▏ | 3067/3741 [18:04:57<3:51:02, 20.57s/it] +2025-05-11 11:30:51 - ERROR - stderr - +2025-05-11 11:30:51 - ERROR - stderr - +2025-05-11 11:30:51 - INFO - stdout - {'loss': 0.4837, 'grad_norm': 0.8830768465995789, 'learning_rate': 1.6553579988522083e-06, 'epoch': 2.46} +2025-05-11 11:30:51 - ERROR - stderr - 82%|████████▏ | 3067/3741 [18:04:57<3:51:02, 20.57s/it] +2025-05-11 11:31:15 - ERROR - stderr - 82%|████████▏ | 3068/3741 [18:05:21<4:00:53, 21.48s/it] +2025-05-11 11:31:15 - ERROR - stderr - +2025-05-11 11:31:15 - ERROR - stderr - +2025-05-11 11:31:15 - INFO - stdout - {'loss': 0.4635, 'grad_norm': 0.8472455143928528, 'learning_rate': 1.6505893175835585e-06, 'epoch': 2.46} +2025-05-11 11:31:15 - ERROR - stderr - 82%|████████▏ | 3068/3741 [18:05:21<4:00:53, 21.48s/it] +2025-05-11 11:31:34 - ERROR - stderr - 82%|████████▏ | 3069/3741 [18:05:41<3:54:18, 20.92s/it] +2025-05-11 11:31:34 - ERROR - stderr - +2025-05-11 11:31:34 - ERROR - stderr - +2025-05-11 11:31:34 - INFO - stdout - {'loss': 0.4813, 'grad_norm': 0.8620253801345825, 'learning_rate': 1.6458268969873892e-06, 'epoch': 2.46} +2025-05-11 11:31:34 - ERROR - stderr - 82%|████████▏ | 3069/3741 [18:05:41<3:54:18, 20.92s/it] +2025-05-11 11:31:58 - ERROR - stderr - 82%|████████▏ | 3070/3741 [18:06:04<4:02:47, 21.71s/it] +2025-05-11 11:31:58 - ERROR - stderr - +2025-05-11 11:31:58 - ERROR - stderr - +2025-05-11 11:31:58 - INFO - stdout - {'loss': 0.4595, 'grad_norm': 0.8335537910461426, 'learning_rate': 1.6410707406347227e-06, 'epoch': 2.46} +2025-05-11 11:31:58 - ERROR - stderr - 82%|████████▏ | 3070/3741 [18:06:04<4:02:47, 21.71s/it] +2025-05-11 11:32:17 - ERROR - stderr - 82%|████████▏ | 3071/3741 [18:06:24<3:55:19, 21.07s/it] +2025-05-11 11:32:17 - ERROR - stderr - +2025-05-11 11:32:17 - ERROR - stderr - +2025-05-11 11:32:17 - INFO - stdout - {'loss': 0.4551, 'grad_norm': 0.8217028975486755, 'learning_rate': 1.6363208520918882e-06, 'epoch': 2.46} +2025-05-11 11:32:17 - ERROR - stderr - 82%|████████▏ | 3071/3741 [18:06:24<3:55:19, 21.07s/it] +2025-05-11 11:32:40 - ERROR - stderr - 82%|████████▏ | 3072/3741 [18:06:46<3:59:55, 21.52s/it] +2025-05-11 11:32:40 - ERROR - stderr - +2025-05-11 11:32:40 - ERROR - stderr - +2025-05-11 11:32:40 - INFO - stdout - {'loss': 0.487, 'grad_norm': 0.8587481379508972, 'learning_rate': 1.6315772349205139e-06, 'epoch': 2.46} +2025-05-11 11:32:40 - ERROR - stderr - 82%|████████▏ | 3072/3741 [18:06:46<3:59:55, 21.52s/it] +2025-05-11 11:32:59 - ERROR - stderr - 82%|████████▏ | 3073/3741 [18:07:06<3:53:02, 20.93s/it] +2025-05-11 11:33:00 - ERROR - stderr - +2025-05-11 11:33:00 - ERROR - stderr - +2025-05-11 11:33:00 - INFO - stdout - {'loss': 0.477, 'grad_norm': 0.8531529903411865, 'learning_rate': 1.6268398926775286e-06, 'epoch': 2.46} +2025-05-11 11:33:00 - ERROR - stderr - 82%|████████▏ | 3073/3741 [18:07:06<3:53:02, 20.93s/it] +2025-05-11 11:33:22 - ERROR - stderr - 82%|████████▏ | 3074/3741 [18:07:28<3:56:41, 21.29s/it] +2025-05-11 11:33:22 - ERROR - stderr - +2025-05-11 11:33:22 - ERROR - stderr - +2025-05-11 11:33:22 - INFO - stdout - {'loss': 0.4625, 'grad_norm': 0.8320519328117371, 'learning_rate': 1.6221088289151477e-06, 'epoch': 2.47} +2025-05-11 11:33:22 - ERROR - stderr - 82%|████████▏ | 3074/3741 [18:07:28<3:56:41, 21.29s/it] +2025-05-11 11:33:42 - ERROR - stderr - 82%|████████▏ | 3075/3741 [18:07:48<3:51:55, 20.89s/it] +2025-05-11 11:33:42 - ERROR - stderr - +2025-05-11 11:33:42 - ERROR - stderr - +2025-05-11 11:33:42 - INFO - stdout - {'loss': 0.4866, 'grad_norm': 0.8659622669219971, 'learning_rate': 1.6173840471808856e-06, 'epoch': 2.47} +2025-05-11 11:33:42 - ERROR - stderr - 82%|████████▏ | 3075/3741 [18:07:48<3:51:55, 20.89s/it] +2025-05-11 11:34:01 - ERROR - stderr - 82%|████████▏ | 3076/3741 [18:08:08<3:47:21, 20.51s/it] +2025-05-11 11:34:01 - ERROR - stderr - +2025-05-11 11:34:01 - ERROR - stderr - +2025-05-11 11:34:01 - INFO - stdout - {'loss': 0.4842, 'grad_norm': 0.8781241178512573, 'learning_rate': 1.612665551017546e-06, 'epoch': 2.47} +2025-05-11 11:34:01 - ERROR - stderr - 82%|████████▏ | 3076/3741 [18:08:08<3:47:21, 20.51s/it] +2025-05-11 11:34:21 - ERROR - stderr - 82%|████████▏ | 3077/3741 [18:08:27<3:43:51, 20.23s/it] +2025-05-11 11:34:21 - ERROR - stderr - +2025-05-11 11:34:21 - ERROR - stderr - +2025-05-11 11:34:21 - INFO - stdout - {'loss': 0.4856, 'grad_norm': 0.8649770021438599, 'learning_rate': 1.6079533439632166e-06, 'epoch': 2.47} +2025-05-11 11:34:21 - ERROR - stderr - 82%|████████▏ | 3077/3741 [18:08:27<3:43:51, 20.23s/it] +2025-05-11 11:34:40 - ERROR - stderr - 82%|████████▏ | 3078/3741 [18:08:46<3:40:47, 19.98s/it] +2025-05-11 11:34:40 - ERROR - stderr - +2025-05-11 11:34:40 - ERROR - stderr - +2025-05-11 11:34:40 - INFO - stdout - {'loss': 0.4727, 'grad_norm': 0.8794416785240173, 'learning_rate': 1.6032474295512733e-06, 'epoch': 2.47} +2025-05-11 11:34:40 - ERROR - stderr - 82%|████████▏ | 3078/3741 [18:08:47<3:40:47, 19.98s/it] +2025-05-11 11:35:00 - ERROR - stderr - 82%|████████▏ | 3079/3741 [18:09:06<3:40:13, 19.96s/it] +2025-05-11 11:35:00 - ERROR - stderr - +2025-05-11 11:35:00 - ERROR - stderr - +2025-05-11 11:35:00 - INFO - stdout - {'loss': 0.4757, 'grad_norm': 0.8567477464675903, 'learning_rate': 1.598547811310368e-06, 'epoch': 2.47} +2025-05-11 11:35:00 - ERROR - stderr - 82%|████████▏ | 3079/3741 [18:09:06<3:40:13, 19.96s/it] +2025-05-11 11:35:20 - ERROR - stderr - 82%|████████▏ | 3080/3741 [18:09:26<3:39:15, 19.90s/it] +2025-05-11 11:35:20 - ERROR - stderr - +2025-05-11 11:35:20 - ERROR - stderr - +2025-05-11 11:35:20 - INFO - stdout - {'loss': 0.4751, 'grad_norm': 0.8346998691558838, 'learning_rate': 1.5938544927644351e-06, 'epoch': 2.47} +2025-05-11 11:35:20 - ERROR - stderr - 82%|████████▏ | 3080/3741 [18:09:26<3:39:15, 19.90s/it] +2025-05-11 11:35:42 - ERROR - stderr - 82%|████████▏ | 3081/3741 [18:09:48<3:44:56, 20.45s/it] +2025-05-11 11:35:42 - ERROR - stderr - +2025-05-11 11:35:42 - ERROR - stderr - +2025-05-11 11:35:42 - INFO - stdout - {'loss': 0.4901, 'grad_norm': 0.8880207538604736, 'learning_rate': 1.5891674774326848e-06, 'epoch': 2.47} +2025-05-11 11:35:42 - ERROR - stderr - 82%|████████▏ | 3081/3741 [18:09:48<3:44:56, 20.45s/it] +2025-05-11 11:36:01 - ERROR - stderr - 82%|████████▏ | 3082/3741 [18:10:07<3:41:12, 20.14s/it] +2025-05-11 11:36:01 - ERROR - stderr - +2025-05-11 11:36:01 - ERROR - stderr - +2025-05-11 11:36:01 - INFO - stdout - {'loss': 0.494, 'grad_norm': 0.8977006077766418, 'learning_rate': 1.5844867688296017e-06, 'epoch': 2.47} +2025-05-11 11:36:01 - ERROR - stderr - 82%|████████▏ | 3082/3741 [18:10:07<3:41:12, 20.14s/it] +2025-05-11 11:36:24 - ERROR - stderr - 82%|████████▏ | 3083/3741 [18:10:30<3:50:09, 20.99s/it] +2025-05-11 11:36:24 - ERROR - stderr - +2025-05-11 11:36:24 - ERROR - stderr - +2025-05-11 11:36:24 - INFO - stdout - {'loss': 0.4929, 'grad_norm': 0.8824754357337952, 'learning_rate': 1.5798123704649416e-06, 'epoch': 2.47} +2025-05-11 11:36:24 - ERROR - stderr - 82%|████████▏ | 3083/3741 [18:10:30<3:50:09, 20.99s/it] +2025-05-11 11:36:44 - ERROR - stderr - 82%|████████▏ | 3084/3741 [18:10:50<3:45:39, 20.61s/it] +2025-05-11 11:36:44 - ERROR - stderr - +2025-05-11 11:36:44 - ERROR - stderr - +2025-05-11 11:36:44 - INFO - stdout - {'loss': 0.4771, 'grad_norm': 0.9107778072357178, 'learning_rate': 1.5751442858437238e-06, 'epoch': 2.47} +2025-05-11 11:36:44 - ERROR - stderr - 82%|████████▏ | 3084/3741 [18:10:50<3:45:39, 20.61s/it] +2025-05-11 11:37:06 - ERROR - stderr - 82%|████████▏ | 3085/3741 [18:11:13<3:52:21, 21.25s/it] +2025-05-11 11:37:06 - ERROR - stderr - +2025-05-11 11:37:06 - ERROR - stderr - +2025-05-11 11:37:06 - INFO - stdout - {'loss': 0.4824, 'grad_norm': 0.8373488783836365, 'learning_rate': 1.5704825184662397e-06, 'epoch': 2.47} +2025-05-11 11:37:06 - ERROR - stderr - 82%|████████▏ | 3085/3741 [18:11:13<3:52:21, 21.25s/it] +2025-05-11 11:37:26 - ERROR - stderr - 82%|████████▏ | 3086/3741 [18:11:32<3:46:14, 20.72s/it] +2025-05-11 11:37:26 - ERROR - stderr - +2025-05-11 11:37:26 - ERROR - stderr - +2025-05-11 11:37:26 - INFO - stdout - {'loss': 0.4644, 'grad_norm': 0.8496052026748657, 'learning_rate': 1.5658270718280433e-06, 'epoch': 2.47} +2025-05-11 11:37:26 - ERROR - stderr - 82%|████████▏ | 3086/3741 [18:11:32<3:46:14, 20.72s/it] +2025-05-11 11:37:49 - ERROR - stderr - 83%|████████▎ | 3087/3741 [18:11:55<3:53:59, 21.47s/it] +2025-05-11 11:37:49 - ERROR - stderr - +2025-05-11 11:37:49 - ERROR - stderr - +2025-05-11 11:37:49 - INFO - stdout - {'loss': 0.5145, 'grad_norm': 0.884526252746582, 'learning_rate': 1.5611779494199398e-06, 'epoch': 2.48} +2025-05-11 11:37:49 - ERROR - stderr - 83%|████████▎ | 3087/3741 [18:11:55<3:53:59, 21.47s/it] +2025-05-11 11:38:09 - ERROR - stderr - 83%|████████▎ | 3088/3741 [18:12:15<3:48:21, 20.98s/it] +2025-05-11 11:38:09 - ERROR - stderr - +2025-05-11 11:38:09 - ERROR - stderr - +2025-05-11 11:38:09 - INFO - stdout - {'loss': 0.4807, 'grad_norm': 0.8632438778877258, 'learning_rate': 1.5565351547280084e-06, 'epoch': 2.48} +2025-05-11 11:38:09 - ERROR - stderr - 83%|████████▎ | 3088/3741 [18:12:15<3:48:21, 20.98s/it] +2025-05-11 11:38:31 - ERROR - stderr - 83%|████████▎ | 3089/3741 [18:12:38<3:52:16, 21.38s/it] +2025-05-11 11:38:31 - ERROR - stderr - +2025-05-11 11:38:31 - ERROR - stderr - +2025-05-11 11:38:31 - INFO - stdout - {'loss': 0.4579, 'grad_norm': 0.8612059354782104, 'learning_rate': 1.5518986912335686e-06, 'epoch': 2.48} +2025-05-11 11:38:31 - ERROR - stderr - 83%|████████▎ | 3089/3741 [18:12:38<3:52:16, 21.38s/it] +2025-05-11 11:38:51 - ERROR - stderr - 83%|████████▎ | 3090/3741 [18:12:57<3:46:21, 20.86s/it] +2025-05-11 11:38:51 - ERROR - stderr - +2025-05-11 11:38:51 - ERROR - stderr - +2025-05-11 11:38:51 - INFO - stdout - {'loss': 0.4687, 'grad_norm': 0.8448330163955688, 'learning_rate': 1.5472685624132012e-06, 'epoch': 2.48} +2025-05-11 11:38:51 - ERROR - stderr - 83%|████████▎ | 3090/3741 [18:12:57<3:46:21, 20.86s/it] +2025-05-11 11:39:11 - ERROR - stderr - 83%|████████▎ | 3091/3741 [18:13:18<3:44:20, 20.71s/it] +2025-05-11 11:39:11 - ERROR - stderr - +2025-05-11 11:39:11 - ERROR - stderr - +2025-05-11 11:39:11 - INFO - stdout - {'loss': 0.4987, 'grad_norm': 0.8578251600265503, 'learning_rate': 1.5426447717387349e-06, 'epoch': 2.48} +2025-05-11 11:39:11 - ERROR - stderr - 83%|████████▎ | 3091/3741 [18:13:18<3:44:20, 20.71s/it] +2025-05-11 11:39:31 - ERROR - stderr - 83%|████████▎ | 3092/3741 [18:13:37<3:40:04, 20.35s/it] +2025-05-11 11:39:31 - ERROR - stderr - +2025-05-11 11:39:31 - ERROR - stderr - +2025-05-11 11:39:31 - INFO - stdout - {'loss': 0.4817, 'grad_norm': 0.8487175703048706, 'learning_rate': 1.5380273226772403e-06, 'epoch': 2.48} +2025-05-11 11:39:31 - ERROR - stderr - 83%|████████▎ | 3092/3741 [18:13:37<3:40:04, 20.35s/it] +2025-05-11 11:39:50 - ERROR - stderr - 83%|████████▎ | 3093/3741 [18:13:57<3:37:07, 20.10s/it] +2025-05-11 11:39:50 - ERROR - stderr - +2025-05-11 11:39:50 - ERROR - stderr - +2025-05-11 11:39:50 - INFO - stdout - {'loss': 0.4834, 'grad_norm': 0.8690900802612305, 'learning_rate': 1.5334162186910474e-06, 'epoch': 2.48} +2025-05-11 11:39:50 - ERROR - stderr - 83%|████████▎ | 3093/3741 [18:13:57<3:37:07, 20.10s/it] +2025-05-11 11:40:10 - ERROR - stderr - 83%|████████▎ | 3094/3741 [18:14:16<3:35:29, 19.98s/it] +2025-05-11 11:40:10 - ERROR - stderr - +2025-05-11 11:40:10 - ERROR - stderr - +2025-05-11 11:40:10 - INFO - stdout - {'loss': 0.4603, 'grad_norm': 0.8409072160720825, 'learning_rate': 1.5288114632377105e-06, 'epoch': 2.48} +2025-05-11 11:40:10 - ERROR - stderr - 83%|████████▎ | 3094/3741 [18:14:16<3:35:29, 19.98s/it] +2025-05-11 11:40:30 - ERROR - stderr - 83%|████████▎ | 3095/3741 [18:14:36<3:34:00, 19.88s/it] +2025-05-11 11:40:30 - ERROR - stderr - +2025-05-11 11:40:30 - ERROR - stderr - +2025-05-11 11:40:30 - INFO - stdout - {'loss': 0.48, 'grad_norm': 0.8334317207336426, 'learning_rate': 1.5242130597700355e-06, 'epoch': 2.48} +2025-05-11 11:40:30 - ERROR - stderr - 83%|████████▎ | 3095/3741 [18:14:36<3:34:00, 19.88s/it] +2025-05-11 11:40:50 - ERROR - stderr - 83%|████████▎ | 3096/3741 [18:14:56<3:35:43, 20.07s/it] +2025-05-11 11:40:50 - ERROR - stderr - +2025-05-11 11:40:50 - ERROR - stderr - +2025-05-11 11:40:50 - INFO - stdout - {'loss': 0.4855, 'grad_norm': 0.8757611513137817, 'learning_rate': 1.5196210117360643e-06, 'epoch': 2.48} +2025-05-11 11:40:50 - ERROR - stderr - 83%|████████▎ | 3096/3741 [18:14:57<3:35:43, 20.07s/it] +2025-05-11 11:41:10 - ERROR - stderr - 83%|████████▎ | 3097/3741 [18:15:16<3:33:21, 19.88s/it] +2025-05-11 11:41:10 - ERROR - stderr - +2025-05-11 11:41:10 - ERROR - stderr - +2025-05-11 11:41:10 - INFO - stdout - {'loss': 0.4787, 'grad_norm': 0.8928870558738708, 'learning_rate': 1.5150353225790626e-06, 'epoch': 2.48} +2025-05-11 11:41:10 - ERROR - stderr - 83%|████████▎ | 3097/3741 [18:15:16<3:33:21, 19.88s/it] +2025-05-11 11:41:32 - ERROR - stderr - 83%|████████▎ | 3098/3741 [18:15:38<3:41:39, 20.68s/it] +2025-05-11 11:41:32 - ERROR - stderr - +2025-05-11 11:41:32 - ERROR - stderr - +2025-05-11 11:41:32 - INFO - stdout - {'loss': 0.4617, 'grad_norm': 0.8667248487472534, 'learning_rate': 1.5104559957375475e-06, 'epoch': 2.48} +2025-05-11 11:41:32 - ERROR - stderr - 83%|████████▎ | 3098/3741 [18:15:39<3:41:39, 20.68s/it] +2025-05-11 11:41:52 - ERROR - stderr - 83%|████████▎ | 3099/3741 [18:15:58<3:38:31, 20.42s/it] +2025-05-11 11:41:52 - ERROR - stderr - +2025-05-11 11:41:52 - ERROR - stderr - +2025-05-11 11:41:52 - INFO - stdout - {'loss': 0.4625, 'grad_norm': 0.8710841536521912, 'learning_rate': 1.505883034645248e-06, 'epoch': 2.49} +2025-05-11 11:41:52 - ERROR - stderr - 83%|████████▎ | 3099/3741 [18:15:58<3:38:31, 20.42s/it] +2025-05-11 11:42:15 - ERROR - stderr - 83%|████████▎ | 3100/3741 [18:16:21<3:45:43, 21.13s/it] +2025-05-11 11:42:15 - ERROR - stderr - +2025-05-11 11:42:15 - ERROR - stderr - +2025-05-11 11:42:15 - INFO - stdout - {'loss': 0.4653, 'grad_norm': 0.8655977249145508, 'learning_rate': 1.5013164427311223e-06, 'epoch': 2.49} +2025-05-11 11:42:15 - ERROR - stderr - 83%|████████▎ | 3100/3741 [18:16:21<3:45:43, 21.13s/it] +2025-05-11 11:42:35 - ERROR - stderr - 83%|████████▎ | 3101/3741 [18:16:41<3:42:15, 20.84s/it] +2025-05-11 11:42:35 - ERROR - stderr - +2025-05-11 11:42:35 - ERROR - stderr - +2025-05-11 11:42:35 - INFO - stdout - {'loss': 0.4835, 'grad_norm': 0.8573647737503052, 'learning_rate': 1.4967562234193655e-06, 'epoch': 2.49} +2025-05-11 11:42:35 - ERROR - stderr - 83%|████████▎ | 3101/3741 [18:16:41<3:42:15, 20.84s/it] +2025-05-11 11:42:58 - ERROR - stderr - 83%|████████▎ | 3102/3741 [18:17:04<3:49:21, 21.54s/it] +2025-05-11 11:42:58 - ERROR - stderr - +2025-05-11 11:42:58 - ERROR - stderr - +2025-05-11 11:42:58 - INFO - stdout - {'loss': 0.4899, 'grad_norm': 0.8854028582572937, 'learning_rate': 1.4922023801293795e-06, 'epoch': 2.49} +2025-05-11 11:42:58 - ERROR - stderr - 83%|████████▎ | 3102/3741 [18:17:04<3:49:21, 21.54s/it] +2025-05-11 11:43:18 - ERROR - stderr - 83%|████████▎ | 3103/3741 [18:17:24<3:43:25, 21.01s/it] +2025-05-11 11:43:18 - ERROR - stderr - +2025-05-11 11:43:18 - ERROR - stderr - +2025-05-11 11:43:18 - INFO - stdout - {'loss': 0.468, 'grad_norm': 0.843406617641449, 'learning_rate': 1.4876549162757915e-06, 'epoch': 2.49} +2025-05-11 11:43:18 - ERROR - stderr - 83%|████████▎ | 3103/3741 [18:17:24<3:43:25, 21.01s/it] +2025-05-11 11:43:41 - ERROR - stderr - 83%|████████▎ | 3104/3741 [18:17:47<3:49:57, 21.66s/it] +2025-05-11 11:43:41 - ERROR - stderr - +2025-05-11 11:43:41 - ERROR - stderr - +2025-05-11 11:43:41 - INFO - stdout - {'loss': 0.467, 'grad_norm': 0.8388236165046692, 'learning_rate': 1.4831138352684482e-06, 'epoch': 2.49} +2025-05-11 11:43:41 - ERROR - stderr - 83%|████████▎ | 3104/3741 [18:17:47<3:49:57, 21.66s/it] +2025-05-11 11:44:01 - ERROR - stderr - 83%|████████▎ | 3105/3741 [18:18:07<3:42:53, 21.03s/it] +2025-05-11 11:44:01 - ERROR - stderr - +2025-05-11 11:44:01 - ERROR - stderr - +2025-05-11 11:44:01 - INFO - stdout - {'loss': 0.4789, 'grad_norm': 0.864824116230011, 'learning_rate': 1.4785791405123995e-06, 'epoch': 2.49} +2025-05-11 11:44:01 - ERROR - stderr - 83%|████████▎ | 3105/3741 [18:18:07<3:42:53, 21.03s/it] +2025-05-11 11:44:24 - ERROR - stderr - 83%|████████▎ | 3106/3741 [18:18:30<3:49:58, 21.73s/it] +2025-05-11 11:44:24 - ERROR - stderr - +2025-05-11 11:44:24 - ERROR - stderr - +2025-05-11 11:44:24 - INFO - stdout - {'loss': 0.4778, 'grad_norm': 0.8695855140686035, 'learning_rate': 1.474050835407923e-06, 'epoch': 2.49} +2025-05-11 11:44:24 - ERROR - stderr - 83%|████████▎ | 3106/3741 [18:18:30<3:49:58, 21.73s/it] +2025-05-11 11:44:44 - ERROR - stderr - 83%|████████▎ | 3107/3741 [18:18:50<3:42:43, 21.08s/it] +2025-05-11 11:44:44 - ERROR - stderr - +2025-05-11 11:44:44 - ERROR - stderr - +2025-05-11 11:44:44 - INFO - stdout - {'loss': 0.4668, 'grad_norm': 0.8766945004463196, 'learning_rate': 1.4695289233504894e-06, 'epoch': 2.49} +2025-05-11 11:44:44 - ERROR - stderr - 83%|████████▎ | 3107/3741 [18:18:50<3:42:43, 21.08s/it] +2025-05-11 11:45:06 - ERROR - stderr - 83%|████████▎ | 3108/3741 [18:19:12<3:47:04, 21.52s/it] +2025-05-11 11:45:06 - ERROR - stderr - +2025-05-11 11:45:06 - ERROR - stderr - +2025-05-11 11:45:06 - INFO - stdout - {'loss': 0.4725, 'grad_norm': 0.8195257782936096, 'learning_rate': 1.4650134077307853e-06, 'epoch': 2.49} +2025-05-11 11:45:06 - ERROR - stderr - 83%|████████▎ | 3108/3741 [18:19:12<3:47:04, 21.52s/it] +2025-05-11 11:45:26 - ERROR - stderr - 83%|████████▎ | 3109/3741 [18:19:33<3:42:23, 21.11s/it] +2025-05-11 11:45:26 - ERROR - stderr - +2025-05-11 11:45:26 - ERROR - stderr - +2025-05-11 11:45:26 - INFO - stdout - {'loss': 0.462, 'grad_norm': 0.8423482775688171, 'learning_rate': 1.4605042919347e-06, 'epoch': 2.49} +2025-05-11 11:45:26 - ERROR - stderr - 83%|████████▎ | 3109/3741 [18:19:33<3:42:23, 21.11s/it] +2025-05-11 11:45:47 - ERROR - stderr - 83%|████████▎ | 3110/3741 [18:19:53<3:40:21, 20.95s/it] +2025-05-11 11:45:47 - ERROR - stderr - +2025-05-11 11:45:47 - ERROR - stderr - +2025-05-11 11:45:47 - INFO - stdout - {'loss': 0.5092, 'grad_norm': 0.8940702080726624, 'learning_rate': 1.4560015793433145e-06, 'epoch': 2.49} +2025-05-11 11:45:47 - ERROR - stderr - 83%|████████▎ | 3110/3741 [18:19:53<3:40:21, 20.95s/it] +2025-05-11 11:46:07 - ERROR - stderr - 83%|████████▎ | 3111/3741 [18:20:13<3:36:34, 20.63s/it] +2025-05-11 11:46:07 - ERROR - stderr - +2025-05-11 11:46:07 - ERROR - stderr - +2025-05-11 11:46:07 - INFO - stdout - {'loss': 0.4787, 'grad_norm': 0.8922168612480164, 'learning_rate': 1.451505273332926e-06, 'epoch': 2.49} +2025-05-11 11:46:07 - ERROR - stderr - 83%|████████▎ | 3111/3741 [18:20:13<3:36:34, 20.63s/it] +2025-05-11 11:46:26 - ERROR - stderr - 83%|████████▎ | 3112/3741 [18:20:32<3:32:37, 20.28s/it] +2025-05-11 11:46:26 - ERROR - stderr - +2025-05-11 11:46:26 - ERROR - stderr - +2025-05-11 11:46:26 - INFO - stdout - {'loss': 0.487, 'grad_norm': 0.8811621069908142, 'learning_rate': 1.4470153772750118e-06, 'epoch': 2.5} +2025-05-11 11:46:26 - ERROR - stderr - 83%|████████▎ | 3112/3741 [18:20:33<3:32:37, 20.28s/it] +2025-05-11 11:46:46 - ERROR - stderr - 83%|████████▎ | 3113/3741 [18:20:52<3:30:22, 20.10s/it] +2025-05-11 11:46:46 - ERROR - stderr - +2025-05-11 11:46:46 - ERROR - stderr - +2025-05-11 11:46:46 - INFO - stdout - {'loss': 0.465, 'grad_norm': 0.8688673973083496, 'learning_rate': 1.4425318945362488e-06, 'epoch': 2.5} +2025-05-11 11:46:46 - ERROR - stderr - 83%|████████▎ | 3113/3741 [18:20:52<3:30:22, 20.10s/it] +2025-05-11 11:47:06 - ERROR - stderr - 83%|████████▎ | 3114/3741 [18:21:12<3:28:52, 19.99s/it] +2025-05-11 11:47:06 - ERROR - stderr - +2025-05-11 11:47:06 - ERROR - stderr - +2025-05-11 11:47:06 - INFO - stdout - {'loss': 0.4719, 'grad_norm': 0.8423486351966858, 'learning_rate': 1.438054828478509e-06, 'epoch': 2.5} +2025-05-11 11:47:06 - ERROR - stderr - 83%|████████▎ | 3114/3741 [18:21:12<3:28:52, 19.99s/it] +2025-05-11 11:47:28 - ERROR - stderr - 83%|████████▎ | 3115/3741 [18:21:34<3:35:59, 20.70s/it] +2025-05-11 11:47:28 - ERROR - stderr - +2025-05-11 11:47:28 - ERROR - stderr - +2025-05-11 11:47:28 - INFO - stdout - {'loss': 0.4796, 'grad_norm': 0.8947927355766296, 'learning_rate': 1.4335841824588436e-06, 'epoch': 2.5} +2025-05-11 11:47:28 - ERROR - stderr - 83%|████████▎ | 3115/3741 [18:21:34<3:35:59, 20.70s/it] +2025-05-11 11:47:47 - ERROR - stderr - 83%|████████▎ | 3116/3741 [18:21:54<3:31:35, 20.31s/it] +2025-05-11 11:47:47 - ERROR - stderr - +2025-05-11 11:47:47 - ERROR - stderr - +2025-05-11 11:47:47 - INFO - stdout - {'loss': 0.468, 'grad_norm': 0.8390734791755676, 'learning_rate': 1.429119959829499e-06, 'epoch': 2.5} +2025-05-11 11:47:47 - ERROR - stderr - 83%|████████▎ | 3116/3741 [18:21:54<3:31:35, 20.31s/it] +2025-05-11 11:48:11 - ERROR - stderr - 83%|████████▎ | 3117/3741 [18:22:18<3:42:44, 21.42s/it] +2025-05-11 11:48:11 - ERROR - stderr - +2025-05-11 11:48:11 - ERROR - stderr - +2025-05-11 11:48:11 - INFO - stdout - {'loss': 0.4661, 'grad_norm': 0.8673422336578369, 'learning_rate': 1.4246621639378998e-06, 'epoch': 2.5} +2025-05-11 11:48:11 - ERROR - stderr - 83%|████████▎ | 3117/3741 [18:22:18<3:42:44, 21.42s/it] +2025-05-11 11:48:31 - ERROR - stderr - 83%|████████▎ | 3118/3741 [18:22:37<3:36:42, 20.87s/it] +2025-05-11 11:48:31 - ERROR - stderr - +2025-05-11 11:48:31 - ERROR - stderr - +2025-05-11 11:48:31 - INFO - stdout - {'loss': 0.4809, 'grad_norm': 0.833712637424469, 'learning_rate': 1.4202107981266532e-06, 'epoch': 2.5} +2025-05-11 11:48:31 - ERROR - stderr - 83%|████████▎ | 3118/3741 [18:22:37<3:36:42, 20.87s/it] +2025-05-11 11:48:55 - ERROR - stderr - 83%|████████▎ | 3119/3741 [18:23:01<3:46:20, 21.83s/it] +2025-05-11 11:48:55 - ERROR - stderr - +2025-05-11 11:48:55 - ERROR - stderr - +2025-05-11 11:48:55 - INFO - stdout - {'loss': 0.4687, 'grad_norm': 0.8182956576347351, 'learning_rate': 1.4157658657335494e-06, 'epoch': 2.5} +2025-05-11 11:48:55 - ERROR - stderr - 83%|████████▎ | 3119/3741 [18:23:01<3:46:20, 21.83s/it] +2025-05-11 11:49:15 - ERROR - stderr - 83%|████████▎ | 3120/3741 [18:23:21<3:39:08, 21.17s/it] +2025-05-11 11:49:15 - ERROR - stderr - +2025-05-11 11:49:15 - ERROR - stderr - +2025-05-11 11:49:15 - INFO - stdout - {'loss': 0.48, 'grad_norm': 0.8784580826759338, 'learning_rate': 1.411327370091542e-06, 'epoch': 2.5} +2025-05-11 11:49:15 - ERROR - stderr - 83%|████████▎ | 3120/3741 [18:23:21<3:39:08, 21.17s/it] +2025-05-11 11:49:38 - ERROR - stderr - 83%|████████▎ | 3121/3741 [18:23:44<3:46:06, 21.88s/it] +2025-05-11 11:49:38 - ERROR - stderr - +2025-05-11 11:49:38 - ERROR - stderr - +2025-05-11 11:49:38 - INFO - stdout - {'loss': 0.5129, 'grad_norm': 0.8753635883331299, 'learning_rate': 1.406895314528771e-06, 'epoch': 2.5} +2025-05-11 11:49:38 - ERROR - stderr - 83%|████████▎ | 3121/3741 [18:23:45<3:46:06, 21.88s/it] +2025-05-11 11:49:58 - ERROR - stderr - 83%|████████��� | 3122/3741 [18:24:04<3:38:40, 21.20s/it] +2025-05-11 11:49:58 - ERROR - stderr - +2025-05-11 11:49:58 - ERROR - stderr - +2025-05-11 11:49:58 - INFO - stdout - {'loss': 0.4794, 'grad_norm': 0.8735189437866211, 'learning_rate': 1.4024697023685429e-06, 'epoch': 2.5} +2025-05-11 11:49:58 - ERROR - stderr - 83%|████████▎ | 3122/3741 [18:24:04<3:38:40, 21.20s/it] +2025-05-11 11:49:58 - INFO - stdout - WARNING: tokenization mismatch: 3167 vs. 3185. (ignored) +2025-05-11 11:50:18 - ERROR - stderr - 83%|████████▎ | 3123/3741 [18:24:25<3:36:17, 21.00s/it] +2025-05-11 11:50:18 - ERROR - stderr - +2025-05-11 11:50:18 - ERROR - stderr - +2025-05-11 11:50:18 - INFO - stdout - {'loss': 0.4402, 'grad_norm': 0.9127287864685059, 'learning_rate': 1.3980505369293306e-06, 'epoch': 2.5} +2025-05-11 11:50:18 - ERROR - stderr - 83%|████████▎ | 3123/3741 [18:24:25<3:36:17, 21.00s/it] +2025-05-11 11:50:38 - ERROR - stderr - 84%|████████▎ | 3124/3741 [18:24:44<3:32:15, 20.64s/it] +2025-05-11 11:50:38 - ERROR - stderr - +2025-05-11 11:50:38 - ERROR - stderr - +2025-05-11 11:50:38 - INFO - stdout - {'loss': 0.4943, 'grad_norm': 0.86842280626297, 'learning_rate': 1.3936378215247771e-06, 'epoch': 2.51} +2025-05-11 11:50:38 - ERROR - stderr - 84%|████████▎ | 3124/3741 [18:24:44<3:32:15, 20.64s/it] +2025-05-11 11:50:58 - ERROR - stderr - 84%|████████▎ | 3125/3741 [18:25:04<3:28:27, 20.30s/it] +2025-05-11 11:50:58 - ERROR - stderr - +2025-05-11 11:50:58 - ERROR - stderr - +2025-05-11 11:50:58 - INFO - stdout - {'loss': 0.4747, 'grad_norm': 0.8117753863334656, 'learning_rate': 1.389231559463684e-06, 'epoch': 2.51} +2025-05-11 11:50:58 - ERROR - stderr - 84%|████████▎ | 3125/3741 [18:25:04<3:28:27, 20.30s/it] +2025-05-11 11:51:17 - ERROR - stderr - 84%|████████▎ | 3126/3741 [18:25:24<3:26:30, 20.15s/it] +2025-05-11 11:51:17 - ERROR - stderr - +2025-05-11 11:51:17 - ERROR - stderr - +2025-05-11 11:51:17 - INFO - stdout - {'loss': 0.5145, 'grad_norm': 0.8560154438018799, 'learning_rate': 1.3848317540500178e-06, 'epoch': 2.51} +2025-05-11 11:51:17 - ERROR - stderr - 84%|████████▎ | 3126/3741 [18:25:24<3:26:30, 20.15s/it] +2025-05-11 11:51:37 - ERROR - stderr - 84%|████████▎ | 3127/3741 [18:25:43<3:23:52, 19.92s/it] +2025-05-11 11:51:37 - ERROR - stderr - +2025-05-11 11:51:37 - ERROR - stderr - +2025-05-11 11:51:37 - INFO - stdout - {'loss': 0.4802, 'grad_norm': 0.885636568069458, 'learning_rate': 1.3804384085829026e-06, 'epoch': 2.51} +2025-05-11 11:51:37 - ERROR - stderr - 84%|████████▎ | 3127/3741 [18:25:43<3:23:52, 19.92s/it] +2025-05-11 11:51:59 - ERROR - stderr - 84%|████████▎ | 3128/3741 [18:26:05<3:29:32, 20.51s/it] +2025-05-11 11:51:59 - ERROR - stderr - +2025-05-11 11:51:59 - ERROR - stderr - +2025-05-11 11:51:59 - INFO - stdout - {'loss': 0.4833, 'grad_norm': 0.8547667264938354, 'learning_rate': 1.376051526356621e-06, 'epoch': 2.51} +2025-05-11 11:51:59 - ERROR - stderr - 84%|████████▎ | 3128/3741 [18:26:05<3:29:32, 20.51s/it] +2025-05-11 11:52:18 - ERROR - stderr - 84%|████████▎ | 3129/3741 [18:26:25<3:26:19, 20.23s/it] +2025-05-11 11:52:18 - ERROR - stderr - +2025-05-11 11:52:18 - ERROR - stderr - +2025-05-11 11:52:18 - INFO - stdout - {'loss': 0.4416, 'grad_norm': 0.8286969065666199, 'learning_rate': 1.3716711106606007e-06, 'epoch': 2.51} +2025-05-11 11:52:18 - ERROR - stderr - 84%|████████▎ | 3129/3741 [18:26:25<3:26:19, 20.23s/it] +2025-05-11 11:52:41 - ERROR - stderr - 84%|████████▎ | 3130/3741 [18:26:48<3:34:19, 21.05s/it] +2025-05-11 11:52:41 - ERROR - stderr - +2025-05-11 11:52:41 - ERROR - stderr - +2025-05-11 11:52:41 - INFO - stdout - {'loss': 0.4571, 'grad_norm': 0.9037246704101562, 'learning_rate': 1.367297164779431e-06, 'epoch': 2.51} +2025-05-11 11:52:41 - ERROR - stderr - 84%|████████▎ | 3130/3741 [18:26:48<3:34:19, 21.05s/it] +2025-05-11 11:53:01 - ERROR - stderr - 84%|████████▎ | 3131/3741 [18:27:07<3:29:20, 20.59s/it] +2025-05-11 11:53:01 - ERROR - stderr - +2025-05-11 11:53:01 - ERROR - stderr - +2025-05-11 11:53:01 - INFO - stdout - {'loss': 0.4926, 'grad_norm': 0.8304953575134277, 'learning_rate': 1.3629296919928447e-06, 'epoch': 2.51} +2025-05-11 11:53:01 - ERROR - stderr - 84%|████████▎ | 3131/3741 [18:27:07<3:29:20, 20.59s/it] +2025-05-11 11:53:25 - ERROR - stderr - 84%|████████▎ | 3132/3741 [18:27:31<3:40:00, 21.68s/it] +2025-05-11 11:53:25 - ERROR - stderr - +2025-05-11 11:53:25 - ERROR - stderr - +2025-05-11 11:53:25 - INFO - stdout - {'loss': 0.4773, 'grad_norm': 0.876008927822113, 'learning_rate': 1.3585686955757205e-06, 'epoch': 2.51} +2025-05-11 11:53:25 - ERROR - stderr - 84%|████████▎ | 3132/3741 [18:27:31<3:40:00, 21.68s/it] +2025-05-11 11:53:44 - ERROR - stderr - 84%|████████▎ | 3133/3741 [18:27:51<3:32:28, 20.97s/it] +2025-05-11 11:53:44 - ERROR - stderr - +2025-05-11 11:53:44 - ERROR - stderr - +2025-05-11 11:53:44 - INFO - stdout - {'loss': 0.4937, 'grad_norm': 0.9258241653442383, 'learning_rate': 1.3542141787980855e-06, 'epoch': 2.51} +2025-05-11 11:53:44 - ERROR - stderr - 84%|████████▎ | 3133/3741 [18:27:51<3:32:28, 20.97s/it] +2025-05-11 11:54:08 - ERROR - stderr - 84%|████████▍ | 3134/3741 [18:28:14<3:39:08, 21.66s/it] +2025-05-11 11:54:08 - ERROR - stderr - +2025-05-11 11:54:08 - ERROR - stderr - +2025-05-11 11:54:08 - INFO - stdout - {'loss': 0.469, 'grad_norm': 0.846127986907959, 'learning_rate': 1.3498661449251006e-06, 'epoch': 2.51} +2025-05-11 11:54:08 - ERROR - stderr - 84%|████████▍ | 3134/3741 [18:28:14<3:39:08, 21.66s/it] +2025-05-11 11:54:27 - ERROR - stderr - 84%|████████▍ | 3135/3741 [18:28:33<3:31:58, 20.99s/it] +2025-05-11 11:54:27 - ERROR - stderr - +2025-05-11 11:54:27 - ERROR - stderr - +2025-05-11 11:54:27 - INFO - stdout - {'loss': 0.4554, 'grad_norm': 0.8556084036827087, 'learning_rate': 1.3455245972170694e-06, 'epoch': 2.51} +2025-05-11 11:54:27 - ERROR - stderr - 84%|████████▍ | 3135/3741 [18:28:33<3:31:58, 20.99s/it] +2025-05-11 11:54:50 - ERROR - stderr - 84%|████████▍ | 3136/3741 [18:28:56<3:37:39, 21.59s/it] +2025-05-11 11:54:50 - ERROR - stderr - +2025-05-11 11:54:50 - ERROR - stderr - +2025-05-11 11:54:50 - INFO - stdout - {'loss': 0.4738, 'grad_norm': 0.8977357745170593, 'learning_rate': 1.341189538929436e-06, 'epoch': 2.51} +2025-05-11 11:54:50 - ERROR - stderr - 84%|████████▍ | 3136/3741 [18:28:56<3:37:39, 21.59s/it] +2025-05-11 11:55:10 - ERROR - stderr - 84%|████████▍ | 3137/3741 [18:29:16<3:31:46, 21.04s/it] +2025-05-11 11:55:10 - ERROR - stderr - +2025-05-11 11:55:10 - ERROR - stderr - +2025-05-11 11:55:10 - INFO - stdout - {'loss': 0.4724, 'grad_norm': 0.8365195989608765, 'learning_rate': 1.3368609733127714e-06, 'epoch': 2.52} +2025-05-11 11:55:10 - ERROR - stderr - 84%|████████▍ | 3137/3741 [18:29:16<3:31:46, 21.04s/it] +2025-05-11 11:55:30 - ERROR - stderr - 84%|████████▍ | 3138/3741 [18:29:36<3:28:23, 20.74s/it] +2025-05-11 11:55:30 - ERROR - stderr - +2025-05-11 11:55:30 - ERROR - stderr - +2025-05-11 11:55:30 - INFO - stdout - {'loss': 0.4714, 'grad_norm': 0.8728964328765869, 'learning_rate': 1.3325389036127855e-06, 'epoch': 2.52} +2025-05-11 11:55:30 - ERROR - stderr - 84%|████████▍ | 3138/3741 [18:29:36<3:28:23, 20.74s/it] +2025-05-11 11:55:50 - ERROR - stderr - 84%|████████▍ | 3139/3741 [18:29:56<3:25:47, 20.51s/it] +2025-05-11 11:55:50 - ERROR - stderr - +2025-05-11 11:55:50 - ERROR - stderr - +2025-05-11 11:55:50 - INFO - stdout - {'loss': 0.4576, 'grad_norm': 0.8206337094306946, 'learning_rate': 1.3282233330703087e-06, 'epoch': 2.52} +2025-05-11 11:55:50 - ERROR - stderr - 84%|████████▍ | 3139/3741 [18:29:56<3:25:47, 20.51s/it] +2025-05-11 11:56:10 - ERROR - stderr - 84%|████████▍ | 3140/3741 [18:30:16<3:23:44, 20.34s/it] +2025-05-11 11:56:10 - ERROR - stderr - +2025-05-11 11:56:10 - ERROR - stderr - +2025-05-11 11:56:10 - INFO - stdout - {'loss': 0.4825, 'grad_norm': 0.8753436207771301, 'learning_rate': 1.3239142649213044e-06, 'epoch': 2.52} +2025-05-11 11:56:10 - ERROR - stderr - 84%|████████▍ | 3140/3741 [18:30:16<3:23:44, 20.34s/it] +2025-05-11 11:56:29 - ERROR - stderr - 84%|████████▍ | 3141/3741 [18:30:36<3:21:39, 20.17s/it] +2025-05-11 11:56:29 - ERROR - stderr - +2025-05-11 11:56:29 - ERROR - stderr - +2025-05-11 11:56:29 - INFO - stdout - {'loss': 0.4762, 'grad_norm': 0.8451640009880066, 'learning_rate': 1.3196117023968613e-06, 'epoch': 2.52} +2025-05-11 11:56:29 - ERROR - stderr - 84%|████████▍ | 3141/3741 [18:30:36<3:21:39, 20.17s/it] +2025-05-11 11:56:49 - ERROR - stderr - 84%|████████▍ | 3142/3741 [18:30:55<3:19:56, 20.03s/it] +2025-05-11 11:56:49 - ERROR - stderr - +2025-05-11 11:56:49 - ERROR - stderr - +2025-05-11 11:56:49 - INFO - stdout - {'loss': 0.4776, 'grad_norm': 0.8681123852729797, 'learning_rate': 1.315315648723181e-06, 'epoch': 2.52} +2025-05-11 11:56:49 - ERROR - stderr - 84%|████████▍ | 3142/3741 [18:30:55<3:19:56, 20.03s/it] +2025-05-11 11:57:10 - ERROR - stderr - 84%|████████▍ | 3143/3741 [18:31:16<3:21:01, 20.17s/it] +2025-05-11 11:57:10 - ERROR - stderr - +2025-05-11 11:57:10 - ERROR - stderr - +2025-05-11 11:57:10 - INFO - stdout - {'loss': 0.4872, 'grad_norm': 0.9339450001716614, 'learning_rate': 1.311026107121599e-06, 'epoch': 2.52} +2025-05-11 11:57:10 - ERROR - stderr - 84%|████████▍ | 3143/3741 [18:31:16<3:21:01, 20.17s/it] +2025-05-11 11:57:30 - ERROR - stderr - 84%|████████▍ | 3144/3741 [18:31:36<3:20:07, 20.11s/it] +2025-05-11 11:57:30 - ERROR - stderr - +2025-05-11 11:57:30 - ERROR - stderr - +2025-05-11 11:57:30 - INFO - stdout - {'loss': 0.4718, 'grad_norm': 0.8395841121673584, 'learning_rate': 1.3067430808085534e-06, 'epoch': 2.52} +2025-05-11 11:57:30 - ERROR - stderr - 84%|████████▍ | 3144/3741 [18:31:36<3:20:07, 20.11s/it] +2025-05-11 11:57:51 - ERROR - stderr - 84%|████████▍ | 3145/3741 [18:31:58<3:24:27, 20.58s/it] +2025-05-11 11:57:51 - ERROR - stderr - +2025-05-11 11:57:51 - ERROR - stderr - +2025-05-11 11:57:51 - INFO - stdout - {'loss': 0.4844, 'grad_norm': 0.8910388946533203, 'learning_rate': 1.3024665729956054e-06, 'epoch': 2.52} +2025-05-11 11:57:51 - ERROR - stderr - 84%|████████▍ | 3145/3741 [18:31:58<3:24:27, 20.58s/it] +2025-05-11 11:58:11 - ERROR - stderr - 84%|████████▍ | 3146/3741 [18:32:18<3:22:28, 20.42s/it] +2025-05-11 11:58:11 - ERROR - stderr - +2025-05-11 11:58:11 - ERROR - stderr - +2025-05-11 11:58:11 - INFO - stdout - {'loss': 0.4937, 'grad_norm': 0.8879891037940979, 'learning_rate': 1.2981965868894287e-06, 'epoch': 2.52} +2025-05-11 11:58:11 - ERROR - stderr - 84%|████████▍ | 3146/3741 [18:32:18<3:22:28, 20.42s/it] +2025-05-11 11:58:33 - ERROR - stderr - 84%|████████▍ | 3147/3741 [18:32:39<3:26:16, 20.84s/it] +2025-05-11 11:58:33 - ERROR - stderr - +2025-05-11 11:58:33 - ERROR - stderr - +2025-05-11 11:58:33 - INFO - stdout - {'loss': 0.4719, 'grad_norm': 0.8522821068763733, 'learning_rate': 1.2939331256917974e-06, 'epoch': 2.52} +2025-05-11 11:58:33 - ERROR - stderr - 84%|████████▍ | 3147/3741 [18:32:39<3:26:16, 20.84s/it] +2025-05-11 11:58:53 - ERROR - stderr - 84%|████████▍ | 3148/3741 [18:32:59<3:22:52, 20.53s/it] +2025-05-11 11:58:53 - ERROR - stderr - +2025-05-11 11:58:53 - ERROR - stderr - +2025-05-11 11:58:53 - INFO - stdout - {'loss': 0.4651, 'grad_norm': 0.8639122843742371, 'learning_rate': 1.2896761925996082e-06, 'epoch': 2.52} +2025-05-11 11:58:53 - ERROR - stderr - 84%|████████▍ | 3148/3741 [18:32:59<3:22:52, 20.53s/it] +2025-05-11 11:59:16 - ERROR - stderr - 84%|████████▍ | 3149/3741 [18:33:22<3:30:26, 21.33s/it] +2025-05-11 11:59:16 - ERROR - stderr - +2025-05-11 11:59:16 - ERROR - stderr - +2025-05-11 11:59:16 - INFO - stdout - {'loss': 0.4804, 'grad_norm': 0.832323431968689, 'learning_rate': 1.2854257908048483e-06, 'epoch': 2.53} +2025-05-11 11:59:16 - ERROR - stderr - 84%|████████▍ | 3149/3741 [18:33:22<3:30:26, 21.33s/it] +2025-05-11 11:59:36 - ERROR - stderr - 84%|████████▍ | 3150/3741 [18:33:42<3:25:46, 20.89s/it] +2025-05-11 11:59:36 - ERROR - stderr - +2025-05-11 11:59:36 - ERROR - stderr - +2025-05-11 11:59:36 - INFO - stdout - {'loss': 0.4942, 'grad_norm': 0.8917650580406189, 'learning_rate': 1.2811819234946165e-06, 'epoch': 2.53} +2025-05-11 11:59:36 - ERROR - stderr - 84%|████████▍ | 3150/3741 [18:33:42<3:25:46, 20.89s/it] +2025-05-11 11:59:59 - ERROR - stderr - 84%|████████▍ | 3151/3741 [18:34:06<3:32:11, 21.58s/it] +2025-05-11 11:59:59 - ERROR - stderr - +2025-05-11 11:59:59 - ERROR - stderr - +2025-05-11 11:59:59 - INFO - stdout - {'loss': 0.4817, 'grad_norm': 0.8569352030754089, 'learning_rate': 1.2769445938511104e-06, 'epoch': 2.53} +2025-05-11 11:59:59 - ERROR - stderr - 84%|████████▍ | 3151/3741 [18:34:06<3:32:11, 21.58s/it] +2025-05-11 12:00:19 - ERROR - stderr - 84%|████████▍ | 3152/3741 [18:34:25<3:26:28, 21.03s/it] +2025-05-11 12:00:19 - ERROR - stderr - +2025-05-11 12:00:19 - ERROR - stderr - +2025-05-11 12:00:19 - INFO - stdout - {'loss': 0.4906, 'grad_norm': 0.8589757084846497, 'learning_rate': 1.2727138050516175e-06, 'epoch': 2.53} +2025-05-11 12:00:19 - ERROR - stderr - 84%|████████▍ | 3152/3741 [18:34:25<3:26:28, 21.03s/it] +2025-05-11 12:00:43 - ERROR - stderr - 84%|████████▍ | 3153/3741 [18:34:49<3:34:58, 21.94s/it] +2025-05-11 12:00:43 - ERROR - stderr - +2025-05-11 12:00:43 - ERROR - stderr - +2025-05-11 12:00:43 - INFO - stdout - {'loss': 0.5072, 'grad_norm': 0.868578314781189, 'learning_rate': 1.2684895602685377e-06, 'epoch': 2.53} +2025-05-11 12:00:43 - ERROR - stderr - 84%|████████▍ | 3153/3741 [18:34:49<3:34:58, 21.94s/it] +2025-05-11 12:01:03 - ERROR - stderr - 84%|████████▍ | 3154/3741 [18:35:09<3:28:26, 21.31s/it] +2025-05-11 12:01:03 - ERROR - stderr - +2025-05-11 12:01:03 - ERROR - stderr - +2025-05-11 12:01:03 - INFO - stdout - {'loss': 0.4789, 'grad_norm': 0.8775882124900818, 'learning_rate': 1.264271862669344e-06, 'epoch': 2.53} +2025-05-11 12:01:03 - ERROR - stderr - 84%|████████▍ | 3154/3741 [18:35:09<3:28:26, 21.31s/it] +2025-05-11 12:01:27 - ERROR - stderr - 84%|████████▍ | 3155/3741 [18:35:33<3:36:25, 22.16s/it] +2025-05-11 12:01:27 - ERROR - stderr - +2025-05-11 12:01:27 - ERROR - stderr - +2025-05-11 12:01:27 - INFO - stdout - {'loss': 0.4941, 'grad_norm': 0.9492253661155701, 'learning_rate': 1.2600607154166146e-06, 'epoch': 2.53} +2025-05-11 12:01:27 - ERROR - stderr - 84%|████████▍ | 3155/3741 [18:35:33<3:36:25, 22.16s/it] +2025-05-11 12:01:47 - ERROR - stderr - 84%|████████▍ | 3156/3741 [18:35:53<3:28:38, 21.40s/it] +2025-05-11 12:01:47 - ERROR - stderr - +2025-05-11 12:01:47 - ERROR - stderr - +2025-05-11 12:01:47 - INFO - stdout - {'loss': 0.4633, 'grad_norm': 0.8520070910453796, 'learning_rate': 1.255856121668012e-06, 'epoch': 2.53} +2025-05-11 12:01:47 - ERROR - stderr - 84%|████████▍ | 3156/3741 [18:35:53<3:28:38, 21.40s/it] +2025-05-11 12:02:11 - ERROR - stderr - 84%|████████▍ | 3157/3741 [18:36:17<3:35:42, 22.16s/it] +2025-05-11 12:02:11 - ERROR - stderr - +2025-05-11 12:02:11 - ERROR - stderr - +2025-05-11 12:02:11 - INFO - stdout - {'loss': 0.4652, 'grad_norm': 0.8257150053977966, 'learning_rate': 1.2516580845762804e-06, 'epoch': 2.53} +2025-05-11 12:02:11 - ERROR - stderr - 84%|████████▍ | 3157/3741 [18:36:17<3:35:42, 22.16s/it] +2025-05-11 12:02:30 - ERROR - stderr - 84%|████████▍ | 3158/3741 [18:36:36<3:27:14, 21.33s/it] +2025-05-11 12:02:30 - ERROR - stderr - +2025-05-11 12:02:30 - ERROR - stderr - +2025-05-11 12:02:30 - INFO - stdout - {'loss': 0.4865, 'grad_norm': 0.885618269443512, 'learning_rate': 1.2474666072892527e-06, 'epoch': 2.53} +2025-05-11 12:02:30 - ERROR - stderr - 84%|████████▍ | 3158/3741 [18:36:36<3:27:14, 21.33s/it] +2025-05-11 12:02:54 - ERROR - stderr - 84%|████████▍ | 3159/3741 [18:37:00<3:33:39, 22.03s/it] +2025-05-11 12:02:54 - ERROR - stderr - +2025-05-11 12:02:54 - ERROR - stderr - +2025-05-11 12:02:54 - INFO - stdout - {'loss': 0.4958, 'grad_norm': 0.8766368627548218, 'learning_rate': 1.2432816929498425e-06, 'epoch': 2.53} +2025-05-11 12:02:54 - ERROR - stderr - 84%|████████▍ | 3159/3741 [18:37:00<3:33:39, 22.03s/it] +2025-05-11 12:03:13 - ERROR - stderr - 84%|████████▍ | 3160/3741 [18:37:20<3:26:45, 21.35s/it] +2025-05-11 12:03:13 - ERROR - stderr - +2025-05-11 12:03:13 - ERROR - stderr - +2025-05-11 12:03:13 - INFO - stdout - {'loss': 0.4913, 'grad_norm': 0.8656640648841858, 'learning_rate': 1.2391033446960355e-06, 'epoch': 2.53} +2025-05-11 12:03:13 - ERROR - stderr - 84%|████████▍ | 3160/3741 [18:37:20<3:26:45, 21.35s/it] +2025-05-11 12:03:36 - ERROR - stderr - 84%|████████▍ | 3161/3741 [18:37:43<3:31:18, 21.86s/it] +2025-05-11 12:03:36 - ERROR - stderr - +2025-05-11 12:03:36 - ERROR - stderr - +2025-05-11 12:03:36 - INFO - stdout - {'loss': 0.4721, 'grad_norm': 0.8739482760429382, 'learning_rate': 1.2349315656609085e-06, 'epoch': 2.53} +2025-05-11 12:03:36 - ERROR - stderr - 84%|████████▍ | 3161/3741 [18:37:43<3:31:18, 21.86s/it] +2025-05-11 12:03:56 - ERROR - stderr - 85%|████████▍ | 3162/3741 [18:38:03<3:25:02, 21.25s/it] +2025-05-11 12:03:56 - ERROR - stderr - +2025-05-11 12:03:56 - ERROR - stderr - +2025-05-11 12:03:56 - INFO - stdout - {'loss': 0.4921, 'grad_norm': 0.9599235653877258, 'learning_rate': 1.230766358972596e-06, 'epoch': 2.54} +2025-05-11 12:03:56 - ERROR - stderr - 85%|████████▍ | 3162/3741 [18:38:03<3:25:02, 21.25s/it] +2025-05-11 12:04:19 - ERROR - stderr - 85%|████████▍ | 3163/3741 [18:38:25<3:28:31, 21.65s/it] +2025-05-11 12:04:19 - ERROR - stderr - +2025-05-11 12:04:19 - ERROR - stderr - +2025-05-11 12:04:19 - INFO - stdout - {'loss': 0.4668, 'grad_norm': 0.8633760213851929, 'learning_rate': 1.2266077277543155e-06, 'epoch': 2.54} +2025-05-11 12:04:19 - ERROR - stderr - 85%|████████▍ | 3163/3741 [18:38:25<3:28:31, 21.65s/it] +2025-05-11 12:04:39 - ERROR - stderr - 85%|████████▍ | 3164/3741 [18:38:45<3:23:28, 21.16s/it] +2025-05-11 12:04:39 - ERROR - stderr - +2025-05-11 12:04:39 - ERROR - stderr - +2025-05-11 12:04:39 - INFO - stdout - {'loss': 0.4562, 'grad_norm': 0.8226913809776306, 'learning_rate': 1.22245567512435e-06, 'epoch': 2.54} +2025-05-11 12:04:39 - ERROR - stderr - 85%|████████▍ | 3164/3741 [18:38:45<3:23:28, 21.16s/it] +2025-05-11 12:05:03 - ERROR - stderr - 85%|████████▍ | 3165/3741 [18:39:09<3:31:30, 22.03s/it] +2025-05-11 12:05:03 - ERROR - stderr - +2025-05-11 12:05:03 - ERROR - stderr - +2025-05-11 12:05:03 - INFO - stdout - {'loss': 0.4515, 'grad_norm': 0.8109039664268494, 'learning_rate': 1.218310204196046e-06, 'epoch': 2.54} +2025-05-11 12:05:03 - ERROR - stderr - 85%|████████▍ | 3165/3741 [18:39:09<3:31:30, 22.03s/it] +2025-05-11 12:05:23 - ERROR - stderr - 85%|████████▍ | 3166/3741 [18:39:29<3:24:25, 21.33s/it] +2025-05-11 12:05:23 - ERROR - stderr - +2025-05-11 12:05:23 - ERROR - stderr - +2025-05-11 12:05:23 - INFO - stdout - {'loss': 0.4797, 'grad_norm': 0.8582691550254822, 'learning_rate': 1.214171318077827e-06, 'epoch': 2.54} +2025-05-11 12:05:23 - ERROR - stderr - 85%|████████▍ | 3166/3741 [18:39:29<3:24:25, 21.33s/it] +2025-05-11 12:05:46 - ERROR - stderr - 85%|████████▍ | 3167/3741 [18:39:52<3:28:40, 21.81s/it] +2025-05-11 12:05:46 - ERROR - stderr - +2025-05-11 12:05:46 - ERROR - stderr - +2025-05-11 12:05:46 - INFO - stdout - {'loss': 0.4859, 'grad_norm': 0.8951324224472046, 'learning_rate': 1.2100390198731627e-06, 'epoch': 2.54} +2025-05-11 12:05:46 - ERROR - stderr - 85%|████████▍ | 3167/3741 [18:39:52<3:28:40, 21.81s/it] +2025-05-11 12:06:05 - ERROR - stderr - 85%|████████▍ | 3168/3741 [18:40:12<3:22:26, 21.20s/it] +2025-05-11 12:06:05 - ERROR - stderr - +2025-05-11 12:06:05 - ERROR - stderr - +2025-05-11 12:06:05 - INFO - stdout - {'loss': 0.4909, 'grad_norm': 0.8818310499191284, 'learning_rate': 1.2059133126805956e-06, 'epoch': 2.54} +2025-05-11 12:06:05 - ERROR - stderr - 85%|████████▍ | 3168/3741 [18:40:12<3:22:26, 21.20s/it] +2025-05-11 12:06:28 - ERROR - stderr - 85%|████████▍ | 3169/3741 [18:40:34<3:26:48, 21.69s/it] +2025-05-11 12:06:28 - ERROR - stderr - +2025-05-11 12:06:28 - ERROR - stderr - +2025-05-11 12:06:28 - INFO - stdout - {'loss': 0.4824, 'grad_norm': 0.8491972088813782, 'learning_rate': 1.201794199593721e-06, 'epoch': 2.54} +2025-05-11 12:06:28 - ERROR - stderr - 85%|████████▍ | 3169/3741 [18:40:34<3:26:48, 21.69s/it] +2025-05-11 12:06:48 - ERROR - stderr - 85%|████████▍ | 3170/3741 [18:40:54<3:21:08, 21.14s/it] +2025-05-11 12:06:48 - ERROR - stderr - +2025-05-11 12:06:48 - ERROR - stderr - +2025-05-11 12:06:48 - INFO - stdout - {'loss': 0.4734, 'grad_norm': 0.9278321266174316, 'learning_rate': 1.197681683701185e-06, 'epoch': 2.54} +2025-05-11 12:06:48 - ERROR - stderr - 85%|████████▍ | 3170/3741 [18:40:54<3:21:08, 21.14s/it] +2025-05-11 12:07:11 - ERROR - stderr - 85%|████████▍ | 3171/3741 [18:41:18<3:27:08, 21.80s/it] +2025-05-11 12:07:11 - ERROR - stderr - +2025-05-11 12:07:11 - ERROR - stderr - +2025-05-11 12:07:11 - INFO - stdout - {'loss': 0.4577, 'grad_norm': 0.8294722437858582, 'learning_rate': 1.193575768086701e-06, 'epoch': 2.54} +2025-05-11 12:07:11 - ERROR - stderr - 85%|████████▍ | 3171/3741 [18:41:18<3:27:08, 21.80s/it] +2025-05-11 12:07:31 - ERROR - stderr - 85%|████████▍ | 3172/3741 [18:41:38<3:21:31, 21.25s/it] +2025-05-11 12:07:31 - ERROR - stderr - +2025-05-11 12:07:31 - ERROR - stderr - +2025-05-11 12:07:31 - INFO - stdout - {'loss': 0.4741, 'grad_norm': 0.7998701333999634, 'learning_rate': 1.1894764558290172e-06, 'epoch': 2.54} +2025-05-11 12:07:31 - ERROR - stderr - 85%|████████▍ | 3172/3741 [18:41:38<3:21:31, 21.25s/it] +2025-05-11 12:07:54 - ERROR - stderr - 85%|████████▍ | 3173/3741 [18:42:01<3:26:29, 21.81s/it] +2025-05-11 12:07:54 - ERROR - stderr - +2025-05-11 12:07:54 - ERROR - stderr - +2025-05-11 12:07:54 - INFO - stdout - {'loss': 0.4739, 'grad_norm': 0.8978516459465027, 'learning_rate': 1.1853837500019406e-06, 'epoch': 2.54} +2025-05-11 12:07:54 - ERROR - stderr - 85%|████████▍ | 3173/3741 [18:42:01<3:26:29, 21.81s/it] +2025-05-11 12:08:14 - ERROR - stderr - 85%|████████▍ | 3174/3741 [18:42:21<3:20:20, 21.20s/it] +2025-05-11 12:08:14 - ERROR - stderr - +2025-05-11 12:08:14 - ERROR - stderr - +2025-05-11 12:08:14 - INFO - stdout - {'loss': 0.4827, 'grad_norm': 0.8539999723434448, 'learning_rate': 1.1812976536743226e-06, 'epoch': 2.55} +2025-05-11 12:08:14 - ERROR - stderr - 85%|████████▍ | 3174/3741 [18:42:21<3:20:20, 21.20s/it] +2025-05-11 12:08:37 - ERROR - stderr - 85%|████████▍ | 3175/3741 [18:42:43<3:24:30, 21.68s/it] +2025-05-11 12:08:37 - ERROR - stderr - +2025-05-11 12:08:37 - ERROR - stderr - +2025-05-11 12:08:37 - INFO - stdout - {'loss': 0.4658, 'grad_norm': 0.8564225435256958, 'learning_rate': 1.1772181699100538e-06, 'epoch': 2.55} +2025-05-11 12:08:37 - ERROR - stderr - 85%|████████▍ | 3175/3741 [18:42:43<3:24:30, 21.68s/it] +2025-05-11 12:08:57 - ERROR - stderr - 85%|████████▍ | 3176/3741 [18:43:03<3:18:23, 21.07s/it] +2025-05-11 12:08:57 - ERROR - stderr - +2025-05-11 12:08:57 - ERROR - stderr - +2025-05-11 12:08:57 - INFO - stdout - {'loss': 0.4624, 'grad_norm': 0.8567453026771545, 'learning_rate': 1.1731453017680716e-06, 'epoch': 2.55} +2025-05-11 12:08:57 - ERROR - stderr - 85%|████████▍ | 3176/3741 [18:43:03<3:18:23, 21.07s/it] +2025-05-11 12:09:18 - ERROR - stderr - 85%|████████▍ | 3177/3741 [18:43:24<3:18:21, 21.10s/it] +2025-05-11 12:09:18 - ERROR - stderr - +2025-05-11 12:09:18 - ERROR - stderr - +2025-05-11 12:09:18 - INFO - stdout - {'loss': 0.4923, 'grad_norm': 0.8754240870475769, 'learning_rate': 1.169079052302352e-06, 'epoch': 2.55} +2025-05-11 12:09:18 - ERROR - stderr - 85%|████████▍ | 3177/3741 [18:43:24<3:18:21, 21.10s/it] +2025-05-11 12:09:38 - ERROR - stderr - 85%|████████▍ | 3178/3741 [18:43:44<3:15:08, 20.80s/it] +2025-05-11 12:09:38 - ERROR - stderr - +2025-05-11 12:09:38 - ERROR - stderr - +2025-05-11 12:09:38 - INFO - stdout - {'loss': 0.5005, 'grad_norm': 0.8477223515510559, 'learning_rate': 1.1650194245619062e-06, 'epoch': 2.55} +2025-05-11 12:09:38 - ERROR - stderr - 85%|████████▍ | 3178/3741 [18:43:44<3:15:08, 20.80s/it] +2025-05-11 12:09:58 - ERROR - stderr - 85%|████████▍ | 3179/3741 [18:44:05<3:13:49, 20.69s/it] +2025-05-11 12:09:58 - ERROR - stderr - +2025-05-11 12:09:58 - ERROR - stderr - +2025-05-11 12:09:58 - INFO - stdout - {'loss': 0.4712, 'grad_norm': 0.8424686789512634, 'learning_rate': 1.1609664215907846e-06, 'epoch': 2.55} +2025-05-11 12:09:58 - ERROR - stderr - 85%|████████▍ | 3179/3741 [18:44:05<3:13:49, 20.69s/it] +2025-05-11 12:10:19 - ERROR - stderr - 85%|████████▌ | 3180/3741 [18:44:25<3:12:09, 20.55s/it] +2025-05-11 12:10:19 - ERROR - stderr - +2025-05-11 12:10:19 - ERROR - stderr - +2025-05-11 12:10:19 - INFO - stdout - {'loss': 0.502, 'grad_norm': 0.8633278012275696, 'learning_rate': 1.1569200464280616e-06, 'epoch': 2.55} +2025-05-11 12:10:19 - ERROR - stderr - 85%|████████▌ | 3180/3741 [18:44:25<3:12:09, 20.55s/it] +2025-05-11 12:10:38 - ERROR - stderr - 85%|████████▌ | 3181/3741 [18:44:45<3:10:02, 20.36s/it] +2025-05-11 12:10:39 - ERROR - stderr - +2025-05-11 12:10:39 - ERROR - stderr - +2025-05-11 12:10:39 - INFO - stdout - {'loss': 0.4818, 'grad_norm': 0.8947163224220276, 'learning_rate': 1.1528803021078505e-06, 'epoch': 2.55} +2025-05-11 12:10:39 - ERROR - stderr - 85%|████████▌ | 3181/3741 [18:44:45<3:10:02, 20.36s/it] +2025-05-11 12:10:58 - ERROR - stderr - 85%|████████▌ | 3182/3741 [18:45:05<3:08:04, 20.19s/it] +2025-05-11 12:10:58 - ERROR - stderr - +2025-05-11 12:10:58 - ERROR - stderr - +2025-05-11 12:10:58 - INFO - stdout - {'loss': 0.4868, 'grad_norm': 0.8522531986236572, 'learning_rate': 1.148847191659288e-06, 'epoch': 2.55} +2025-05-11 12:10:58 - ERROR - stderr - 85%|████████▌ | 3182/3741 [18:45:05<3:08:04, 20.19s/it] +2025-05-11 12:11:18 - ERROR - stderr - 85%|████████▌ | 3183/3741 [18:45:24<3:06:09, 20.02s/it] +2025-05-11 12:11:18 - ERROR - stderr - +2025-05-11 12:11:18 - ERROR - stderr - +2025-05-11 12:11:18 - INFO - stdout - {'loss': 0.4717, 'grad_norm': 0.8658236265182495, 'learning_rate': 1.1448207181065385e-06, 'epoch': 2.55} +2025-05-11 12:11:18 - ERROR - stderr - 85%|████████▌ | 3183/3741 [18:45:24<3:06:09, 20.02s/it] +2025-05-11 12:11:39 - ERROR - stderr - 85%|████████▌ | 3184/3741 [18:45:45<3:08:26, 20.30s/it] +2025-05-11 12:11:39 - ERROR - stderr - +2025-05-11 12:11:39 - ERROR - stderr - +2025-05-11 12:11:39 - INFO - stdout - {'loss': 0.5037, 'grad_norm': 0.919199526309967, 'learning_rate': 1.1408008844687901e-06, 'epoch': 2.55} +2025-05-11 12:11:39 - ERROR - stderr - 85%|████████▌ | 3184/3741 [18:45:45<3:08:26, 20.30s/it] +2025-05-11 12:11:59 - ERROR - stderr - 85%|████████▌ | 3185/3741 [18:46:05<3:07:09, 20.20s/it] +2025-05-11 12:11:59 - ERROR - stderr - +2025-05-11 12:11:59 - ERROR - stderr - +2025-05-11 12:11:59 - INFO - stdout - {'loss': 0.4904, 'grad_norm': 0.8818656802177429, 'learning_rate': 1.1367876937602474e-06, 'epoch': 2.55} +2025-05-11 12:11:59 - ERROR - stderr - 85%|████████▌ | 3185/3741 [18:46:05<3:07:09, 20.20s/it] +2025-05-11 12:12:20 - ERROR - stderr - 85%|████████▌ | 3186/3741 [18:46:27<3:10:53, 20.64s/it] +2025-05-11 12:12:20 - ERROR - stderr - +2025-05-11 12:12:20 - ERROR - stderr - +2025-05-11 12:12:20 - INFO - stdout - {'loss': 0.4395, 'grad_norm': 0.8673224449157715, 'learning_rate': 1.1327811489901398e-06, 'epoch': 2.55} +2025-05-11 12:12:20 - ERROR - stderr - 85%|████████▌ | 3186/3741 [18:46:27<3:10:53, 20.64s/it] +2025-05-11 12:12:40 - ERROR - stderr - 85%|████████▌ | 3187/3741 [18:46:47<3:08:05, 20.37s/it] +2025-05-11 12:12:40 - ERROR - stderr - +2025-05-11 12:12:40 - ERROR - stderr - +2025-05-11 12:12:40 - INFO - stdout - {'loss': 0.4864, 'grad_norm': 0.8441712260246277, 'learning_rate': 1.1287812531627108e-06, 'epoch': 2.56} +2025-05-11 12:12:40 - ERROR - stderr - 85%|████████▌ | 3187/3741 [18:46:47<3:08:05, 20.37s/it] +2025-05-11 12:13:03 - ERROR - stderr - 85%|████████▌ | 3188/3741 [18:47:09<3:14:49, 21.14s/it] +2025-05-11 12:13:03 - ERROR - stderr - +2025-05-11 12:13:03 - ERROR - stderr - +2025-05-11 12:13:03 - INFO - stdout - {'loss': 0.4841, 'grad_norm': 0.8633151650428772, 'learning_rate': 1.1247880092772202e-06, 'epoch': 2.56} +2025-05-11 12:13:03 - ERROR - stderr - 85%|████████▌ | 3188/3741 [18:47:09<3:14:49, 21.14s/it] +2025-05-11 12:13:23 - ERROR - stderr - 85%|████████▌ | 3189/3741 [18:47:29<3:10:40, 20.73s/it] +2025-05-11 12:13:23 - ERROR - stderr - +2025-05-11 12:13:23 - ERROR - stderr - +2025-05-11 12:13:23 - INFO - stdout - {'loss': 0.4965, 'grad_norm': 0.8488594889640808, 'learning_rate': 1.120801420327935e-06, 'epoch': 2.56} +2025-05-11 12:13:23 - ERROR - stderr - 85%|████████▌ | 3189/3741 [18:47:29<3:10:40, 20.73s/it] +2025-05-11 12:13:46 - ERROR - stderr - 85%|████████▌ | 3190/3741 [18:47:53<3:18:05, 21.57s/it] +2025-05-11 12:13:46 - ERROR - stderr - +2025-05-11 12:13:46 - ERROR - stderr - +2025-05-11 12:13:46 - INFO - stdout - {'loss': 0.4798, 'grad_norm': 0.8840992450714111, 'learning_rate': 1.1168214893041363e-06, 'epoch': 2.56} +2025-05-11 12:13:46 - ERROR - stderr - 85%|████████▌ | 3190/3741 [18:47:53<3:18:05, 21.57s/it] +2025-05-11 12:14:06 - ERROR - stderr - 85%|████████▌ | 3191/3741 [18:48:13<3:13:17, 21.09s/it] +2025-05-11 12:14:06 - ERROR - stderr - +2025-05-11 12:14:06 - ERROR - stderr - +2025-05-11 12:14:06 - INFO - stdout - {'loss': 0.4761, 'grad_norm': 0.909084677696228, 'learning_rate': 1.1128482191901124e-06, 'epoch': 2.56} +2025-05-11 12:14:06 - ERROR - stderr - 85%|████████▌ | 3191/3741 [18:48:13<3:13:17, 21.09s/it] +2025-05-11 12:14:29 - ERROR - stderr - 85%|████████▌ | 3192/3741 [18:48:36<3:17:53, 21.63s/it] +2025-05-11 12:14:29 - ERROR - stderr - +2025-05-11 12:14:29 - ERROR - stderr - +2025-05-11 12:14:29 - INFO - stdout - {'loss': 0.4709, 'grad_norm': 0.8316271901130676, 'learning_rate': 1.1088816129651569e-06, 'epoch': 2.56} +2025-05-11 12:14:29 - ERROR - stderr - 85%|████████▌ | 3192/3741 [18:48:36<3:17:53, 21.63s/it] +2025-05-11 12:14:49 - ERROR - stderr - 85%|████████▌ | 3193/3741 [18:48:55<3:12:06, 21.03s/it] +2025-05-11 12:14:49 - ERROR - stderr - +2025-05-11 12:14:49 - ERROR - stderr - +2025-05-11 12:14:49 - INFO - stdout - {'loss': 0.4643, 'grad_norm': 0.8329962491989136, 'learning_rate': 1.1049216736035673e-06, 'epoch': 2.56} +2025-05-11 12:14:49 - ERROR - stderr - 85%|████████▌ | 3193/3741 [18:48:55<3:12:06, 21.03s/it] +2025-05-11 12:15:09 - ERROR - stderr - 85%|████████▌ | 3194/3741 [18:49:15<3:09:06, 20.74s/it] +2025-05-11 12:15:09 - ERROR - stderr - +2025-05-11 12:15:09 - ERROR - stderr - +2025-05-11 12:15:09 - INFO - stdout - {'loss': 0.4716, 'grad_norm': 0.8709492683410645, 'learning_rate': 1.1009684040746394e-06, 'epoch': 2.56} +2025-05-11 12:15:09 - ERROR - stderr - 85%|████████▌ | 3194/3741 [18:49:15<3:09:06, 20.74s/it] +2025-05-11 12:15:29 - ERROR - stderr - 85%|████████▌ | 3195/3741 [18:49:35<3:06:04, 20.45s/it] +2025-05-11 12:15:29 - ERROR - stderr - +2025-05-11 12:15:29 - ERROR - stderr - +2025-05-11 12:15:29 - INFO - stdout - {'loss': 0.4831, 'grad_norm': 0.8427078723907471, 'learning_rate': 1.0970218073426674e-06, 'epoch': 2.56} +2025-05-11 12:15:29 - ERROR - stderr - 85%|████████▌ | 3195/3741 [18:49:35<3:06:04, 20.45s/it] +2025-05-11 12:15:49 - ERROR - stderr - 85%|████████▌ | 3196/3741 [18:49:55<3:04:24, 20.30s/it] +2025-05-11 12:15:49 - ERROR - stderr - +2025-05-11 12:15:49 - ERROR - stderr - +2025-05-11 12:15:49 - INFO - stdout - {'loss': 0.4631, 'grad_norm': 0.8805751204490662, 'learning_rate': 1.093081886366948e-06, 'epoch': 2.56} +2025-05-11 12:15:49 - ERROR - stderr - 85%|████████▌ | 3196/3741 [18:49:55<3:04:24, 20.30s/it] +2025-05-11 12:16:10 - ERROR - stderr - 85%|████████▌ | 3197/3741 [18:50:16<3:05:21, 20.44s/it] +2025-05-11 12:16:10 - ERROR - stderr - +2025-05-11 12:16:10 - ERROR - stderr - +2025-05-11 12:16:10 - INFO - stdout - {'loss': 0.4822, 'grad_norm': 0.8472068905830383, 'learning_rate': 1.0891486441017652e-06, 'epoch': 2.56} +2025-05-11 12:16:10 - ERROR - stderr - 85%|████████▌ | 3197/3741 [18:50:16<3:05:21, 20.44s/it] +2025-05-11 12:16:29 - ERROR - stderr - 85%|████████▌ | 3198/3741 [18:50:35<3:02:51, 20.21s/it] +2025-05-11 12:16:29 - ERROR - stderr - +2025-05-11 12:16:29 - ERROR - stderr - +2025-05-11 12:16:29 - INFO - stdout - {'loss': 0.4685, 'grad_norm': 0.8687289357185364, 'learning_rate': 1.085222083496401e-06, 'epoch': 2.56} +2025-05-11 12:16:29 - ERROR - stderr - 85%|████████▌ | 3198/3741 [18:50:35<3:02:51, 20.21s/it] +2025-05-11 12:16:50 - ERROR - stderr - 86%|████████▌ | 3199/3741 [18:50:56<3:04:01, 20.37s/it] +2025-05-11 12:16:50 - ERROR - stderr - +2025-05-11 12:16:50 - ERROR - stderr - +2025-05-11 12:16:50 - INFO - stdout - {'loss': 0.4577, 'grad_norm': 0.857964813709259, 'learning_rate': 1.0813022074951208e-06, 'epoch': 2.57} +2025-05-11 12:16:50 - ERROR - stderr - 86%|████████▌ | 3199/3741 [18:50:56<3:04:01, 20.37s/it] +2025-05-11 12:17:10 - ERROR - stderr - 86%|████████▌ | 3200/3741 [18:51:16<3:03:06, 20.31s/it] +2025-05-11 12:17:10 - ERROR - stderr - +2025-05-11 12:17:10 - ERROR - stderr - +2025-05-11 12:17:10 - INFO - stdout - {'loss': 0.4715, 'grad_norm': 0.8346413373947144, 'learning_rate': 1.0773890190371828e-06, 'epoch': 2.57} +2025-05-11 12:17:10 - ERROR - stderr - 86%|████████▌ | 3200/3741 [18:51:16<3:03:06, 20.31s/it] +2025-05-11 12:17:32 - ERROR - stderr - 86%|████████▌ | 3201/3741 [18:51:38<3:06:44, 20.75s/it] +2025-05-11 12:17:32 - ERROR - stderr - +2025-05-11 12:17:32 - ERROR - stderr - +2025-05-11 12:17:32 - INFO - stdout - {'loss': 0.5006, 'grad_norm': 0.8784050345420837, 'learning_rate': 1.07348252105683e-06, 'epoch': 2.57} +2025-05-11 12:17:32 - ERROR - stderr - 86%|████████▌ | 3201/3741 [18:51:38<3:06:44, 20.75s/it] +2025-05-11 12:17:52 - ERROR - stderr - 86%|████████▌ | 3202/3741 [18:51:58<3:03:31, 20.43s/it] +2025-05-11 12:17:52 - ERROR - stderr - +2025-05-11 12:17:52 - ERROR - stderr - +2025-05-11 12:17:52 - INFO - stdout - {'loss': 0.486, 'grad_norm': 0.8425981402397156, 'learning_rate': 1.0695827164832828e-06, 'epoch': 2.57} +2025-05-11 12:17:52 - ERROR - stderr - 86%|████████▌ | 3202/3741 [18:51:58<3:03:31, 20.43s/it] +2025-05-11 12:18:14 - ERROR - stderr - 86%|████████▌ | 3203/3741 [18:52:20<3:09:06, 21.09s/it] +2025-05-11 12:18:14 - ERROR - stderr - +2025-05-11 12:18:14 - ERROR - stderr - +2025-05-11 12:18:14 - INFO - stdout - {'loss': 0.4815, 'grad_norm': 0.886447012424469, 'learning_rate': 1.0656896082407554e-06, 'epoch': 2.57} +2025-05-11 12:18:14 - ERROR - stderr - 86%|████████▌ | 3203/3741 [18:52:21<3:09:06, 21.09s/it] +2025-05-11 12:18:34 - ERROR - stderr - 86%|████████▌ | 3204/3741 [18:52:40<3:04:48, 20.65s/it] +2025-05-11 12:18:34 - ERROR - stderr - +2025-05-11 12:18:34 - ERROR - stderr - +2025-05-11 12:18:34 - INFO - stdout - {'loss': 0.4633, 'grad_norm': 0.8257114291191101, 'learning_rate': 1.0618031992484267e-06, 'epoch': 2.57} +2025-05-11 12:18:34 - ERROR - stderr - 86%|████████▌ | 3204/3741 [18:52:40<3:04:48, 20.65s/it] +2025-05-11 12:18:57 - ERROR - stderr - 86%|████████▌ | 3205/3741 [18:53:03<3:10:52, 21.37s/it] +2025-05-11 12:18:57 - ERROR - stderr - +2025-05-11 12:18:57 - ERROR - stderr - +2025-05-11 12:18:57 - INFO - stdout - {'loss': 0.4793, 'grad_norm': 0.8750101923942566, 'learning_rate': 1.0579234924204608e-06, 'epoch': 2.57} +2025-05-11 12:18:57 - ERROR - stderr - 86%|████████▌ | 3205/3741 [18:53:03<3:10:52, 21.37s/it] +2025-05-11 12:19:16 - ERROR - stderr - 86%|████████▌ | 3206/3741 [18:53:23<3:05:54, 20.85s/it] +2025-05-11 12:19:16 - ERROR - stderr - +2025-05-11 12:19:16 - ERROR - stderr - +2025-05-11 12:19:16 - INFO - stdout - {'loss': 0.4668, 'grad_norm': 0.9001625180244446, 'learning_rate': 1.0540504906659955e-06, 'epoch': 2.57} +2025-05-11 12:19:16 - ERROR - stderr - 86%|████████▌ | 3206/3741 [18:53:23<3:05:54, 20.85s/it] +2025-05-11 12:19:39 - ERROR - stderr - 86%|████████▌ | 3207/3741 [18:53:45<3:09:35, 21.30s/it] +2025-05-11 12:19:39 - ERROR - stderr - +2025-05-11 12:19:39 - ERROR - stderr - +2025-05-11 12:19:39 - INFO - stdout - {'loss': 0.4718, 'grad_norm': 0.8314618468284607, 'learning_rate': 1.0501841968891324e-06, 'epoch': 2.57} +2025-05-11 12:19:39 - ERROR - stderr - 86%|████████▌ | 3207/3741 [18:53:45<3:09:35, 21.30s/it] +2025-05-11 12:19:59 - ERROR - stderr - 86%|████████▌ | 3208/3741 [18:54:05<3:05:02, 20.83s/it] +2025-05-11 12:19:59 - ERROR - stderr - +2025-05-11 12:19:59 - ERROR - stderr - +2025-05-11 12:19:59 - INFO - stdout - {'loss': 0.4886, 'grad_norm': 0.8423780202865601, 'learning_rate': 1.0463246139889604e-06, 'epoch': 2.57} +2025-05-11 12:19:59 - ERROR - stderr - 86%|████████▌ | 3208/3741 [18:54:05<3:05:02, 20.83s/it] +2025-05-11 12:20:22 - ERROR - stderr - 86%|████████▌ | 3209/3741 [18:54:28<3:10:44, 21.51s/it] +2025-05-11 12:20:22 - ERROR - stderr - +2025-05-11 12:20:22 - ERROR - stderr - +2025-05-11 12:20:22 - INFO - stdout - {'loss': 0.4784, 'grad_norm': 0.8424227237701416, 'learning_rate': 1.04247174485952e-06, 'epoch': 2.57} +2025-05-11 12:20:22 - ERROR - stderr - 86%|████████▌ | 3209/3741 [18:54:28<3:10:44, 21.51s/it] +2025-05-11 12:20:42 - ERROR - stderr - 86%|████████▌ | 3210/3741 [18:54:48<3:06:39, 21.09s/it] +2025-05-11 12:20:42 - ERROR - stderr - +2025-05-11 12:20:42 - ERROR - stderr - +2025-05-11 12:20:42 - INFO - stdout - {'loss': 0.4853, 'grad_norm': 0.8639745116233826, 'learning_rate': 1.0386255923898236e-06, 'epoch': 2.57} +2025-05-11 12:20:42 - ERROR - stderr - 86%|████████▌ | 3210/3741 [18:54:48<3:06:39, 21.09s/it] +2025-05-11 12:21:05 - ERROR - stderr - 86%|████████▌ | 3211/3741 [18:55:11<3:11:35, 21.69s/it] +2025-05-11 12:21:05 - ERROR - stderr - +2025-05-11 12:21:05 - ERROR - stderr - +2025-05-11 12:21:05 - INFO - stdout - {'loss': 0.4797, 'grad_norm': 0.906941294670105, 'learning_rate': 1.0347861594638519e-06, 'epoch': 2.57} +2025-05-11 12:21:05 - ERROR - stderr - 86%|████████▌ | 3211/3741 [18:55:11<3:11:35, 21.69s/it] +2025-05-11 12:21:25 - ERROR - stderr - 86%|████████▌ | 3212/3741 [18:55:31<3:07:06, 21.22s/it] +2025-05-11 12:21:25 - ERROR - stderr - +2025-05-11 12:21:25 - ERROR - stderr - +2025-05-11 12:21:25 - INFO - stdout - {'loss': 0.4974, 'grad_norm': 0.8813538551330566, 'learning_rate': 1.0309534489605344e-06, 'epoch': 2.58} +2025-05-11 12:21:25 - ERROR - stderr - 86%|████████▌ | 3212/3741 [18:55:31<3:07:06, 21.22s/it] +2025-05-11 12:21:48 - ERROR - stderr - 86%|████████▌ | 3213/3741 [18:55:54<3:10:37, 21.66s/it] +2025-05-11 12:21:48 - ERROR - stderr - +2025-05-11 12:21:48 - ERROR - stderr - +2025-05-11 12:21:48 - INFO - stdout - {'loss': 0.476, 'grad_norm': 0.8710605502128601, 'learning_rate': 1.0271274637537764e-06, 'epoch': 2.58} +2025-05-11 12:21:48 - ERROR - stderr - 86%|████████▌ | 3213/3741 [18:55:54<3:10:37, 21.66s/it] +2025-05-11 12:22:07 - ERROR - stderr - 86%|████████▌ | 3214/3741 [18:56:14<3:04:57, 21.06s/it] +2025-05-11 12:22:07 - ERROR - stderr - +2025-05-11 12:22:07 - ERROR - stderr - +2025-05-11 12:22:07 - INFO - stdout - {'loss': 0.4821, 'grad_norm': 0.9004622101783752, 'learning_rate': 1.0233082067124266e-06, 'epoch': 2.58} +2025-05-11 12:22:07 - ERROR - stderr - 86%|████████▌ | 3214/3741 [18:56:14<3:04:57, 21.06s/it] +2025-05-11 12:22:30 - ERROR - stderr - 86%|████████▌ | 3215/3741 [18:56:36<3:08:07, 21.46s/it] +2025-05-11 12:22:30 - ERROR - stderr - +2025-05-11 12:22:30 - ERROR - stderr - +2025-05-11 12:22:30 - INFO - stdout - {'loss': 0.4919, 'grad_norm': 0.9231355786323547, 'learning_rate': 1.0194956807002965e-06, 'epoch': 2.58} +2025-05-11 12:22:30 - ERROR - stderr - 86%|████████▌ | 3215/3741 [18:56:36<3:08:07, 21.46s/it] +2025-05-11 12:22:49 - ERROR - stderr - 86%|████████▌ | 3216/3741 [18:56:56<3:02:44, 20.88s/it] +2025-05-11 12:22:49 - ERROR - stderr - +2025-05-11 12:22:49 - ERROR - stderr - +2025-05-11 12:22:49 - INFO - stdout - {'loss': 0.4442, 'grad_norm': 0.8449090719223022, 'learning_rate': 1.015689888576149e-06, 'epoch': 2.58} +2025-05-11 12:22:49 - ERROR - stderr - 86%|████████▌ | 3216/3741 [18:56:56<3:02:44, 20.88s/it] +2025-05-11 12:23:11 - ERROR - stderr - 86%|████████▌ | 3217/3741 [18:57:18<3:05:32, 21.25s/it] +2025-05-11 12:23:11 - ERROR - stderr - +2025-05-11 12:23:11 - ERROR - stderr - +2025-05-11 12:23:11 - INFO - stdout - {'loss': 0.4931, 'grad_norm': 0.9300955533981323, 'learning_rate': 1.0118908331936915e-06, 'epoch': 2.58} +2025-05-11 12:23:11 - ERROR - stderr - 86%|████████▌ | 3217/3741 [18:57:18<3:05:32, 21.25s/it] +2025-05-11 12:23:31 - ERROR - stderr - 86%|████████▌ | 3218/3741 [18:57:37<3:00:36, 20.72s/it] +2025-05-11 12:23:31 - ERROR - stderr - +2025-05-11 12:23:31 - ERROR - stderr - +2025-05-11 12:23:31 - INFO - stdout - {'loss': 0.4689, 'grad_norm': 0.8253883719444275, 'learning_rate': 1.0080985174015901e-06, 'epoch': 2.58} +2025-05-11 12:23:31 - ERROR - stderr - 86%|████████▌ | 3218/3741 [18:57:37<3:00:36, 20.72s/it] +2025-05-11 12:23:53 - ERROR - stderr - 86%|████████▌ | 3219/3741 [18:58:00<3:05:03, 21.27s/it] +2025-05-11 12:23:53 - ERROR - stderr - +2025-05-11 12:23:53 - ERROR - stderr - +2025-05-11 12:23:53 - INFO - stdout - {'loss': 0.4745, 'grad_norm': 0.9097961783409119, 'learning_rate': 1.0043129440434496e-06, 'epoch': 2.58} +2025-05-11 12:23:53 - ERROR - stderr - 86%|████████▌ | 3219/3741 [18:58:00<3:05:03, 21.27s/it] +2025-05-11 12:24:13 - ERROR - stderr - 86%|████████▌ | 3220/3741 [18:58:20<3:01:03, 20.85s/it] +2025-05-11 12:24:13 - ERROR - stderr - +2025-05-11 12:24:13 - ERROR - stderr - +2025-05-11 12:24:13 - INFO - stdout - {'loss': 0.4759, 'grad_norm': 0.8564082384109497, 'learning_rate': 1.000534115957823e-06, 'epoch': 2.58} +2025-05-11 12:24:13 - ERROR - stderr - 86%|████████▌ | 3220/3741 [18:58:20<3:01:03, 20.85s/it] +2025-05-11 12:24:33 - ERROR - stderr - 86%|████████▌ | 3221/3741 [18:58:40<2:58:28, 20.59s/it] +2025-05-11 12:24:33 - ERROR - stderr - +2025-05-11 12:24:33 - ERROR - stderr - +2025-05-11 12:24:33 - INFO - stdout - {'loss': 0.4745, 'grad_norm': 0.8668890595436096, 'learning_rate': 9.96762035978206e-07, 'epoch': 2.58} +2025-05-11 12:24:33 - ERROR - stderr - 86%|████████▌ | 3221/3741 [18:58:40<2:58:28, 20.59s/it] +2025-05-11 12:24:53 - ERROR - stderr - 86%|████████▌ | 3222/3741 [18:59:00<2:56:51, 20.45s/it] +2025-05-11 12:24:53 - ERROR - stderr - +2025-05-11 12:24:53 - ERROR - stderr - +2025-05-11 12:24:53 - INFO - stdout - {'loss': 0.4765, 'grad_norm': 0.8841666579246521, 'learning_rate': 9.929967069330282e-07, 'epoch': 2.58} +2025-05-11 12:24:53 - ERROR - stderr - 86%|████████▌ | 3222/3741 [18:59:00<2:56:51, 20.45s/it] +2025-05-11 12:25:13 - ERROR - stderr - 86%|████████▌ | 3223/3741 [18:59:20<2:54:53, 20.26s/it] +2025-05-11 12:25:13 - ERROR - stderr - +2025-05-11 12:25:13 - ERROR - stderr - +2025-05-11 12:25:13 - INFO - stdout - {'loss': 0.4634, 'grad_norm': 0.8739838600158691, 'learning_rate': 9.892381316456656e-07, 'epoch': 2.58} +2025-05-11 12:25:13 - ERROR - stderr - 86%|████████▌ | 3223/3741 [18:59:20<2:54:53, 20.26s/it] +2025-05-11 12:25:33 - ERROR - stderr - 86%|████████▌ | 3224/3741 [18:59:39<2:53:23, 20.12s/it] +2025-05-11 12:25:33 - ERROR - stderr - +2025-05-11 12:25:33 - ERROR - stderr - +2025-05-11 12:25:33 - INFO - stdout - {'loss': 0.4719, 'grad_norm': 0.8660984039306641, 'learning_rate': 9.854863129344229e-07, 'epoch': 2.59} +2025-05-11 12:25:33 - ERROR - stderr - 86%|████████▌ | 3224/3741 [18:59:39<2:53:23, 20.12s/it] +2025-05-11 12:25:53 - ERROR - stderr - 86%|████████▌ | 3225/3741 [18:59:59<2:51:59, 20.00s/it] +2025-05-11 12:25:53 - ERROR - stderr - +2025-05-11 12:25:53 - ERROR - stderr - +2025-05-11 12:25:53 - INFO - stdout - {'loss': 0.479, 'grad_norm': 0.8962447643280029, 'learning_rate': 9.817412536125449e-07, 'epoch': 2.59} +2025-05-11 12:25:53 - ERROR - stderr - 86%|████████▌ | 3225/3741 [18:59:59<2:51:59, 20.00s/it] +2025-05-11 12:26:13 - ERROR - stderr - 86%|████████▌ | 3226/3741 [19:00:19<2:52:20, 20.08s/it] +2025-05-11 12:26:13 - ERROR - stderr - +2025-05-11 12:26:13 - ERROR - stderr - +2025-05-11 12:26:13 - INFO - stdout - {'loss': 0.4506, 'grad_norm': 0.8654753565788269, 'learning_rate': 9.780029564882032e-07, 'epoch': 2.59} +2025-05-11 12:26:13 - ERROR - stderr - 86%|████████▌ | 3226/3741 [19:00:19<2:52:20, 20.08s/it] +2025-05-11 12:26:33 - ERROR - stderr - 86%|████████▋ | 3227/3741 [19:00:39<2:51:13, 19.99s/it] +2025-05-11 12:26:33 - ERROR - stderr - +2025-05-11 12:26:33 - ERROR - stderr - +2025-05-11 12:26:33 - INFO - stdout - {'loss': 0.459, 'grad_norm': 0.8893976211547852, 'learning_rate': 9.74271424364498e-07, 'epoch': 2.59} +2025-05-11 12:26:33 - ERROR - stderr - 86%|████████▋ | 3227/3741 [19:00:39<2:51:13, 19.99s/it] +2025-05-11 12:26:55 - ERROR - stderr - 86%|████████▋ | 3228/3741 [19:01:01<2:56:22, 20.63s/it] +2025-05-11 12:26:55 - ERROR - stderr - +2025-05-11 12:26:55 - ERROR - stderr - +2025-05-11 12:26:55 - INFO - stdout - {'loss': 0.4895, 'grad_norm': 0.9372872710227966, 'learning_rate': 9.70546660039462e-07, 'epoch': 2.59} +2025-05-11 12:26:55 - ERROR - stderr - 86%|████████▋ | 3228/3741 [19:01:01<2:56:22, 20.63s/it] +2025-05-11 12:27:15 - ERROR - stderr - 86%|████████▋ | 3229/3741 [19:01:21<2:53:53, 20.38s/it] +2025-05-11 12:27:15 - ERROR - stderr - +2025-05-11 12:27:15 - ERROR - stderr - +2025-05-11 12:27:15 - INFO - stdout - {'loss': 0.4948, 'grad_norm': 0.8435534238815308, 'learning_rate': 9.66828666306049e-07, 'epoch': 2.59} +2025-05-11 12:27:15 - ERROR - stderr - 86%|████████▋ | 3229/3741 [19:01:21<2:53:53, 20.38s/it] +2025-05-11 12:27:37 - ERROR - stderr - 86%|████████▋ | 3230/3741 [19:01:43<2:58:53, 21.00s/it] +2025-05-11 12:27:37 - ERROR - stderr - +2025-05-11 12:27:37 - ERROR - stderr - +2025-05-11 12:27:37 - INFO - stdout - {'loss': 0.4559, 'grad_norm': 0.8981894850730896, 'learning_rate': 9.631174459521398e-07, 'epoch': 2.59} +2025-05-11 12:27:37 - ERROR - stderr - 86%|████████▋ | 3230/3741 [19:01:43<2:58:53, 21.00s/it] +2025-05-11 12:27:57 - ERROR - stderr - 86%|████████▋ | 3231/3741 [19:02:04<2:56:15, 20.74s/it] +2025-05-11 12:27:57 - ERROR - stderr - +2025-05-11 12:27:57 - ERROR - stderr - +2025-05-11 12:27:57 - INFO - stdout - {'loss': 0.4704, 'grad_norm': 0.8509783744812012, 'learning_rate': 9.594130017605296e-07, 'epoch': 2.59} +2025-05-11 12:27:57 - ERROR - stderr - 86%|████████▋ | 3231/3741 [19:02:04<2:56:15, 20.74s/it] +2025-05-11 12:28:19 - ERROR - stderr - 86%|████████▋ | 3232/3741 [19:02:26<2:59:21, 21.14s/it] +2025-05-11 12:28:19 - ERROR - stderr - +2025-05-11 12:28:19 - ERROR - stderr - +2025-05-11 12:28:19 - INFO - stdout - {'loss': 0.4753, 'grad_norm': 0.8438981175422668, 'learning_rate': 9.5571533650894e-07, 'epoch': 2.59} +2025-05-11 12:28:19 - ERROR - stderr - 86%|████████▋ | 3232/3741 [19:02:26<2:59:21, 21.14s/it] +2025-05-11 12:28:39 - ERROR - stderr - 86%|████████▋ | 3233/3741 [19:02:46<2:56:26, 20.84s/it] +2025-05-11 12:28:39 - ERROR - stderr - +2025-05-11 12:28:39 - ERROR - stderr - +2025-05-11 12:28:39 - INFO - stdout - {'loss': 0.4596, 'grad_norm': 0.8406031131744385, 'learning_rate': 9.520244529700041e-07, 'epoch': 2.59} +2025-05-11 12:28:39 - ERROR - stderr - 86%|████████▋ | 3233/3741 [19:02:46<2:56:26, 20.84s/it] +2025-05-11 12:29:02 - ERROR - stderr - 86%|████████▋ | 3234/3741 [19:03:08<2:59:17, 21.22s/it] +2025-05-11 12:29:02 - ERROR - stderr - +2025-05-11 12:29:02 - ERROR - stderr - +2025-05-11 12:29:02 - INFO - stdout - {'loss': 0.4481, 'grad_norm': 0.8662395477294922, 'learning_rate': 9.483403539112735e-07, 'epoch': 2.59} +2025-05-11 12:29:02 - ERROR - stderr - 86%|████████▋ | 3234/3741 [19:03:08<2:59:17, 21.22s/it] +2025-05-11 12:29:21 - ERROR - stderr - 86%|████████▋ | 3235/3741 [19:03:28<2:55:03, 20.76s/it] +2025-05-11 12:29:21 - ERROR - stderr - +2025-05-11 12:29:21 - ERROR - stderr - +2025-05-11 12:29:21 - INFO - stdout - {'loss': 0.4611, 'grad_norm': 0.843547523021698, 'learning_rate': 9.44663042095213e-07, 'epoch': 2.59} +2025-05-11 12:29:21 - ERROR - stderr - 86%|████████▋ | 3235/3741 [19:03:28<2:55:03, 20.76s/it] +2025-05-11 12:29:43 - ERROR - stderr - 87%|████████▋ | 3236/3741 [19:03:49<2:56:52, 21.02s/it] +2025-05-11 12:29:43 - ERROR - stderr - +2025-05-11 12:29:43 - ERROR - stderr - +2025-05-11 12:29:43 - INFO - stdout - {'loss': 0.4842, 'grad_norm': 0.8858677744865417, 'learning_rate': 9.409925202791925e-07, 'epoch': 2.6} +2025-05-11 12:29:43 - ERROR - stderr - 87%|████████▋ | 3236/3741 [19:03:49<2:56:52, 21.02s/it] +2025-05-11 12:30:03 - ERROR - stderr - 87%|████████▋ | 3237/3741 [19:04:09<2:53:18, 20.63s/it] +2025-05-11 12:30:03 - ERROR - stderr - +2025-05-11 12:30:03 - ERROR - stderr - +2025-05-11 12:30:03 - INFO - stdout - {'loss': 0.462, 'grad_norm': 0.8714892864227295, 'learning_rate': 9.37328791215496e-07, 'epoch': 2.6} +2025-05-11 12:30:03 - ERROR - stderr - 87%|████████▋ | 3237/3741 [19:04:09<2:53:18, 20.63s/it] +2025-05-11 12:30:24 - ERROR - stderr - 87%|████████▋ | 3238/3741 [19:04:31<2:55:58, 20.99s/it] +2025-05-11 12:30:24 - ERROR - stderr - +2025-05-11 12:30:24 - ERROR - stderr - +2025-05-11 12:30:24 - INFO - stdout - {'loss': 0.4721, 'grad_norm': 0.8578234910964966, 'learning_rate': 9.336718576513127e-07, 'epoch': 2.6} +2025-05-11 12:30:24 - ERROR - stderr - 87%|████████▋ | 3238/3741 [19:04:31<2:55:58, 20.99s/it] +2025-05-11 12:30:44 - ERROR - stderr - 87%|████████▋ | 3239/3741 [19:04:51<2:52:32, 20.62s/it] +2025-05-11 12:30:44 - ERROR - stderr - +2025-05-11 12:30:44 - ERROR - stderr - +2025-05-11 12:30:44 - INFO - stdout - {'loss': 0.4791, 'grad_norm': 0.8844730257987976, 'learning_rate': 9.300217223287345e-07, 'epoch': 2.6} +2025-05-11 12:30:44 - ERROR - stderr - 87%|████████▋ | 3239/3741 [19:04:51<2:52:32, 20.62s/it] +2025-05-11 12:31:06 - ERROR - stderr - 87%|████████▋ | 3240/3741 [19:05:13<2:56:23, 21.12s/it] +2025-05-11 12:31:07 - ERROR - stderr - +2025-05-11 12:31:07 - ERROR - stderr - +2025-05-11 12:31:07 - INFO - stdout - {'loss': 0.4751, 'grad_norm': 0.8658874034881592, 'learning_rate': 9.263783879847599e-07, 'epoch': 2.6} +2025-05-11 12:31:07 - ERROR - stderr - 87%|████████▋ | 3240/3741 [19:05:13<2:56:23, 21.12s/it] +2025-05-11 12:31:26 - ERROR - stderr - 87%|████████▋ | 3241/3741 [19:05:33<2:52:36, 20.71s/it] +2025-05-11 12:31:26 - ERROR - stderr - +2025-05-11 12:31:26 - ERROR - stderr - +2025-05-11 12:31:26 - INFO - stdout - {'loss': 0.4736, 'grad_norm': 0.8326420187950134, 'learning_rate': 9.227418573512825e-07, 'epoch': 2.6} +2025-05-11 12:31:26 - ERROR - stderr - 87%|████████▋ | 3241/3741 [19:05:33<2:52:36, 20.71s/it] +2025-05-11 12:31:49 - ERROR - stderr - 87%|████████▋ | 3242/3741 [19:05:55<2:57:29, 21.34s/it] +2025-05-11 12:31:49 - ERROR - stderr - +2025-05-11 12:31:49 - ERROR - stderr - +2025-05-11 12:31:49 - INFO - stdout - {'loss': 0.4664, 'grad_norm': 0.8331664800643921, 'learning_rate': 9.191121331550967e-07, 'epoch': 2.6} +2025-05-11 12:31:49 - ERROR - stderr - 87%|████████▋ | 3242/3741 [19:05:55<2:57:29, 21.34s/it] +2025-05-11 12:32:09 - ERROR - stderr - 87%|████████▋ | 3243/3741 [19:06:15<2:53:02, 20.85s/it] +2025-05-11 12:32:09 - ERROR - stderr - +2025-05-11 12:32:09 - ERROR - stderr - +2025-05-11 12:32:09 - INFO - stdout - {'loss': 0.4962, 'grad_norm': 0.915407657623291, 'learning_rate': 9.154892181178954e-07, 'epoch': 2.6} +2025-05-11 12:32:09 - ERROR - stderr - 87%|████████▋ | 3243/3741 [19:06:15<2:53:02, 20.85s/it] +2025-05-11 12:32:32 - ERROR - stderr - 87%|████████▋ | 3244/3741 [19:06:38<2:58:59, 21.61s/it] +2025-05-11 12:32:32 - ERROR - stderr - +2025-05-11 12:32:32 - ERROR - stderr - +2025-05-11 12:32:32 - INFO - stdout - {'loss': 0.4601, 'grad_norm': 0.8679154515266418, 'learning_rate': 9.11873114956261e-07, 'epoch': 2.6} +2025-05-11 12:32:32 - ERROR - stderr - 87%|���███████▋ | 3244/3741 [19:06:38<2:58:59, 21.61s/it] +2025-05-11 12:32:52 - ERROR - stderr - 87%|████████▋ | 3245/3741 [19:06:58<2:53:47, 21.02s/it] +2025-05-11 12:32:52 - ERROR - stderr - +2025-05-11 12:32:52 - ERROR - stderr - +2025-05-11 12:32:52 - INFO - stdout - {'loss': 0.4632, 'grad_norm': 0.8816805481910706, 'learning_rate': 9.082638263816756e-07, 'epoch': 2.6} +2025-05-11 12:32:52 - ERROR - stderr - 87%|████████▋ | 3245/3741 [19:06:58<2:53:47, 21.02s/it] +2025-05-11 12:33:14 - ERROR - stderr - 87%|████████▋ | 3246/3741 [19:07:20<2:56:49, 21.43s/it] +2025-05-11 12:33:14 - ERROR - stderr - +2025-05-11 12:33:14 - ERROR - stderr - +2025-05-11 12:33:14 - INFO - stdout - {'loss': 0.4638, 'grad_norm': 0.8480551838874817, 'learning_rate': 9.046613551005012e-07, 'epoch': 2.6} +2025-05-11 12:33:14 - ERROR - stderr - 87%|████████▋ | 3246/3741 [19:07:21<2:56:49, 21.43s/it] +2025-05-11 12:33:34 - ERROR - stderr - 87%|████████▋ | 3247/3741 [19:07:40<2:52:24, 20.94s/it] +2025-05-11 12:33:34 - ERROR - stderr - +2025-05-11 12:33:34 - ERROR - stderr - +2025-05-11 12:33:34 - INFO - stdout - {'loss': 0.4933, 'grad_norm': 0.8449149131774902, 'learning_rate': 9.010657038139947e-07, 'epoch': 2.6} +2025-05-11 12:33:34 - ERROR - stderr - 87%|████████▋ | 3247/3741 [19:07:40<2:52:24, 20.94s/it] +2025-05-11 12:33:58 - ERROR - stderr - 87%|████████▋ | 3248/3741 [19:08:04<2:59:26, 21.84s/it] +2025-05-11 12:33:58 - ERROR - stderr - +2025-05-11 12:33:58 - ERROR - stderr - +2025-05-11 12:33:58 - INFO - stdout - {'loss': 0.4751, 'grad_norm': 0.8377960920333862, 'learning_rate': 8.974768752183016e-07, 'epoch': 2.6} +2025-05-11 12:33:58 - ERROR - stderr - 87%|████████▋ | 3248/3741 [19:08:04<2:59:26, 21.84s/it] +2025-05-11 12:34:18 - ERROR - stderr - 87%|████████▋ | 3249/3741 [19:08:24<2:54:51, 21.32s/it] +2025-05-11 12:34:18 - ERROR - stderr - +2025-05-11 12:34:18 - ERROR - stderr - +2025-05-11 12:34:18 - INFO - stdout - {'loss': 0.4731, 'grad_norm': 0.8451635837554932, 'learning_rate': 8.938948720044416e-07, 'epoch': 2.61} +2025-05-11 12:34:18 - ERROR - stderr - 87%|████████▋ | 3249/3741 [19:08:24<2:54:51, 21.32s/it] +2025-05-11 12:34:41 - ERROR - stderr - 87%|████████▋ | 3250/3741 [19:08:48<2:59:16, 21.91s/it] +2025-05-11 12:34:41 - ERROR - stderr - +2025-05-11 12:34:41 - ERROR - stderr - +2025-05-11 12:34:41 - INFO - stdout - {'loss': 0.4893, 'grad_norm': 0.8990350365638733, 'learning_rate': 8.903196968583295e-07, 'epoch': 2.61} +2025-05-11 12:34:41 - ERROR - stderr - 87%|████████▋ | 3250/3741 [19:08:48<2:59:16, 21.91s/it] +2025-05-11 12:35:02 - ERROR - stderr - 87%|████████▋ | 3251/3741 [19:09:08<2:55:34, 21.50s/it] +2025-05-11 12:35:02 - ERROR - stderr - +2025-05-11 12:35:02 - ERROR - stderr - +2025-05-11 12:35:02 - INFO - stdout - {'loss': 0.4816, 'grad_norm': 0.8640419244766235, 'learning_rate': 8.867513524607485e-07, 'epoch': 2.61} +2025-05-11 12:35:02 - ERROR - stderr - 87%|████████▋ | 3251/3741 [19:09:08<2:55:34, 21.50s/it] +2025-05-11 12:35:24 - ERROR - stderr - 87%|████████▋ | 3252/3741 [19:09:31<2:57:44, 21.81s/it] +2025-05-11 12:35:24 - ERROR - stderr - +2025-05-11 12:35:24 - ERROR - stderr - +2025-05-11 12:35:24 - INFO - stdout - {'loss': 0.4691, 'grad_norm': 0.8659276366233826, 'learning_rate': 8.831898414873663e-07, 'epoch': 2.61} +2025-05-11 12:35:24 - ERROR - stderr - 87%|████████▋ | 3252/3741 [19:09:31<2:57:44, 21.81s/it] +2025-05-11 12:35:45 - ERROR - stderr - 87%|████████▋ | 3253/3741 [19:09:51<2:53:41, 21.36s/it] +2025-05-11 12:35:45 - ERROR - stderr - +2025-05-11 12:35:45 - ERROR - stderr - +2025-05-11 12:35:45 - INFO - stdout - {'loss': 0.4584, 'grad_norm': 0.8834055066108704, 'learning_rate': 8.796351666087266e-07, 'epoch': 2.61} +2025-05-11 12:35:45 - ERROR - stderr - 87%|████████▋ | 3253/3741 [19:09:51<2:53:41, 21.36s/it] +2025-05-11 12:36:08 - ERROR - stderr - 87%|████████▋ | 3254/3741 [19:10:15<2:58:41, 22.02s/it] +2025-05-11 12:36:08 - ERROR - stderr - +2025-05-11 12:36:08 - ERROR - stderr - +2025-05-11 12:36:08 - INFO - stdout - {'loss': 0.448, 'grad_norm': 0.8305429220199585, 'learning_rate': 8.760873304902406e-07, 'epoch': 2.61} +2025-05-11 12:36:08 - ERROR - stderr - 87%|████████▋ | 3254/3741 [19:10:15<2:58:41, 22.02s/it] +2025-05-11 12:36:29 - ERROR - stderr - 87%|████████▋ | 3255/3741 [19:10:35<2:54:20, 21.52s/it] +2025-05-11 12:36:29 - ERROR - stderr - +2025-05-11 12:36:29 - ERROR - stderr - +2025-05-11 12:36:29 - INFO - stdout - {'loss': 0.4716, 'grad_norm': 0.8624115586280823, 'learning_rate': 8.725463357922037e-07, 'epoch': 2.61} +2025-05-11 12:36:29 - ERROR - stderr - 87%|████████▋ | 3255/3741 [19:10:35<2:54:20, 21.52s/it] +2025-05-11 12:36:52 - ERROR - stderr - 87%|████████▋ | 3256/3741 [19:10:58<2:58:02, 22.03s/it] +2025-05-11 12:36:52 - ERROR - stderr - +2025-05-11 12:36:52 - ERROR - stderr - +2025-05-11 12:36:52 - INFO - stdout - {'loss': 0.4564, 'grad_norm': 0.838100016117096, 'learning_rate': 8.690121851697697e-07, 'epoch': 2.61} +2025-05-11 12:36:52 - ERROR - stderr - 87%|████████▋ | 3256/3741 [19:10:58<2:58:02, 22.03s/it] +2025-05-11 12:37:11 - ERROR - stderr - 87%|████████▋ | 3257/3741 [19:11:18<2:51:29, 21.26s/it] +2025-05-11 12:37:11 - ERROR - stderr - +2025-05-11 12:37:11 - ERROR - stderr - +2025-05-11 12:37:11 - INFO - stdout - {'loss': 0.4695, 'grad_norm': 0.8911872506141663, 'learning_rate': 8.654848812729655e-07, 'epoch': 2.61} +2025-05-11 12:37:11 - ERROR - stderr - 87%|████████▋ | 3257/3741 [19:11:18<2:51:29, 21.26s/it] +2025-05-11 12:37:33 - ERROR - stderr - 87%|████████▋ | 3258/3741 [19:11:40<2:52:59, 21.49s/it] +2025-05-11 12:37:33 - ERROR - stderr - +2025-05-11 12:37:33 - ERROR - stderr - +2025-05-11 12:37:33 - INFO - stdout - {'loss': 0.487, 'grad_norm': 0.9087029099464417, 'learning_rate': 8.619644267466876e-07, 'epoch': 2.61} +2025-05-11 12:37:33 - ERROR - stderr - 87%|████████▋ | 3258/3741 [19:11:40<2:52:59, 21.49s/it] +2025-05-11 12:37:53 - ERROR - stderr - 87%|████████▋ | 3259/3741 [19:11:59<2:48:05, 20.92s/it] +2025-05-11 12:37:53 - ERROR - stderr - +2025-05-11 12:37:53 - ERROR - stderr - +2025-05-11 12:37:53 - INFO - stdout - {'loss': 0.4856, 'grad_norm': 0.8725029826164246, 'learning_rate': 8.584508242306844e-07, 'epoch': 2.61} +2025-05-11 12:37:53 - ERROR - stderr - 87%|████████▋ | 3259/3741 [19:11:59<2:48:05, 20.92s/it] +2025-05-11 12:38:15 - ERROR - stderr - 87%|████████▋ | 3260/3741 [19:12:21<2:50:01, 21.21s/it] +2025-05-11 12:38:15 - ERROR - stderr - +2025-05-11 12:38:15 - ERROR - stderr - +2025-05-11 12:38:15 - INFO - stdout - {'loss': 0.4754, 'grad_norm': 0.9196523427963257, 'learning_rate': 8.549440763595851e-07, 'epoch': 2.61} +2025-05-11 12:38:15 - ERROR - stderr - 87%|████████▋ | 3260/3741 [19:12:21<2:50:01, 21.21s/it] +2025-05-11 12:38:35 - ERROR - stderr - 87%|████████▋ | 3261/3741 [19:12:41<2:46:11, 20.77s/it] +2025-05-11 12:38:35 - ERROR - stderr - +2025-05-11 12:38:35 - ERROR - stderr - +2025-05-11 12:38:35 - INFO - stdout - {'loss': 0.4519, 'grad_norm': 0.7980285286903381, 'learning_rate': 8.514441857628619e-07, 'epoch': 2.62} +2025-05-11 12:38:35 - ERROR - stderr - 87%|████████▋ | 3261/3741 [19:12:41<2:46:11, 20.77s/it] +2025-05-11 12:38:55 - ERROR - stderr - 87%|████████▋ | 3262/3741 [19:13:01<2:44:26, 20.60s/it] +2025-05-11 12:38:55 - ERROR - stderr - +2025-05-11 12:38:55 - ERROR - stderr - +2025-05-11 12:38:55 - INFO - stdout - {'loss': 0.4633, 'grad_norm': 0.8673176169395447, 'learning_rate': 8.479511550648512e-07, 'epoch': 2.62} +2025-05-11 12:38:55 - ERROR - stderr - 87%|████████▋ | 3262/3741 [19:13:01<2:44:26, 20.60s/it] +2025-05-11 12:39:14 - ERROR - stderr - 87%|████████▋ | 3263/3741 [19:13:21<2:41:43, 20.30s/it] +2025-05-11 12:39:14 - ERROR - stderr - +2025-05-11 12:39:14 - ERROR - stderr - +2025-05-11 12:39:14 - INFO - stdout - {'loss': 0.4848, 'grad_norm': 0.8644046783447266, 'learning_rate': 8.44464986884751e-07, 'epoch': 2.62} +2025-05-11 12:39:14 - ERROR - stderr - 87%|████████▋ | 3263/3741 [19:13:21<2:41:43, 20.30s/it] +2025-05-11 12:39:34 - ERROR - stderr - 87%|████████▋ | 3264/3741 [19:13:40<2:39:55, 20.12s/it] +2025-05-11 12:39:34 - ERROR - stderr - +2025-05-11 12:39:34 - ERROR - stderr - +2025-05-11 12:39:34 - INFO - stdout - {'loss': 0.4959, 'grad_norm': 0.8942599296569824, 'learning_rate': 8.40985683836606e-07, 'epoch': 2.62} +2025-05-11 12:39:34 - ERROR - stderr - 87%|████████▋ | 3264/3741 [19:13:40<2:39:55, 20.12s/it] +2025-05-11 12:39:54 - ERROR - stderr - 87%|████████▋ | 3265/3741 [19:14:00<2:39:00, 20.04s/it] +2025-05-11 12:39:54 - ERROR - stderr - +2025-05-11 12:39:54 - ERROR - stderr - +2025-05-11 12:39:54 - INFO - stdout - {'loss': 0.4744, 'grad_norm': 0.8743693232536316, 'learning_rate': 8.375132485293158e-07, 'epoch': 2.62} +2025-05-11 12:39:54 - ERROR - stderr - 87%|████████▋ | 3265/3741 [19:14:00<2:39:00, 20.04s/it] +2025-05-11 12:40:14 - ERROR - stderr - 87%|████████▋ | 3266/3741 [19:14:20<2:38:44, 20.05s/it] +2025-05-11 12:40:14 - ERROR - stderr - +2025-05-11 12:40:14 - ERROR - stderr - +2025-05-11 12:40:14 - INFO - stdout - {'loss': 0.4812, 'grad_norm': 0.852287232875824, 'learning_rate': 8.340476835666345e-07, 'epoch': 2.62} +2025-05-11 12:40:14 - ERROR - stderr - 87%|████████▋ | 3266/3741 [19:14:20<2:38:44, 20.05s/it] +2025-05-11 12:40:34 - ERROR - stderr - 87%|████████▋ | 3267/3741 [19:14:40<2:37:32, 19.94s/it] +2025-05-11 12:40:34 - ERROR - stderr - +2025-05-11 12:40:34 - ERROR - stderr - +2025-05-11 12:40:34 - INFO - stdout - {'loss': 0.4776, 'grad_norm': 0.8935552835464478, 'learning_rate': 8.305889915471532e-07, 'epoch': 2.62} +2025-05-11 12:40:34 - ERROR - stderr - 87%|████████▋ | 3267/3741 [19:14:40<2:37:32, 19.94s/it] +2025-05-11 12:40:53 - ERROR - stderr - 87%|████████▋ | 3268/3741 [19:15:00<2:36:50, 19.89s/it] +2025-05-11 12:40:53 - ERROR - stderr - +2025-05-11 12:40:53 - ERROR - stderr - +2025-05-11 12:40:53 - INFO - stdout - {'loss': 0.4794, 'grad_norm': 0.8734551072120667, 'learning_rate': 8.271371750643265e-07, 'epoch': 2.62} +2025-05-11 12:40:53 - ERROR - stderr - 87%|████████▋ | 3268/3741 [19:15:00<2:36:50, 19.89s/it] +2025-05-11 12:41:14 - ERROR - stderr - 87%|████████▋ | 3269/3741 [19:15:20<2:37:14, 19.99s/it] +2025-05-11 12:41:14 - ERROR - stderr - +2025-05-11 12:41:14 - ERROR - stderr - +2025-05-11 12:41:14 - INFO - stdout - {'loss': 0.4691, 'grad_norm': 0.8658239245414734, 'learning_rate': 8.236922367064359e-07, 'epoch': 2.62} +2025-05-11 12:41:14 - ERROR - stderr - 87%|████████▋ | 3269/3741 [19:15:20<2:37:14, 19.99s/it] +2025-05-11 12:41:33 - ERROR - stderr - 87%|████████▋ | 3270/3741 [19:15:40<2:35:59, 19.87s/it] +2025-05-11 12:41:33 - ERROR - stderr - +2025-05-11 12:41:33 - ERROR - stderr - +2025-05-11 12:41:33 - INFO - stdout - {'loss': 0.4987, 'grad_norm': 0.8544544577598572, 'learning_rate': 8.202541790566176e-07, 'epoch': 2.62} +2025-05-11 12:41:33 - ERROR - stderr - 87%|████████▋ | 3270/3741 [19:15:40<2:35:59, 19.87s/it] +2025-05-11 12:41:54 - ERROR - stderr - 87%|████████▋ | 3271/3741 [19:16:01<2:38:55, 20.29s/it] +2025-05-11 12:41:55 - ERROR - stderr - +2025-05-11 12:41:55 - ERROR - stderr - +2025-05-11 12:41:55 - INFO - stdout - {'loss': 0.4704, 'grad_norm': 0.8634313344955444, 'learning_rate': 8.16823004692845e-07, 'epoch': 2.62} +2025-05-11 12:41:55 - ERROR - stderr - 87%|████████▋ | 3271/3741 [19:16:01<2:38:55, 20.29s/it] +2025-05-11 12:42:14 - ERROR - stderr - 87%|████████▋ | 3272/3741 [19:16:20<2:37:08, 20.10s/it] +2025-05-11 12:42:14 - ERROR - stderr - +2025-05-11 12:42:14 - ERROR - stderr - +2025-05-11 12:42:14 - INFO - stdout - {'loss': 0.4827, 'grad_norm': 0.9538077116012573, 'learning_rate': 8.133987161879231e-07, 'epoch': 2.62} +2025-05-11 12:42:14 - ERROR - stderr - 87%|████████▋ | 3272/3741 [19:16:21<2:37:08, 20.10s/it] +2025-05-11 12:42:36 - ERROR - stderr - 87%|████████▋ | 3273/3741 [19:16:43<2:41:52, 20.75s/it] +2025-05-11 12:42:36 - ERROR - stderr - +2025-05-11 12:42:36 - ERROR - stderr - +2025-05-11 12:42:36 - INFO - stdout - {'loss': 0.4685, 'grad_norm': 0.8885698914527893, 'learning_rate': 8.099813161095094e-07, 'epoch': 2.62} +2025-05-11 12:42:36 - ERROR - stderr - 87%|████████▋ | 3273/3741 [19:16:43<2:41:52, 20.75s/it] +2025-05-11 12:42:56 - ERROR - stderr - 88%|████████▊ | 3274/3741 [19:17:03<2:39:52, 20.54s/it] +2025-05-11 12:42:56 - ERROR - stderr - +2025-05-11 12:42:56 - ERROR - stderr - +2025-05-11 12:42:56 - INFO - stdout - {'loss': 0.4699, 'grad_norm': 0.8560696840286255, 'learning_rate': 8.065708070200806e-07, 'epoch': 2.63} +2025-05-11 12:42:56 - ERROR - stderr - 88%|████████▊ | 3274/3741 [19:17:03<2:39:52, 20.54s/it] +2025-05-11 12:43:19 - ERROR - stderr - 88%|████████▊ | 3275/3741 [19:17:25<2:44:07, 21.13s/it] +2025-05-11 12:43:19 - ERROR - stderr - +2025-05-11 12:43:19 - ERROR - stderr - +2025-05-11 12:43:19 - INFO - stdout - {'loss': 0.4734, 'grad_norm': 0.8704147934913635, 'learning_rate': 8.031671914769545e-07, 'epoch': 2.63} +2025-05-11 12:43:19 - ERROR - stderr - 88%|████████▊ | 3275/3741 [19:17:25<2:44:07, 21.13s/it] +2025-05-11 12:43:20 - INFO - stdout - WARNING: tokenization mismatch: 3183 vs. 3209. (ignored) +2025-05-11 12:43:39 - ERROR - stderr - 88%|████████▊ | 3276/3741 [19:17:45<2:40:39, 20.73s/it] +2025-05-11 12:43:39 - ERROR - stderr - +2025-05-11 12:43:39 - ERROR - stderr - +2025-05-11 12:43:39 - INFO - stdout - {'loss': 0.4737, 'grad_norm': 0.8830083608627319, 'learning_rate': 7.997704720322785e-07, 'epoch': 2.63} +2025-05-11 12:43:39 - ERROR - stderr - 88%|████████▊ | 3276/3741 [19:17:45<2:40:39, 20.73s/it] +2025-05-11 12:44:01 - ERROR - stderr - 88%|████████▊ | 3277/3741 [19:18:08<2:44:32, 21.28s/it] +2025-05-11 12:44:01 - ERROR - stderr - +2025-05-11 12:44:01 - ERROR - stderr - +2025-05-11 12:44:01 - INFO - stdout - {'loss': 0.4699, 'grad_norm': 0.8642050623893738, 'learning_rate': 7.963806512330275e-07, 'epoch': 2.63} +2025-05-11 12:44:01 - ERROR - stderr - 88%|████████▊ | 3277/3741 [19:18:08<2:44:32, 21.28s/it] +2025-05-11 12:44:21 - ERROR - stderr - 88%|████████▊ | 3278/3741 [19:18:27<2:40:02, 20.74s/it] +2025-05-11 12:44:21 - ERROR - stderr - +2025-05-11 12:44:21 - ERROR - stderr - +2025-05-11 12:44:21 - INFO - stdout - {'loss': 0.4592, 'grad_norm': 0.8747822642326355, 'learning_rate': 7.929977316210036e-07, 'epoch': 2.63} +2025-05-11 12:44:21 - ERROR - stderr - 88%|████████▊ | 3278/3741 [19:18:27<2:40:02, 20.74s/it] +2025-05-11 12:44:43 - ERROR - stderr - 88%|████████▊ | 3279/3741 [19:18:49<2:41:57, 21.03s/it] +2025-05-11 12:44:43 - ERROR - stderr - +2025-05-11 12:44:43 - ERROR - stderr - +2025-05-11 12:44:43 - INFO - stdout - {'loss': 0.4706, 'grad_norm': 0.8815390467643738, 'learning_rate': 7.896217157328357e-07, 'epoch': 2.63} +2025-05-11 12:44:43 - ERROR - stderr - 88%|████████▊ | 3279/3741 [19:18:49<2:41:57, 21.03s/it] +2025-05-11 12:45:03 - ERROR - stderr - 88%|████████▊ | 3280/3741 [19:19:09<2:40:17, 20.86s/it] +2025-05-11 12:45:03 - ERROR - stderr - +2025-05-11 12:45:03 - ERROR - stderr - +2025-05-11 12:45:03 - INFO - stdout - {'loss': 0.4753, 'grad_norm': 0.8473573327064514, 'learning_rate': 7.862526060999775e-07, 'epoch': 2.63} +2025-05-11 12:45:03 - ERROR - stderr - 88%|████████▊ | 3280/3741 [19:19:09<2:40:17, 20.86s/it] +2025-05-11 12:45:25 - ERROR - stderr - 88%|████████▊ | 3281/3741 [19:19:31<2:42:03, 21.14s/it] +2025-05-11 12:45:25 - ERROR - stderr - +2025-05-11 12:45:25 - ERROR - stderr - +2025-05-11 12:45:25 - INFO - stdout - {'loss': 0.467, 'grad_norm': 0.8307991623878479, 'learning_rate': 7.828904052487019e-07, 'epoch': 2.63} +2025-05-11 12:45:25 - ERROR - stderr - 88%|████████▊ | 3281/3741 [19:19:31<2:42:03, 21.14s/it] +2025-05-11 12:45:44 - ERROR - stderr - 88%|████████▊ | 3282/3741 [19:19:51<2:37:47, 20.63s/it] +2025-05-11 12:45:44 - ERROR - stderr - +2025-05-11 12:45:44 - ERROR - stderr - +2025-05-11 12:45:44 - INFO - stdout - {'loss': 0.4571, 'grad_norm': 0.8285624384880066, 'learning_rate': 7.795351157000986e-07, 'epoch': 2.63} +2025-05-11 12:45:44 - ERROR - stderr - 88%|████████▊ | 3282/3741 [19:19:51<2:37:47, 20.63s/it] +2025-05-11 12:46:07 - ERROR - stderr - 88%|████████▊ | 3283/3741 [19:20:13<2:41:57, 21.22s/it] +2025-05-11 12:46:07 - ERROR - stderr - +2025-05-11 12:46:07 - ERROR - stderr - +2025-05-11 12:46:07 - INFO - stdout - {'loss': 0.485, 'grad_norm': 0.8307252526283264, 'learning_rate': 7.761867399700796e-07, 'epoch': 2.63} +2025-05-11 12:46:07 - ERROR - stderr - 88%|████████▊ | 3283/3741 [19:20:13<2:41:57, 21.22s/it] +2025-05-11 12:46:26 - ERROR - stderr - 88%|████████▊ | 3284/3741 [19:20:33<2:38:02, 20.75s/it] +2025-05-11 12:46:26 - ERROR - stderr - +2025-05-11 12:46:26 - ERROR - stderr - +2025-05-11 12:46:26 - INFO - stdout - {'loss': 0.4594, 'grad_norm': 0.8082962036132812, 'learning_rate': 7.72845280569372e-07, 'epoch': 2.63} +2025-05-11 12:46:26 - ERROR - stderr - 88%|████████▊ | 3284/3741 [19:20:33<2:38:02, 20.75s/it] +2025-05-11 12:46:49 - ERROR - stderr - 88%|████████▊ | 3285/3741 [19:20:55<2:41:31, 21.25s/it] +2025-05-11 12:46:49 - ERROR - stderr - +2025-05-11 12:46:49 - ERROR - stderr - +2025-05-11 12:46:49 - INFO - stdout - {'loss': 0.4566, 'grad_norm': 0.8508808016777039, 'learning_rate': 7.69510740003514e-07, 'epoch': 2.63} +2025-05-11 12:46:49 - ERROR - stderr - 88%|████████▊ | 3285/3741 [19:20:55<2:41:31, 21.25s/it] +2025-05-11 12:47:09 - ERROR - stderr - 88%|████████▊ | 3286/3741 [19:21:15<2:37:41, 20.79s/it] +2025-05-11 12:47:09 - ERROR - stderr - +2025-05-11 12:47:09 - ERROR - stderr - +2025-05-11 12:47:09 - INFO - stdout - {'loss': 0.4669, 'grad_norm': 0.8629611134529114, 'learning_rate': 7.66183120772862e-07, 'epoch': 2.64} +2025-05-11 12:47:09 - ERROR - stderr - 88%|████████▊ | 3286/3741 [19:21:15<2:37:41, 20.79s/it] +2025-05-11 12:47:31 - ERROR - stderr - 88%|████████▊ | 3287/3741 [19:21:38<2:41:42, 21.37s/it] +2025-05-11 12:47:31 - ERROR - stderr - +2025-05-11 12:47:31 - ERROR - stderr - +2025-05-11 12:47:31 - INFO - stdout - {'loss': 0.4717, 'grad_norm': 0.8811891078948975, 'learning_rate': 7.628624253725725e-07, 'epoch': 2.64} +2025-05-11 12:47:31 - ERROR - stderr - 88%|████████▊ | 3287/3741 [19:21:38<2:41:42, 21.37s/it] +2025-05-11 12:47:51 - ERROR - stderr - 88%|████████▊ | 3288/3741 [19:21:57<2:37:29, 20.86s/it] +2025-05-11 12:47:51 - ERROR - stderr - +2025-05-11 12:47:51 - ERROR - stderr - +2025-05-11 12:47:51 - INFO - stdout - {'loss': 0.4819, 'grad_norm': 0.8686394691467285, 'learning_rate': 7.59548656292618e-07, 'epoch': 2.64} +2025-05-11 12:47:51 - ERROR - stderr - 88%|████████▊ | 3288/3741 [19:21:57<2:37:29, 20.86s/it] +2025-05-11 12:48:13 - ERROR - stderr - 88%|████████▊ | 3289/3741 [19:22:20<2:40:18, 21.28s/it] +2025-05-11 12:48:13 - ERROR - stderr - +2025-05-11 12:48:13 - ERROR - stderr - +2025-05-11 12:48:13 - INFO - stdout - {'loss': 0.4884, 'grad_norm': 0.9036654829978943, 'learning_rate': 7.562418160177765e-07, 'epoch': 2.64} +2025-05-11 12:48:13 - ERROR - stderr - 88%|████████▊ | 3289/3741 [19:22:20<2:40:18, 21.28s/it] +2025-05-11 12:48:33 - ERROR - stderr - 88%|████████▊ | 3290/3741 [19:22:39<2:36:20, 20.80s/it] +2025-05-11 12:48:33 - ERROR - stderr - +2025-05-11 12:48:33 - ERROR - stderr - +2025-05-11 12:48:33 - INFO - stdout - {'loss': 0.4787, 'grad_norm': 0.8560638427734375, 'learning_rate': 7.529419070276312e-07, 'epoch': 2.64} +2025-05-11 12:48:33 - ERROR - stderr - 88%|████████▊ | 3290/3741 [19:22:39<2:36:20, 20.80s/it] +2025-05-11 12:48:55 - ERROR - stderr - 88%|████████▊ | 3291/3741 [19:23:01<2:38:07, 21.08s/it] +2025-05-11 12:48:55 - ERROR - stderr - +2025-05-11 12:48:55 - ERROR - stderr - +2025-05-11 12:48:55 - INFO - stdout - {'loss': 0.4466, 'grad_norm': 0.8455655574798584, 'learning_rate': 7.496489317965616e-07, 'epoch': 2.64} +2025-05-11 12:48:55 - ERROR - stderr - 88%|████████▊ | 3291/3741 [19:23:01<2:38:07, 21.08s/it] +2025-05-11 12:49:14 - ERROR - stderr - 88%|████████▊ | 3292/3741 [19:23:21<2:34:53, 20.70s/it] +2025-05-11 12:49:15 - ERROR - stderr - +2025-05-11 12:49:15 - ERROR - stderr - +2025-05-11 12:49:15 - INFO - stdout - {'loss': 0.5097, 'grad_norm': 0.9102020859718323, 'learning_rate': 7.463628927937549e-07, 'epoch': 2.64} +2025-05-11 12:49:15 - ERROR - stderr - 88%|████████▊ | 3292/3741 [19:23:21<2:34:53, 20.70s/it] +2025-05-11 12:49:36 - ERROR - stderr - 88%|████████▊ | 3293/3741 [19:23:42<2:36:39, 20.98s/it] +2025-05-11 12:49:36 - ERROR - stderr - +2025-05-11 12:49:36 - ERROR - stderr - +2025-05-11 12:49:36 - INFO - stdout - {'loss': 0.4628, 'grad_norm': 0.8347408175468445, 'learning_rate': 7.430837924831958e-07, 'epoch': 2.64} +2025-05-11 12:49:36 - ERROR - stderr - 88%|████████▊ | 3293/3741 [19:23:42<2:36:39, 20.98s/it] +2025-05-11 12:49:56 - ERROR - stderr - 88%|████████▊ | 3294/3741 [19:24:02<2:33:58, 20.67s/it] +2025-05-11 12:49:56 - ERROR - stderr - +2025-05-11 12:49:56 - ERROR - stderr - +2025-05-11 12:49:56 - INFO - stdout - {'loss': 0.4683, 'grad_norm': 0.8465882539749146, 'learning_rate': 7.398116333236638e-07, 'epoch': 2.64} +2025-05-11 12:49:56 - ERROR - stderr - 88%|████████▊ | 3294/3741 [19:24:02<2:33:58, 20.67s/it] +2025-05-11 12:50:16 - ERROR - stderr - 88%|████████▊ | 3295/3741 [19:24:22<2:32:23, 20.50s/it] +2025-05-11 12:50:16 - ERROR - stderr - +2025-05-11 12:50:16 - ERROR - stderr - +2025-05-11 12:50:16 - INFO - stdout - {'loss': 0.4939, 'grad_norm': 0.8932040929794312, 'learning_rate': 7.365464177687387e-07, 'epoch': 2.64} +2025-05-11 12:50:16 - ERROR - stderr - 88%|████████▊ | 3295/3741 [19:24:23<2:32:23, 20.50s/it] +2025-05-11 12:50:36 - ERROR - stderr - 88%|████████▊ | 3296/3741 [19:24:42<2:30:47, 20.33s/it] +2025-05-11 12:50:36 - ERROR - stderr - +2025-05-11 12:50:36 - ERROR - stderr - +2025-05-11 12:50:36 - INFO - stdout - {'loss': 0.4709, 'grad_norm': 0.8394157290458679, 'learning_rate': 7.332881482667853e-07, 'epoch': 2.64} +2025-05-11 12:50:36 - ERROR - stderr - 88%|████████▊ | 3296/3741 [19:24:42<2:30:47, 20.33s/it] +2025-05-11 12:50:56 - ERROR - stderr - 88%|████████▊ | 3297/3741 [19:25:02<2:29:07, 20.15s/it] +2025-05-11 12:50:56 - ERROR - stderr - +2025-05-11 12:50:56 - ERROR - stderr - +2025-05-11 12:50:56 - INFO - stdout - {'loss': 0.4766, 'grad_norm': 0.8661454319953918, 'learning_rate': 7.300368272609692e-07, 'epoch': 2.64} +2025-05-11 12:50:56 - ERROR - stderr - 88%|████████▊ | 3297/3741 [19:25:02<2:29:07, 20.15s/it] +2025-05-11 12:51:16 - ERROR - stderr - 88%|████████▊ | 3298/3741 [19:25:22<2:28:14, 20.08s/it] +2025-05-11 12:51:16 - ERROR - stderr - +2025-05-11 12:51:16 - ERROR - stderr - +2025-05-11 12:51:16 - INFO - stdout - {'loss': 0.4845, 'grad_norm': 0.85788893699646, 'learning_rate': 7.267924571892382e-07, 'epoch': 2.64} +2025-05-11 12:51:16 - ERROR - stderr - 88%|████████▊ | 3298/3741 [19:25:22<2:28:14, 20.08s/it] +2025-05-11 12:51:36 - ERROR - stderr - 88%|████████▊ | 3299/3741 [19:25:42<2:27:32, 20.03s/it] +2025-05-11 12:51:36 - ERROR - stderr - +2025-05-11 12:51:36 - ERROR - stderr - +2025-05-11 12:51:36 - INFO - stdout - {'loss': 0.4652, 'grad_norm': 0.8850892186164856, 'learning_rate': 7.23555040484335e-07, 'epoch': 2.65} +2025-05-11 12:51:36 - ERROR - stderr - 88%|████████▊ | 3299/3741 [19:25:42<2:27:32, 20.03s/it] +2025-05-11 12:51:56 - ERROR - stderr - 88%|████████▊ | 3300/3741 [19:26:02<2:27:05, 20.01s/it] +2025-05-11 12:51:56 - ERROR - stderr - +2025-05-11 12:51:56 - ERROR - stderr - +2025-05-11 12:51:56 - INFO - stdout - {'loss': 0.4986, 'grad_norm': 0.9400017857551575, 'learning_rate': 7.203245795737834e-07, 'epoch': 2.65} +2025-05-11 12:51:56 - ERROR - stderr - 88%|████████▊ | 3300/3741 [19:26:02<2:27:05, 20.01s/it] +2025-05-11 12:52:16 - ERROR - stderr - 88%|████████▊ | 3301/3741 [19:26:22<2:26:43, 20.01s/it] +2025-05-11 12:52:16 - ERROR - stderr - +2025-05-11 12:52:16 - ERROR - stderr - +2025-05-11 12:52:16 - INFO - stdout - {'loss': 0.4876, 'grad_norm': 0.8622959852218628, 'learning_rate': 7.171010768798925e-07, 'epoch': 2.65} +2025-05-11 12:52:16 - ERROR - stderr - 88%|████████▊ | 3301/3741 [19:26:22<2:26:43, 20.01s/it] +2025-05-11 12:52:37 - ERROR - stderr - 88%|████████▊ | 3302/3741 [19:26:43<2:29:06, 20.38s/it] +2025-05-11 12:52:37 - ERROR - stderr - +2025-05-11 12:52:37 - ERROR - stderr - +2025-05-11 12:52:37 - INFO - stdout - {'loss': 0.4665, 'grad_norm': 0.8049087524414062, 'learning_rate': 7.138845348197532e-07, 'epoch': 2.65} +2025-05-11 12:52:37 - ERROR - stderr - 88%|████████▊ | 3302/3741 [19:26:43<2:29:06, 20.38s/it] +2025-05-11 12:52:57 - ERROR - stderr - 88%|████████▊ | 3303/3741 [19:27:03<2:27:36, 20.22s/it] +2025-05-11 12:52:57 - ERROR - stderr - +2025-05-11 12:52:57 - ERROR - stderr - +2025-05-11 12:52:57 - INFO - stdout - {'loss': 0.4727, 'grad_norm': 0.8678800463676453, 'learning_rate': 7.106749558052428e-07, 'epoch': 2.65} +2025-05-11 12:52:57 - ERROR - stderr - 88%|████████▊ | 3303/3741 [19:27:03<2:27:36, 20.22s/it] +2025-05-11 12:53:20 - ERROR - stderr - 88%|████████▊ | 3304/3741 [19:27:26<2:32:51, 20.99s/it] +2025-05-11 12:53:20 - ERROR - stderr - +2025-05-11 12:53:20 - ERROR - stderr - +2025-05-11 12:53:20 - INFO - stdout - {'loss': 0.4743, 'grad_norm': 0.8367797136306763, 'learning_rate': 7.074723422430052e-07, 'epoch': 2.65} +2025-05-11 12:53:20 - ERROR - stderr - 88%|████████▊ | 3304/3741 [19:27:26<2:32:51, 20.99s/it] +2025-05-11 12:53:39 - ERROR - stderr - 88%|████████▊ | 3305/3741 [19:27:46<2:30:06, 20.66s/it] +2025-05-11 12:53:39 - ERROR - stderr - +2025-05-11 12:53:39 - ERROR - stderr - +2025-05-11 12:53:39 - INFO - stdout - {'loss': 0.4498, 'grad_norm': 0.8551909327507019, 'learning_rate': 7.042766965344782e-07, 'epoch': 2.65} +2025-05-11 12:53:39 - ERROR - stderr - 88%|████████▊ | 3305/3741 [19:27:46<2:30:06, 20.66s/it] +2025-05-11 12:54:02 - ERROR - stderr - 88%|████████▊ | 3306/3741 [19:28:08<2:33:22, 21.16s/it] +2025-05-11 12:54:02 - ERROR - stderr - +2025-05-11 12:54:02 - ERROR - stderr - +2025-05-11 12:54:02 - INFO - stdout - {'loss': 0.4905, 'grad_norm': 0.8910350203514099, 'learning_rate': 7.010880210758597e-07, 'epoch': 2.65} +2025-05-11 12:54:02 - ERROR - stderr - 88%|████████▊ | 3306/3741 [19:28:08<2:33:22, 21.16s/it] +2025-05-11 12:54:22 - ERROR - stderr - 88%|████████▊ | 3307/3741 [19:28:28<2:30:08, 20.76s/it] +2025-05-11 12:54:22 - ERROR - stderr - +2025-05-11 12:54:22 - ERROR - stderr - +2025-05-11 12:54:22 - INFO - stdout - {'loss': 0.4695, 'grad_norm': 1.0603433847427368, 'learning_rate': 6.979063182581291e-07, 'epoch': 2.65} +2025-05-11 12:54:22 - ERROR - stderr - 88%|████████▊ | 3307/3741 [19:28:28<2:30:08, 20.76s/it] +2025-05-11 12:54:44 - ERROR - stderr - 88%|████████▊ | 3308/3741 [19:28:50<2:33:42, 21.30s/it] +2025-05-11 12:54:44 - ERROR - stderr - +2025-05-11 12:54:44 - ERROR - stderr - +2025-05-11 12:54:44 - INFO - stdout - {'loss': 0.4916, 'grad_norm': 0.8967297673225403, 'learning_rate': 6.94731590467036e-07, 'epoch': 2.65} +2025-05-11 12:54:44 - ERROR - stderr - 88%|████████▊ | 3308/3741 [19:28:50<2:33:42, 21.30s/it] +2025-05-11 12:55:04 - ERROR - stderr - 88%|████████▊ | 3309/3741 [19:29:10<2:30:40, 20.93s/it] +2025-05-11 12:55:04 - ERROR - stderr - +2025-05-11 12:55:04 - ERROR - stderr - +2025-05-11 12:55:04 - INFO - stdout - {'loss': 0.4965, 'grad_norm': 0.9202722311019897, 'learning_rate': 6.915638400830959e-07, 'epoch': 2.65} +2025-05-11 12:55:04 - ERROR - stderr - 88%|████████▊ | 3309/3741 [19:29:11<2:30:40, 20.93s/it] +2025-05-11 12:55:28 - ERROR - stderr - 88%|████████▊ | 3310/3741 [19:29:34<2:35:52, 21.70s/it] +2025-05-11 12:55:28 - ERROR - stderr - +2025-05-11 12:55:28 - ERROR - stderr - +2025-05-11 12:55:28 - INFO - stdout - {'loss': 0.4634, 'grad_norm': 0.8630130290985107, 'learning_rate': 6.884030694816024e-07, 'epoch': 2.65} +2025-05-11 12:55:28 - ERROR - stderr - 88%|████████▊ | 3310/3741 [19:29:34<2:35:52, 21.70s/it] +2025-05-11 12:55:48 - ERROR - stderr - 89%|████████▊ | 3311/3741 [19:29:54<2:31:41, 21.17s/it] +2025-05-11 12:55:48 - ERROR - stderr - +2025-05-11 12:55:48 - ERROR - stderr - +2025-05-11 12:55:48 - INFO - stdout - {'loss': 0.463, 'grad_norm': 0.8898342847824097, 'learning_rate': 6.852492810326028e-07, 'epoch': 2.66} +2025-05-11 12:55:48 - ERROR - stderr - 89%|████████▊ | 3311/3741 [19:29:54<2:31:41, 21.17s/it] +2025-05-11 12:56:11 - ERROR - stderr - 89%|████████▊ | 3312/3741 [19:30:17<2:35:19, 21.72s/it] +2025-05-11 12:56:11 - ERROR - stderr - +2025-05-11 12:56:11 - ERROR - stderr - +2025-05-11 12:56:11 - INFO - stdout - {'loss': 0.4686, 'grad_norm': 0.8422430753707886, 'learning_rate': 6.821024771009188e-07, 'epoch': 2.66} +2025-05-11 12:56:11 - ERROR - stderr - 89%|████████▊ | 3312/3741 [19:30:17<2:35:19, 21.72s/it] +2025-05-11 12:56:30 - ERROR - stderr - 89%|████████▊ | 3313/3741 [19:30:37<2:30:56, 21.16s/it] +2025-05-11 12:56:30 - ERROR - stderr - +2025-05-11 12:56:30 - ERROR - stderr - +2025-05-11 12:56:30 - INFO - stdout - {'loss': 0.4703, 'grad_norm': 0.8274664282798767, 'learning_rate': 6.789626600461307e-07, 'epoch': 2.66} +2025-05-11 12:56:30 - ERROR - stderr - 89%|████████▊ | 3313/3741 [19:30:37<2:30:56, 21.16s/it] +2025-05-11 12:56:53 - ERROR - stderr - 89%|████████▊ | 3314/3741 [19:30:59<2:33:13, 21.53s/it] +2025-05-11 12:56:53 - ERROR - stderr - +2025-05-11 12:56:53 - ERROR - stderr - +2025-05-11 12:56:53 - INFO - stdout - {'loss': 0.4771, 'grad_norm': 0.8955613970756531, 'learning_rate': 6.758298322225765e-07, 'epoch': 2.66} +2025-05-11 12:56:53 - ERROR - stderr - 89%|████████▊ | 3314/3741 [19:30:59<2:33:13, 21.53s/it] +2025-05-11 12:57:13 - ERROR - stderr - 89%|████████▊ | 3315/3741 [19:31:19<2:29:09, 21.01s/it] +2025-05-11 12:57:13 - ERROR - stderr - +2025-05-11 12:57:13 - ERROR - stderr - +2025-05-11 12:57:13 - INFO - stdout - {'loss': 0.4513, 'grad_norm': 0.8348634243011475, 'learning_rate': 6.727039959793635e-07, 'epoch': 2.66} +2025-05-11 12:57:13 - ERROR - stderr - 89%|████████▊ | 3315/3741 [19:31:19<2:29:09, 21.01s/it] +2025-05-11 12:57:35 - ERROR - stderr - 89%|████████▊ | 3316/3741 [19:31:41<2:31:41, 21.41s/it] +2025-05-11 12:57:35 - ERROR - stderr - +2025-05-11 12:57:35 - ERROR - stderr - +2025-05-11 12:57:35 - INFO - stdout - {'loss': 0.454, 'grad_norm': 0.8191150426864624, 'learning_rate': 6.69585153660347e-07, 'epoch': 2.66} +2025-05-11 12:57:35 - ERROR - stderr - 89%|████████▊ | 3316/3741 [19:31:41<2:31:41, 21.41s/it] +2025-05-11 12:57:54 - ERROR - stderr - 89%|████████▊ | 3317/3741 [19:32:01<2:26:56, 20.79s/it] +2025-05-11 12:57:54 - ERROR - stderr - +2025-05-11 12:57:54 - ERROR - stderr - +2025-05-11 12:57:54 - INFO - stdout - {'loss': 0.4835, 'grad_norm': 0.8912159204483032, 'learning_rate': 6.664733076041374e-07, 'epoch': 2.66} +2025-05-11 12:57:54 - ERROR - stderr - 89%|████████▊ | 3317/3741 [19:32:01<2:26:56, 20.79s/it] +2025-05-11 12:58:17 - ERROR - stderr - 89%|████████▊ | 3318/3741 [19:32:24<2:31:01, 21.42s/it] +2025-05-11 12:58:17 - ERROR - stderr - +2025-05-11 12:58:17 - ERROR - stderr - +2025-05-11 12:58:17 - INFO - stdout - {'loss': 0.4868, 'grad_norm': 0.8224286437034607, 'learning_rate': 6.633684601441092e-07, 'epoch': 2.66} +2025-05-11 12:58:17 - ERROR - stderr - 89%|████████▊ | 3318/3741 [19:32:24<2:31:01, 21.42s/it] +2025-05-11 12:58:37 - ERROR - stderr - 89%|████████▊ | 3319/3741 [19:32:43<2:27:22, 20.95s/it] +2025-05-11 12:58:37 - ERROR - stderr - +2025-05-11 12:58:37 - ERROR - stderr - +2025-05-11 12:58:37 - INFO - stdout - {'loss': 0.4837, 'grad_norm': 0.8863853812217712, 'learning_rate': 6.602706136083792e-07, 'epoch': 2.66} +2025-05-11 12:58:37 - ERROR - stderr - 89%|████████▊ | 3319/3741 [19:32:43<2:27:22, 20.95s/it] +2025-05-11 12:59:00 - ERROR - stderr - 89%|████████▊ | 3320/3741 [19:33:06<2:31:10, 21.54s/it] +2025-05-11 12:59:00 - ERROR - stderr - +2025-05-11 12:59:00 - ERROR - stderr - +2025-05-11 12:59:00 - INFO - stdout - {'loss': 0.4754, 'grad_norm': 0.9140603542327881, 'learning_rate': 6.57179770319819e-07, 'epoch': 2.66} +2025-05-11 12:59:00 - ERROR - stderr - 89%|████████▊ | 3320/3741 [19:33:06<2:31:10, 21.54s/it] +2025-05-11 12:59:20 - ERROR - stderr - 89%|████████▉ | 3321/3741 [19:33:26<2:27:19, 21.05s/it] +2025-05-11 12:59:20 - ERROR - stderr - +2025-05-11 12:59:20 - ERROR - stderr - +2025-05-11 12:59:20 - INFO - stdout - {'loss': 0.4739, 'grad_norm': 0.8462338447570801, 'learning_rate': 6.540959325960494e-07, 'epoch': 2.66} +2025-05-11 12:59:20 - ERROR - stderr - 89%|████████▉ | 3321/3741 [19:33:26<2:27:19, 21.05s/it] +2025-05-11 12:59:40 - ERROR - stderr - 89%|████████▉ | 3322/3741 [19:33:47<2:25:26, 20.83s/it] +2025-05-11 12:59:40 - ERROR - stderr - +2025-05-11 12:59:40 - ERROR - stderr - +2025-05-11 12:59:40 - INFO - stdout - {'loss': 0.4652, 'grad_norm': 0.90171217918396, 'learning_rate': 6.510191027494339e-07, 'epoch': 2.66} +2025-05-11 12:59:40 - ERROR - stderr - 89%|████████▉ | 3322/3741 [19:33:47<2:25:26, 20.83s/it] +2025-05-11 13:00:00 - ERROR - stderr - 89%|████████▉ | 3323/3741 [19:34:06<2:22:41, 20.48s/it] +2025-05-11 13:00:00 - ERROR - stderr - +2025-05-11 13:00:00 - ERROR - stderr - +2025-05-11 13:00:00 - INFO - stdout - {'loss': 0.4673, 'grad_norm': 0.878610372543335, 'learning_rate': 6.479492830870881e-07, 'epoch': 2.66} +2025-05-11 13:00:00 - ERROR - stderr - 89%|████████▉ | 3323/3741 [19:34:06<2:22:41, 20.48s/it] +2025-05-11 13:00:20 - ERROR - stderr - 89%|████████▉ | 3324/3741 [19:34:26<2:20:56, 20.28s/it] +2025-05-11 13:00:20 - ERROR - stderr - +2025-05-11 13:00:20 - ERROR - stderr - +2025-05-11 13:00:20 - INFO - stdout - {'loss': 0.4864, 'grad_norm': 0.8632857799530029, 'learning_rate': 6.448864759108642e-07, 'epoch': 2.67} +2025-05-11 13:00:20 - ERROR - stderr - 89%|████████▉ | 3324/3741 [19:34:26<2:20:56, 20.28s/it] +2025-05-11 13:00:40 - ERROR - stderr - 89%|████████▉ | 3325/3741 [19:34:46<2:20:11, 20.22s/it] +2025-05-11 13:00:40 - ERROR - stderr - +2025-05-11 13:00:40 - ERROR - stderr - +2025-05-11 13:00:40 - INFO - stdout - {'loss': 0.5048, 'grad_norm': 0.8719748258590698, 'learning_rate': 6.418306835173605e-07, 'epoch': 2.67} +2025-05-11 13:00:40 - ERROR - stderr - 89%|████████▉ | 3325/3741 [19:34:46<2:20:11, 20.22s/it] +2025-05-11 13:01:00 - ERROR - stderr - 89%|████████▉ | 3326/3741 [19:35:07<2:20:21, 20.29s/it] +2025-05-11 13:01:00 - ERROR - stderr - +2025-05-11 13:01:00 - ERROR - stderr - +2025-05-11 13:01:00 - INFO - stdout - {'loss': 0.4702, 'grad_norm': 0.8836221098899841, 'learning_rate': 6.387819081979163e-07, 'epoch': 2.67} +2025-05-11 13:01:00 - ERROR - stderr - 89%|████████▉ | 3326/3741 [19:35:07<2:20:21, 20.29s/it] +2025-05-11 13:01:21 - ERROR - stderr - 89%|████████▉ | 3327/3741 [19:35:27<2:20:35, 20.38s/it] +2025-05-11 13:01:21 - ERROR - stderr - +2025-05-11 13:01:21 - ERROR - stderr - +2025-05-11 13:01:21 - INFO - stdout - {'loss': 0.4492, 'grad_norm': 0.8506346940994263, 'learning_rate': 6.35740152238602e-07, 'epoch': 2.67} +2025-05-11 13:01:21 - ERROR - stderr - 89%|████████▉ | 3327/3741 [19:35:27<2:20:35, 20.38s/it] +2025-05-11 13:01:42 - ERROR - stderr - 89%|████████▉ | 3328/3741 [19:35:48<2:21:10, 20.51s/it] +2025-05-11 13:01:42 - ERROR - stderr - +2025-05-11 13:01:42 - ERROR - stderr - +2025-05-11 13:01:42 - INFO - stdout - {'loss': 0.4562, 'grad_norm': 0.9193611145019531, 'learning_rate': 6.327054179202352e-07, 'epoch': 2.67} +2025-05-11 13:01:42 - ERROR - stderr - 89%|████████▉ | 3328/3741 [19:35:48<2:21:10, 20.51s/it] +2025-05-11 13:02:03 - ERROR - stderr - 89%|████████▉ | 3329/3741 [19:36:10<2:23:15, 20.86s/it] +2025-05-11 13:02:03 - ERROR - stderr - +2025-05-11 13:02:03 - ERROR - stderr - +2025-05-11 13:02:03 - INFO - stdout - {'loss': 0.4693, 'grad_norm': 0.8532218933105469, 'learning_rate': 6.296777075183602e-07, 'epoch': 2.67} +2025-05-11 13:02:03 - ERROR - stderr - 89%|████████▉ | 3329/3741 [19:36:10<2:23:15, 20.86s/it] +2025-05-11 13:02:24 - ERROR - stderr - 89%|████████▉ | 3330/3741 [19:36:30<2:21:48, 20.70s/it] +2025-05-11 13:02:24 - ERROR - stderr - +2025-05-11 13:02:24 - ERROR - stderr - +2025-05-11 13:02:24 - INFO - stdout - {'loss': 0.4745, 'grad_norm': 0.8159738779067993, 'learning_rate': 6.266570233032576e-07, 'epoch': 2.67} +2025-05-11 13:02:24 - ERROR - stderr - 89%|████████▉ | 3330/3741 [19:36:30<2:21:48, 20.70s/it] +2025-05-11 13:02:46 - ERROR - stderr - 89%|████████▉ | 3331/3741 [19:36:53<2:25:26, 21.28s/it] +2025-05-11 13:02:46 - ERROR - stderr - +2025-05-11 13:02:46 - ERROR - stderr - +2025-05-11 13:02:46 - INFO - stdout - {'loss': 0.4801, 'grad_norm': 0.8848310112953186, 'learning_rate': 6.236433675399412e-07, 'epoch': 2.67} +2025-05-11 13:02:46 - ERROR - stderr - 89%|████████▉ | 3331/3741 [19:36:53<2:25:26, 21.28s/it] +2025-05-11 13:03:06 - ERROR - stderr - 89%|████████▉ | 3332/3741 [19:37:13<2:22:39, 20.93s/it] +2025-05-11 13:03:06 - ERROR - stderr - +2025-05-11 13:03:06 - ERROR - stderr - +2025-05-11 13:03:06 - INFO - stdout - {'loss': 0.4469, 'grad_norm': 0.80049067735672, 'learning_rate': 6.206367424881487e-07, 'epoch': 2.67} +2025-05-11 13:03:06 - ERROR - stderr - 89%|████████▉ | 3332/3741 [19:37:13<2:22:39, 20.93s/it] +2025-05-11 13:03:29 - ERROR - stderr - 89%|████████▉ | 3333/3741 [19:37:35<2:25:39, 21.42s/it] +2025-05-11 13:03:29 - ERROR - stderr - +2025-05-11 13:03:29 - ERROR - stderr - +2025-05-11 13:03:29 - INFO - stdout - {'loss': 0.4731, 'grad_norm': 0.8821406960487366, 'learning_rate': 6.176371504023537e-07, 'epoch': 2.67} +2025-05-11 13:03:29 - ERROR - stderr - 89%|████████▉ | 3333/3741 [19:37:35<2:25:39, 21.42s/it] +2025-05-11 13:03:49 - ERROR - stderr - 89%|████████▉ | 3334/3741 [19:37:55<2:22:22, 20.99s/it] +2025-05-11 13:03:49 - ERROR - stderr - +2025-05-11 13:03:49 - ERROR - stderr - +2025-05-11 13:03:49 - INFO - stdout - {'loss': 0.467, 'grad_norm': 0.8810633420944214, 'learning_rate': 6.146445935317502e-07, 'epoch': 2.67} +2025-05-11 13:03:49 - ERROR - stderr - 89%|████████▉ | 3334/3741 [19:37:55<2:22:22, 20.99s/it] +2025-05-11 13:04:12 - ERROR - stderr - 89%|████████▉ | 3335/3741 [19:38:19<2:26:42, 21.68s/it] +2025-05-11 13:04:12 - ERROR - stderr - +2025-05-11 13:04:12 - ERROR - stderr - +2025-05-11 13:04:12 - INFO - stdout - {'loss': 0.4911, 'grad_norm': 0.8649298548698425, 'learning_rate': 6.116590741202611e-07, 'epoch': 2.67} +2025-05-11 13:04:12 - ERROR - stderr - 89%|████████▉ | 3335/3741 [19:38:19<2:26:42, 21.68s/it] +2025-05-11 13:04:32 - ERROR - stderr - 89%|████████▉ | 3336/3741 [19:38:39<2:23:14, 21.22s/it] +2025-05-11 13:04:32 - ERROR - stderr - +2025-05-11 13:04:32 - ERROR - stderr - +2025-05-11 13:04:32 - INFO - stdout - {'loss': 0.4564, 'grad_norm': 0.8359307646751404, 'learning_rate': 6.08680594406531e-07, 'epoch': 2.68} +2025-05-11 13:04:32 - ERROR - stderr - 89%|████████▉ | 3336/3741 [19:38:39<2:23:14, 21.22s/it] +2025-05-11 13:04:56 - ERROR - stderr - 89%|████████▉ | 3337/3741 [19:39:03<2:28:22, 22.04s/it] +2025-05-11 13:04:56 - ERROR - stderr - +2025-05-11 13:04:56 - ERROR - stderr - +2025-05-11 13:04:56 - INFO - stdout - {'loss': 0.4856, 'grad_norm': 0.876586377620697, 'learning_rate': 6.057091566239226e-07, 'epoch': 2.68} +2025-05-11 13:04:56 - ERROR - stderr - 89%|████████▉ | 3337/3741 [19:39:03<2:28:22, 22.04s/it] +2025-05-11 13:05:17 - ERROR - stderr - 89%|████████▉ | 3338/3741 [19:39:23<2:24:43, 21.55s/it] +2025-05-11 13:05:17 - ERROR - stderr - +2025-05-11 13:05:17 - ERROR - stderr - +2025-05-11 13:05:17 - INFO - stdout - {'loss': 0.5089, 'grad_norm': 0.9136984944343567, 'learning_rate': 6.027447630005234e-07, 'epoch': 2.68} +2025-05-11 13:05:17 - ERROR - stderr - 89%|████████▉ | 3338/3741 [19:39:23<2:24:43, 21.55s/it] +2025-05-11 13:05:41 - ERROR - stderr - 89%|████████▉ | 3339/3741 [19:39:47<2:29:28, 22.31s/it] +2025-05-11 13:05:41 - ERROR - stderr - +2025-05-11 13:05:41 - ERROR - stderr - +2025-05-11 13:05:41 - INFO - stdout - {'loss': 0.4879, 'grad_norm': 0.899994432926178, 'learning_rate': 5.997874157591344e-07, 'epoch': 2.68} +2025-05-11 13:05:41 - ERROR - stderr - 89%|████████▉ | 3339/3741 [19:39:47<2:29:28, 22.31s/it] +2025-05-11 13:06:01 - ERROR - stderr - 89%|████████▉ | 3340/3741 [19:40:08<2:25:45, 21.81s/it] +2025-05-11 13:06:01 - ERROR - stderr - +2025-05-11 13:06:01 - ERROR - stderr - +2025-05-11 13:06:01 - INFO - stdout - {'loss': 0.4718, 'grad_norm': 0.8839572072029114, 'learning_rate': 5.968371171172782e-07, 'epoch': 2.68} +2025-05-11 13:06:01 - ERROR - stderr - 89%|████████▉ | 3340/3741 [19:40:08<2:25:45, 21.81s/it] +2025-05-11 13:06:26 - ERROR - stderr - 89%|████████▉ | 3341/3741 [19:40:32<2:29:56, 22.49s/it] +2025-05-11 13:06:26 - ERROR - stderr - +2025-05-11 13:06:26 - ERROR - stderr - +2025-05-11 13:06:26 - INFO - stdout - {'loss': 0.4668, 'grad_norm': 0.8831132650375366, 'learning_rate': 5.938938692871887e-07, 'epoch': 2.68} +2025-05-11 13:06:26 - ERROR - stderr - 89%|████████▉ | 3341/3741 [19:40:32<2:29:56, 22.49s/it] +2025-05-11 13:06:46 - ERROR - stderr - 89%|████████▉ | 3342/3741 [19:40:52<2:24:51, 21.78s/it] +2025-05-11 13:06:46 - ERROR - stderr - +2025-05-11 13:06:46 - ERROR - stderr - +2025-05-11 13:06:46 - INFO - stdout - {'loss': 0.4733, 'grad_norm': 0.9043929576873779, 'learning_rate': 5.909576744758117e-07, 'epoch': 2.68} +2025-05-11 13:06:46 - ERROR - stderr - 89%|████████▉ | 3342/3741 [19:40:52<2:24:51, 21.78s/it] +2025-05-11 13:07:10 - ERROR - stderr - 89%|████████▉ | 3343/3741 [19:41:16<2:29:31, 22.54s/it] +2025-05-11 13:07:10 - ERROR - stderr - +2025-05-11 13:07:10 - ERROR - stderr - +2025-05-11 13:07:10 - INFO - stdout - {'loss': 0.478, 'grad_norm': 0.8958361744880676, 'learning_rate': 5.880285348848069e-07, 'epoch': 2.68} +2025-05-11 13:07:10 - ERROR - stderr - 89%|████████▉ | 3343/3741 [19:41:16<2:29:31, 22.54s/it] +2025-05-11 13:07:30 - ERROR - stderr - 89%|████████▉ | 3344/3741 [19:41:36<2:24:08, 21.79s/it] +2025-05-11 13:07:30 - ERROR - stderr - +2025-05-11 13:07:30 - ERROR - stderr - +2025-05-11 13:07:30 - INFO - stdout - {'loss': 0.4736, 'grad_norm': 0.8278128504753113, 'learning_rate': 5.851064527105421e-07, 'epoch': 2.68} +2025-05-11 13:07:30 - ERROR - stderr - 89%|████████▉ | 3344/3741 [19:41:36<2:24:08, 21.79s/it] +2025-05-11 13:07:53 - ERROR - stderr - 89%|████████▉ | 3345/3741 [19:41:59<2:26:05, 22.14s/it] +2025-05-11 13:07:53 - ERROR - stderr - +2025-05-11 13:07:53 - ERROR - stderr - +2025-05-11 13:07:53 - INFO - stdout - {'loss': 0.4548, 'grad_norm': 0.8567398190498352, 'learning_rate': 5.821914301440956e-07, 'epoch': 2.68} +2025-05-11 13:07:53 - ERROR - stderr - 89%|████████▉ | 3345/3741 [19:41:59<2:26:05, 22.14s/it] +2025-05-11 13:08:13 - ERROR - stderr - 89%|████████▉ | 3346/3741 [19:42:19<2:21:21, 21.47s/it] +2025-05-11 13:08:13 - ERROR - stderr - +2025-05-11 13:08:13 - ERROR - stderr - +2025-05-11 13:08:13 - INFO - stdout - {'loss': 0.4774, 'grad_norm': 0.9090203046798706, 'learning_rate': 5.792834693712502e-07, 'epoch': 2.68} +2025-05-11 13:08:13 - ERROR - stderr - 89%|████████▉ | 3346/3741 [19:42:19<2:21:21, 21.47s/it] +2025-05-11 13:08:36 - ERROR - stderr - 89%|████████▉ | 3347/3741 [19:42:42<2:24:11, 21.96s/it] +2025-05-11 13:08:36 - ERROR - stderr - +2025-05-11 13:08:36 - ERROR - stderr - +2025-05-11 13:08:36 - INFO - stdout - {'loss': 0.4796, 'grad_norm': 0.8930779099464417, 'learning_rate': 5.763825725724925e-07, 'epoch': 2.68} +2025-05-11 13:08:36 - ERROR - stderr - 89%|████████▉ | 3347/3741 [19:42:42<2:24:11, 21.96s/it] +2025-05-11 13:08:56 - ERROR - stderr - 89%|████████▉ | 3348/3741 [19:43:02<2:20:13, 21.41s/it] +2025-05-11 13:08:56 - ERROR - stderr - +2025-05-11 13:08:56 - ERROR - stderr - +2025-05-11 13:08:56 - INFO - stdout - {'loss': 0.4736, 'grad_norm': 0.8618937730789185, 'learning_rate': 5.734887419230151e-07, 'epoch': 2.68} +2025-05-11 13:08:56 - ERROR - stderr - 89%|████████▉ | 3348/3741 [19:43:02<2:20:13, 21.41s/it] +2025-05-11 13:09:20 - ERROR - stderr - 90%|████████▉ | 3349/3741 [19:43:26<2:24:19, 22.09s/it] +2025-05-11 13:09:20 - ERROR - stderr - +2025-05-11 13:09:20 - ERROR - stderr - +2025-05-11 13:09:20 - INFO - stdout - {'loss': 0.4666, 'grad_norm': 0.8632767796516418, 'learning_rate': 5.70601979592711e-07, 'epoch': 2.69} +2025-05-11 13:09:20 - ERROR - stderr - 90%|████████▉ | 3349/3741 [19:43:26<2:24:19, 22.09s/it] +2025-05-11 13:09:40 - ERROR - stderr - 90%|████████▉ | 3350/3741 [19:43:46<2:20:03, 21.49s/it] +2025-05-11 13:09:40 - ERROR - stderr - +2025-05-11 13:09:40 - ERROR - stderr - +2025-05-11 13:09:40 - INFO - stdout - {'loss': 0.4702, 'grad_norm': 0.8496332764625549, 'learning_rate': 5.67722287746173e-07, 'epoch': 2.69} +2025-05-11 13:09:40 - ERROR - stderr - 90%|████████▉ | 3350/3741 [19:43:46<2:20:03, 21.49s/it] +2025-05-11 13:10:03 - ERROR - stderr - 90%|████████▉ | 3351/3741 [19:44:10<2:23:28, 22.07s/it] +2025-05-11 13:10:03 - ERROR - stderr - +2025-05-11 13:10:03 - ERROR - stderr - +2025-05-11 13:10:03 - INFO - stdout - {'loss': 0.5128, 'grad_norm': 0.9124326705932617, 'learning_rate': 5.648496685426908e-07, 'epoch': 2.69} +2025-05-11 13:10:03 - ERROR - stderr - 90%|████████▉ | 3351/3741 [19:44:10<2:23:28, 22.07s/it] +2025-05-11 13:10:23 - ERROR - stderr - 90%|████████▉ | 3352/3741 [19:44:29<2:18:19, 21.33s/it] +2025-05-11 13:10:23 - ERROR - stderr - +2025-05-11 13:10:23 - ERROR - stderr - +2025-05-11 13:10:23 - INFO - stdout - {'loss': 0.4635, 'grad_norm': 0.8609637022018433, 'learning_rate': 5.619841241362522e-07, 'epoch': 2.69} +2025-05-11 13:10:23 - ERROR - stderr - 90%|████████▉ | 3352/3741 [19:44:29<2:18:19, 21.33s/it] +2025-05-11 13:10:44 - ERROR - stderr - 90%|████████▉ | 3353/3741 [19:44:50<2:16:37, 21.13s/it] +2025-05-11 13:10:44 - ERROR - stderr - +2025-05-11 13:10:44 - ERROR - stderr - +2025-05-11 13:10:44 - INFO - stdout - {'loss': 0.4702, 'grad_norm': 0.8587467074394226, 'learning_rate': 5.591256566755399e-07, 'epoch': 2.69} +2025-05-11 13:10:44 - ERROR - stderr - 90%|████████▉ | 3353/3741 [19:44:50<2:16:37, 21.13s/it] +2025-05-11 13:11:03 - ERROR - stderr - 90%|████████▉ | 3354/3741 [19:45:10<2:13:56, 20.77s/it] +2025-05-11 13:11:04 - ERROR - stderr - +2025-05-11 13:11:04 - ERROR - stderr - +2025-05-11 13:11:04 - INFO - stdout - {'loss': 0.5009, 'grad_norm': 0.8724468350410461, 'learning_rate': 5.562742683039313e-07, 'epoch': 2.69} +2025-05-11 13:11:04 - ERROR - stderr - 90%|████████▉ | 3354/3741 [19:45:10<2:13:56, 20.77s/it] +2025-05-11 13:11:23 - ERROR - stderr - 90%|████████▉ | 3355/3741 [19:45:30<2:12:02, 20.53s/it] +2025-05-11 13:11:23 - ERROR - stderr - +2025-05-11 13:11:23 - ERROR - stderr - +2025-05-11 13:11:23 - INFO - stdout - {'loss': 0.462, 'grad_norm': 0.8646350502967834, 'learning_rate': 5.534299611594962e-07, 'epoch': 2.69} +2025-05-11 13:11:23 - ERROR - stderr - 90%|████████▉ | 3355/3741 [19:45:30<2:12:02, 20.53s/it] +2025-05-11 13:11:43 - ERROR - stderr - 90%|████████▉ | 3356/3741 [19:45:50<2:10:19, 20.31s/it] +2025-05-11 13:11:43 - ERROR - stderr - +2025-05-11 13:11:43 - ERROR - stderr - +2025-05-11 13:11:43 - INFO - stdout - {'loss': 0.4507, 'grad_norm': 0.8256325721740723, 'learning_rate': 5.505927373749887e-07, 'epoch': 2.69} +2025-05-11 13:11:43 - ERROR - stderr - 90%|████████▉ | 3356/3741 [19:45:50<2:10:19, 20.31s/it] +2025-05-11 13:12:03 - ERROR - stderr - 90%|████████▉ | 3357/3741 [19:46:10<2:09:49, 20.29s/it] +2025-05-11 13:12:04 - ERROR - stderr - +2025-05-11 13:12:04 - ERROR - stderr - +2025-05-11 13:12:04 - INFO - stdout - {'loss': 0.4719, 'grad_norm': 0.8692865371704102, 'learning_rate': 5.477625990778579e-07, 'epoch': 2.69} +2025-05-11 13:12:04 - ERROR - stderr - 90%|████████▉ | 3357/3741 [19:46:10<2:09:49, 20.29s/it] +2025-05-11 13:12:25 - ERROR - stderr - 90%|████████▉ | 3358/3741 [19:46:31<2:11:48, 20.65s/it] +2025-05-11 13:12:25 - ERROR - stderr - +2025-05-11 13:12:25 - ERROR - stderr - +2025-05-11 13:12:25 - INFO - stdout - {'loss': 0.4955, 'grad_norm': 0.8937420845031738, 'learning_rate': 5.449395483902376e-07, 'epoch': 2.69} +2025-05-11 13:12:25 - ERROR - stderr - 90%|████████▉ | 3358/3741 [19:46:31<2:11:48, 20.65s/it] +2025-05-11 13:12:45 - ERROR - stderr - 90%|████████▉ | 3359/3741 [19:46:51<2:10:11, 20.45s/it] +2025-05-11 13:12:45 - ERROR - stderr - +2025-05-11 13:12:45 - ERROR - stderr - +2025-05-11 13:12:45 - INFO - stdout - {'loss': 0.4784, 'grad_norm': 0.8685393333435059, 'learning_rate': 5.421235874289488e-07, 'epoch': 2.69} +2025-05-11 13:12:45 - ERROR - stderr - 90%|████████▉ | 3359/3741 [19:46:51<2:10:11, 20.45s/it] +2025-05-11 13:13:07 - ERROR - stderr - 90%|████████▉ | 3360/3741 [19:47:13<2:12:27, 20.86s/it] +2025-05-11 13:13:07 - ERROR - stderr - +2025-05-11 13:13:07 - ERROR - stderr - +2025-05-11 13:13:07 - INFO - stdout - {'loss': 0.4807, 'grad_norm': 0.8677978515625, 'learning_rate': 5.393147183054936e-07, 'epoch': 2.69} +2025-05-11 13:13:07 - ERROR - stderr - 90%|████████▉ | 3360/3741 [19:47:13<2:12:27, 20.86s/it] +2025-05-11 13:13:26 - ERROR - stderr - 90%|████████▉ | 3361/3741 [19:47:32<2:09:16, 20.41s/it] +2025-05-11 13:13:26 - ERROR - stderr - +2025-05-11 13:13:26 - ERROR - stderr - +2025-05-11 13:13:26 - INFO - stdout - {'loss': 0.4603, 'grad_norm': 0.8556413650512695, 'learning_rate': 5.365129431260574e-07, 'epoch': 2.7} +2025-05-11 13:13:26 - ERROR - stderr - 90%|████████▉ | 3361/3741 [19:47:32<2:09:16, 20.41s/it] +2025-05-11 13:13:49 - ERROR - stderr - 90%|████████▉ | 3362/3741 [19:47:56<2:14:16, 21.26s/it] +2025-05-11 13:13:49 - ERROR - stderr - +2025-05-11 13:13:49 - ERROR - stderr - +2025-05-11 13:13:49 - INFO - stdout - {'loss': 0.4935, 'grad_norm': 0.9038963913917542, 'learning_rate': 5.337182639915073e-07, 'epoch': 2.7} +2025-05-11 13:13:49 - ERROR - stderr - 90%|████████▉ | 3362/3741 [19:47:56<2:14:16, 21.26s/it] +2025-05-11 13:14:09 - ERROR - stderr - 90%|████████▉ | 3363/3741 [19:48:15<2:10:46, 20.76s/it] +2025-05-11 13:14:09 - ERROR - stderr - +2025-05-11 13:14:09 - ERROR - stderr - +2025-05-11 13:14:09 - INFO - stdout - {'loss': 0.4893, 'grad_norm': 0.8712881803512573, 'learning_rate': 5.309306829973892e-07, 'epoch': 2.7} +2025-05-11 13:14:09 - ERROR - stderr - 90%|████████▉ | 3363/3741 [19:48:15<2:10:46, 20.76s/it] +2025-05-11 13:14:32 - ERROR - stderr - 90%|████████▉ | 3364/3741 [19:48:38<2:14:20, 21.38s/it] +2025-05-11 13:14:32 - ERROR - stderr - +2025-05-11 13:14:32 - ERROR - stderr - +2025-05-11 13:14:32 - INFO - stdout - {'loss': 0.4682, 'grad_norm': 0.8390231728553772, 'learning_rate': 5.281502022339236e-07, 'epoch': 2.7} +2025-05-11 13:14:32 - ERROR - stderr - 90%|████████▉ | 3364/3741 [19:48:38<2:14:20, 21.38s/it] +2025-05-11 13:14:51 - ERROR - stderr - 90%|████████▉ | 3365/3741 [19:48:57<2:09:53, 20.73s/it] +2025-05-11 13:14:51 - ERROR - stderr - +2025-05-11 13:14:51 - ERROR - stderr - +2025-05-11 13:14:51 - INFO - stdout - {'loss': 0.4695, 'grad_norm': 0.8879370093345642, 'learning_rate': 5.253768237860146e-07, 'epoch': 2.7} +2025-05-11 13:14:51 - ERROR - stderr - 90%|████████▉ | 3365/3741 [19:48:57<2:09:53, 20.73s/it] +2025-05-11 13:15:12 - ERROR - stderr - 90%|████████▉ | 3366/3741 [19:49:18<2:09:18, 20.69s/it] +2025-05-11 13:15:12 - ERROR - stderr - +2025-05-11 13:15:12 - ERROR - stderr - +2025-05-11 13:15:12 - INFO - stdout - {'loss': 0.4811, 'grad_norm': 0.9008282423019409, 'learning_rate': 5.226105497332323e-07, 'epoch': 2.7} +2025-05-11 13:15:12 - ERROR - stderr - 90%|████████▉ | 3366/3741 [19:49:18<2:09:18, 20.69s/it] +2025-05-11 13:15:31 - ERROR - stderr - 90%|█████████ | 3367/3741 [19:49:38<2:07:14, 20.41s/it] +2025-05-11 13:15:31 - ERROR - stderr - +2025-05-11 13:15:31 - ERROR - stderr - +2025-05-11 13:15:31 - INFO - stdout - {'loss': 0.4956, 'grad_norm': 0.9389668107032776, 'learning_rate': 5.19851382149823e-07, 'epoch': 2.7} +2025-05-11 13:15:31 - ERROR - stderr - 90%|█████████ | 3367/3741 [19:49:38<2:07:14, 20.41s/it] +2025-05-11 13:15:51 - ERROR - stderr - 90%|█████████ | 3368/3741 [19:49:57<2:05:06, 20.13s/it] +2025-05-11 13:15:51 - ERROR - stderr - +2025-05-11 13:15:51 - ERROR - stderr - +2025-05-11 13:15:51 - INFO - stdout - {'loss': 0.4591, 'grad_norm': 0.8313438296318054, 'learning_rate': 5.170993231047072e-07, 'epoch': 2.7} +2025-05-11 13:15:51 - ERROR - stderr - 90%|█████████ | 3368/3741 [19:49:57<2:05:06, 20.13s/it] +2025-05-11 13:16:10 - ERROR - stderr - 90%|█████████ | 3369/3741 [19:50:17<2:03:34, 19.93s/it] +2025-05-11 13:16:10 - ERROR - stderr - +2025-05-11 13:16:10 - ERROR - stderr - +2025-05-11 13:16:10 - INFO - stdout - {'loss': 0.4968, 'grad_norm': 0.8891331553459167, 'learning_rate': 5.143543746614688e-07, 'epoch': 2.7} +2025-05-11 13:16:10 - ERROR - stderr - 90%|█████████ | 3369/3741 [19:50:17<2:03:34, 19.93s/it] +2025-05-11 13:16:30 - ERROR - stderr - 90%|█████████ | 3370/3741 [19:50:36<2:01:51, 19.71s/it] +2025-05-11 13:16:30 - ERROR - stderr - +2025-05-11 13:16:30 - ERROR - stderr - +2025-05-11 13:16:30 - INFO - stdout - {'loss': 0.4716, 'grad_norm': 0.861348569393158, 'learning_rate': 5.116165388783678e-07, 'epoch': 2.7} +2025-05-11 13:16:30 - ERROR - stderr - 90%|█████████ | 3370/3741 [19:50:36<2:01:51, 19.71s/it] +2025-05-11 13:16:51 - ERROR - stderr - 90%|█████████ | 3371/3741 [19:50:58<2:05:38, 20.37s/it] +2025-05-11 13:16:51 - ERROR - stderr - +2025-05-11 13:16:51 - ERROR - stderr - +2025-05-11 13:16:51 - INFO - stdout - {'loss': 0.487, 'grad_norm': 0.8426232933998108, 'learning_rate': 5.088858178083223e-07, 'epoch': 2.7} +2025-05-11 13:16:51 - ERROR - stderr - 90%|█████████ | 3371/3741 [19:50:58<2:05:38, 20.37s/it] +2025-05-11 13:17:11 - ERROR - stderr - 90%|█████████ | 3372/3741 [19:51:17<2:03:50, 20.14s/it] +2025-05-11 13:17:11 - ERROR - stderr - +2025-05-11 13:17:11 - ERROR - stderr - +2025-05-11 13:17:11 - INFO - stdout - {'loss': 0.4549, 'grad_norm': 0.8665948510169983, 'learning_rate': 5.06162213498923e-07, 'epoch': 2.7} +2025-05-11 13:17:11 - ERROR - stderr - 90%|█████████ | 3372/3741 [19:51:17<2:03:50, 20.14s/it] +2025-05-11 13:17:34 - ERROR - stderr - 90%|█████████ | 3373/3741 [19:51:40<2:07:58, 20.86s/it] +2025-05-11 13:17:34 - ERROR - stderr - +2025-05-11 13:17:34 - ERROR - stderr - +2025-05-11 13:17:34 - INFO - stdout - {'loss': 0.4989, 'grad_norm': 0.8754244446754456, 'learning_rate': 5.034457279924221e-07, 'epoch': 2.7} +2025-05-11 13:17:34 - ERROR - stderr - 90%|█████████ | 3373/3741 [19:51:40<2:07:58, 20.86s/it] +2025-05-11 13:17:53 - ERROR - stderr - 90%|█████████ | 3374/3741 [19:52:00<2:05:31, 20.52s/it] +2025-05-11 13:17:53 - ERROR - stderr - +2025-05-11 13:17:53 - ERROR - stderr - +2025-05-11 13:17:53 - INFO - stdout - {'loss': 0.4558, 'grad_norm': 0.8739911317825317, 'learning_rate': 5.007363633257278e-07, 'epoch': 2.71} +2025-05-11 13:17:53 - ERROR - stderr - 90%|█████████ | 3374/3741 [19:52:00<2:05:31, 20.52s/it] +2025-05-11 13:18:17 - ERROR - stderr - 90%|█████████ | 3375/3741 [19:52:23<2:10:35, 21.41s/it] +2025-05-11 13:18:17 - ERROR - stderr - +2025-05-11 13:18:17 - ERROR - stderr - +2025-05-11 13:18:17 - INFO - stdout - {'loss': 0.4608, 'grad_norm': 0.8367862701416016, 'learning_rate': 4.980341215304196e-07, 'epoch': 2.71} +2025-05-11 13:18:17 - ERROR - stderr - 90%|█████████ | 3375/3741 [19:52:23<2:10:35, 21.41s/it] +2025-05-11 13:18:37 - ERROR - stderr - 90%|█████████ | 3376/3741 [19:52:43<2:07:24, 20.94s/it] +2025-05-11 13:18:37 - ERROR - stderr - +2025-05-11 13:18:37 - ERROR - stderr - +2025-05-11 13:18:37 - INFO - stdout - {'loss': 0.4706, 'grad_norm': 0.8726009130477905, 'learning_rate': 4.953390046327278e-07, 'epoch': 2.71} +2025-05-11 13:18:37 - ERROR - stderr - 90%|█████████ | 3376/3741 [19:52:43<2:07:24, 20.94s/it] +2025-05-11 13:18:58 - ERROR - stderr - 90%|█████████ | 3377/3741 [19:53:04<2:07:23, 21.00s/it] +2025-05-11 13:18:58 - ERROR - stderr - +2025-05-11 13:18:58 - ERROR - stderr - +2025-05-11 13:18:58 - INFO - stdout - {'loss': 0.4618, 'grad_norm': 0.8462924361228943, 'learning_rate': 4.926510146535434e-07, 'epoch': 2.71} +2025-05-11 13:18:58 - ERROR - stderr - 90%|█████████ | 3377/3741 [19:53:04<2:07:23, 21.00s/it] +2025-05-11 13:19:18 - ERROR - stderr - 90%|█████████ | 3378/3741 [19:53:24<2:05:23, 20.73s/it] +2025-05-11 13:19:18 - ERROR - stderr - +2025-05-11 13:19:18 - ERROR - stderr - +2025-05-11 13:19:18 - INFO - stdout - {'loss': 0.4875, 'grad_norm': 0.9273150563240051, 'learning_rate': 4.899701536084134e-07, 'epoch': 2.71} +2025-05-11 13:19:18 - ERROR - stderr - 90%|█████████ | 3378/3741 [19:53:24<2:05:23, 20.73s/it] +2025-05-11 13:19:38 - ERROR - stderr - 90%|█████████ | 3379/3741 [19:53:44<2:03:30, 20.47s/it] +2025-05-11 13:19:38 - ERROR - stderr - +2025-05-11 13:19:38 - ERROR - stderr - +2025-05-11 13:19:38 - INFO - stdout - {'loss': 0.4932, 'grad_norm': 0.8953900933265686, 'learning_rate': 4.872964235075361e-07, 'epoch': 2.71} +2025-05-11 13:19:38 - ERROR - stderr - 90%|█████████ | 3379/3741 [19:53:44<2:03:30, 20.47s/it] +2025-05-11 13:20:00 - ERROR - stderr - 90%|█████████ | 3380/3741 [19:54:07<2:06:50, 21.08s/it] +2025-05-11 13:20:00 - ERROR - stderr - +2025-05-11 13:20:00 - ERROR - stderr - +2025-05-11 13:20:00 - INFO - stdout - {'loss': 0.4835, 'grad_norm': 0.8775522708892822, 'learning_rate': 4.846298263557681e-07, 'epoch': 2.71} +2025-05-11 13:20:00 - ERROR - stderr - 90%|█████████ | 3380/3741 [19:54:07<2:06:50, 21.08s/it] +2025-05-11 13:20:20 - ERROR - stderr - 90%|█████████ | 3381/3741 [19:54:26<2:03:30, 20.59s/it] +2025-05-11 13:20:20 - ERROR - stderr - +2025-05-11 13:20:20 - ERROR - stderr - +2025-05-11 13:20:20 - INFO - stdout - {'loss': 0.4748, 'grad_norm': 0.8793307542800903, 'learning_rate': 4.819703641526141e-07, 'epoch': 2.71} +2025-05-11 13:20:20 - ERROR - stderr - 90%|█████████ | 3381/3741 [19:54:26<2:03:30, 20.59s/it] +2025-05-11 13:20:43 - ERROR - stderr - 90%|█████████ | 3382/3741 [19:54:50<2:08:42, 21.51s/it] +2025-05-11 13:20:43 - ERROR - stderr - +2025-05-11 13:20:43 - ERROR - stderr - +2025-05-11 13:20:43 - INFO - stdout - {'loss': 0.4906, 'grad_norm': 0.8890559673309326, 'learning_rate': 4.793180388922292e-07, 'epoch': 2.71} +2025-05-11 13:20:43 - ERROR - stderr - 90%|█████████ | 3382/3741 [19:54:50<2:08:42, 21.51s/it] +2025-05-11 13:21:03 - ERROR - stderr - 90%|█████████ | 3383/3741 [19:55:09<2:05:18, 21.00s/it] +2025-05-11 13:21:03 - ERROR - stderr - +2025-05-11 13:21:03 - ERROR - stderr - +2025-05-11 13:21:03 - INFO - stdout - {'loss': 0.466, 'grad_norm': 0.8522745370864868, 'learning_rate': 4.766728525634179e-07, 'epoch': 2.71} +2025-05-11 13:21:03 - ERROR - stderr - 90%|█████████ | 3383/3741 [19:55:09<2:05:18, 21.00s/it] +2025-05-11 13:21:26 - ERROR - stderr - 90%|█████████ | 3384/3741 [19:55:33<2:09:04, 21.69s/it] +2025-05-11 13:21:26 - ERROR - stderr - +2025-05-11 13:21:26 - ERROR - stderr - +2025-05-11 13:21:26 - INFO - stdout - {'loss': 0.4565, 'grad_norm': 0.8627252578735352, 'learning_rate': 4.7403480714963037e-07, 'epoch': 2.71} +2025-05-11 13:21:26 - ERROR - stderr - 90%|█████████ | 3384/3741 [19:55:33<2:09:04, 21.69s/it] +2025-05-11 13:21:46 - ERROR - stderr - 90%|█████████ | 3385/3741 [19:55:52<2:04:24, 20.97s/it] +2025-05-11 13:21:46 - ERROR - stderr - +2025-05-11 13:21:46 - ERROR - stderr - +2025-05-11 13:21:46 - INFO - stdout - {'loss': 0.4707, 'grad_norm': 0.8617047071456909, 'learning_rate': 4.71403904628962e-07, 'epoch': 2.71} +2025-05-11 13:21:46 - ERROR - stderr - 90%|█████████ | 3385/3741 [19:55:52<2:04:24, 20.97s/it] +2025-05-11 13:22:06 - ERROR - stderr - 91%|█████████ | 3386/3741 [19:56:12<2:02:30, 20.71s/it] +2025-05-11 13:22:06 - ERROR - stderr - +2025-05-11 13:22:06 - ERROR - stderr - +2025-05-11 13:22:06 - INFO - stdout - {'loss': 0.4742, 'grad_norm': 0.8587998747825623, 'learning_rate': 4.6878014697415374e-07, 'epoch': 2.72} +2025-05-11 13:22:06 - ERROR - stderr - 91%|█████████ | 3386/3741 [19:56:12<2:02:30, 20.71s/it] +2025-05-11 13:22:25 - ERROR - stderr - 91%|█████████ | 3387/3741 [19:56:32<2:00:01, 20.34s/it] +2025-05-11 13:22:25 - ERROR - stderr - +2025-05-11 13:22:25 - ERROR - stderr - +2025-05-11 13:22:25 - INFO - stdout - {'loss': 0.4557, 'grad_norm': 0.8326482176780701, 'learning_rate': 4.661635361525885e-07, 'epoch': 2.72} +2025-05-11 13:22:25 - ERROR - stderr - 91%|█████████ | 3387/3741 [19:56:32<2:00:01, 20.34s/it] +2025-05-11 13:22:45 - ERROR - stderr - 91%|█████████ | 3388/3741 [19:56:51<1:58:07, 20.08s/it] +2025-05-11 13:22:45 - ERROR - stderr - +2025-05-11 13:22:45 - ERROR - stderr - +2025-05-11 13:22:45 - INFO - stdout - {'loss': 0.4722, 'grad_norm': 0.8801354765892029, 'learning_rate': 4.635540741262923e-07, 'epoch': 2.72} +2025-05-11 13:22:45 - ERROR - stderr - 91%|█████████ | 3388/3741 [19:56:51<1:58:07, 20.08s/it] +2025-05-11 13:23:06 - ERROR - stderr - 91%|█████████ | 3389/3741 [19:57:13<2:00:10, 20.48s/it] +2025-05-11 13:23:06 - ERROR - stderr - +2025-05-11 13:23:06 - ERROR - stderr - +2025-05-11 13:23:06 - INFO - stdout - {'loss': 0.4689, 'grad_norm': 0.8579322695732117, 'learning_rate': 4.6095176285192556e-07, 'epoch': 2.72} +2025-05-11 13:23:06 - ERROR - stderr - 91%|█████████ | 3389/3741 [19:57:13<2:00:10, 20.48s/it] +2025-05-11 13:23:26 - ERROR - stderr - 91%|█████████ | 3390/3741 [19:57:32<1:58:30, 20.26s/it] +2025-05-11 13:23:26 - ERROR - stderr - +2025-05-11 13:23:26 - ERROR - stderr - +2025-05-11 13:23:26 - INFO - stdout - {'loss': 0.4769, 'grad_norm': 0.9005702137947083, 'learning_rate': 4.583566042807908e-07, 'epoch': 2.72} +2025-05-11 13:23:26 - ERROR - stderr - 91%|█████████ | 3390/3741 [19:57:32<1:58:30, 20.26s/it] +2025-05-11 13:23:48 - ERROR - stderr - 91%|█████████ | 3391/3741 [19:57:55<2:01:50, 20.89s/it] +2025-05-11 13:23:48 - ERROR - stderr - +2025-05-11 13:23:48 - ERROR - stderr - +2025-05-11 13:23:48 - INFO - stdout - {'loss': 0.4771, 'grad_norm': 0.8632087707519531, 'learning_rate': 4.557686003588269e-07, 'epoch': 2.72} +2025-05-11 13:23:48 - ERROR - stderr - 91%|█████████ | 3391/3741 [19:57:55<2:01:50, 20.89s/it] +2025-05-11 13:24:08 - ERROR - stderr - 91%|█████████ | 3392/3741 [19:58:14<1:59:21, 20.52s/it] +2025-05-11 13:24:08 - ERROR - stderr - +2025-05-11 13:24:08 - ERROR - stderr - +2025-05-11 13:24:08 - INFO - stdout - {'loss': 0.4509, 'grad_norm': 0.8352670669555664, 'learning_rate': 4.531877530266071e-07, 'epoch': 2.72} +2025-05-11 13:24:08 - ERROR - stderr - 91%|█████████ | 3392/3741 [19:58:14<1:59:21, 20.52s/it] +2025-05-11 13:24:31 - ERROR - stderr - 91%|█████████ | 3393/3741 [19:58:37<2:03:39, 21.32s/it] +2025-05-11 13:24:31 - ERROR - stderr - +2025-05-11 13:24:31 - ERROR - stderr - +2025-05-11 13:24:31 - INFO - stdout - {'loss': 0.4455, 'grad_norm': 0.8583428263664246, 'learning_rate': 4.506140642193391e-07, 'epoch': 2.72} +2025-05-11 13:24:31 - ERROR - stderr - 91%|█████████ | 3393/3741 [19:58:37<2:03:39, 21.32s/it] +2025-05-11 13:24:51 - ERROR - stderr - 91%|█████████ | 3394/3741 [19:58:57<2:00:02, 20.76s/it] +2025-05-11 13:24:51 - ERROR - stderr - +2025-05-11 13:24:51 - ERROR - stderr - +2025-05-11 13:24:51 - INFO - stdout - {'loss': 0.4665, 'grad_norm': 0.9356517195701599, 'learning_rate': 4.4804753586686013e-07, 'epoch': 2.72} +2025-05-11 13:24:51 - ERROR - stderr - 91%|█████████ | 3394/3741 [19:58:57<2:00:02, 20.76s/it] +2025-05-11 13:25:14 - ERROR - stderr - 91%|█████████ | 3395/3741 [19:59:20<2:03:57, 21.50s/it] +2025-05-11 13:25:14 - ERROR - stderr - +2025-05-11 13:25:14 - ERROR - stderr - +2025-05-11 13:25:14 - INFO - stdout - {'loss': 0.4485, 'grad_norm': 0.8645097017288208, 'learning_rate': 4.454881698936431e-07, 'epoch': 2.72} +2025-05-11 13:25:14 - ERROR - stderr - 91%|█████████ | 3395/3741 [19:59:20<2:03:57, 21.50s/it] +2025-05-11 13:25:34 - ERROR - stderr - 91%|█████████ | 3396/3741 [19:59:40<2:00:36, 20.98s/it] +2025-05-11 13:25:34 - ERROR - stderr - +2025-05-11 13:25:34 - ERROR - stderr - +2025-05-11 13:25:34 - INFO - stdout - {'loss': 0.4833, 'grad_norm': 0.902538537979126, 'learning_rate': 4.4293596821878613e-07, 'epoch': 2.72} +2025-05-11 13:25:34 - ERROR - stderr - 91%|█████████ | 3396/3741 [19:59:40<2:00:36, 20.98s/it] +2025-05-11 13:25:56 - ERROR - stderr - 91%|█████████ | 3397/3741 [20:00:02<2:02:07, 21.30s/it] +2025-05-11 13:25:56 - ERROR - stderr - +2025-05-11 13:25:56 - ERROR - stderr - +2025-05-11 13:25:56 - INFO - stdout - {'loss': 0.4663, 'grad_norm': 0.8527434468269348, 'learning_rate': 4.403909327560207e-07, 'epoch': 2.72} +2025-05-11 13:25:56 - ERROR - stderr - 91%|█████████ | 3397/3741 [20:00:02<2:02:07, 21.30s/it] +2025-05-11 13:26:16 - ERROR - stderr - 91%|█████████ | 3398/3741 [20:00:22<1:59:37, 20.93s/it] +2025-05-11 13:26:16 - ERROR - stderr - +2025-05-11 13:26:16 - ERROR - stderr - +2025-05-11 13:26:16 - INFO - stdout - {'loss': 0.4596, 'grad_norm': 0.8391342759132385, 'learning_rate': 4.378530654136948e-07, 'epoch': 2.72} +2025-05-11 13:26:16 - ERROR - stderr - 91%|█████████ | 3398/3741 [20:00:22<1:59:37, 20.93s/it] +2025-05-11 13:26:38 - ERROR - stderr - 91%|█████████ | 3399/3741 [20:00:44<2:01:17, 21.28s/it] +2025-05-11 13:26:38 - ERROR - stderr - +2025-05-11 13:26:38 - ERROR - stderr - +2025-05-11 13:26:38 - INFO - stdout - {'loss': 0.461, 'grad_norm': 0.8753255009651184, 'learning_rate': 4.3532236809479265e-07, 'epoch': 2.73} +2025-05-11 13:26:38 - ERROR - stderr - 91%|█████████ | 3399/3741 [20:00:44<2:01:17, 21.28s/it] +2025-05-11 13:26:58 - ERROR - stderr - 91%|█████████ | 3400/3741 [20:01:04<1:58:48, 20.90s/it] +2025-05-11 13:26:58 - ERROR - stderr - +2025-05-11 13:26:58 - ERROR - stderr - +2025-05-11 13:26:58 - INFO - stdout - {'loss': 0.4777, 'grad_norm': 0.8954243659973145, 'learning_rate': 4.327988426969154e-07, 'epoch': 2.73} +2025-05-11 13:26:58 - ERROR - stderr - 91%|█████████ | 3400/3741 [20:01:04<1:58:48, 20.90s/it] +2025-05-11 13:27:20 - ERROR - stderr - 91%|█████████ | 3401/3741 [20:01:27<2:01:07, 21.37s/it] +2025-05-11 13:27:20 - ERROR - stderr - +2025-05-11 13:27:20 - ERROR - stderr - +2025-05-11 13:27:20 - INFO - stdout - {'loss': 0.5002, 'grad_norm': 0.8503953218460083, 'learning_rate': 4.3028249111228824e-07, 'epoch': 2.73} +2025-05-11 13:27:20 - ERROR - stderr - 91%|█████████ | 3401/3741 [20:01:27<2:01:07, 21.37s/it] +2025-05-11 13:27:40 - ERROR - stderr - 91%|█████████ | 3402/3741 [20:01:46<1:58:06, 20.90s/it] +2025-05-11 13:27:40 - ERROR - stderr - +2025-05-11 13:27:40 - ERROR - stderr - +2025-05-11 13:27:40 - INFO - stdout - {'loss': 0.4702, 'grad_norm': 0.8919215798377991, 'learning_rate': 4.277733152277597e-07, 'epoch': 2.73} +2025-05-11 13:27:40 - ERROR - stderr - 91%|█████████ | 3402/3741 [20:01:46<1:58:06, 20.90s/it] +2025-05-11 13:28:00 - ERROR - stderr - 91%|█████████ | 3403/3741 [20:02:06<1:56:21, 20.66s/it] +2025-05-11 13:28:00 - ERROR - stderr - +2025-05-11 13:28:00 - ERROR - stderr - +2025-05-11 13:28:00 - INFO - stdout - {'loss': 0.457, 'grad_norm': 0.8524766564369202, 'learning_rate': 4.2527131692479127e-07, 'epoch': 2.73} +2025-05-11 13:28:00 - ERROR - stderr - 91%|█████████ | 3403/3741 [20:02:07<1:56:21, 20.66s/it] +2025-05-11 13:28:20 - ERROR - stderr - 91%|█████████ | 3404/3741 [20:02:26<1:54:30, 20.39s/it] +2025-05-11 13:28:20 - ERROR - stderr - +2025-05-11 13:28:20 - ERROR - stderr - +2025-05-11 13:28:20 - INFO - stdout - {'loss': 0.4827, 'grad_norm': 0.8316718935966492, 'learning_rate': 4.227764980794691e-07, 'epoch': 2.73} +2025-05-11 13:28:20 - ERROR - stderr - 91%|█████████ | 3404/3741 [20:02:26<1:54:30, 20.39s/it] +2025-05-11 13:28:39 - ERROR - stderr - 91%|█████████ | 3405/3741 [20:02:46<1:52:43, 20.13s/it] +2025-05-11 13:28:39 - ERROR - stderr - +2025-05-11 13:28:39 - ERROR - stderr - +2025-05-11 13:28:39 - INFO - stdout - {'loss': 0.4885, 'grad_norm': 0.9017997980117798, 'learning_rate': 4.202888605624944e-07, 'epoch': 2.73} +2025-05-11 13:28:39 - ERROR - stderr - 91%|█████████ | 3405/3741 [20:02:46<1:52:43, 20.13s/it] +2025-05-11 13:28:59 - ERROR - stderr - 91%|█████████ | 3406/3741 [20:03:06<1:51:49, 20.03s/it] +2025-05-11 13:28:59 - ERROR - stderr - +2025-05-11 13:28:59 - ERROR - stderr - +2025-05-11 13:28:59 - INFO - stdout - {'loss': 0.4698, 'grad_norm': 0.8500910401344299, 'learning_rate': 4.178084062391774e-07, 'epoch': 2.73} +2025-05-11 13:28:59 - ERROR - stderr - 91%|█████████ | 3406/3741 [20:03:06<1:51:49, 20.03s/it] +2025-05-11 13:29:19 - ERROR - stderr - 91%|█████████ | 3407/3741 [20:03:25<1:50:59, 19.94s/it] +2025-05-11 13:29:19 - ERROR - stderr - +2025-05-11 13:29:19 - ERROR - stderr - +2025-05-11 13:29:19 - INFO - stdout - {'loss': 0.4843, 'grad_norm': 0.8581136465072632, 'learning_rate': 4.153351369694536e-07, 'epoch': 2.73} +2025-05-11 13:29:19 - ERROR - stderr - 91%|█████████ | 3407/3741 [20:03:25<1:50:59, 19.94s/it] +2025-05-11 13:29:39 - ERROR - stderr - 91%|█████████ | 3408/3741 [20:03:46<1:51:32, 20.10s/it] +2025-05-11 13:29:39 - ERROR - stderr - +2025-05-11 13:29:39 - ERROR - stderr - +2025-05-11 13:29:39 - INFO - stdout - {'loss': 0.4624, 'grad_norm': 0.8434959053993225, 'learning_rate': 4.128690546078606e-07, 'epoch': 2.73} +2025-05-11 13:29:39 - ERROR - stderr - 91%|█████████ | 3408/3741 [20:03:46<1:51:32, 20.10s/it] +2025-05-11 13:29:59 - ERROR - stderr - 91%|█████████ | 3409/3741 [20:04:05<1:50:26, 19.96s/it] +2025-05-11 13:29:59 - ERROR - stderr - +2025-05-11 13:29:59 - ERROR - stderr - +2025-05-11 13:29:59 - INFO - stdout - {'loss': 0.4558, 'grad_norm': 0.9332457184791565, 'learning_rate': 4.104101610035527e-07, 'epoch': 2.73} +2025-05-11 13:29:59 - ERROR - stderr - 91%|█████████ | 3409/3741 [20:04:05<1:50:26, 19.96s/it] +2025-05-11 13:30:21 - ERROR - stderr - 91%|█████████ | 3410/3741 [20:04:27<1:53:29, 20.57s/it] +2025-05-11 13:30:21 - ERROR - stderr - +2025-05-11 13:30:21 - ERROR - stderr - +2025-05-11 13:30:21 - INFO - stdout - {'loss': 0.4803, 'grad_norm': 0.8714927434921265, 'learning_rate': 4.0795845800029156e-07, 'epoch': 2.73} +2025-05-11 13:30:21 - ERROR - stderr - 91%|█████████ | 3410/3741 [20:04:27<1:53:29, 20.57s/it] +2025-05-11 13:30:41 - ERROR - stderr - 91%|█████████ | 3411/3741 [20:04:47<1:51:54, 20.35s/it] +2025-05-11 13:30:41 - ERROR - stderr - +2025-05-11 13:30:41 - ERROR - stderr - +2025-05-11 13:30:41 - INFO - stdout - {'loss': 0.4536, 'grad_norm': 0.8897708654403687, 'learning_rate': 4.055139474364456e-07, 'epoch': 2.74} +2025-05-11 13:30:41 - ERROR - stderr - 91%|█████████ | 3411/3741 [20:04:47<1:51:54, 20.35s/it] +2025-05-11 13:31:03 - ERROR - stderr - 91%|█████████ | 3412/3741 [20:05:09<1:54:29, 20.88s/it] +2025-05-11 13:31:03 - ERROR - stderr - +2025-05-11 13:31:03 - ERROR - stderr - +2025-05-11 13:31:03 - INFO - stdout - {'loss': 0.4663, 'grad_norm': 0.8492273688316345, 'learning_rate': 4.030766311449952e-07, 'epoch': 2.74} +2025-05-11 13:31:03 - ERROR - stderr - 91%|█████████ | 3412/3741 [20:05:09<1:54:29, 20.88s/it] +2025-05-11 13:31:23 - ERROR - stderr - 91%|█████████ | 3413/3741 [20:05:29<1:52:42, 20.62s/it] +2025-05-11 13:31:23 - ERROR - stderr - +2025-05-11 13:31:23 - ERROR - stderr - +2025-05-11 13:31:23 - INFO - stdout - {'loss': 0.4609, 'grad_norm': 0.8852546811103821, 'learning_rate': 4.006465109535218e-07, 'epoch': 2.74} +2025-05-11 13:31:23 - ERROR - stderr - 91%|█████████ | 3413/3741 [20:05:29<1:52:42, 20.62s/it] +2025-05-11 13:31:46 - ERROR - stderr - 91%|█████████▏| 3414/3741 [20:05:52<1:55:37, 21.21s/it] +2025-05-11 13:31:46 - ERROR - stderr - +2025-05-11 13:31:46 - ERROR - stderr - +2025-05-11 13:31:46 - INFO - stdout - {'loss': 0.4631, 'grad_norm': 0.8187031149864197, 'learning_rate': 3.9822358868421116e-07, 'epoch': 2.74} +2025-05-11 13:31:46 - ERROR - stderr - 91%|█████████▏| 3414/3741 [20:05:52<1:55:37, 21.21s/it] +2025-05-11 13:32:06 - ERROR - stderr - 91%|█████████▏| 3415/3741 [20:06:12<1:53:46, 20.94s/it] +2025-05-11 13:32:06 - ERROR - stderr - +2025-05-11 13:32:06 - ERROR - stderr - +2025-05-11 13:32:06 - INFO - stdout - {'loss': 0.4694, 'grad_norm': 0.8688458204269409, 'learning_rate': 3.958078661538567e-07, 'epoch': 2.74} +2025-05-11 13:32:06 - ERROR - stderr - 91%|█████████▏| 3415/3741 [20:06:12<1:53:46, 20.94s/it] +2025-05-11 13:32:28 - ERROR - stderr - 91%|█████████▏| 3416/3741 [20:06:34<1:54:43, 21.18s/it] +2025-05-11 13:32:28 - ERROR - stderr - +2025-05-11 13:32:28 - ERROR - stderr - +2025-05-11 13:32:28 - INFO - stdout - {'loss': 0.4959, 'grad_norm': 0.97969651222229, 'learning_rate': 3.933993451738427e-07, 'epoch': 2.74} +2025-05-11 13:32:28 - ERROR - stderr - 91%|█████████▏| 3416/3741 [20:06:34<1:54:43, 21.18s/it] +2025-05-11 13:32:47 - ERROR - stderr - 91%|█████████▏| 3417/3741 [20:06:54<1:51:47, 20.70s/it] +2025-05-11 13:32:47 - ERROR - stderr - +2025-05-11 13:32:47 - ERROR - stderr - +2025-05-11 13:32:47 - INFO - stdout - {'loss': 0.4829, 'grad_norm': 0.8926424384117126, 'learning_rate': 3.909980275501679e-07, 'epoch': 2.74} +2025-05-11 13:32:47 - ERROR - stderr - 91%|█████████▏| 3417/3741 [20:06:54<1:51:47, 20.70s/it] +2025-05-11 13:33:10 - ERROR - stderr - 91%|█████████▏| 3418/3741 [20:07:16<1:53:57, 21.17s/it] +2025-05-11 13:33:10 - ERROR - stderr - +2025-05-11 13:33:10 - ERROR - stderr - +2025-05-11 13:33:10 - INFO - stdout - {'loss': 0.4633, 'grad_norm': 0.8417994379997253, 'learning_rate': 3.8860391508341754e-07, 'epoch': 2.74} +2025-05-11 13:33:10 - ERROR - stderr - 91%|█████████▏| 3418/3741 [20:07:16<1:53:57, 21.17s/it] +2025-05-11 13:33:29 - ERROR - stderr - 91%|█████████▏| 3419/3741 [20:07:35<1:50:57, 20.67s/it] +2025-05-11 13:33:29 - ERROR - stderr - +2025-05-11 13:33:29 - ERROR - stderr - +2025-05-11 13:33:29 - INFO - stdout - {'loss': 0.4935, 'grad_norm': 0.9168954491615295, 'learning_rate': 3.8621700956877784e-07, 'epoch': 2.74} +2025-05-11 13:33:29 - ERROR - stderr - 91%|█████████▏| 3419/3741 [20:07:35<1:50:57, 20.67s/it] +2025-05-11 13:33:51 - ERROR - stderr - 91%|█████████▏| 3420/3741 [20:07:57<1:52:14, 20.98s/it] +2025-05-11 13:33:51 - ERROR - stderr - +2025-05-11 13:33:51 - ERROR - stderr - +2025-05-11 13:33:51 - INFO - stdout - {'loss': 0.4714, 'grad_norm': 0.8628125786781311, 'learning_rate': 3.8383731279603597e-07, 'epoch': 2.74} +2025-05-11 13:33:51 - ERROR - stderr - 91%|█████████▏| 3420/3741 [20:07:57<1:52:14, 20.98s/it] +2025-05-11 13:34:10 - ERROR - stderr - 91%|█████████▏| 3421/3741 [20:08:17<1:49:39, 20.56s/it] +2025-05-11 13:34:10 - ERROR - stderr - +2025-05-11 13:34:10 - ERROR - stderr - +2025-05-11 13:34:10 - INFO - stdout - {'loss': 0.4796, 'grad_norm': 0.8950872421264648, 'learning_rate': 3.8146482654956574e-07, 'epoch': 2.74} +2025-05-11 13:34:10 - ERROR - stderr - 91%|█████████▏| 3421/3741 [20:08:17<1:49:39, 20.56s/it] +2025-05-11 13:34:32 - ERROR - stderr - 91%|█████████▏| 3422/3741 [20:08:38<1:50:39, 20.81s/it] +2025-05-11 13:34:32 - ERROR - stderr - +2025-05-11 13:34:32 - ERROR - stderr - +2025-05-11 13:34:32 - INFO - stdout - {'loss': 0.4808, 'grad_norm': 0.8650218844413757, 'learning_rate': 3.7909955260833966e-07, 'epoch': 2.74} +2025-05-11 13:34:32 - ERROR - stderr - 91%|█████████▏| 3422/3741 [20:08:38<1:50:39, 20.81s/it] +2025-05-11 13:34:51 - ERROR - stderr - 91%|█████████▏| 3423/3741 [20:08:58<1:48:23, 20.45s/it] +2025-05-11 13:34:51 - ERROR - stderr - +2025-05-11 13:34:51 - ERROR - stderr - +2025-05-11 13:34:51 - INFO - stdout - {'loss': 0.4782, 'grad_norm': 0.8490626215934753, 'learning_rate': 3.767414927459223e-07, 'epoch': 2.74} +2025-05-11 13:34:51 - ERROR - stderr - 91%|█████████▏| 3423/3741 [20:08:58<1:48:23, 20.45s/it] +2025-05-11 13:35:13 - ERROR - stderr - 92%|█████████▏| 3424/3741 [20:09:20<1:50:41, 20.95s/it] +2025-05-11 13:35:13 - ERROR - stderr - +2025-05-11 13:35:13 - ERROR - stderr - +2025-05-11 13:35:13 - INFO - stdout - {'loss': 0.4578, 'grad_norm': 0.8351301550865173, 'learning_rate': 3.743906487304627e-07, 'epoch': 2.75} +2025-05-11 13:35:13 - ERROR - stderr - 92%|█████████▏| 3424/3741 [20:09:20<1:50:41, 20.95s/it] +2025-05-11 13:35:34 - ERROR - stderr - 92%|█████████▏| 3425/3741 [20:09:40<1:49:02, 20.71s/it] +2025-05-11 13:35:34 - ERROR - stderr - +2025-05-11 13:35:34 - ERROR - stderr - +2025-05-11 13:35:34 - INFO - stdout - {'loss': 0.4564, 'grad_norm': 0.8559272289276123, 'learning_rate': 3.720470223247097e-07, 'epoch': 2.75} +2025-05-11 13:35:34 - ERROR - stderr - 92%|█████████▏| 3425/3741 [20:09:40<1:49:02, 20.71s/it] +2025-05-11 13:35:54 - ERROR - stderr - 92%|█████████▏| 3426/3741 [20:10:00<1:47:46, 20.53s/it] +2025-05-11 13:35:54 - ERROR - stderr - +2025-05-11 13:35:54 - ERROR - stderr - +2025-05-11 13:35:54 - INFO - stdout - {'loss': 0.4932, 'grad_norm': 0.8744351267814636, 'learning_rate': 3.697106152859886e-07, 'epoch': 2.75} +2025-05-11 13:35:54 - ERROR - stderr - 92%|█████████▏| 3426/3741 [20:10:00<1:47:46, 20.53s/it] +2025-05-11 13:36:13 - ERROR - stderr - 92%|█████████▏| 3427/3741 [20:10:20<1:46:06, 20.27s/it] +2025-05-11 13:36:13 - ERROR - stderr - +2025-05-11 13:36:13 - ERROR - stderr - +2025-05-11 13:36:13 - INFO - stdout - {'loss': 0.4939, 'grad_norm': 0.8866355419158936, 'learning_rate': 3.6738142936622035e-07, 'epoch': 2.75} +2025-05-11 13:36:13 - ERROR - stderr - 92%|█████████▏| 3427/3741 [20:10:20<1:46:06, 20.27s/it] +2025-05-11 13:36:33 - ERROR - stderr - 92%|█████████▏| 3428/3741 [20:10:39<1:44:25, 20.02s/it] +2025-05-11 13:36:33 - ERROR - stderr - +2025-05-11 13:36:33 - ERROR - stderr - +2025-05-11 13:36:33 - INFO - stdout - {'loss': 0.4924, 'grad_norm': 0.8977624177932739, 'learning_rate': 3.650594663119089e-07, 'epoch': 2.75} +2025-05-11 13:36:33 - ERROR - stderr - 92%|█████████▏| 3428/3741 [20:10:39<1:44:25, 20.02s/it] +2025-05-11 13:36:53 - ERROR - stderr - 92%|█████████▏| 3429/3741 [20:10:59<1:43:54, 19.98s/it] +2025-05-11 13:36:53 - ERROR - stderr - +2025-05-11 13:36:53 - ERROR - stderr - +2025-05-11 13:36:53 - INFO - stdout - {'loss': 0.4753, 'grad_norm': 0.8765467405319214, 'learning_rate': 3.6274472786413605e-07, 'epoch': 2.75} +2025-05-11 13:36:53 - ERROR - stderr - 92%|█████████▏| 3429/3741 [20:10:59<1:43:54, 19.98s/it] +2025-05-11 13:37:12 - ERROR - stderr - 92%|█████████▏| 3430/3741 [20:11:19<1:43:16, 19.93s/it] +2025-05-11 13:37:13 - ERROR - stderr - +2025-05-11 13:37:13 - ERROR - stderr - +2025-05-11 13:37:13 - INFO - stdout - {'loss': 0.4566, 'grad_norm': 0.8574792146682739, 'learning_rate': 3.604372157585767e-07, 'epoch': 2.75} +2025-05-11 13:37:13 - ERROR - stderr - 92%|█████████▏| 3430/3741 [20:11:19<1:43:16, 19.93s/it] +2025-05-11 13:37:32 - ERROR - stderr - 92%|█████████▏| 3431/3741 [20:11:38<1:42:31, 19.84s/it] +2025-05-11 13:37:32 - ERROR - stderr - +2025-05-11 13:37:32 - ERROR - stderr - +2025-05-11 13:37:32 - INFO - stdout - {'loss': 0.4866, 'grad_norm': 0.9319592714309692, 'learning_rate': 3.5813693172548016e-07, 'epoch': 2.75} +2025-05-11 13:37:32 - ERROR - stderr - 92%|█████████▏| 3431/3741 [20:11:38<1:42:31, 19.84s/it] +2025-05-11 13:37:52 - ERROR - stderr - 92%|█████████▏| 3432/3741 [20:11:58<1:41:29, 19.71s/it] +2025-05-11 13:37:52 - ERROR - stderr - +2025-05-11 13:37:52 - ERROR - stderr - +2025-05-11 13:37:52 - INFO - stdout - {'loss': 0.4619, 'grad_norm': 0.8465300798416138, 'learning_rate': 3.5584387748967665e-07, 'epoch': 2.75} +2025-05-11 13:37:52 - ERROR - stderr - 92%|█████████▏| 3432/3741 [20:11:58<1:41:29, 19.71s/it] +2025-05-11 13:38:11 - ERROR - stderr - 92%|█████████▏| 3433/3741 [20:12:18<1:41:05, 19.69s/it] +2025-05-11 13:38:11 - ERROR - stderr - +2025-05-11 13:38:11 - ERROR - stderr - +2025-05-11 13:38:11 - INFO - stdout - {'loss': 0.4739, 'grad_norm': 0.8767644166946411, 'learning_rate': 3.535580547705797e-07, 'epoch': 2.75} +2025-05-11 13:38:11 - ERROR - stderr - 92%|█████████▏| 3433/3741 [20:12:18<1:41:05, 19.69s/it] +2025-05-11 13:38:31 - ERROR - stderr - 92%|█████████▏| 3434/3741 [20:12:37<1:40:43, 19.69s/it] +2025-05-11 13:38:31 - ERROR - stderr - +2025-05-11 13:38:31 - ERROR - stderr - +2025-05-11 13:38:31 - INFO - stdout - {'loss': 0.478, 'grad_norm': 0.8457480072975159, 'learning_rate': 3.512794652821716e-07, 'epoch': 2.75} +2025-05-11 13:38:31 - ERROR - stderr - 92%|█████████▏| 3434/3741 [20:12:37<1:40:43, 19.69s/it] +2025-05-11 13:38:52 - ERROR - stderr - 92%|█████████▏| 3435/3741 [20:12:58<1:42:39, 20.13s/it] +2025-05-11 13:38:52 - ERROR - stderr - +2025-05-11 13:38:52 - ERROR - stderr - +2025-05-11 13:38:52 - INFO - stdout - {'loss': 0.4537, 'grad_norm': 0.871969997882843, 'learning_rate': 3.490081107330223e-07, 'epoch': 2.75} +2025-05-11 13:38:52 - ERROR - stderr - 92%|█████████▏| 3435/3741 [20:12:58<1:42:39, 20.13s/it] +2025-05-11 13:39:12 - ERROR - stderr - 92%|█████████▏| 3436/3741 [20:13:18<1:41:33, 19.98s/it] +2025-05-11 13:39:12 - ERROR - stderr - +2025-05-11 13:39:12 - ERROR - stderr - +2025-05-11 13:39:12 - INFO - stdout - {'loss': 0.4704, 'grad_norm': 0.8666412234306335, 'learning_rate': 3.4674399282626616e-07, 'epoch': 2.76} +2025-05-11 13:39:12 - ERROR - stderr - 92%|█████████▏| 3436/3741 [20:13:18<1:41:33, 19.98s/it] +2025-05-11 13:39:34 - ERROR - stderr - 92%|█████████▏| 3437/3741 [20:13:40<1:44:42, 20.67s/it] +2025-05-11 13:39:34 - ERROR - stderr - +2025-05-11 13:39:34 - ERROR - stderr - +2025-05-11 13:39:34 - INFO - stdout - {'loss': 0.4902, 'grad_norm': 0.8694742918014526, 'learning_rate': 3.4448711325961834e-07, 'epoch': 2.76} +2025-05-11 13:39:34 - ERROR - stderr - 92%|█████████▏| 3437/3741 [20:13:40<1:44:42, 20.67s/it] +2025-05-11 13:39:53 - ERROR - stderr - 92%|█████████▏| 3438/3741 [20:14:00<1:42:31, 20.30s/it] +2025-05-11 13:39:53 - ERROR - stderr - +2025-05-11 13:39:53 - ERROR - stderr - +2025-05-11 13:39:53 - INFO - stdout - {'loss': 0.4604, 'grad_norm': 0.8497282266616821, 'learning_rate': 3.422374737253642e-07, 'epoch': 2.76} +2025-05-11 13:39:53 - ERROR - stderr - 92%|█████████▏| 3438/3741 [20:14:00<1:42:31, 20.30s/it] +2025-05-11 13:40:15 - ERROR - stderr - 92%|█████████▏| 3439/3741 [20:14:21<1:44:17, 20.72s/it] +2025-05-11 13:40:15 - ERROR - stderr - +2025-05-11 13:40:15 - ERROR - stderr - +2025-05-11 13:40:15 - INFO - stdout - {'loss': 0.4865, 'grad_norm': 0.9280922412872314, 'learning_rate': 3.399950759103576e-07, 'epoch': 2.76} +2025-05-11 13:40:15 - ERROR - stderr - 92%|█████████▏| 3439/3741 [20:14:21<1:44:17, 20.72s/it] +2025-05-11 13:40:35 - ERROR - stderr - 92%|█████████▏| 3440/3741 [20:14:42<1:43:01, 20.54s/it] +2025-05-11 13:40:35 - ERROR - stderr - +2025-05-11 13:40:35 - ERROR - stderr - +2025-05-11 13:40:35 - INFO - stdout - {'loss': 0.477, 'grad_norm': 0.8891953229904175, 'learning_rate': 3.37759921496027e-07, 'epoch': 2.76} +2025-05-11 13:40:35 - ERROR - stderr - 92%|█████████▏| 3440/3741 [20:14:42<1:43:01, 20.54s/it] +2025-05-11 13:40:57 - ERROR - stderr - 92%|█████████▏| 3441/3741 [20:15:03<1:44:03, 20.81s/it] +2025-05-11 13:40:57 - ERROR - stderr - +2025-05-11 13:40:57 - ERROR - stderr - +2025-05-11 13:40:57 - INFO - stdout - {'loss': 0.4823, 'grad_norm': 0.9152265787124634, 'learning_rate': 3.355320121583672e-07, 'epoch': 2.76} +2025-05-11 13:40:57 - ERROR - stderr - 92%|█████████▏| 3441/3741 [20:15:03<1:44:03, 20.81s/it] +2025-05-11 13:41:17 - ERROR - stderr - 92%|█████████▏| 3442/3741 [20:15:23<1:42:23, 20.55s/it] +2025-05-11 13:41:17 - ERROR - stderr - +2025-05-11 13:41:17 - ERROR - stderr - +2025-05-11 13:41:17 - INFO - stdout - {'loss': 0.474, 'grad_norm': 0.8602423071861267, 'learning_rate': 3.3331134956793965e-07, 'epoch': 2.76} +2025-05-11 13:41:17 - ERROR - stderr - 92%|█████████▏| 3442/3741 [20:15:23<1:42:23, 20.55s/it] +2025-05-11 13:41:39 - ERROR - stderr - 92%|█████████▏| 3443/3741 [20:15:45<1:44:26, 21.03s/it] +2025-05-11 13:41:39 - ERROR - stderr - +2025-05-11 13:41:39 - ERROR - stderr - +2025-05-11 13:41:39 - INFO - stdout - {'loss': 0.4873, 'grad_norm': 0.8670658469200134, 'learning_rate': 3.3109793538987356e-07, 'epoch': 2.76} +2025-05-11 13:41:39 - ERROR - stderr - 92%|█████████▏| 3443/3741 [20:15:45<1:44:26, 21.03s/it] +2025-05-11 13:41:59 - ERROR - stderr - 92%|█████████▏| 3444/3741 [20:16:05<1:42:56, 20.80s/it] +2025-05-11 13:41:59 - ERROR - stderr - +2025-05-11 13:41:59 - ERROR - stderr - +2025-05-11 13:41:59 - INFO - stdout - {'loss': 0.4697, 'grad_norm': 0.9038890600204468, 'learning_rate': 3.288917712838613e-07, 'epoch': 2.76} +2025-05-11 13:41:59 - ERROR - stderr - 92%|█████████▏| 3444/3741 [20:16:05<1:42:56, 20.80s/it] +2025-05-11 13:42:22 - ERROR - stderr - 92%|█████████▏| 3445/3741 [20:16:28<1:45:27, 21.38s/it] +2025-05-11 13:42:22 - ERROR - stderr - +2025-05-11 13:42:22 - ERROR - stderr - +2025-05-11 13:42:22 - INFO - stdout - {'loss': 0.4837, 'grad_norm': 0.9339286684989929, 'learning_rate': 3.266928589041607e-07, 'epoch': 2.76} +2025-05-11 13:42:22 - ERROR - stderr - 92%|█████████▏| 3445/3741 [20:16:28<1:45:27, 21.38s/it] +2025-05-11 13:42:42 - ERROR - stderr - 92%|█████████▏| 3446/3741 [20:16:48<1:43:20, 21.02s/it] +2025-05-11 13:42:42 - ERROR - stderr - +2025-05-11 13:42:42 - ERROR - stderr - +2025-05-11 13:42:42 - INFO - stdout - {'loss': 0.4526, 'grad_norm': 0.8574177622795105, 'learning_rate': 3.2450119989959064e-07, 'epoch': 2.76} +2025-05-11 13:42:42 - ERROR - stderr - 92%|█████████▏| 3446/3741 [20:16:48<1:43:20, 21.02s/it] +2025-05-11 13:43:04 - ERROR - stderr - 92%|█████████▏| 3447/3741 [20:17:10<1:44:38, 21.35s/it] +2025-05-11 13:43:04 - ERROR - stderr - +2025-05-11 13:43:04 - ERROR - stderr - +2025-05-11 13:43:04 - INFO - stdout - {'loss': 0.4726, 'grad_norm': 0.8637518882751465, 'learning_rate': 3.2231679591353203e-07, 'epoch': 2.76} +2025-05-11 13:43:04 - ERROR - stderr - 92%|█████████▏| 3447/3741 [20:17:10<1:44:38, 21.35s/it] +2025-05-11 13:43:23 - ERROR - stderr - 92%|█████████▏| 3448/3741 [20:17:30<1:41:25, 20.77s/it] +2025-05-11 13:43:23 - ERROR - stderr - +2025-05-11 13:43:23 - ERROR - stderr - +2025-05-11 13:43:23 - INFO - stdout - {'loss': 0.4474, 'grad_norm': 0.8453518152236938, 'learning_rate': 3.201396485839259e-07, 'epoch': 2.77} +2025-05-11 13:43:23 - ERROR - stderr - 92%|█████████▏| 3448/3741 [20:17:30<1:41:25, 20.77s/it] +2025-05-11 13:43:45 - ERROR - stderr - 92%|█████████▏| 3449/3741 [20:17:51<1:41:33, 20.87s/it] +2025-05-11 13:43:45 - ERROR - stderr - +2025-05-11 13:43:45 - ERROR - stderr - +2025-05-11 13:43:45 - INFO - stdout - {'loss': 0.4729, 'grad_norm': 0.8873100280761719, 'learning_rate': 3.179697595432707e-07, 'epoch': 2.77} +2025-05-11 13:43:45 - ERROR - stderr - 92%|█████████▏| 3449/3741 [20:17:51<1:41:33, 20.87s/it] +2025-05-11 13:44:04 - ERROR - stderr - 92%|█████████▏| 3450/3741 [20:18:10<1:39:07, 20.44s/it] +2025-05-11 13:44:04 - ERROR - stderr - +2025-05-11 13:44:04 - ERROR - stderr - +2025-05-11 13:44:04 - INFO - stdout - {'loss': 0.4532, 'grad_norm': 0.8434620499610901, 'learning_rate': 3.158071304186228e-07, 'epoch': 2.77} +2025-05-11 13:44:04 - ERROR - stderr - 92%|█████████▏| 3450/3741 [20:18:10<1:39:07, 20.44s/it] +2025-05-11 13:44:24 - ERROR - stderr - 92%|█████████▏| 3451/3741 [20:18:30<1:37:39, 20.21s/it] +2025-05-11 13:44:24 - ERROR - stderr - +2025-05-11 13:44:24 - ERROR - stderr - +2025-05-11 13:44:24 - INFO - stdout - {'loss': 0.4721, 'grad_norm': 0.8255607485771179, 'learning_rate': 3.136517628315949e-07, 'epoch': 2.77} +2025-05-11 13:44:24 - ERROR - stderr - 92%|█████████▏| 3451/3741 [20:18:30<1:37:39, 20.21s/it] +2025-05-11 13:44:43 - ERROR - stderr - 92%|█████████▏| 3452/3741 [20:18:50<1:36:35, 20.05s/it] +2025-05-11 13:44:43 - ERROR - stderr - +2025-05-11 13:44:43 - ERROR - stderr - +2025-05-11 13:44:43 - INFO - stdout - {'loss': 0.4832, 'grad_norm': 0.8967651724815369, 'learning_rate': 3.1150365839835773e-07, 'epoch': 2.77} +2025-05-11 13:44:43 - ERROR - stderr - 92%|█████████▏| 3452/3741 [20:18:50<1:36:35, 20.05s/it] +2025-05-11 13:45:03 - ERROR - stderr - 92%|█████████▏| 3453/3741 [20:19:09<1:35:29, 19.90s/it] +2025-05-11 13:45:03 - ERROR - stderr - +2025-05-11 13:45:03 - ERROR - stderr - +2025-05-11 13:45:03 - INFO - stdout - {'loss': 0.5076, 'grad_norm': 0.9030566811561584, 'learning_rate': 3.093628187296294e-07, 'epoch': 2.77} +2025-05-11 13:45:03 - ERROR - stderr - 92%|█████████▏| 3453/3741 [20:19:09<1:35:29, 19.90s/it] +2025-05-11 13:45:23 - ERROR - stderr - 92%|█████████▏| 3454/3741 [20:19:29<1:35:03, 19.87s/it] +2025-05-11 13:45:23 - ERROR - stderr - +2025-05-11 13:45:23 - ERROR - stderr - +2025-05-11 13:45:23 - INFO - stdout - {'loss': 0.4568, 'grad_norm': 0.8555736541748047, 'learning_rate': 3.0722924543068687e-07, 'epoch': 2.77} +2025-05-11 13:45:23 - ERROR - stderr - 92%|█████████▏| 3454/3741 [20:19:29<1:35:03, 19.87s/it] +2025-05-11 13:45:42 - ERROR - stderr - 92%|█████████▏| 3455/3741 [20:19:49<1:34:31, 19.83s/it] +2025-05-11 13:45:42 - ERROR - stderr - +2025-05-11 13:45:42 - ERROR - stderr - +2025-05-11 13:45:42 - INFO - stdout - {'loss': 0.4755, 'grad_norm': 0.8584993481636047, 'learning_rate': 3.0510294010135387e-07, 'epoch': 2.77} +2025-05-11 13:45:42 - ERROR - stderr - 92%|█████████▏| 3455/3741 [20:19:49<1:34:31, 19.83s/it] +2025-05-11 13:46:04 - ERROR - stderr - 92%|█████████▏| 3456/3741 [20:20:10<1:36:03, 20.22s/it] +2025-05-11 13:46:04 - ERROR - stderr - +2025-05-11 13:46:04 - ERROR - stderr - +2025-05-11 13:46:04 - INFO - stdout - {'loss': 0.494, 'grad_norm': 0.9319979548454285, 'learning_rate': 3.0298390433600945e-07, 'epoch': 2.77} +2025-05-11 13:46:04 - ERROR - stderr - 92%|█████████▏| 3456/3741 [20:20:10<1:36:03, 20.22s/it] +2025-05-11 13:46:23 - ERROR - stderr - 92%|█████████▏| 3457/3741 [20:20:30<1:34:59, 20.07s/it] +2025-05-11 13:46:23 - ERROR - stderr - +2025-05-11 13:46:23 - ERROR - stderr - +2025-05-11 13:46:23 - INFO - stdout - {'loss': 0.4819, 'grad_norm': 0.8971447348594666, 'learning_rate': 3.008721397235781e-07, 'epoch': 2.77} +2025-05-11 13:46:23 - ERROR - stderr - 92%|█████████▏| 3457/3741 [20:20:30<1:34:59, 20.07s/it] +2025-05-11 13:46:46 - ERROR - stderr - 92%|█████████▏| 3458/3741 [20:20:52<1:38:27, 20.87s/it] +2025-05-11 13:46:46 - ERROR - stderr - +2025-05-11 13:46:46 - ERROR - stderr - +2025-05-11 13:46:46 - INFO - stdout - {'loss': 0.4959, 'grad_norm': 0.8856346011161804, 'learning_rate': 2.9876764784753096e-07, 'epoch': 2.77} +2025-05-11 13:46:46 - ERROR - stderr - 92%|█████████▏| 3458/3741 [20:20:52<1:38:27, 20.87s/it] +2025-05-11 13:47:06 - ERROR - stderr - 92%|█████████▏| 3459/3741 [20:21:12<1:36:26, 20.52s/it] +2025-05-11 13:47:06 - ERROR - stderr - +2025-05-11 13:47:06 - ERROR - stderr - +2025-05-11 13:47:06 - INFO - stdout - {'loss': 0.4604, 'grad_norm': 0.8331868052482605, 'learning_rate': 2.966704302858892e-07, 'epoch': 2.77} +2025-05-11 13:47:06 - ERROR - stderr - 92%|█████████▏| 3459/3741 [20:21:12<1:36:26, 20.52s/it] +2025-05-11 13:47:29 - ERROR - stderr - 92%|█████████▏| 3460/3741 [20:21:35<1:39:19, 21.21s/it] +2025-05-11 13:47:29 - ERROR - stderr - +2025-05-11 13:47:29 - ERROR - stderr - +2025-05-11 13:47:29 - INFO - stdout - {'loss': 0.4757, 'grad_norm': 0.8967451453208923, 'learning_rate': 2.945804886112169e-07, 'epoch': 2.77} +2025-05-11 13:47:29 - ERROR - stderr - 92%|█████████▏| 3460/3741 [20:21:35<1:39:19, 21.21s/it] +2025-05-11 13:47:48 - ERROR - stderr - 93%|█████████▎| 3461/3741 [20:21:54<1:36:35, 20.70s/it] +2025-05-11 13:47:48 - ERROR - stderr - +2025-05-11 13:47:48 - ERROR - stderr - +2025-05-11 13:47:48 - INFO - stdout - {'loss': 0.4762, 'grad_norm': 0.8893353343009949, 'learning_rate': 2.924978243906251e-07, 'epoch': 2.78} +2025-05-11 13:47:48 - ERROR - stderr - 93%|█████████▎| 3461/3741 [20:21:54<1:36:35, 20.70s/it] +2025-05-11 13:48:10 - ERROR - stderr - 93%|█████████▎| 3462/3741 [20:22:17<1:38:25, 21.17s/it] +2025-05-11 13:48:10 - ERROR - stderr - +2025-05-11 13:48:10 - ERROR - stderr - +2025-05-11 13:48:10 - INFO - stdout - {'loss': 0.4907, 'grad_norm': 0.9258598685264587, 'learning_rate': 2.9042243918576574e-07, 'epoch': 2.78} +2025-05-11 13:48:10 - ERROR - stderr - 93%|█████████▎| 3462/3741 [20:22:17<1:38:25, 21.17s/it] +2025-05-11 13:48:30 - ERROR - stderr - 93%|█████████▎| 3463/3741 [20:22:36<1:36:07, 20.75s/it] +2025-05-11 13:48:30 - ERROR - stderr - +2025-05-11 13:48:30 - ERROR - stderr - +2025-05-11 13:48:30 - INFO - stdout - {'loss': 0.4659, 'grad_norm': 0.8462851047515869, 'learning_rate': 2.883543345528328e-07, 'epoch': 2.78} +2025-05-11 13:48:30 - ERROR - stderr - 93%|█████████▎| 3463/3741 [20:22:36<1:36:07, 20.75s/it] +2025-05-11 13:48:51 - ERROR - stderr - 93%|█████████▎| 3464/3741 [20:22:58<1:36:20, 20.87s/it] +2025-05-11 13:48:51 - ERROR - stderr - +2025-05-11 13:48:51 - ERROR - stderr - +2025-05-11 13:48:51 - INFO - stdout - {'loss': 0.4619, 'grad_norm': 0.8520975112915039, 'learning_rate': 2.862935120425614e-07, 'epoch': 2.78} +2025-05-11 13:48:51 - ERROR - stderr - 93%|█████████▎| 3464/3741 [20:22:58<1:36:20, 20.87s/it] +2025-05-11 13:49:11 - ERROR - stderr - 93%|█████████▎| 3465/3741 [20:23:17<1:34:08, 20.47s/it] +2025-05-11 13:49:11 - ERROR - stderr - +2025-05-11 13:49:11 - ERROR - stderr - +2025-05-11 13:49:11 - INFO - stdout - {'loss': 0.4773, 'grad_norm': 0.9390130639076233, 'learning_rate': 2.8423997320022765e-07, 'epoch': 2.78} +2025-05-11 13:49:11 - ERROR - stderr - 93%|█████████▎| 3465/3741 [20:23:17<1:34:08, 20.47s/it] +2025-05-11 13:49:31 - ERROR - stderr - 93%|█████████▎| 3466/3741 [20:23:37<1:32:56, 20.28s/it] +2025-05-11 13:49:31 - ERROR - stderr - +2025-05-11 13:49:31 - ERROR - stderr - +2025-05-11 13:49:31 - INFO - stdout - {'loss': 0.4857, 'grad_norm': 0.9307414293289185, 'learning_rate': 2.821937195656421e-07, 'epoch': 2.78} +2025-05-11 13:49:31 - ERROR - stderr - 93%|█████████▎| 3466/3741 [20:23:37<1:32:56, 20.28s/it] +2025-05-11 13:49:50 - ERROR - stderr - 93%|█████████▎| 3467/3741 [20:23:57<1:31:42, 20.08s/it] +2025-05-11 13:49:50 - ERROR - stderr - +2025-05-11 13:49:50 - ERROR - stderr - +2025-05-11 13:49:50 - INFO - stdout - {'loss': 0.4645, 'grad_norm': 0.9021451473236084, 'learning_rate': 2.801547526731596e-07, 'epoch': 2.78} +2025-05-11 13:49:50 - ERROR - stderr - 93%|█████████▎| 3467/3741 [20:23:57<1:31:42, 20.08s/it] +2025-05-11 13:50:10 - ERROR - stderr - 93%|█████████▎| 3468/3741 [20:24:16<1:30:54, 19.98s/it] +2025-05-11 13:50:10 - ERROR - stderr - +2025-05-11 13:50:10 - ERROR - stderr - +2025-05-11 13:50:10 - INFO - stdout - {'loss': 0.4581, 'grad_norm': 0.8632877469062805, 'learning_rate': 2.781230740516649e-07, 'epoch': 2.78} +2025-05-11 13:50:10 - ERROR - stderr - 93%|█████████▎| 3468/3741 [20:24:16<1:30:54, 19.98s/it] +2025-05-11 13:50:30 - ERROR - stderr - 93%|█████████▎| 3469/3741 [20:24:36<1:30:06, 19.88s/it] +2025-05-11 13:50:30 - ERROR - stderr - +2025-05-11 13:50:30 - ERROR - stderr - +2025-05-11 13:50:30 - INFO - stdout - {'loss': 0.4629, 'grad_norm': 0.8747298717498779, 'learning_rate': 2.760986852245784e-07, 'epoch': 2.78} +2025-05-11 13:50:30 - ERROR - stderr - 93%|█████████▎| 3469/3741 [20:24:36<1:30:06, 19.88s/it] +2025-05-11 13:50:49 - ERROR - stderr - 93%|█████████▎| 3470/3741 [20:24:56<1:29:30, 19.82s/it] +2025-05-11 13:50:49 - ERROR - stderr - +2025-05-11 13:50:49 - ERROR - stderr - +2025-05-11 13:50:49 - INFO - stdout - {'loss': 0.4507, 'grad_norm': 0.8483293652534485, 'learning_rate': 2.7408158770985905e-07, 'epoch': 2.78} +2025-05-11 13:50:49 - ERROR - stderr - 93%|█████████▎| 3470/3741 [20:24:56<1:29:30, 19.82s/it] +2025-05-11 13:51:10 - ERROR - stderr - 93%|█████████▎| 3471/3741 [20:25:16<1:30:11, 20.04s/it] +2025-05-11 13:51:10 - ERROR - stderr - +2025-05-11 13:51:10 - ERROR - stderr - +2025-05-11 13:51:10 - INFO - stdout - {'loss': 0.4687, 'grad_norm': 0.853502094745636, 'learning_rate': 2.720717830199904e-07, 'epoch': 2.78} +2025-05-11 13:51:10 - ERROR - stderr - 93%|█████████▎| 3471/3741 [20:25:16<1:30:11, 20.04s/it] +2025-05-11 13:51:10 - INFO - stdout - WARNING: tokenization mismatch: 3181 vs. 3198. (ignored) +2025-05-11 13:51:30 - ERROR - stderr - 93%|█████████▎| 3472/3741 [20:25:36<1:29:26, 19.95s/it] +2025-05-11 13:51:30 - ERROR - stderr - +2025-05-11 13:51:30 - ERROR - stderr - +2025-05-11 13:51:30 - INFO - stdout - {'loss': 0.5025, 'grad_norm': 0.9311491250991821, 'learning_rate': 2.70069272661998e-07, 'epoch': 2.78} +2025-05-11 13:51:30 - ERROR - stderr - 93%|█████████▎| 3472/3741 [20:25:36<1:29:26, 19.95s/it] +2025-05-11 13:51:51 - ERROR - stderr - 93%|█████████▎| 3473/3741 [20:25:57<1:30:30, 20.26s/it] +2025-05-11 13:51:51 - ERROR - stderr - +2025-05-11 13:51:51 - ERROR - stderr - +2025-05-11 13:51:51 - INFO - stdout - {'loss': 0.4954, 'grad_norm': 0.930825412273407, 'learning_rate': 2.680740581374286e-07, 'epoch': 2.79} +2025-05-11 13:51:51 - ERROR - stderr - 93%|█████████▎| 3473/3741 [20:25:57<1:30:30, 20.26s/it] +2025-05-11 13:52:10 - ERROR - stderr - 93%|█████████▎| 3474/3741 [20:26:17<1:29:29, 20.11s/it] +2025-05-11 13:52:10 - ERROR - stderr - +2025-05-11 13:52:10 - ERROR - stderr - +2025-05-11 13:52:10 - INFO - stdout - {'loss': 0.4692, 'grad_norm': 0.866186797618866, 'learning_rate': 2.6608614094236317e-07, 'epoch': 2.79} +2025-05-11 13:52:10 - ERROR - stderr - 93%|█████████▎| 3474/3741 [20:26:17<1:29:29, 20.11s/it] +2025-05-11 13:52:31 - ERROR - stderr - 93%|█████████▎| 3475/3741 [20:26:37<1:29:47, 20.25s/it] +2025-05-11 13:52:31 - ERROR - stderr - +2025-05-11 13:52:31 - ERROR - stderr - +2025-05-11 13:52:31 - INFO - stdout - {'loss': 0.4558, 'grad_norm': 0.8751170635223389, 'learning_rate': 2.641055225674105e-07, 'epoch': 2.79} +2025-05-11 13:52:31 - ERROR - stderr - 93%|█████████▎| 3475/3741 [20:26:37<1:29:47, 20.25s/it] +2025-05-11 13:52:50 - ERROR - stderr - 93%|█████████▎| 3476/3741 [20:26:57<1:28:33, 20.05s/it] +2025-05-11 13:52:50 - ERROR - stderr - +2025-05-11 13:52:50 - ERROR - stderr - +2025-05-11 13:52:50 - INFO - stdout - {'loss': 0.4459, 'grad_norm': 0.8723040819168091, 'learning_rate': 2.6213220449770373e-07, 'epoch': 2.79} +2025-05-11 13:52:50 - ERROR - stderr - 93%|█████████▎| 3476/3741 [20:26:57<1:28:33, 20.05s/it] +2025-05-11 13:53:11 - ERROR - stderr - 93%|█████████▎| 3477/3741 [20:27:17<1:28:43, 20.16s/it] +2025-05-11 13:53:11 - ERROR - stderr - +2025-05-11 13:53:11 - ERROR - stderr - +2025-05-11 13:53:11 - INFO - stdout - {'loss': 0.4557, 'grad_norm': 0.8558136820793152, 'learning_rate': 2.6016618821290583e-07, 'epoch': 2.79} +2025-05-11 13:53:11 - ERROR - stderr - 93%|█████████▎| 3477/3741 [20:27:17<1:28:43, 20.16s/it] +2025-05-11 13:53:31 - ERROR - stderr - 93%|█████████▎| 3478/3741 [20:27:37<1:27:43, 20.01s/it] +2025-05-11 13:53:31 - ERROR - stderr - +2025-05-11 13:53:31 - ERROR - stderr - +2025-05-11 13:53:31 - INFO - stdout - {'loss': 0.4699, 'grad_norm': 0.8481677770614624, 'learning_rate': 2.5820747518720326e-07, 'epoch': 2.79} +2025-05-11 13:53:31 - ERROR - stderr - 93%|█████████▎| 3478/3741 [20:27:37<1:27:43, 20.01s/it] +2025-05-11 13:53:52 - ERROR - stderr - 93%|█████████▎| 3479/3741 [20:27:59<1:29:31, 20.50s/it] +2025-05-11 13:53:52 - ERROR - stderr - +2025-05-11 13:53:52 - ERROR - stderr - +2025-05-11 13:53:52 - INFO - stdout - {'loss': 0.4521, 'grad_norm': 0.8892841339111328, 'learning_rate': 2.5625606688930107e-07, 'epoch': 2.79} +2025-05-11 13:53:52 - ERROR - stderr - 93%|█████████▎| 3479/3741 [20:27:59<1:29:31, 20.50s/it] +2025-05-11 13:54:12 - ERROR - stderr - 93%|█████████▎| 3480/3741 [20:28:18<1:27:53, 20.21s/it] +2025-05-11 13:54:12 - ERROR - stderr - +2025-05-11 13:54:12 - ERROR - stderr - +2025-05-11 13:54:12 - INFO - stdout - {'loss': 0.4977, 'grad_norm': 0.9031122326850891, 'learning_rate': 2.5431196478243767e-07, 'epoch': 2.79} +2025-05-11 13:54:12 - ERROR - stderr - 93%|█████████▎| 3480/3741 [20:28:18<1:27:53, 20.21s/it] +2025-05-11 13:54:34 - ERROR - stderr - 93%|█████████▎| 3481/3741 [20:28:40<1:29:46, 20.72s/it] +2025-05-11 13:54:34 - ERROR - stderr - +2025-05-11 13:54:34 - ERROR - stderr - +2025-05-11 13:54:34 - INFO - stdout - {'loss': 0.4658, 'grad_norm': 0.8958612680435181, 'learning_rate': 2.5237517032436374e-07, 'epoch': 2.79} +2025-05-11 13:54:34 - ERROR - stderr - 93%|█████████▎| 3481/3741 [20:28:40<1:29:46, 20.72s/it] +2025-05-11 13:54:53 - ERROR - stderr - 93%|█████████▎| 3482/3741 [20:29:00<1:28:01, 20.39s/it] +2025-05-11 13:54:53 - ERROR - stderr - +2025-05-11 13:54:53 - ERROR - stderr - +2025-05-11 13:54:53 - INFO - stdout - {'loss': 0.4532, 'grad_norm': 0.8278775811195374, 'learning_rate': 2.5044568496735534e-07, 'epoch': 2.79} +2025-05-11 13:54:53 - ERROR - stderr - 93%|█████████▎| 3482/3741 [20:29:00<1:28:01, 20.39s/it] +2025-05-11 13:55:15 - ERROR - stderr - 93%|█████████▎| 3483/3741 [20:29:22<1:29:50, 20.89s/it] +2025-05-11 13:55:15 - ERROR - stderr - +2025-05-11 13:55:15 - ERROR - stderr - +2025-05-11 13:55:15 - INFO - stdout - {'loss': 0.4817, 'grad_norm': 0.8773555159568787, 'learning_rate': 2.485235101582051e-07, 'epoch': 2.79} +2025-05-11 13:55:15 - ERROR - stderr - 93%|█████████▎| 3483/3741 [20:29:22<1:29:50, 20.89s/it] +2025-05-11 13:55:35 - ERROR - stderr - 93%|█████████▎| 3484/3741 [20:29:41<1:27:54, 20.52s/it] +2025-05-11 13:55:35 - ERROR - stderr - +2025-05-11 13:55:35 - ERROR - stderr - +2025-05-11 13:55:35 - INFO - stdout - {'loss': 0.4544, 'grad_norm': 0.8488254547119141, 'learning_rate': 2.466086473382234e-07, 'epoch': 2.79} +2025-05-11 13:55:35 - ERROR - stderr - 93%|█████████▎| 3484/3741 [20:29:41<1:27:54, 20.52s/it] +2025-05-11 13:55:57 - ERROR - stderr - 93%|█████████▎| 3485/3741 [20:30:03<1:29:38, 21.01s/it] +2025-05-11 13:55:57 - ERROR - stderr - +2025-05-11 13:55:57 - ERROR - stderr - +2025-05-11 13:55:57 - INFO - stdout - {'loss': 0.4533, 'grad_norm': 0.8559311628341675, 'learning_rate': 2.4470109794324405e-07, 'epoch': 2.79} +2025-05-11 13:55:57 - ERROR - stderr - 93%|█████████▎| 3485/3741 [20:30:03<1:29:38, 21.01s/it] +2025-05-11 13:56:17 - ERROR - stderr - 93%|█████████▎| 3486/3741 [20:30:23<1:27:20, 20.55s/it] +2025-05-11 13:56:17 - ERROR - stderr - +2025-05-11 13:56:17 - ERROR - stderr - +2025-05-11 13:56:17 - INFO - stdout - {'loss': 0.4757, 'grad_norm': 0.9088083505630493, 'learning_rate': 2.4280086340360944e-07, 'epoch': 2.8} +2025-05-11 13:56:17 - ERROR - stderr - 93%|█████████▎| 3486/3741 [20:30:23<1:27:20, 20.55s/it] +2025-05-11 13:56:39 - ERROR - stderr - 93%|█████████▎| 3487/3741 [20:30:45<1:29:00, 21.02s/it] +2025-05-11 13:56:39 - ERROR - stderr - +2025-05-11 13:56:39 - ERROR - stderr - +2025-05-11 13:56:39 - INFO - stdout - {'loss': 0.4424, 'grad_norm': 0.8725008964538574, 'learning_rate': 2.409079451441809e-07, 'epoch': 2.8} +2025-05-11 13:56:39 - ERROR - stderr - 93%|█████████▎| 3487/3741 [20:30:45<1:29:00, 21.02s/it] +2025-05-11 13:56:59 - ERROR - stderr - 93%|█████████▎| 3488/3741 [20:31:05<1:27:10, 20.67s/it] +2025-05-11 13:56:59 - ERROR - stderr - +2025-05-11 13:56:59 - ERROR - stderr - +2025-05-11 13:56:59 - INFO - stdout - {'loss': 0.4581, 'grad_norm': 0.8594610095024109, 'learning_rate': 2.3902234458433315e-07, 'epoch': 2.8} +2025-05-11 13:56:59 - ERROR - stderr - 93%|█████████▎| 3488/3741 [20:31:05<1:27:10, 20.67s/it] +2025-05-11 13:57:21 - ERROR - stderr - 93%|█████████▎| 3489/3741 [20:31:27<1:29:15, 21.25s/it] +2025-05-11 13:57:21 - ERROR - stderr - +2025-05-11 13:57:21 - ERROR - stderr - +2025-05-11 13:57:21 - INFO - stdout - {'loss': 0.4769, 'grad_norm': 0.8405497670173645, 'learning_rate': 2.371440631379529e-07, 'epoch': 2.8} +2025-05-11 13:57:21 - ERROR - stderr - 93%|█████████▎| 3489/3741 [20:31:28<1:29:15, 21.25s/it] +2025-05-11 13:57:41 - ERROR - stderr - 93%|█████████▎| 3490/3741 [20:31:47<1:26:36, 20.70s/it] +2025-05-11 13:57:41 - ERROR - stderr - +2025-05-11 13:57:41 - ERROR - stderr - +2025-05-11 13:57:41 - INFO - stdout - {'loss': 0.4886, 'grad_norm': 0.9169846177101135, 'learning_rate': 2.3527310221344136e-07, 'epoch': 2.8} +2025-05-11 13:57:41 - ERROR - stderr - 93%|█████████▎| 3490/3741 [20:31:47<1:26:36, 20.70s/it] +2025-05-11 13:58:03 - ERROR - stderr - 93%|█████████▎| 3491/3741 [20:32:09<1:28:22, 21.21s/it] +2025-05-11 13:58:03 - ERROR - stderr - +2025-05-11 13:58:03 - ERROR - stderr - +2025-05-11 13:58:03 - INFO - stdout - {'loss': 0.4912, 'grad_norm': 0.9240108132362366, 'learning_rate': 2.334094632137063e-07, 'epoch': 2.8} +2025-05-11 13:58:03 - ERROR - stderr - 93%|█████████▎| 3491/3741 [20:32:09<1:28:22, 21.21s/it] +2025-05-11 13:58:23 - ERROR - stderr - 93%|█████████▎| 3492/3741 [20:32:29<1:25:57, 20.71s/it] +2025-05-11 13:58:23 - ERROR - stderr - +2025-05-11 13:58:23 - ERROR - stderr - +2025-05-11 13:58:23 - INFO - stdout - {'loss': 0.4897, 'grad_norm': 0.853226900100708, 'learning_rate': 2.3155314753616874e-07, 'epoch': 2.8} +2025-05-11 13:58:23 - ERROR - stderr - 93%|█████████▎| 3492/3741 [20:32:29<1:25:57, 20.71s/it] +2025-05-11 13:58:45 - ERROR - stderr - 93%|█████████▎| 3493/3741 [20:32:51<1:27:21, 21.14s/it] +2025-05-11 13:58:45 - ERROR - stderr - +2025-05-11 13:58:45 - ERROR - stderr - +2025-05-11 13:58:45 - INFO - stdout - {'loss': 0.4654, 'grad_norm': 0.863446831703186, 'learning_rate': 2.297041565727598e-07, 'epoch': 2.8} +2025-05-11 13:58:45 - ERROR - stderr - 93%|█████████▎| 3493/3741 [20:32:51<1:27:21, 21.14s/it] +2025-05-11 13:59:04 - ERROR - stderr - 93%|█████████▎| 3494/3741 [20:33:11<1:25:07, 20.68s/it] +2025-05-11 13:59:04 - ERROR - stderr - +2025-05-11 13:59:04 - ERROR - stderr - +2025-05-11 13:59:04 - INFO - stdout - {'loss': 0.4854, 'grad_norm': 0.8915846347808838, 'learning_rate': 2.2786249170991148e-07, 'epoch': 2.8} +2025-05-11 13:59:04 - ERROR - stderr - 93%|█████████▎| 3494/3741 [20:33:11<1:25:07, 20.68s/it] +2025-05-11 13:59:25 - ERROR - stderr - 93%|█████████▎| 3495/3741 [20:33:31<1:24:42, 20.66s/it] +2025-05-11 13:59:25 - ERROR - stderr - +2025-05-11 13:59:25 - ERROR - stderr - +2025-05-11 13:59:25 - INFO - stdout - {'loss': 0.4911, 'grad_norm': 0.8765985369682312, 'learning_rate': 2.260281543285703e-07, 'epoch': 2.8} +2025-05-11 13:59:25 - ERROR - stderr - 93%|█████████▎| 3495/3741 [20:33:31<1:24:42, 20.66s/it] +2025-05-11 13:59:45 - ERROR - stderr - 93%|█████████▎| 3496/3741 [20:33:51<1:23:17, 20.40s/it] +2025-05-11 13:59:45 - ERROR - stderr - +2025-05-11 13:59:45 - ERROR - stderr - +2025-05-11 13:59:45 - INFO - stdout - {'loss': 0.4665, 'grad_norm': 0.8515004515647888, 'learning_rate': 2.2420114580418262e-07, 'epoch': 2.8} +2025-05-11 13:59:45 - ERROR - stderr - 93%|█████████▎| 3496/3741 [20:33:51<1:23:17, 20.40s/it] +2025-05-11 14:00:05 - ERROR - stderr - 93%|█████████▎| 3497/3741 [20:34:11<1:22:16, 20.23s/it] +2025-05-11 14:00:05 - ERROR - stderr - +2025-05-11 14:00:05 - ERROR - stderr - +2025-05-11 14:00:05 - INFO - stdout - {'loss': 0.4608, 'grad_norm': 0.8506413698196411, 'learning_rate': 2.2238146750670264e-07, 'epoch': 2.8} +2025-05-11 14:00:05 - ERROR - stderr - 93%|█████████▎| 3497/3741 [20:34:11<1:22:16, 20.23s/it] +2025-05-11 14:00:25 - ERROR - stderr - 94%|█████████▎| 3498/3741 [20:34:32<1:22:29, 20.37s/it] +2025-05-11 14:00:25 - ERROR - stderr - +2025-05-11 14:00:25 - ERROR - stderr - +2025-05-11 14:00:25 - INFO - stdout - {'loss': 0.4648, 'grad_norm': 0.8681690096855164, 'learning_rate': 2.205691208005889e-07, 'epoch': 2.81} +2025-05-11 14:00:25 - ERROR - stderr - 94%|█████████▎| 3498/3741 [20:34:32<1:22:29, 20.37s/it] +2025-05-11 14:00:45 - ERROR - stderr - 94%|█████████▎| 3499/3741 [20:34:51<1:21:19, 20.16s/it] +2025-05-11 14:00:45 - ERROR - stderr - +2025-05-11 14:00:45 - ERROR - stderr - +2025-05-11 14:00:45 - INFO - stdout - {'loss': 0.4622, 'grad_norm': 0.855263888835907, 'learning_rate': 2.1876410704479767e-07, 'epoch': 2.81} +2025-05-11 14:00:45 - ERROR - stderr - 94%|█████████▎| 3499/3741 [20:34:51<1:21:19, 20.16s/it] +2025-05-11 14:01:05 - ERROR - stderr - 94%|█████████▎| 3500/3741 [20:35:11<1:20:28, 20.04s/it] +2025-05-11 14:01:05 - ERROR - stderr - +2025-05-11 14:01:05 - ERROR - stderr - +2025-05-11 14:01:05 - INFO - stdout - {'loss': 0.4671, 'grad_norm': 0.8927332758903503, 'learning_rate': 2.1696642759279074e-07, 'epoch': 2.81} +2025-05-11 14:01:05 - ERROR - stderr - 94%|█████████▎| 3500/3741 [20:35:11<1:20:28, 20.04s/it] +2025-05-11 14:01:24 - ERROR - stderr - 94%|█████████▎| 3501/3741 [20:35:31<1:19:47, 19.95s/it] +2025-05-11 14:01:24 - ERROR - stderr - +2025-05-11 14:01:24 - ERROR - stderr - +2025-05-11 14:01:24 - INFO - stdout - {'loss': 0.4874, 'grad_norm': 0.8977309465408325, 'learning_rate': 2.1517608379252985e-07, 'epoch': 2.81} +2025-05-11 14:01:24 - ERROR - stderr - 94%|█████████▎| 3501/3741 [20:35:31<1:19:47, 19.95s/it] +2025-05-11 14:01:44 - ERROR - stderr - 94%|█████████▎| 3502/3741 [20:35:50<1:19:07, 19.86s/it] +2025-05-11 14:01:44 - ERROR - stderr - +2025-05-11 14:01:44 - ERROR - stderr - +2025-05-11 14:01:44 - INFO - stdout - {'loss': 0.4645, 'grad_norm': 0.8782375454902649, 'learning_rate': 2.133930769864756e-07, 'epoch': 2.81} +2025-05-11 14:01:44 - ERROR - stderr - 94%|█████████▎| 3502/3741 [20:35:50<1:19:07, 19.86s/it] +2025-05-11 14:02:04 - ERROR - stderr - 94%|█████████▎| 3503/3741 [20:36:10<1:18:45, 19.85s/it] +2025-05-11 14:02:04 - ERROR - stderr - +2025-05-11 14:02:04 - ERROR - stderr - +2025-05-11 14:02:04 - INFO - stdout - {'loss': 0.4574, 'grad_norm': 0.8151038885116577, 'learning_rate': 2.1161740851158742e-07, 'epoch': 2.81} +2025-05-11 14:02:04 - ERROR - stderr - 94%|█████████▎| 3503/3741 [20:36:10<1:18:45, 19.85s/it] +2025-05-11 14:02:26 - ERROR - stderr - 94%|█████████▎| 3504/3741 [20:36:32<1:20:33, 20.39s/it] +2025-05-11 14:02:26 - ERROR - stderr - +2025-05-11 14:02:26 - ERROR - stderr - +2025-05-11 14:02:26 - INFO - stdout - {'loss': 0.4472, 'grad_norm': 0.884947657585144, 'learning_rate': 2.0984907969932134e-07, 'epoch': 2.81} +2025-05-11 14:02:26 - ERROR - stderr - 94%|█████████▎| 3504/3741 [20:36:32<1:20:33, 20.39s/it] +2025-05-11 14:02:45 - ERROR - stderr - 94%|█████████▎| 3505/3741 [20:36:52<1:19:40, 20.26s/it] +2025-05-11 14:02:45 - ERROR - stderr - +2025-05-11 14:02:45 - ERROR - stderr - +2025-05-11 14:02:45 - INFO - stdout - {'loss': 0.4669, 'grad_norm': 0.8524783253669739, 'learning_rate': 2.0808809187563118e-07, 'epoch': 2.81} +2025-05-11 14:02:45 - ERROR - stderr - 94%|█████████▎| 3505/3741 [20:36:52<1:19:40, 20.26s/it] +2025-05-11 14:03:07 - ERROR - stderr - 94%|█████████▎| 3506/3741 [20:37:13<1:20:25, 20.53s/it] +2025-05-11 14:03:07 - ERROR - stderr - +2025-05-11 14:03:07 - ERROR - stderr - +2025-05-11 14:03:07 - INFO - stdout - {'loss': 0.4632, 'grad_norm': 0.8649755120277405, 'learning_rate': 2.063344463609651e-07, 'epoch': 2.81} +2025-05-11 14:03:07 - ERROR - stderr - 94%|█████████▎| 3506/3741 [20:37:13<1:20:25, 20.53s/it] +2025-05-11 14:03:27 - ERROR - stderr - 94%|█████████▎| 3507/3741 [20:37:33<1:19:26, 20.37s/it] +2025-05-11 14:03:27 - ERROR - stderr - +2025-05-11 14:03:27 - ERROR - stderr - +2025-05-11 14:03:27 - INFO - stdout - {'loss': 0.4809, 'grad_norm': 0.849073052406311, 'learning_rate': 2.0458814447026687e-07, 'epoch': 2.81} +2025-05-11 14:03:27 - ERROR - stderr - 94%|█████████▎| 3507/3741 [20:37:33<1:19:26, 20.37s/it] +2025-05-11 14:03:49 - ERROR - stderr - 94%|█████████▍| 3508/3741 [20:37:55<1:21:22, 20.95s/it] +2025-05-11 14:03:49 - ERROR - stderr - +2025-05-11 14:03:49 - ERROR - stderr - +2025-05-11 14:03:49 - INFO - stdout - {'loss': 0.4942, 'grad_norm': 0.872164785861969, 'learning_rate': 2.0284918751297235e-07, 'epoch': 2.81} +2025-05-11 14:03:49 - ERROR - stderr - 94%|█████████▍| 3508/3741 [20:37:55<1:21:22, 20.95s/it] +2025-05-11 14:04:08 - ERROR - stderr - 94%|█████████▍| 3509/3741 [20:38:15<1:19:22, 20.53s/it] +2025-05-11 14:04:09 - ERROR - stderr - +2025-05-11 14:04:09 - ERROR - stderr - +2025-05-11 14:04:09 - INFO - stdout - {'loss': 0.44, 'grad_norm': 0.8174782395362854, 'learning_rate': 2.011175767930118e-07, 'epoch': 2.81} +2025-05-11 14:04:09 - ERROR - stderr - 94%|█████████▍| 3509/3741 [20:38:15<1:19:22, 20.53s/it] +2025-05-11 14:04:31 - ERROR - stderr - 94%|█████████▍| 3510/3741 [20:38:37<1:20:48, 20.99s/it] +2025-05-11 14:04:31 - ERROR - stderr - +2025-05-11 14:04:31 - ERROR - stderr - +2025-05-11 14:04:31 - INFO - stdout - {'loss': 0.4673, 'grad_norm': 0.8372223973274231, 'learning_rate': 1.9939331360880442e-07, 'epoch': 2.81} +2025-05-11 14:04:31 - ERROR - stderr - 94%|█████████▍| 3510/3741 [20:38:37<1:20:48, 20.99s/it] +2025-05-11 14:04:50 - ERROR - stderr - 94%|█████████▍| 3511/3741 [20:38:56<1:18:46, 20.55s/it] +2025-05-11 14:04:50 - ERROR - stderr - +2025-05-11 14:04:50 - ERROR - stderr - +2025-05-11 14:04:50 - INFO - stdout - {'loss': 0.4708, 'grad_norm': 0.8555891513824463, 'learning_rate': 1.9767639925326155e-07, 'epoch': 2.82} +2025-05-11 14:04:50 - ERROR - stderr - 94%|█████████▍| 3511/3741 [20:38:56<1:18:46, 20.55s/it] +2025-05-11 14:05:12 - ERROR - stderr - 94%|█████████▍| 3512/3741 [20:39:18<1:19:47, 20.91s/it] +2025-05-11 14:05:12 - ERROR - stderr - +2025-05-11 14:05:12 - ERROR - stderr - +2025-05-11 14:05:12 - INFO - stdout - {'loss': 0.4717, 'grad_norm': 0.8843154311180115, 'learning_rate': 1.9596683501378666e-07, 'epoch': 2.82} +2025-05-11 14:05:12 - ERROR - stderr - 94%|█████████▍| 3512/3741 [20:39:18<1:19:47, 20.91s/it] +2025-05-11 14:05:31 - ERROR - stderr - 94%|█████████▍| 3513/3741 [20:39:38<1:17:42, 20.45s/it] +2025-05-11 14:05:31 - ERROR - stderr - +2025-05-11 14:05:31 - ERROR - stderr - +2025-05-11 14:05:31 - INFO - stdout - {'loss': 0.4618, 'grad_norm': 0.8556921482086182, 'learning_rate': 1.942646221722655e-07, 'epoch': 2.82} +2025-05-11 14:05:31 - ERROR - stderr - 94%|█████████▍| 3513/3741 [20:39:38<1:17:42, 20.45s/it] +2025-05-11 14:05:53 - ERROR - stderr - 94%|█████████▍| 3514/3741 [20:40:00<1:19:14, 20.95s/it] +2025-05-11 14:05:53 - ERROR - stderr - +2025-05-11 14:05:53 - ERROR - stderr - +2025-05-11 14:05:53 - INFO - stdout - {'loss': 0.4838, 'grad_norm': 0.8737980127334595, 'learning_rate': 1.9256976200507814e-07, 'epoch': 2.82} +2025-05-11 14:05:53 - ERROR - stderr - 94%|█████████▍| 3514/3741 [20:40:00<1:19:14, 20.95s/it] +2025-05-11 14:06:13 - ERROR - stderr - 94%|█████████▍| 3515/3741 [20:40:19<1:17:37, 20.61s/it] +2025-05-11 14:06:13 - ERROR - stderr - +2025-05-11 14:06:13 - ERROR - stderr - +2025-05-11 14:06:13 - INFO - stdout - {'loss': 0.4755, 'grad_norm': 0.8971306085586548, 'learning_rate': 1.9088225578308582e-07, 'epoch': 2.82} +2025-05-11 14:06:13 - ERROR - stderr - 94%|█████████▍| 3515/3741 [20:40:19<1:17:37, 20.61s/it] +2025-05-11 14:06:36 - ERROR - stderr - 94%|█████████▍| 3516/3741 [20:40:42<1:19:47, 21.28s/it] +2025-05-11 14:06:36 - ERROR - stderr - +2025-05-11 14:06:36 - ERROR - stderr - +2025-05-11 14:06:36 - INFO - stdout - {'loss': 0.4624, 'grad_norm': 0.8777005672454834, 'learning_rate': 1.892021047716408e-07, 'epoch': 2.82} +2025-05-11 14:06:36 - ERROR - stderr - 94%|█████████▍| 3516/3741 [20:40:42<1:19:47, 21.28s/it] +2025-05-11 14:06:56 - ERROR - stderr - 94%|█████████▍| 3517/3741 [20:41:03<1:18:23, 21.00s/it] +2025-05-11 14:06:56 - ERROR - stderr - +2025-05-11 14:06:56 - ERROR - stderr - +2025-05-11 14:06:56 - INFO - stdout - {'loss': 0.45, 'grad_norm': 0.844862163066864, 'learning_rate': 1.8752931023057753e-07, 'epoch': 2.82} +2025-05-11 14:06:56 - ERROR - stderr - 94%|█████████▍| 3517/3741 [20:41:03<1:18:23, 21.00s/it] +2025-05-11 14:07:19 - ERROR - stderr - 94%|█████████▍| 3518/3741 [20:41:25<1:19:23, 21.36s/it] +2025-05-11 14:07:19 - ERROR - stderr - +2025-05-11 14:07:19 - ERROR - stderr - +2025-05-11 14:07:19 - INFO - stdout - {'loss': 0.4738, 'grad_norm': 0.8649774789810181, 'learning_rate': 1.858638734142104e-07, 'epoch': 2.82} +2025-05-11 14:07:19 - ERROR - stderr - 94%|█████████▍| 3518/3741 [20:41:25<1:19:23, 21.36s/it] +2025-05-11 14:07:38 - ERROR - stderr - 94%|█████████▍| 3519/3741 [20:41:45<1:17:26, 20.93s/it] +2025-05-11 14:07:38 - ERROR - stderr - +2025-05-11 14:07:38 - ERROR - stderr - +2025-05-11 14:07:38 - INFO - stdout - {'loss': 0.4632, 'grad_norm': 0.8356218934059143, 'learning_rate': 1.842057955713461e-07, 'epoch': 2.82} +2025-05-11 14:07:38 - ERROR - stderr - 94%|█████████▍| 3519/3741 [20:41:45<1:17:26, 20.93s/it] +2025-05-11 14:08:00 - ERROR - stderr - 94%|█████████▍| 3520/3741 [20:42:06<1:17:33, 21.06s/it] +2025-05-11 14:08:00 - ERROR - stderr - +2025-05-11 14:08:00 - ERROR - stderr - +2025-05-11 14:08:00 - INFO - stdout - {'loss': 0.4708, 'grad_norm': 0.8642269968986511, 'learning_rate': 1.8255507794526338e-07, 'epoch': 2.82} +2025-05-11 14:08:00 - ERROR - stderr - 94%|█████████▍| 3520/3741 [20:42:06<1:17:33, 21.06s/it] +2025-05-11 14:08:20 - ERROR - stderr - 94%|█████████▍| 3521/3741 [20:42:26<1:15:48, 20.67s/it] +2025-05-11 14:08:20 - ERROR - stderr - +2025-05-11 14:08:20 - ERROR - stderr - +2025-05-11 14:08:20 - INFO - stdout - {'loss': 0.4649, 'grad_norm': 0.8712006211280823, 'learning_rate': 1.8091172177372994e-07, 'epoch': 2.82} +2025-05-11 14:08:20 - ERROR - stderr - 94%|█████████▍| 3521/3741 [20:42:26<1:15:48, 20.67s/it] +2025-05-11 14:08:41 - ERROR - stderr - 94%|█████████▍| 3522/3741 [20:42:47<1:16:20, 20.92s/it] +2025-05-11 14:08:41 - ERROR - stderr - +2025-05-11 14:08:41 - ERROR - stderr - +2025-05-11 14:08:41 - INFO - stdout - {'loss': 0.4599, 'grad_norm': 0.8947505354881287, 'learning_rate': 1.7927572828898788e-07, 'epoch': 2.82} +2025-05-11 14:08:41 - ERROR - stderr - 94%|█████████▍| 3522/3741 [20:42:47<1:16:20, 20.92s/it] +2025-05-11 14:09:01 - ERROR - stderr - 94%|█████████▍| 3523/3741 [20:43:07<1:14:34, 20.52s/it] +2025-05-11 14:09:01 - ERROR - stderr - +2025-05-11 14:09:01 - ERROR - stderr - +2025-05-11 14:09:01 - INFO - stdout - {'loss': 0.4514, 'grad_norm': 0.8280917406082153, 'learning_rate': 1.776470987177614e-07, 'epoch': 2.83} +2025-05-11 14:09:01 - ERROR - stderr - 94%|█████████▍| 3523/3741 [20:43:07<1:14:34, 20.52s/it] +2025-05-11 14:09:22 - ERROR - stderr - 94%|█████████▍| 3524/3741 [20:43:29<1:15:38, 20.91s/it] +2025-05-11 14:09:23 - ERROR - stderr - +2025-05-11 14:09:23 - ERROR - stderr - +2025-05-11 14:09:23 - INFO - stdout - {'loss': 0.4769, 'grad_norm': 0.8675678968429565, 'learning_rate': 1.7602583428125263e-07, 'epoch': 2.83} +2025-05-11 14:09:23 - ERROR - stderr - 94%|█████████▍| 3524/3741 [20:43:29<1:15:38, 20.91s/it] +2025-05-11 14:09:42 - ERROR - stderr - 94%|█████████▍| 3525/3741 [20:43:48<1:13:38, 20.46s/it] +2025-05-11 14:09:42 - ERROR - stderr - +2025-05-11 14:09:42 - ERROR - stderr - +2025-05-11 14:09:42 - INFO - stdout - {'loss': 0.4635, 'grad_norm': 0.8008211255073547, 'learning_rate': 1.744119361951413e-07, 'epoch': 2.83} +2025-05-11 14:09:42 - ERROR - stderr - 94%|█████████▍| 3525/3741 [20:43:48<1:13:38, 20.46s/it] +2025-05-11 14:10:05 - ERROR - stderr - 94%|█████████▍| 3526/3741 [20:44:12<1:16:41, 21.40s/it] +2025-05-11 14:10:06 - ERROR - stderr - +2025-05-11 14:10:06 - ERROR - stderr - +2025-05-11 14:10:06 - INFO - stdout - {'loss': 0.4531, 'grad_norm': 0.8509759306907654, 'learning_rate': 1.728054056695816e-07, 'epoch': 2.83} +2025-05-11 14:10:06 - ERROR - stderr - 94%|█████████▍| 3526/3741 [20:44:12<1:16:41, 21.40s/it] +2025-05-11 14:10:25 - ERROR - stderr - 94%|█████████▍| 3527/3741 [20:44:31<1:14:23, 20.86s/it] +2025-05-11 14:10:25 - ERROR - stderr - +2025-05-11 14:10:25 - ERROR - stderr - +2025-05-11 14:10:25 - INFO - stdout - {'loss': 0.4622, 'grad_norm': 0.8491147756576538, 'learning_rate': 1.712062439092077e-07, 'epoch': 2.83} +2025-05-11 14:10:25 - ERROR - stderr - 94%|█████████▍| 3527/3741 [20:44:31<1:14:23, 20.86s/it] +2025-05-11 14:10:47 - ERROR - stderr - 94%|█████████▍| 3528/3741 [20:44:54<1:15:40, 21.32s/it] +2025-05-11 14:10:47 - ERROR - stderr - +2025-05-11 14:10:47 - ERROR - stderr - +2025-05-11 14:10:47 - INFO - stdout - {'loss': 0.4772, 'grad_norm': 0.8817112445831299, 'learning_rate': 1.6961445211312265e-07, 'epoch': 2.83} +2025-05-11 14:10:47 - ERROR - stderr - 94%|█████████▍| 3528/3741 [20:44:54<1:15:40, 21.32s/it] +2025-05-11 14:11:07 - ERROR - stderr - 94%|█████████▍| 3529/3741 [20:45:13<1:13:30, 20.81s/it] +2025-05-11 14:11:07 - ERROR - stderr - +2025-05-11 14:11:07 - ERROR - stderr - +2025-05-11 14:11:07 - INFO - stdout - {'loss': 0.4619, 'grad_norm': 0.8488009572029114, 'learning_rate': 1.6803003147490727e-07, 'epoch': 2.83} +2025-05-11 14:11:07 - ERROR - stderr - 94%|█████████▍| 3529/3741 [20:45:13<1:13:30, 20.81s/it] +2025-05-11 14:11:29 - ERROR - stderr - 94%|█████████▍| 3530/3741 [20:45:36<1:14:47, 21.27s/it] +2025-05-11 14:11:29 - ERROR - stderr - +2025-05-11 14:11:29 - ERROR - stderr - +2025-05-11 14:11:29 - INFO - stdout - {'loss': 0.4584, 'grad_norm': 0.8665116429328918, 'learning_rate': 1.6645298318261449e-07, 'epoch': 2.83} +2025-05-11 14:11:29 - ERROR - stderr - 94%|█████████▍| 3530/3741 [20:45:36<1:14:47, 21.27s/it] +2025-05-11 14:11:49 - ERROR - stderr - 94%|█████████▍| 3531/3741 [20:45:56<1:12:56, 20.84s/it] +2025-05-11 14:11:49 - ERROR - stderr - +2025-05-11 14:11:49 - ERROR - stderr - +2025-05-11 14:11:49 - INFO - stdout - {'loss': 0.4742, 'grad_norm': 0.8897413015365601, 'learning_rate': 1.648833084187673e-07, 'epoch': 2.83} +2025-05-11 14:11:49 - ERROR - stderr - 94%|█████████▍| 3531/3741 [20:45:56<1:12:56, 20.84s/it] +2025-05-11 14:12:12 - ERROR - stderr - 94%|█████████▍| 3532/3741 [20:46:18<1:14:04, 21.27s/it] +2025-05-11 14:12:12 - ERROR - stderr - +2025-05-11 14:12:12 - ERROR - stderr - +2025-05-11 14:12:12 - INFO - stdout - {'loss': 0.4718, 'grad_norm': 0.8791276216506958, 'learning_rate': 1.6332100836036425e-07, 'epoch': 2.83} +2025-05-11 14:12:12 - ERROR - stderr - 94%|█████████▍| 3532/3741 [20:46:18<1:14:04, 21.27s/it] +2025-05-11 14:12:31 - ERROR - stderr - 94%|█████████▍| 3533/3741 [20:46:37<1:11:54, 20.74s/it] +2025-05-11 14:12:31 - ERROR - stderr - +2025-05-11 14:12:31 - ERROR - stderr - +2025-05-11 14:12:31 - INFO - stdout - {'loss': 0.4715, 'grad_norm': 0.8625409603118896, 'learning_rate': 1.617660841788682e-07, 'epoch': 2.83} +2025-05-11 14:12:31 - ERROR - stderr - 94%|█████████▍| 3533/3741 [20:46:37<1:11:54, 20.74s/it] +2025-05-11 14:12:54 - ERROR - stderr - 94%|█████████▍| 3534/3741 [20:47:00<1:13:28, 21.30s/it] +2025-05-11 14:12:54 - ERROR - stderr - +2025-05-11 14:12:54 - ERROR - stderr - +2025-05-11 14:12:54 - INFO - stdout - {'loss': 0.5021, 'grad_norm': 0.887173593044281, 'learning_rate': 1.602185370402154e-07, 'epoch': 2.83} +2025-05-11 14:12:54 - ERROR - stderr - 94%|█████████▍| 3534/3741 [20:47:00<1:13:28, 21.30s/it] +2025-05-11 14:13:13 - ERROR - stderr - 94%|█████████▍| 3535/3741 [20:47:20<1:11:26, 20.81s/it] +2025-05-11 14:13:13 - ERROR - stderr - +2025-05-11 14:13:13 - ERROR - stderr - +2025-05-11 14:13:13 - INFO - stdout - {'loss': 0.5014, 'grad_norm': 0.9261402487754822, 'learning_rate': 1.5867836810481095e-07, 'epoch': 2.83} +2025-05-11 14:13:13 - ERROR - stderr - 94%|█████████▍| 3535/3741 [20:47:20<1:11:26, 20.81s/it] +2025-05-11 14:13:36 - ERROR - stderr - 95%|█████████▍| 3536/3741 [20:47:42<1:12:39, 21.27s/it] +2025-05-11 14:13:36 - ERROR - stderr - +2025-05-11 14:13:36 - ERROR - stderr - +2025-05-11 14:13:36 - INFO - stdout - {'loss': 0.4643, 'grad_norm': 0.8724820017814636, 'learning_rate': 1.5714557852752222e-07, 'epoch': 2.84} +2025-05-11 14:13:36 - ERROR - stderr - 95%|█████████▍| 3536/3741 [20:47:42<1:12:39, 21.27s/it] +2025-05-11 14:13:56 - ERROR - stderr - 95%|█████████▍| 3537/3741 [20:48:02<1:11:10, 20.93s/it] +2025-05-11 14:13:56 - ERROR - stderr - +2025-05-11 14:13:56 - ERROR - stderr - +2025-05-11 14:13:56 - INFO - stdout - {'loss': 0.4687, 'grad_norm': 0.8310959339141846, 'learning_rate': 1.5562016945769088e-07, 'epoch': 2.84} +2025-05-11 14:13:56 - ERROR - stderr - 95%|█████████▍| 3537/3741 [20:48:02<1:11:10, 20.93s/it] +2025-05-11 14:14:19 - ERROR - stderr - 95%|█████████▍| 3538/3741 [20:48:25<1:13:17, 21.66s/it] +2025-05-11 14:14:19 - ERROR - stderr - +2025-05-11 14:14:19 - ERROR - stderr - +2025-05-11 14:14:19 - INFO - stdout - {'loss': 0.4681, 'grad_norm': 0.8877079486846924, 'learning_rate': 1.5410214203911754e-07, 'epoch': 2.84} +2025-05-11 14:14:19 - ERROR - stderr - 95%|█████████▍| 3538/3741 [20:48:26<1:13:17, 21.66s/it] +2025-05-11 14:14:39 - ERROR - stderr - 95%|█████████▍| 3539/3741 [20:48:45<1:10:48, 21.03s/it] +2025-05-11 14:14:39 - ERROR - stderr - +2025-05-11 14:14:39 - ERROR - stderr - +2025-05-11 14:14:39 - INFO - stdout - {'loss': 0.4639, 'grad_norm': 0.8687597513198853, 'learning_rate': 1.5259149741007284e-07, 'epoch': 2.84} +2025-05-11 14:14:39 - ERROR - stderr - 95%|█████████▍| 3539/3741 [20:48:45<1:10:48, 21.03s/it] +2025-05-11 14:15:01 - ERROR - stderr - 95%|█████████▍| 3540/3741 [20:49:07<1:11:34, 21.36s/it] +2025-05-11 14:15:01 - ERROR - stderr - +2025-05-11 14:15:01 - ERROR - stderr - +2025-05-11 14:15:01 - INFO - stdout - {'loss': 0.4792, 'grad_norm': 0.8550201058387756, 'learning_rate': 1.5108823670328954e-07, 'epoch': 2.84} +2025-05-11 14:15:01 - ERROR - stderr - 95%|█████████▍| 3540/3741 [20:49:07<1:11:34, 21.36s/it] +2025-05-11 14:15:20 - ERROR - stderr - 95%|█████████▍| 3541/3741 [20:49:27<1:09:25, 20.83s/it] +2025-05-11 14:15:20 - ERROR - stderr - +2025-05-11 14:15:20 - ERROR - stderr - +2025-05-11 14:15:20 - INFO - stdout - {'loss': 0.4758, 'grad_norm': 0.9106943011283875, 'learning_rate': 1.4959236104596265e-07, 'epoch': 2.84} +2025-05-11 14:15:20 - ERROR - stderr - 95%|█████████▍| 3541/3741 [20:49:27<1:09:25, 20.83s/it] +2025-05-11 14:15:44 - ERROR - stderr - 95%|█████████▍| 3542/3741 [20:49:50<1:11:34, 21.58s/it] +2025-05-11 14:15:44 - ERROR - stderr - +2025-05-11 14:15:44 - ERROR - stderr - +2025-05-11 14:15:44 - INFO - stdout - {'loss': 0.4608, 'grad_norm': 0.8858274221420288, 'learning_rate': 1.4810387155975158e-07, 'epoch': 2.84} +2025-05-11 14:15:44 - ERROR - stderr - 95%|█████████▍| 3542/3741 [20:49:50<1:11:34, 21.58s/it] +2025-05-11 14:16:03 - ERROR - stderr - 95%|█████████▍| 3543/3741 [20:50:10<1:09:21, 21.02s/it] +2025-05-11 14:16:04 - ERROR - stderr - +2025-05-11 14:16:04 - ERROR - stderr - +2025-05-11 14:16:04 - INFO - stdout - {'loss': 0.4688, 'grad_norm': 0.8574510216712952, 'learning_rate': 1.466227693607747e-07, 'epoch': 2.84} +2025-05-11 14:16:04 - ERROR - stderr - 95%|█████████▍| 3543/3741 [20:50:10<1:09:21, 21.02s/it] +2025-05-11 14:16:27 - ERROR - stderr - 95%|█████████▍| 3544/3741 [20:50:33<1:11:09, 21.67s/it] +2025-05-11 14:16:27 - ERROR - stderr - +2025-05-11 14:16:27 - ERROR - stderr - +2025-05-11 14:16:27 - INFO - stdout - {'loss': 0.4763, 'grad_norm': 0.8784916400909424, 'learning_rate': 1.4514905555961578e-07, 'epoch': 2.84} +2025-05-11 14:16:27 - ERROR - stderr - 95%|█████████▍| 3544/3741 [20:50:33<1:11:09, 21.67s/it] +2025-05-11 14:16:47 - ERROR - stderr - 95%|█████████��| 3545/3741 [20:50:53<1:08:59, 21.12s/it] +2025-05-11 14:16:47 - ERROR - stderr - +2025-05-11 14:16:47 - ERROR - stderr - +2025-05-11 14:16:47 - INFO - stdout - {'loss': 0.4472, 'grad_norm': 0.863102912902832, 'learning_rate': 1.4368273126131428e-07, 'epoch': 2.84} +2025-05-11 14:16:47 - ERROR - stderr - 95%|█████████▍| 3545/3741 [20:50:53<1:08:59, 21.12s/it] +2025-05-11 14:17:09 - ERROR - stderr - 95%|█████████▍| 3546/3741 [20:51:16<1:10:12, 21.60s/it] +2025-05-11 14:17:09 - ERROR - stderr - +2025-05-11 14:17:09 - ERROR - stderr - +2025-05-11 14:17:09 - INFO - stdout - {'loss': 0.4649, 'grad_norm': 0.8521451354026794, 'learning_rate': 1.4222379756536841e-07, 'epoch': 2.84} +2025-05-11 14:17:09 - ERROR - stderr - 95%|█████████▍| 3546/3741 [20:51:16<1:10:12, 21.60s/it] +2025-05-11 14:17:29 - ERROR - stderr - 95%|█████████▍| 3547/3741 [20:51:36<1:08:20, 21.13s/it] +2025-05-11 14:17:29 - ERROR - stderr - +2025-05-11 14:17:29 - ERROR - stderr - +2025-05-11 14:17:29 - INFO - stdout - {'loss': 0.4707, 'grad_norm': 0.8406957983970642, 'learning_rate': 1.4077225556573872e-07, 'epoch': 2.84} +2025-05-11 14:17:29 - ERROR - stderr - 95%|█████████▍| 3547/3741 [20:51:36<1:08:20, 21.13s/it] +2025-05-11 14:17:52 - ERROR - stderr - 95%|█████████▍| 3548/3741 [20:51:59<1:09:53, 21.73s/it] +2025-05-11 14:17:52 - ERROR - stderr - +2025-05-11 14:17:52 - ERROR - stderr - +2025-05-11 14:17:52 - INFO - stdout - {'loss': 0.4524, 'grad_norm': 0.8615586757659912, 'learning_rate': 1.3932810635083893e-07, 'epoch': 2.85} +2025-05-11 14:17:52 - ERROR - stderr - 95%|█████████▍| 3548/3741 [20:51:59<1:09:53, 21.73s/it] +2025-05-11 14:18:13 - ERROR - stderr - 95%|█████████▍| 3549/3741 [20:52:19<1:07:59, 21.24s/it] +2025-05-11 14:18:13 - ERROR - stderr - +2025-05-11 14:18:13 - ERROR - stderr - +2025-05-11 14:18:13 - INFO - stdout - {'loss': 0.4781, 'grad_norm': 0.8704116344451904, 'learning_rate': 1.378913510035429e-07, 'epoch': 2.85} +2025-05-11 14:18:13 - ERROR - stderr - 95%|█████████▍| 3549/3741 [20:52:19<1:07:59, 21.24s/it] +2025-05-11 14:18:35 - ERROR - stderr - 95%|█████████▍| 3550/3741 [20:52:42<1:09:09, 21.73s/it] +2025-05-11 14:18:35 - ERROR - stderr - +2025-05-11 14:18:35 - ERROR - stderr - +2025-05-11 14:18:35 - INFO - stdout - {'loss': 0.4628, 'grad_norm': 0.8566069602966309, 'learning_rate': 1.3646199060117881e-07, 'epoch': 2.85} +2025-05-11 14:18:35 - ERROR - stderr - 95%|█████████▍| 3550/3741 [20:52:42<1:09:09, 21.73s/it] +2025-05-11 14:18:55 - ERROR - stderr - 95%|█████████▍| 3551/3741 [20:53:02<1:07:04, 21.18s/it] +2025-05-11 14:18:55 - ERROR - stderr - +2025-05-11 14:18:55 - ERROR - stderr - +2025-05-11 14:18:55 - INFO - stdout - {'loss': 0.4675, 'grad_norm': 0.8848322629928589, 'learning_rate': 1.3504002621552937e-07, 'epoch': 2.85} +2025-05-11 14:18:55 - ERROR - stderr - 95%|█████████▍| 3551/3741 [20:53:02<1:07:04, 21.18s/it] +2025-05-11 14:19:18 - ERROR - stderr - 95%|█████████▍| 3552/3741 [20:53:24<1:07:58, 21.58s/it] +2025-05-11 14:19:18 - ERROR - stderr - +2025-05-11 14:19:18 - ERROR - stderr - +2025-05-11 14:19:18 - INFO - stdout - {'loss': 0.4589, 'grad_norm': 0.8653082251548767, 'learning_rate': 1.3362545891283052e-07, 'epoch': 2.85} +2025-05-11 14:19:18 - ERROR - stderr - 95%|█████████▍| 3552/3741 [20:53:24<1:07:58, 21.58s/it] +2025-05-11 14:19:38 - ERROR - stderr - 95%|█████████▍| 3553/3741 [20:53:44<1:06:07, 21.10s/it] +2025-05-11 14:19:38 - ERROR - stderr - +2025-05-11 14:19:38 - ERROR - stderr - +2025-05-11 14:19:38 - INFO - stdout - {'loss': 0.4574, 'grad_norm': 0.8225123286247253, 'learning_rate': 1.3221828975377382e-07, 'epoch': 2.85} +2025-05-11 14:19:38 - ERROR - stderr - 95%|█████████▍| 3553/3741 [20:53:44<1:06:07, 21.10s/it] +2025-05-11 14:20:01 - ERROR - stderr - 95%|█████████▌| 3554/3741 [20:54:07<1:07:27, 21.64s/it] +2025-05-11 14:20:01 - ERROR - stderr - +2025-05-11 14:20:01 - ERROR - stderr - +2025-05-11 14:20:01 - INFO - stdout - {'loss': 0.4877, 'grad_norm': 0.838966965675354, 'learning_rate': 1.3081851979350412e-07, 'epoch': 2.85} +2025-05-11 14:20:01 - ERROR - stderr - 95%|█████████▌| 3554/3741 [20:54:07<1:07:27, 21.64s/it] +2025-05-11 14:20:21 - ERROR - stderr - 95%|█████████▌| 3555/3741 [20:54:27<1:05:30, 21.13s/it] +2025-05-11 14:20:21 - ERROR - stderr - +2025-05-11 14:20:21 - ERROR - stderr - +2025-05-11 14:20:21 - INFO - stdout - {'loss': 0.4686, 'grad_norm': 0.8537912368774414, 'learning_rate': 1.294261500816152e-07, 'epoch': 2.85} +2025-05-11 14:20:21 - ERROR - stderr - 95%|█████████▌| 3555/3741 [20:54:27<1:05:30, 21.13s/it] +2025-05-11 14:20:44 - ERROR - stderr - 95%|█████████▌| 3556/3741 [20:54:50<1:07:04, 21.75s/it] +2025-05-11 14:20:44 - ERROR - stderr - +2025-05-11 14:20:44 - ERROR - stderr - +2025-05-11 14:20:44 - INFO - stdout - {'loss': 0.4968, 'grad_norm': 0.9012706279754639, 'learning_rate': 1.2804118166215297e-07, 'epoch': 2.85} +2025-05-11 14:20:44 - ERROR - stderr - 95%|█████████▌| 3556/3741 [20:54:50<1:07:04, 21.75s/it] +2025-05-11 14:21:03 - ERROR - stderr - 95%|█████████▌| 3557/3741 [20:55:10<1:04:47, 21.13s/it] +2025-05-11 14:21:04 - ERROR - stderr - +2025-05-11 14:21:04 - ERROR - stderr - +2025-05-11 14:21:04 - INFO - stdout - {'loss': 0.474, 'grad_norm': 0.8772068619728088, 'learning_rate': 1.266636155736145e-07, 'epoch': 2.85} +2025-05-11 14:21:04 - ERROR - stderr - 95%|█████████▌| 3557/3741 [20:55:10<1:04:47, 21.13s/it] +2025-05-11 14:21:26 - ERROR - stderr - 95%|█████████▌| 3558/3741 [20:55:32<1:05:21, 21.43s/it] +2025-05-11 14:21:26 - ERROR - stderr - +2025-05-11 14:21:26 - ERROR - stderr - +2025-05-11 14:21:26 - INFO - stdout - {'loss': 0.4907, 'grad_norm': 0.8455215692520142, 'learning_rate': 1.252934528489458e-07, 'epoch': 2.85} +2025-05-11 14:21:26 - ERROR - stderr - 95%|█████████▌| 3558/3741 [20:55:32<1:05:21, 21.43s/it] +2025-05-11 14:21:45 - ERROR - stderr - 95%|█████████▌| 3559/3741 [20:55:52<1:03:19, 20.88s/it] +2025-05-11 14:21:45 - ERROR - stderr - +2025-05-11 14:21:45 - ERROR - stderr - +2025-05-11 14:21:45 - INFO - stdout - {'loss': 0.5056, 'grad_norm': 0.9360870718955994, 'learning_rate': 1.2393069451554163e-07, 'epoch': 2.85} +2025-05-11 14:21:45 - ERROR - stderr - 95%|█████████▌| 3559/3741 [20:55:52<1:03:19, 20.88s/it] +2025-05-11 14:22:08 - ERROR - stderr - 95%|█████████▌| 3560/3741 [20:56:14<1:04:17, 21.31s/it] +2025-05-11 14:22:08 - ERROR - stderr - +2025-05-11 14:22:08 - ERROR - stderr - +2025-05-11 14:22:08 - INFO - stdout - {'loss': 0.4777, 'grad_norm': 0.8860448598861694, 'learning_rate': 1.2257534159524353e-07, 'epoch': 2.85} +2025-05-11 14:22:08 - ERROR - stderr - 95%|█████████▌| 3560/3741 [20:56:14<1:04:17, 21.31s/it] +2025-05-11 14:22:27 - ERROR - stderr - 95%|█████████▌| 3561/3741 [20:56:33<1:02:20, 20.78s/it] +2025-05-11 14:22:27 - ERROR - stderr - +2025-05-11 14:22:27 - ERROR - stderr - +2025-05-11 14:22:27 - INFO - stdout - {'loss': 0.4508, 'grad_norm': 0.8781315088272095, 'learning_rate': 1.21227395104343e-07, 'epoch': 2.86} +2025-05-11 14:22:27 - ERROR - stderr - 95%|█████████▌| 3561/3741 [20:56:33<1:02:20, 20.78s/it] +2025-05-11 14:22:49 - ERROR - stderr - 95%|█████████▌| 3562/3741 [20:56:56<1:03:21, 21.23s/it] +2025-05-11 14:22:49 - ERROR - stderr - +2025-05-11 14:22:49 - ERROR - stderr - +2025-05-11 14:22:49 - INFO - stdout - {'loss': 0.4514, 'grad_norm': 0.8404563069343567, 'learning_rate': 1.1988685605357486e-07, 'epoch': 2.86} +2025-05-11 14:22:49 - ERROR - stderr - 95%|█████████▌| 3562/3741 [20:56:56<1:03:21, 21.23s/it] +2025-05-11 14:23:09 - ERROR - stderr - 95%|█████████▌| 3563/3741 [20:57:15<1:01:19, 20.67s/it] +2025-05-11 14:23:09 - ERROR - stderr - +2025-05-11 14:23:09 - ERROR - stderr - +2025-05-11 14:23:09 - INFO - stdout - {'loss': 0.4616, 'grad_norm': 0.8559548258781433, 'learning_rate': 1.1855372544812172e-07, 'epoch': 2.86} +2025-05-11 14:23:09 - ERROR - stderr - 95%|█████████▌| 3563/3741 [20:57:15<1:01:19, 20.67s/it] +2025-05-11 14:23:32 - ERROR - stderr - 95%|█████████▌| 3564/3741 [20:57:38<1:02:55, 21.33s/it] +2025-05-11 14:23:32 - ERROR - stderr - +2025-05-11 14:23:32 - ERROR - stderr - +2025-05-11 14:23:32 - INFO - stdout - {'loss': 0.4811, 'grad_norm': 0.9076322913169861, 'learning_rate': 1.172280042876106e-07, 'epoch': 2.86} +2025-05-11 14:23:32 - ERROR - stderr - 95%|█████████▌| 3564/3741 [20:57:38<1:02:55, 21.33s/it] +2025-05-11 14:23:52 - ERROR - stderr - 95%|█████████▌| 3565/3741 [20:57:59<1:01:56, 21.12s/it] +2025-05-11 14:23:52 - ERROR - stderr - +2025-05-11 14:23:52 - ERROR - stderr - +2025-05-11 14:23:52 - INFO - stdout - {'loss': 0.4823, 'grad_norm': 0.9246238470077515, 'learning_rate': 1.1590969356611081e-07, 'epoch': 2.86} +2025-05-11 14:23:52 - ERROR - stderr - 95%|█████████▌| 3565/3741 [20:57:59<1:01:56, 21.12s/it] +2025-05-11 14:24:15 - ERROR - stderr - 95%|█████████▌| 3566/3741 [20:58:22<1:03:20, 21.72s/it] +2025-05-11 14:24:15 - ERROR - stderr - +2025-05-11 14:24:15 - ERROR - stderr - +2025-05-11 14:24:15 - INFO - stdout - {'loss': 0.483, 'grad_norm': 0.9094520211219788, 'learning_rate': 1.1459879427213827e-07, 'epoch': 2.86} +2025-05-11 14:24:15 - ERROR - stderr - 95%|█████████▌| 3566/3741 [20:58:22<1:03:20, 21.72s/it] +2025-05-11 14:24:37 - ERROR - stderr - 95%|█████████▌| 3567/3741 [20:58:43<1:02:45, 21.64s/it] +2025-05-11 14:24:37 - ERROR - stderr - +2025-05-11 14:24:37 - ERROR - stderr - +2025-05-11 14:24:37 - INFO - stdout - {'loss': 0.5114, 'grad_norm': 0.8826652765274048, 'learning_rate': 1.1329530738865003e-07, 'epoch': 2.86} +2025-05-11 14:24:37 - ERROR - stderr - 95%|█████████▌| 3567/3741 [20:58:43<1:02:45, 21.64s/it] +2025-05-11 14:25:01 - ERROR - stderr - 95%|█████████▌| 3568/3741 [20:59:07<1:04:36, 22.41s/it] +2025-05-11 14:25:01 - ERROR - stderr - +2025-05-11 14:25:01 - ERROR - stderr - +2025-05-11 14:25:01 - INFO - stdout - {'loss': 0.4441, 'grad_norm': 0.8294800519943237, 'learning_rate': 1.1199923389304201e-07, 'epoch': 2.86} +2025-05-11 14:25:01 - ERROR - stderr - 95%|█████████▌| 3568/3741 [20:59:07<1:04:36, 22.41s/it] +2025-05-11 14:25:24 - ERROR - stderr - 95%|█████████▌| 3569/3741 [20:59:30<1:04:21, 22.45s/it] +2025-05-11 14:25:24 - ERROR - stderr - +2025-05-11 14:25:24 - ERROR - stderr - +2025-05-11 14:25:24 - INFO - stdout - {'loss': 0.4362, 'grad_norm': 0.8477868437767029, 'learning_rate': 1.1071057475715797e-07, 'epoch': 2.86} +2025-05-11 14:25:24 - ERROR - stderr - 95%|█████████▌| 3569/3741 [20:59:30<1:04:21, 22.45s/it] +2025-05-11 14:25:46 - ERROR - stderr - 95%|█████████▌| 3570/3741 [20:59:52<1:03:40, 22.34s/it] +2025-05-11 14:25:46 - ERROR - stderr - +2025-05-11 14:25:46 - ERROR - stderr - +2025-05-11 14:25:46 - INFO - stdout - {'loss': 0.4741, 'grad_norm': 0.8606266379356384, 'learning_rate': 1.0942933094727715e-07, 'epoch': 2.86} +2025-05-11 14:25:46 - ERROR - stderr - 95%|█████████▌| 3570/3741 [20:59:52<1:03:40, 22.34s/it] +2025-05-11 14:26:08 - ERROR - stderr - 95%|█████████▌| 3571/3741 [21:00:14<1:03:02, 22.25s/it] +2025-05-11 14:26:08 - ERROR - stderr - +2025-05-11 14:26:08 - ERROR - stderr - +2025-05-11 14:26:08 - INFO - stdout - {'loss': 0.4655, 'grad_norm': 0.877900242805481, 'learning_rate': 1.0815550342411885e-07, 'epoch': 2.86} +2025-05-11 14:26:08 - ERROR - stderr - 95%|█████████▌| 3571/3741 [21:00:14<1:03:02, 22.25s/it] +2025-05-11 14:26:30 - ERROR - stderr - 95%|█████████▌| 3572/3741 [21:00:36<1:02:25, 22.17s/it] +2025-05-11 14:26:30 - ERROR - stderr - +2025-05-11 14:26:30 - ERROR - stderr - +2025-05-11 14:26:30 - INFO - stdout - {'loss': 0.4838, 'grad_norm': 0.8586121201515198, 'learning_rate': 1.0688909314284346e-07, 'epoch': 2.86} +2025-05-11 14:26:30 - ERROR - stderr - 95%|█████████▌| 3572/3741 [21:00:36<1:02:25, 22.17s/it] +2025-05-11 14:26:52 - ERROR - stderr - 96%|█████████▌| 3573/3741 [21:00:58<1:01:57, 22.13s/it] +2025-05-11 14:26:52 - ERROR - stderr - +2025-05-11 14:26:52 - ERROR - stderr - +2025-05-11 14:26:52 - INFO - stdout - {'loss': 0.4737, 'grad_norm': 0.8372088074684143, 'learning_rate': 1.0563010105304694e-07, 'epoch': 2.87} +2025-05-11 14:26:52 - ERROR - stderr - 96%|█████████▌| 3573/3741 [21:00:58<1:01:57, 22.13s/it] +2025-05-11 14:27:14 - ERROR - stderr - 96%|█████████▌| 3574/3741 [21:01:20<1:01:25, 22.07s/it] +2025-05-11 14:27:14 - ERROR - stderr - +2025-05-11 14:27:14 - ERROR - stderr - +2025-05-11 14:27:14 - INFO - stdout - {'loss': 0.4736, 'grad_norm': 0.8549312353134155, 'learning_rate': 1.0437852809876636e-07, 'epoch': 2.87} +2025-05-11 14:27:14 - ERROR - stderr - 96%|█████████▌| 3574/3741 [21:01:20<1:01:25, 22.07s/it] +2025-05-11 14:27:36 - ERROR - stderr - 96%|█████████▌| 3575/3741 [21:01:42<1:01:00, 22.05s/it] +2025-05-11 14:27:36 - ERROR - stderr - +2025-05-11 14:27:36 - ERROR - stderr - +2025-05-11 14:27:36 - INFO - stdout - {'loss': 0.4589, 'grad_norm': 0.8639695644378662, 'learning_rate': 1.0313437521847325e-07, 'epoch': 2.87} +2025-05-11 14:27:36 - ERROR - stderr - 96%|█████████▌| 3575/3741 [21:01:42<1:01:00, 22.05s/it] +2025-05-11 14:27:57 - ERROR - stderr - 96%|█████████▌| 3576/3741 [21:02:04<1:00:19, 21.94s/it] +2025-05-11 14:27:57 - ERROR - stderr - +2025-05-11 14:27:57 - ERROR - stderr - +2025-05-11 14:27:57 - INFO - stdout - {'loss': 0.4866, 'grad_norm': 0.8854097723960876, 'learning_rate': 1.0189764334507579e-07, 'epoch': 2.87} +2025-05-11 14:27:57 - ERROR - stderr - 96%|█████████▌| 3576/3741 [21:02:04<1:00:19, 21.94s/it] +2025-05-11 14:28:21 - ERROR - stderr - 96%|█████████▌| 3577/3741 [21:02:27<1:01:15, 22.41s/it] +2025-05-11 14:28:21 - ERROR - stderr - +2025-05-11 14:28:21 - ERROR - stderr - +2025-05-11 14:28:21 - INFO - stdout - {'loss': 0.4556, 'grad_norm': 0.8394895792007446, 'learning_rate': 1.0066833340591664e-07, 'epoch': 2.87} +2025-05-11 14:28:21 - ERROR - stderr - 96%|█████████▌| 3577/3741 [21:02:27<1:01:15, 22.41s/it] +2025-05-11 14:28:43 - ERROR - stderr - 96%|█████████▌| 3578/3741 [21:02:49<1:00:21, 22.22s/it] +2025-05-11 14:28:43 - ERROR - stderr - +2025-05-11 14:28:43 - ERROR - stderr - +2025-05-11 14:28:43 - INFO - stdout - {'loss': 0.4882, 'grad_norm': 0.8737897276878357, 'learning_rate': 9.944644632277512e-08, 'epoch': 2.87} +2025-05-11 14:28:43 - ERROR - stderr - 96%|█████████▌| 3578/3741 [21:02:49<1:00:21, 22.22s/it] +2025-05-11 14:29:07 - ERROR - stderr - 96%|█████████▌| 3579/3741 [21:03:13<1:01:41, 22.85s/it] +2025-05-11 14:29:07 - ERROR - stderr - +2025-05-11 14:29:07 - ERROR - stderr - +2025-05-11 14:29:07 - INFO - stdout - {'loss': 0.463, 'grad_norm': 0.9116494655609131, 'learning_rate': 9.823198301186387e-08, 'epoch': 2.87} +2025-05-11 14:29:07 - ERROR - stderr - 96%|█████████▌| 3579/3741 [21:03:13<1:01:41, 22.85s/it] +2025-05-11 14:29:29 - ERROR - stderr - 96%|█████████▌| 3580/3741 [21:03:35<1:00:47, 22.65s/it] +2025-05-11 14:29:29 - ERROR - stderr - +2025-05-11 14:29:29 - ERROR - stderr - +2025-05-11 14:29:29 - INFO - stdout - {'loss': 0.4651, 'grad_norm': 0.8393471837043762, 'learning_rate': 9.702494438383003e-08, 'epoch': 2.87} +2025-05-11 14:29:29 - ERROR - stderr - 96%|█████████▌| 3580/3741 [21:03:35<1:00:47, 22.65s/it] +2025-05-11 14:29:51 - ERROR - stderr - 96%|█████████▌| 3581/3741 [21:03:57<59:53, 22.46s/it] +2025-05-11 14:29:51 - ERROR - stderr - +2025-05-11 14:29:51 - ERROR - stderr - +2025-05-11 14:29:51 - INFO - stdout - {'loss': 0.4699, 'grad_norm': 0.8796486258506775, 'learning_rate': 9.582533134374849e-08, 'epoch': 2.87} +2025-05-11 14:29:51 - ERROR - stderr - 96%|█████████▌| 3581/3741 [21:03:57<59:53, 22.46s/it] +2025-05-11 14:30:13 - ERROR - stderr - 96%|█████████▌| 3582/3741 [21:04:19<58:57, 22.25s/it] +2025-05-11 14:30:13 - ERROR - stderr - +2025-05-11 14:30:13 - ERROR - stderr - +2025-05-11 14:30:13 - INFO - stdout - {'loss': 0.4686, 'grad_norm': 0.8335583209991455, 'learning_rate': 9.463314479113416e-08, 'epoch': 2.87} +2025-05-11 14:30:13 - ERROR - stderr - 96%|█████████▌| 3582/3741 [21:04:19<58:57, 22.25s/it] +2025-05-11 14:30:35 - ERROR - stderr - 96%|█████████▌| 3583/3741 [21:04:41<58:10, 22.09s/it] +2025-05-11 14:30:35 - ERROR - stderr - +2025-05-11 14:30:35 - ERROR - stderr - +2025-05-11 14:30:35 - INFO - stdout - {'loss': 0.4928, 'grad_norm': 0.8907720446586609, 'learning_rate': 9.344838561992642e-08, 'epoch': 2.87} +2025-05-11 14:30:35 - ERROR - stderr - 96%|█████████▌| 3583/3741 [21:04:41<58:10, 22.09s/it] +2025-05-11 14:30:56 - ERROR - stderr - 96%|█████████▌| 3584/3741 [21:05:03<57:39, 22.04s/it] +2025-05-11 14:30:57 - ERROR - stderr - +2025-05-11 14:30:57 - ERROR - stderr - +2025-05-11 14:30:57 - INFO - stdout - {'loss': 0.4846, 'grad_norm': 0.8920674920082092, 'learning_rate': 9.227105471849795e-08, 'epoch': 2.87} +2025-05-11 14:30:57 - ERROR - stderr - 96%|█████████▌| 3584/3741 [21:05:03<57:39, 22.04s/it] +2025-05-11 14:31:18 - ERROR - stderr - 96%|█████████▌| 3585/3741 [21:05:24<56:56, 21.90s/it] +2025-05-11 14:31:18 - ERROR - stderr - +2025-05-11 14:31:18 - ERROR - stderr - +2025-05-11 14:31:18 - INFO - stdout - {'loss': 0.4715, 'grad_norm': 0.8768170475959778, 'learning_rate': 9.110115296965482e-08, 'epoch': 2.87} +2025-05-11 14:31:18 - ERROR - stderr - 96%|█████████▌| 3585/3741 [21:05:24<56:56, 21.90s/it] +2025-05-11 14:31:40 - ERROR - stderr - 96%|█████████▌| 3586/3741 [21:05:47<56:47, 21.98s/it] +2025-05-11 14:31:40 - ERROR - stderr - +2025-05-11 14:31:40 - ERROR - stderr - +2025-05-11 14:31:40 - INFO - stdout - {'loss': 0.4579, 'grad_norm': 0.8467321395874023, 'learning_rate': 8.993868125062533e-08, 'epoch': 2.88} +2025-05-11 14:31:40 - ERROR - stderr - 96%|█████████▌| 3586/3741 [21:05:47<56:47, 21.98s/it] +2025-05-11 14:32:02 - ERROR - stderr - 96%|█████████▌| 3587/3741 [21:06:08<56:11, 21.89s/it] +2025-05-11 14:32:02 - ERROR - stderr - +2025-05-11 14:32:02 - ERROR - stderr - +2025-05-11 14:32:02 - INFO - stdout - {'loss': 0.4825, 'grad_norm': 0.9148172736167908, 'learning_rate': 8.87836404330722e-08, 'epoch': 2.88} +2025-05-11 14:32:02 - ERROR - stderr - 96%|█████████▌| 3587/3741 [21:06:08<56:11, 21.89s/it] +2025-05-11 14:32:26 - ERROR - stderr - 96%|█████████▌| 3588/3741 [21:06:33<57:42, 22.63s/it] +2025-05-11 14:32:26 - ERROR - stderr - +2025-05-11 14:32:26 - ERROR - stderr - +2025-05-11 14:32:26 - INFO - stdout - {'loss': 0.4647, 'grad_norm': 0.8339017629623413, 'learning_rate': 8.763603138308485e-08, 'epoch': 2.88} +2025-05-11 14:32:26 - ERROR - stderr - 96%|█████████▌| 3588/3741 [21:06:33<57:42, 22.63s/it] +2025-05-11 14:32:48 - ERROR - stderr - 96%|█████████▌| 3589/3741 [21:06:54<56:42, 22.38s/it] +2025-05-11 14:32:48 - ERROR - stderr - +2025-05-11 14:32:48 - ERROR - stderr - +2025-05-11 14:32:48 - INFO - stdout - {'loss': 0.4491, 'grad_norm': 0.8492835760116577, 'learning_rate': 8.64958549611783e-08, 'epoch': 2.88} +2025-05-11 14:32:48 - ERROR - stderr - 96%|█████████▌| 3589/3741 [21:06:54<56:42, 22.38s/it] +2025-05-11 14:33:11 - ERROR - stderr - 96%|█████████▌| 3590/3741 [21:07:18<57:04, 22.68s/it] +2025-05-11 14:33:11 - ERROR - stderr - +2025-05-11 14:33:11 - ERROR - stderr - +2025-05-11 14:33:11 - INFO - stdout - {'loss': 0.484, 'grad_norm': 0.8415125608444214, 'learning_rate': 8.536311202229641e-08, 'epoch': 2.88} +2025-05-11 14:33:11 - ERROR - stderr - 96%|█████████▌| 3590/3741 [21:07:18<57:04, 22.68s/it] +2025-05-11 14:33:34 - ERROR - stderr - 96%|█████████▌| 3591/3741 [21:07:40<56:31, 22.61s/it] +2025-05-11 14:33:34 - ERROR - stderr - +2025-05-11 14:33:34 - ERROR - stderr - +2025-05-11 14:33:34 - INFO - stdout - {'loss': 0.4564, 'grad_norm': 0.8640526533126831, 'learning_rate': 8.423780341580756e-08, 'epoch': 2.88} +2025-05-11 14:33:34 - ERROR - stderr - 96%|█████████▌| 3591/3741 [21:07:40<56:31, 22.61s/it] +2025-05-11 14:33:56 - ERROR - stderr - 96%|█████████▌| 3592/3741 [21:08:02<55:39, 22.41s/it] +2025-05-11 14:33:56 - ERROR - stderr - +2025-05-11 14:33:56 - ERROR - stderr - +2025-05-11 14:33:56 - INFO - stdout - {'loss': 0.4632, 'grad_norm': 0.8569619655609131, 'learning_rate': 8.311992998550789e-08, 'epoch': 2.88} +2025-05-11 14:33:56 - ERROR - stderr - 96%|█████████▌| 3592/3741 [21:08:02<55:39, 22.41s/it] +2025-05-11 14:34:17 - ERROR - stderr - 96%|█████████▌| 3593/3741 [21:08:24<54:36, 22.14s/it] +2025-05-11 14:34:17 - ERROR - stderr - +2025-05-11 14:34:17 - ERROR - stderr - +2025-05-11 14:34:17 - INFO - stdout - {'loss': 0.4823, 'grad_norm': 0.8628082871437073, 'learning_rate': 8.200949256961687e-08, 'epoch': 2.88} +2025-05-11 14:34:17 - ERROR - stderr - 96%|█████████▌| 3593/3741 [21:08:24<54:36, 22.14s/it] +2025-05-11 14:34:39 - ERROR - stderr - 96%|█████████▌| 3594/3741 [21:08:45<54:00, 22.04s/it] +2025-05-11 14:34:39 - ERROR - stderr - +2025-05-11 14:34:39 - ERROR - stderr - +2025-05-11 14:34:39 - INFO - stdout - {'loss': 0.4679, 'grad_norm': 0.8405731916427612, 'learning_rate': 8.090649200077627e-08, 'epoch': 2.88} +2025-05-11 14:34:39 - ERROR - stderr - 96%|█████████▌| 3594/3741 [21:08:46<54:00, 22.04s/it] +2025-05-11 14:35:01 - ERROR - stderr - 96%|█████████▌| 3595/3741 [21:09:07<53:33, 22.01s/it] +2025-05-11 14:35:01 - ERROR - stderr - +2025-05-11 14:35:01 - ERROR - stderr - +2025-05-11 14:35:01 - INFO - stdout - {'loss': 0.4371, 'grad_norm': 0.831079363822937, 'learning_rate': 7.98109291060567e-08, 'epoch': 2.88} +2025-05-11 14:35:01 - ERROR - stderr - 96%|█████████▌| 3595/3741 [21:09:07<53:33, 22.01s/it] +2025-05-11 14:35:23 - ERROR - stderr - 96%|█████████▌| 3596/3741 [21:09:29<52:53, 21.89s/it] +2025-05-11 14:35:23 - ERROR - stderr - +2025-05-11 14:35:23 - ERROR - stderr - +2025-05-11 14:35:23 - INFO - stdout - {'loss': 0.4744, 'grad_norm': 0.8602555990219116, 'learning_rate': 7.872280470694549e-08, 'epoch': 2.88} +2025-05-11 14:35:23 - ERROR - stderr - 96%|█████████▌| 3596/3741 [21:09:29<52:53, 21.89s/it] +2025-05-11 14:35:45 - ERROR - stderr - 96%|█████████▌| 3597/3741 [21:09:51<52:30, 21.88s/it] +2025-05-11 14:35:45 - ERROR - stderr - +2025-05-11 14:35:45 - ERROR - stderr - +2025-05-11 14:35:45 - INFO - stdout - {'loss': 0.4517, 'grad_norm': 0.8295644521713257, 'learning_rate': 7.764211961935664e-08, 'epoch': 2.88} +2025-05-11 14:35:45 - ERROR - stderr - 96%|█████████▌| 3597/3741 [21:09:51<52:30, 21.88s/it] +2025-05-11 14:36:06 - ERROR - stderr - 96%|█████████▌| 3598/3741 [21:10:13<52:06, 21.86s/it] +2025-05-11 14:36:06 - ERROR - stderr - +2025-05-11 14:36:06 - ERROR - stderr - +2025-05-11 14:36:06 - INFO - stdout - {'loss': 0.4617, 'grad_norm': 0.8858298659324646, 'learning_rate': 7.656887465362528e-08, 'epoch': 2.89} +2025-05-11 14:36:06 - ERROR - stderr - 96%|█████████▌| 3598/3741 [21:10:13<52:06, 21.86s/it] +2025-05-11 14:36:30 - ERROR - stderr - 96%|█████████▌| 3599/3741 [21:10:36<52:50, 22.33s/it] +2025-05-11 14:36:30 - ERROR - stderr - +2025-05-11 14:36:30 - ERROR - stderr - +2025-05-11 14:36:30 - INFO - stdout - {'loss': 0.474, 'grad_norm': 0.8486381769180298, 'learning_rate': 7.550307061450546e-08, 'epoch': 2.89} +2025-05-11 14:36:30 - ERROR - stderr - 96%|█████████▌| 3599/3741 [21:10:36<52:50, 22.33s/it] +2025-05-11 14:36:52 - ERROR - stderr - 96%|█████████▌| 3600/3741 [21:10:58<52:13, 22.22s/it] +2025-05-11 14:36:52 - ERROR - stderr - +2025-05-11 14:36:52 - ERROR - stderr - +2025-05-11 14:36:52 - INFO - stdout - {'loss': 0.4681, 'grad_norm': 0.8799077272415161, 'learning_rate': 7.444470830117456e-08, 'epoch': 2.89} +2025-05-11 14:36:52 - ERROR - stderr - 96%|█████████▌| 3600/3741 [21:10:58<52:13, 22.22s/it] +2025-05-11 14:37:14 - ERROR - stderr - 96%|█████████▋| 3601/3741 [21:11:21<52:04, 22.32s/it] +2025-05-11 14:37:14 - ERROR - stderr - +2025-05-11 14:37:14 - ERROR - stderr - +2025-05-11 14:37:14 - INFO - stdout - {'loss': 0.4839, 'grad_norm': 0.8846973776817322, 'learning_rate': 7.339378850722889e-08, 'epoch': 2.89} +2025-05-11 14:37:14 - ERROR - stderr - 96%|█████████▋| 3601/3741 [21:11:21<52:04, 22.32s/it] +2025-05-11 14:37:36 - ERROR - stderr - 96%|█████████▋| 3602/3741 [21:11:43<51:27, 22.21s/it] +2025-05-11 14:37:36 - ERROR - stderr - +2025-05-11 14:37:36 - ERROR - stderr - +2025-05-11 14:37:36 - INFO - stdout - {'loss': 0.4527, 'grad_norm': 0.8882151246070862, 'learning_rate': 7.235031202068255e-08, 'epoch': 2.89} +2025-05-11 14:37:36 - ERROR - stderr - 96%|█████████▋| 3602/3741 [21:11:43<51:27, 22.21s/it] +2025-05-11 14:37:58 - ERROR - stderr - 96%|█████████▋| 3603/3741 [21:12:05<50:55, 22.14s/it] +2025-05-11 14:37:58 - ERROR - stderr - +2025-05-11 14:37:58 - ERROR - stderr - +2025-05-11 14:37:58 - INFO - stdout - {'loss': 0.5004, 'grad_norm': 0.8758078813552856, 'learning_rate': 7.131427962397076e-08, 'epoch': 2.89} +2025-05-11 14:37:58 - ERROR - stderr - 96%|█████████▋| 3603/3741 [21:12:05<50:55, 22.14s/it] +2025-05-11 14:38:21 - ERROR - stderr - 96%|█████████▋| 3604/3741 [21:12:27<50:38, 22.18s/it] +2025-05-11 14:38:21 - ERROR - stderr - +2025-05-11 14:38:21 - ERROR - stderr - +2025-05-11 14:38:21 - INFO - stdout - {'loss': 0.4439, 'grad_norm': 0.9011175632476807, 'learning_rate': 7.028569209394653e-08, 'epoch': 2.89} +2025-05-11 14:38:21 - ERROR - stderr - 96%|█████████▋| 3604/3741 [21:12:27<50:38, 22.18s/it] +2025-05-11 14:38:42 - ERROR - stderr - 96%|█████████▋| 3605/3741 [21:12:49<50:06, 22.10s/it] +2025-05-11 14:38:42 - ERROR - stderr - +2025-05-11 14:38:42 - ERROR - stderr - +2025-05-11 14:38:42 - INFO - stdout - {'loss': 0.5111, 'grad_norm': 0.8999505043029785, 'learning_rate': 6.92645502018785e-08, 'epoch': 2.89} +2025-05-11 14:38:42 - ERROR - stderr - 96%|█████████▋| 3605/3741 [21:12:49<50:06, 22.10s/it] +2025-05-11 14:39:05 - ERROR - stderr - 96%|█████████▋| 3606/3741 [21:13:11<49:42, 22.09s/it] +2025-05-11 14:39:05 - ERROR - stderr - +2025-05-11 14:39:05 - ERROR - stderr - +2025-05-11 14:39:05 - INFO - stdout - {'loss': 0.4621, 'grad_norm': 0.9070544242858887, 'learning_rate': 6.825085471345416e-08, 'epoch': 2.89} +2025-05-11 14:39:05 - ERROR - stderr - 96%|█████████▋| 3606/3741 [21:13:11<49:42, 22.09s/it] +2025-05-11 14:39:27 - ERROR - stderr - 96%|█████████▋| 3607/3741 [21:13:33<49:18, 22.08s/it] +2025-05-11 14:39:27 - ERROR - stderr - +2025-05-11 14:39:27 - ERROR - stderr - +2025-05-11 14:39:27 - INFO - stdout - {'loss': 0.4679, 'grad_norm': 0.8917433619499207, 'learning_rate': 6.724460638877661e-08, 'epoch': 2.89} +2025-05-11 14:39:27 - ERROR - stderr - 96%|█████████▋| 3607/3741 [21:13:33<49:18, 22.08s/it] +2025-05-11 14:39:50 - ERROR - stderr - 96%|█████████▋| 3608/3741 [21:13:56<49:44, 22.44s/it] +2025-05-11 14:39:50 - ERROR - stderr - +2025-05-11 14:39:50 - ERROR - stderr - +2025-05-11 14:39:50 - INFO - stdout - {'loss': 0.4559, 'grad_norm': 0.8720741868019104, 'learning_rate': 6.624580598236563e-08, 'epoch': 2.89} +2025-05-11 14:39:50 - ERROR - stderr - 96%|█████████▋| 3608/3741 [21:13:56<49:44, 22.44s/it] +2025-05-11 14:40:13 - ERROR - stderr - 96%|█████████▋| 3609/3741 [21:14:19<49:31, 22.51s/it] +2025-05-11 14:40:13 - ERROR - stderr - +2025-05-11 14:40:13 - ERROR - stderr - +2025-05-11 14:40:13 - INFO - stdout - {'loss': 0.4542, 'grad_norm': 0.8378724455833435, 'learning_rate': 6.525445424315546e-08, 'epoch': 2.89} +2025-05-11 14:40:13 - ERROR - stderr - 96%|█████████▋| 3609/3741 [21:14:19<49:31, 22.51s/it] +2025-05-11 14:40:35 - ERROR - stderr - 96%|█████████▋| 3610/3741 [21:14:41<49:04, 22.48s/it] +2025-05-11 14:40:35 - ERROR - stderr - +2025-05-11 14:40:35 - ERROR - stderr - +2025-05-11 14:40:35 - INFO - stdout - {'loss': 0.4639, 'grad_norm': 0.9167259335517883, 'learning_rate': 6.427055191449483e-08, 'epoch': 2.89} +2025-05-11 14:40:35 - ERROR - stderr - 96%|█████████▋| 3610/3741 [21:14:41<49:04, 22.48s/it] +2025-05-11 14:40:57 - ERROR - stderr - 97%|█████████▋| 3611/3741 [21:15:04<48:37, 22.44s/it] +2025-05-11 14:40:57 - ERROR - stderr - +2025-05-11 14:40:57 - ERROR - stderr - +2025-05-11 14:40:57 - INFO - stdout - {'loss': 0.4829, 'grad_norm': 0.9500483274459839, 'learning_rate': 6.329409973414913e-08, 'epoch': 2.9} +2025-05-11 14:40:57 - ERROR - stderr - 97%|███████��█▋| 3611/3741 [21:15:04<48:37, 22.44s/it] +2025-05-11 14:41:19 - ERROR - stderr - 97%|█████████▋| 3612/3741 [21:15:25<47:51, 22.26s/it] +2025-05-11 14:41:19 - ERROR - stderr - +2025-05-11 14:41:19 - ERROR - stderr - +2025-05-11 14:41:19 - INFO - stdout - {'loss': 0.4823, 'grad_norm': 0.8519693613052368, 'learning_rate': 6.23250984342938e-08, 'epoch': 2.9} +2025-05-11 14:41:19 - ERROR - stderr - 97%|█████████▋| 3612/3741 [21:15:25<47:51, 22.26s/it] +2025-05-11 14:41:41 - ERROR - stderr - 97%|█████████▋| 3613/3741 [21:15:47<47:16, 22.16s/it] +2025-05-11 14:41:41 - ERROR - stderr - +2025-05-11 14:41:41 - ERROR - stderr - +2025-05-11 14:41:41 - INFO - stdout - {'loss': 0.4694, 'grad_norm': 0.8825336694717407, 'learning_rate': 6.136354874151874e-08, 'epoch': 2.9} +2025-05-11 14:41:41 - ERROR - stderr - 97%|█████████▋| 3613/3741 [21:15:47<47:16, 22.16s/it] +2025-05-11 14:41:42 - INFO - stdout - WARNING: tokenization mismatch: 1 vs. 3133. (ignored) +2025-05-11 14:42:03 - ERROR - stderr - 97%|█████████▋| 3614/3741 [21:16:10<46:53, 22.15s/it] +2025-05-11 14:42:03 - ERROR - stderr - +2025-05-11 14:42:03 - ERROR - stderr - +2025-05-11 14:42:03 - INFO - stdout - {'loss': 0.468, 'grad_norm': 0.8310946226119995, 'learning_rate': 6.04094513768283e-08, 'epoch': 2.9} +2025-05-11 14:42:03 - ERROR - stderr - 97%|█████████▋| 3614/3741 [21:16:10<46:53, 22.15s/it] +2025-05-11 14:42:27 - ERROR - stderr - 97%|█████████▋| 3615/3741 [21:16:34<47:51, 22.79s/it] +2025-05-11 14:42:27 - ERROR - stderr - +2025-05-11 14:42:27 - ERROR - stderr - +2025-05-11 14:42:27 - INFO - stdout - {'loss': 0.4555, 'grad_norm': 0.7865362763404846, 'learning_rate': 5.9462807055635787e-08, 'epoch': 2.9} +2025-05-11 14:42:27 - ERROR - stderr - 97%|█████████▋| 3615/3741 [21:16:34<47:51, 22.79s/it] +2025-05-11 14:42:49 - ERROR - stderr - 97%|█████████▋| 3616/3741 [21:16:55<46:44, 22.44s/it] +2025-05-11 14:42:49 - ERROR - stderr - +2025-05-11 14:42:49 - ERROR - stderr - +2025-05-11 14:42:49 - INFO - stdout - {'loss': 0.4533, 'grad_norm': 0.8779739141464233, 'learning_rate': 5.852361648776672e-08, 'epoch': 2.9} +2025-05-11 14:42:49 - ERROR - stderr - 97%|█████████▋| 3616/3741 [21:16:55<46:44, 22.44s/it] +2025-05-11 14:43:11 - ERROR - stderr - 97%|█████████▋| 3617/3741 [21:17:17<45:54, 22.21s/it] +2025-05-11 14:43:11 - ERROR - stderr - +2025-05-11 14:43:11 - ERROR - stderr - +2025-05-11 14:43:11 - INFO - stdout - {'loss': 0.4784, 'grad_norm': 0.9257890582084656, 'learning_rate': 5.7591880377459995e-08, 'epoch': 2.9} +2025-05-11 14:43:11 - ERROR - stderr - 97%|█████████▋| 3617/3741 [21:17:17<45:54, 22.21s/it] +2025-05-11 14:43:33 - ERROR - stderr - 97%|█████████▋| 3618/3741 [21:17:39<45:31, 22.21s/it] +2025-05-11 14:43:33 - ERROR - stderr - +2025-05-11 14:43:33 - ERROR - stderr - +2025-05-11 14:43:33 - INFO - stdout - {'loss': 0.4619, 'grad_norm': 0.8655847907066345, 'learning_rate': 5.666759942336231e-08, 'epoch': 2.9} +2025-05-11 14:43:33 - ERROR - stderr - 97%|█████████▋| 3618/3741 [21:17:39<45:31, 22.21s/it] +2025-05-11 14:43:54 - ERROR - stderr - 97%|█████████▋| 3619/3741 [21:18:01<44:43, 22.00s/it] +2025-05-11 14:43:54 - ERROR - stderr - +2025-05-11 14:43:54 - ERROR - stderr - +2025-05-11 14:43:54 - INFO - stdout - {'loss': 0.4821, 'grad_norm': 0.9047146439552307, 'learning_rate': 5.5750774318531486e-08, 'epoch': 2.9} +2025-05-11 14:43:54 - ERROR - stderr - 97%|█████████▋| 3619/3741 [21:18:01<44:43, 22.00s/it] +2025-05-11 14:44:16 - ERROR - stderr - 97%|█████████▋| 3620/3741 [21:18:23<44:21, 22.00s/it] +2025-05-11 14:44:17 - ERROR - stderr - +2025-05-11 14:44:17 - ERROR - stderr - +2025-05-11 14:44:17 - INFO - stdout - {'loss': 0.4506, 'grad_norm': 0.8486894965171814, 'learning_rate': 5.4841405750433175e-08, 'epoch': 2.9} +2025-05-11 14:44:17 - ERROR - stderr - 97%|█████████▋| 3620/3741 [21:18:23<44:21, 22.00s/it] +2025-05-11 14:44:38 - ERROR - stderr - 97%|█████████▋| 3621/3741 [21:18:45<43:56, 21.97s/it] +2025-05-11 14:44:38 - ERROR - stderr - +2025-05-11 14:44:38 - ERROR - stderr - +2025-05-11 14:44:38 - INFO - stdout - {'loss': 0.4725, 'grad_norm': 0.9041795134544373, 'learning_rate': 5.393949440094415e-08, 'epoch': 2.9} +2025-05-11 14:44:38 - ERROR - stderr - 97%|█████████▋| 3621/3741 [21:18:45<43:56, 21.97s/it] +2025-05-11 14:45:01 - ERROR - stderr - 97%|█████████▋| 3622/3741 [21:19:08<44:05, 22.23s/it] +2025-05-11 14:45:01 - ERROR - stderr - +2025-05-11 14:45:01 - ERROR - stderr - +2025-05-11 14:45:01 - INFO - stdout - {'loss': 0.5029, 'grad_norm': 0.8865926265716553, 'learning_rate': 5.304504094634677e-08, 'epoch': 2.9} +2025-05-11 14:45:01 - ERROR - stderr - 97%|███████���█▋| 3622/3741 [21:19:08<44:05, 22.23s/it] +2025-05-11 14:45:23 - ERROR - stderr - 97%|█████████▋| 3623/3741 [21:19:30<43:41, 22.21s/it] +2025-05-11 14:45:23 - ERROR - stderr - +2025-05-11 14:45:23 - ERROR - stderr - +2025-05-11 14:45:23 - INFO - stdout - {'loss': 0.4617, 'grad_norm': 0.8497272729873657, 'learning_rate': 5.2158046057333434e-08, 'epoch': 2.91} +2025-05-11 14:45:23 - ERROR - stderr - 97%|█████████▋| 3623/3741 [21:19:30<43:41, 22.21s/it] +2025-05-11 14:45:47 - ERROR - stderr - 97%|█████████▋| 3624/3741 [21:19:54<44:24, 22.78s/it] +2025-05-11 14:45:48 - ERROR - stderr - +2025-05-11 14:45:48 - ERROR - stderr - +2025-05-11 14:45:48 - INFO - stdout - {'loss': 0.4697, 'grad_norm': 0.8616043925285339, 'learning_rate': 5.1278510399004334e-08, 'epoch': 2.91} +2025-05-11 14:45:48 - ERROR - stderr - 97%|█████████▋| 3624/3741 [21:19:54<44:24, 22.78s/it] +2025-05-11 14:46:09 - ERROR - stderr - 97%|█████████▋| 3625/3741 [21:20:16<43:32, 22.52s/it] +2025-05-11 14:46:09 - ERROR - stderr - +2025-05-11 14:46:09 - ERROR - stderr - +2025-05-11 14:46:09 - INFO - stdout - {'loss': 0.4575, 'grad_norm': 0.8522729873657227, 'learning_rate': 5.040643463086303e-08, 'epoch': 2.91} +2025-05-11 14:46:09 - ERROR - stderr - 97%|█████████▋| 3625/3741 [21:20:16<43:32, 22.52s/it] +2025-05-11 14:46:32 - ERROR - stderr - 97%|█████████▋| 3626/3741 [21:20:38<43:01, 22.44s/it] +2025-05-11 14:46:32 - ERROR - stderr - +2025-05-11 14:46:32 - ERROR - stderr - +2025-05-11 14:46:32 - INFO - stdout - {'loss': 0.4719, 'grad_norm': 0.8534807562828064, 'learning_rate': 4.954181940682201e-08, 'epoch': 2.91} +2025-05-11 14:46:32 - ERROR - stderr - 97%|█████████▋| 3626/3741 [21:20:38<43:01, 22.44s/it] +2025-05-11 14:46:54 - ERROR - stderr - 97%|█████████▋| 3627/3741 [21:21:00<42:37, 22.44s/it] +2025-05-11 14:46:54 - ERROR - stderr - +2025-05-11 14:46:54 - ERROR - stderr - +2025-05-11 14:46:54 - INFO - stdout - {'loss': 0.4726, 'grad_norm': 0.8463496565818787, 'learning_rate': 4.8684665375201553e-08, 'epoch': 2.91} +2025-05-11 14:46:54 - ERROR - stderr - 97%|█████████▋| 3627/3741 [21:21:00<42:37, 22.44s/it] +2025-05-11 14:47:16 - ERROR - stderr - 97%|█████████▋| 3628/3741 [21:21:22<41:54, 22.25s/it] +2025-05-11 14:47:16 - ERROR - stderr - +2025-05-11 14:47:16 - ERROR - stderr - +2025-05-11 14:47:16 - INFO - stdout - {'loss': 0.4646, 'grad_norm': 0.8361274003982544, 'learning_rate': 4.7834973178721986e-08, 'epoch': 2.91} +2025-05-11 14:47:16 - ERROR - stderr - 97%|█████████▋| 3628/3741 [21:21:22<41:54, 22.25s/it] +2025-05-11 14:47:38 - ERROR - stderr - 97%|█████████▋| 3629/3741 [21:21:44<41:18, 22.13s/it] +2025-05-11 14:47:38 - ERROR - stderr - +2025-05-11 14:47:38 - ERROR - stderr - +2025-05-11 14:47:38 - INFO - stdout - {'loss': 0.4596, 'grad_norm': 0.8507230877876282, 'learning_rate': 4.6992743454513654e-08, 'epoch': 2.91} +2025-05-11 14:47:38 - ERROR - stderr - 97%|█████████▋| 3629/3741 [21:21:44<41:18, 22.13s/it] +2025-05-11 14:48:00 - ERROR - stderr - 97%|█████████▋| 3630/3741 [21:22:06<40:52, 22.09s/it] +2025-05-11 14:48:00 - ERROR - stderr - +2025-05-11 14:48:00 - ERROR - stderr - +2025-05-11 14:48:00 - INFO - stdout - {'loss': 0.4887, 'grad_norm': 0.8449583649635315, 'learning_rate': 4.615797683410694e-08, 'epoch': 2.91} +2025-05-11 14:48:00 - ERROR - stderr - 97%|█████████▋| 3630/3741 [21:22:06<40:52, 22.09s/it] +2025-05-11 14:48:23 - ERROR - stderr - 97%|█████████▋| 3631/3741 [21:22:30<41:20, 22.55s/it] +2025-05-11 14:48:23 - ERROR - stderr - +2025-05-11 14:48:23 - ERROR - stderr - +2025-05-11 14:48:23 - INFO - stdout - {'loss': 0.4702, 'grad_norm': 0.8460783362388611, 'learning_rate': 4.533067394344115e-08, 'epoch': 2.91} +2025-05-11 14:48:23 - ERROR - stderr - 97%|█████████▋| 3631/3741 [21:22:30<41:20, 22.55s/it] +2025-05-11 14:48:45 - ERROR - stderr - 97%|█████████▋| 3632/3741 [21:22:51<40:27, 22.27s/it] +2025-05-11 14:48:45 - ERROR - stderr - +2025-05-11 14:48:45 - ERROR - stderr - +2025-05-11 14:48:45 - INFO - stdout - {'loss': 0.478, 'grad_norm': 0.89262855052948, 'learning_rate': 4.4510835402853394e-08, 'epoch': 2.91} +2025-05-11 14:48:45 - ERROR - stderr - 97%|█████████▋| 3632/3741 [21:22:51<40:27, 22.27s/it] +2025-05-11 14:49:06 - ERROR - stderr - 97%|█████████▋| 3633/3741 [21:23:13<39:37, 22.02s/it] +2025-05-11 14:49:06 - ERROR - stderr - +2025-05-11 14:49:06 - ERROR - stderr - +2025-05-11 14:49:06 - INFO - stdout - {'loss': 0.4603, 'grad_norm': 0.8806936144828796, 'learning_rate': 4.369846182708748e-08, 'epoch': 2.91} +2025-05-11 14:49:06 - ERROR - stderr - 97%|█████████▋| 3633/3741 [21:23:13<39:37, 22.02s/it] +2025-05-11 14:49:26 - ERROR - stderr - 97%|█████████▋| 3634/3741 [21:23:33<38:03, 21.34s/it] +2025-05-11 14:49:26 - ERROR - stderr - +2025-05-11 14:49:26 - ERROR - stderr - +2025-05-11 14:49:26 - INFO - stdout - {'loss': 0.4593, 'grad_norm': 0.9069940447807312, 'learning_rate': 4.289355382529059e-08, 'epoch': 2.91} +2025-05-11 14:49:26 - ERROR - stderr - 97%|█████████▋| 3634/3741 [21:23:33<38:03, 21.34s/it] +2025-05-11 14:49:46 - ERROR - stderr - 97%|█████████▋| 3635/3741 [21:23:53<37:00, 20.95s/it] +2025-05-11 14:49:46 - ERROR - stderr - +2025-05-11 14:49:46 - ERROR - stderr - +2025-05-11 14:49:46 - INFO - stdout - {'loss': 0.4879, 'grad_norm': 0.9094383716583252, 'learning_rate': 4.2096112001006604e-08, 'epoch': 2.91} +2025-05-11 14:49:46 - ERROR - stderr - 97%|█████████▋| 3635/3741 [21:23:53<37:00, 20.95s/it] +2025-05-11 14:50:06 - ERROR - stderr - 97%|█████████▋| 3636/3741 [21:24:13<36:17, 20.74s/it] +2025-05-11 14:50:06 - ERROR - stderr - +2025-05-11 14:50:06 - ERROR - stderr - +2025-05-11 14:50:06 - INFO - stdout - {'loss': 0.4808, 'grad_norm': 0.8786039352416992, 'learning_rate': 4.1306136952187214e-08, 'epoch': 2.92} +2025-05-11 14:50:06 - ERROR - stderr - 97%|█████████▋| 3636/3741 [21:24:13<36:17, 20.74s/it] +2025-05-11 14:50:26 - ERROR - stderr - 97%|█████████▋| 3637/3741 [21:24:33<35:30, 20.49s/it] +2025-05-11 14:50:26 - ERROR - stderr - +2025-05-11 14:50:26 - ERROR - stderr - +2025-05-11 14:50:26 - INFO - stdout - {'loss': 0.4875, 'grad_norm': 0.9403489828109741, 'learning_rate': 4.052362927118303e-08, 'epoch': 2.92} +2025-05-11 14:50:26 - ERROR - stderr - 97%|█████████▋| 3637/3741 [21:24:33<35:30, 20.49s/it] +2025-05-11 14:50:46 - ERROR - stderr - 97%|█████████▋| 3638/3741 [21:24:53<34:53, 20.32s/it] +2025-05-11 14:50:46 - ERROR - stderr - +2025-05-11 14:50:46 - ERROR - stderr - +2025-05-11 14:50:46 - INFO - stdout - {'loss': 0.45, 'grad_norm': 0.7999061346054077, 'learning_rate': 3.974858954474248e-08, 'epoch': 2.92} +2025-05-11 14:50:46 - ERROR - stderr - 97%|█████████▋| 3638/3741 [21:24:53<34:53, 20.32s/it] +2025-05-11 14:51:07 - ERROR - stderr - 97%|█████████▋| 3639/3741 [21:25:13<34:43, 20.43s/it] +2025-05-11 14:51:07 - ERROR - stderr - +2025-05-11 14:51:07 - ERROR - stderr - +2025-05-11 14:51:07 - INFO - stdout - {'loss': 0.4521, 'grad_norm': 0.8469072580337524, 'learning_rate': 3.898101835401846e-08, 'epoch': 2.92} +2025-05-11 14:51:07 - ERROR - stderr - 97%|█████████▋| 3639/3741 [21:25:13<34:43, 20.43s/it] +2025-05-11 14:51:27 - ERROR - stderr - 97%|█████████▋| 3640/3741 [21:25:33<34:06, 20.26s/it] +2025-05-11 14:51:27 - ERROR - stderr - +2025-05-11 14:51:27 - ERROR - stderr - +2025-05-11 14:51:27 - INFO - stdout - {'loss': 0.48, 'grad_norm': 0.8923320770263672, 'learning_rate': 3.82209162745617e-08, 'epoch': 2.92} +2025-05-11 14:51:27 - ERROR - stderr - 97%|█████████▋| 3640/3741 [21:25:33<34:06, 20.26s/it] +2025-05-11 14:51:47 - ERROR - stderr - 97%|█████████▋| 3641/3741 [21:25:53<33:29, 20.10s/it] +2025-05-11 14:51:47 - ERROR - stderr - +2025-05-11 14:51:47 - ERROR - stderr - +2025-05-11 14:51:47 - INFO - stdout - {'loss': 0.4469, 'grad_norm': 0.8438341021537781, 'learning_rate': 3.746828387632184e-08, 'epoch': 2.92} +2025-05-11 14:51:47 - ERROR - stderr - 97%|█████████▋| 3641/3741 [21:25:53<33:29, 20.10s/it] +2025-05-11 14:52:06 - ERROR - stderr - 97%|█████████▋| 3642/3741 [21:26:13<32:56, 19.96s/it] +2025-05-11 14:52:06 - ERROR - stderr - +2025-05-11 14:52:06 - ERROR - stderr - +2025-05-11 14:52:06 - INFO - stdout - {'loss': 0.4609, 'grad_norm': 0.8810524940490723, 'learning_rate': 3.672312172365078e-08, 'epoch': 2.92} +2025-05-11 14:52:06 - ERROR - stderr - 97%|█████████▋| 3642/3741 [21:26:13<32:56, 19.96s/it] +2025-05-11 14:52:26 - ERROR - stderr - 97%|█████████▋| 3643/3741 [21:26:32<32:28, 19.88s/it] +2025-05-11 14:52:26 - ERROR - stderr - +2025-05-11 14:52:26 - ERROR - stderr - +2025-05-11 14:52:26 - INFO - stdout - {'loss': 0.4814, 'grad_norm': 0.902431309223175, 'learning_rate': 3.598543037529378e-08, 'epoch': 2.92} +2025-05-11 14:52:26 - ERROR - stderr - 97%|█████████▋| 3643/3741 [21:26:32<32:28, 19.88s/it] +2025-05-11 14:52:46 - ERROR - stderr - 97%|█████████▋| 3644/3741 [21:26:52<32:03, 19.83s/it] +2025-05-11 14:52:46 - ERROR - stderr - +2025-05-11 14:52:46 - ERROR - stderr - +2025-05-11 14:52:46 - INFO - stdout - {'loss': 0.465, 'grad_norm': 0.8674122095108032, 'learning_rate': 3.525521038439728e-08, 'epoch': 2.92} +2025-05-11 14:52:46 - ERROR - stderr - 97%|█████████▋| 3644/3741 [21:26:52<32:03, 19.83s/it] +2025-05-11 14:53:05 - ERROR - stderr - 97%|█████████▋| 3645/3741 [21:27:11<31:34, 19.74s/it] +2025-05-11 14:53:05 - ERROR - stderr - +2025-05-11 14:53:05 - ERROR - stderr - +2025-05-11 14:53:05 - INFO - stdout - {'loss': 0.4554, 'grad_norm': 0.8674834370613098, 'learning_rate': 3.4532462298506596e-08, 'epoch': 2.92} +2025-05-11 14:53:05 - ERROR - stderr - 97%|█████████▋| 3645/3741 [21:27:11<31:34, 19.74s/it] +2025-05-11 14:53:25 - ERROR - stderr - 97%|█████████▋| 3646/3741 [21:27:31<31:10, 19.68s/it] +2025-05-11 14:53:25 - ERROR - stderr - +2025-05-11 14:53:25 - ERROR - stderr - +2025-05-11 14:53:25 - INFO - stdout - {'loss': 0.4579, 'grad_norm': 0.8510974049568176, 'learning_rate': 3.3817186659560466e-08, 'epoch': 2.92} +2025-05-11 14:53:25 - ERROR - stderr - 97%|█████████▋| 3646/3741 [21:27:31<31:10, 19.68s/it] +2025-05-11 14:53:45 - ERROR - stderr - 97%|█████████▋| 3647/3741 [21:27:51<30:53, 19.72s/it] +2025-05-11 14:53:45 - ERROR - stderr - +2025-05-11 14:53:45 - ERROR - stderr - +2025-05-11 14:53:45 - INFO - stdout - {'loss': 0.4764, 'grad_norm': 0.8944957852363586, 'learning_rate': 3.3109384003899844e-08, 'epoch': 2.92} +2025-05-11 14:53:45 - ERROR - stderr - 97%|█████████▋| 3647/3741 [21:27:51<30:53, 19.72s/it] +2025-05-11 14:54:04 - ERROR - stderr - 98%|█████████▊| 3648/3741 [21:28:10<30:30, 19.68s/it] +2025-05-11 14:54:04 - ERROR - stderr - +2025-05-11 14:54:04 - ERROR - stderr - +2025-05-11 14:54:04 - INFO - stdout - {'loss': 0.4763, 'grad_norm': 0.8609873056411743, 'learning_rate': 3.2409054862256875e-08, 'epoch': 2.93} +2025-05-11 14:54:04 - ERROR - stderr - 98%|█████████▊| 3648/3741 [21:28:10<30:30, 19.68s/it] +2025-05-11 14:54:24 - ERROR - stderr - 98%|█████████▊| 3649/3741 [21:28:30<30:19, 19.78s/it] +2025-05-11 14:54:24 - ERROR - stderr - +2025-05-11 14:54:24 - ERROR - stderr - +2025-05-11 14:54:24 - INFO - stdout - {'loss': 0.4574, 'grad_norm': 0.8213399648666382, 'learning_rate': 3.17161997597637e-08, 'epoch': 2.93} +2025-05-11 14:54:24 - ERROR - stderr - 98%|█████████▊| 3649/3741 [21:28:30<30:19, 19.78s/it] +2025-05-11 14:54:44 - ERROR - stderr - 98%|█████████▊| 3650/3741 [21:28:50<29:51, 19.69s/it] +2025-05-11 14:54:44 - ERROR - stderr - +2025-05-11 14:54:44 - ERROR - stderr - +2025-05-11 14:54:44 - INFO - stdout - {'loss': 0.467, 'grad_norm': 0.8279301524162292, 'learning_rate': 3.103081921594586e-08, 'epoch': 2.93} +2025-05-11 14:54:44 - ERROR - stderr - 98%|█████████▊| 3650/3741 [21:28:50<29:51, 19.69s/it] +2025-05-11 14:55:04 - ERROR - stderr - 98%|█████████▊| 3651/3741 [21:29:10<29:45, 19.84s/it] +2025-05-11 14:55:04 - ERROR - stderr - +2025-05-11 14:55:04 - ERROR - stderr - +2025-05-11 14:55:04 - INFO - stdout - {'loss': 0.4496, 'grad_norm': 0.8031629323959351, 'learning_rate': 3.03529137447256e-08, 'epoch': 2.93} +2025-05-11 14:55:04 - ERROR - stderr - 98%|█████████▊| 3651/3741 [21:29:10<29:45, 19.84s/it] +2025-05-11 14:55:23 - ERROR - stderr - 98%|█████████▊| 3652/3741 [21:29:30<29:16, 19.73s/it] +2025-05-11 14:55:23 - ERROR - stderr - +2025-05-11 14:55:23 - ERROR - stderr - +2025-05-11 14:55:23 - INFO - stdout - {'loss': 0.4587, 'grad_norm': 0.8145686984062195, 'learning_rate': 2.968248385441852e-08, 'epoch': 2.93} +2025-05-11 14:55:23 - ERROR - stderr - 98%|█████████▊| 3652/3741 [21:29:30<29:16, 19.73s/it] +2025-05-11 14:55:43 - ERROR - stderr - 98%|█████████▊| 3653/3741 [21:29:49<28:54, 19.71s/it] +2025-05-11 14:55:43 - ERROR - stderr - +2025-05-11 14:55:43 - ERROR - stderr - +2025-05-11 14:55:43 - INFO - stdout - {'loss': 0.4765, 'grad_norm': 0.8566755652427673, 'learning_rate': 2.9019530047736944e-08, 'epoch': 2.93} +2025-05-11 14:55:43 - ERROR - stderr - 98%|█████████▊| 3653/3741 [21:29:49<28:54, 19.71s/it] +2025-05-11 14:56:03 - ERROR - stderr - 98%|█████████▊| 3654/3741 [21:30:09<28:36, 19.73s/it] +2025-05-11 14:56:03 - ERROR - stderr - +2025-05-11 14:56:03 - ERROR - stderr - +2025-05-11 14:56:03 - INFO - stdout - {'loss': 0.4975, 'grad_norm': 0.8937522768974304, 'learning_rate': 2.836405282178656e-08, 'epoch': 2.93} +2025-05-11 14:56:03 - ERROR - stderr - 98%|█████████▊| 3654/3741 [21:30:09<28:36, 19.73s/it] +2025-05-11 14:56:22 - ERROR - stderr - 98%|█████████▊| 3655/3741 [21:30:28<28:06, 19.62s/it] +2025-05-11 14:56:22 - ERROR - stderr - +2025-05-11 14:56:22 - ERROR - stderr - +2025-05-11 14:56:22 - INFO - stdout - {'loss': 0.4558, 'grad_norm': 0.8584054708480835, 'learning_rate': 2.7716052668064208e-08, 'epoch': 2.93} +2025-05-11 14:56:22 - ERROR - stderr - 98%|█████████▊| 3655/3741 [21:30:28<28:06, 19.62s/it] +2025-05-11 14:56:42 - ERROR - stderr - 98%|█████████▊| 3656/3741 [21:30:48<27:45, 19.59s/it] +2025-05-11 14:56:42 - ERROR - stderr - +2025-05-11 14:56:42 - ERROR - stderr - +2025-05-11 14:56:42 - INFO - stdout - {'loss': 0.491, 'grad_norm': 0.8949685096740723, 'learning_rate': 2.707553007246455e-08, 'epoch': 2.93} +2025-05-11 14:56:42 - ERROR - stderr - 98%|█████████▊| 3656/3741 [21:30:48<27:45, 19.59s/it] +2025-05-11 14:57:01 - ERROR - stderr - 98%|█████████▊| 3657/3741 [21:31:07<27:22, 19.55s/it] +2025-05-11 14:57:01 - ERROR - stderr - +2025-05-11 14:57:01 - ERROR - stderr - +2025-05-11 14:57:01 - INFO - stdout - {'loss': 0.4682, 'grad_norm': 0.8435682654380798, 'learning_rate': 2.6442485515273397e-08, 'epoch': 2.93} +2025-05-11 14:57:01 - ERROR - stderr - 98%|█████████▊| 3657/3741 [21:31:07<27:22, 19.55s/it] +2025-05-11 14:57:21 - ERROR - stderr - 98%|█████████▊| 3658/3741 [21:31:27<27:10, 19.64s/it] +2025-05-11 14:57:21 - ERROR - stderr - +2025-05-11 14:57:21 - ERROR - stderr - +2025-05-11 14:57:21 - INFO - stdout - {'loss': 0.4771, 'grad_norm': 0.8290677070617676, 'learning_rate': 2.581691947116771e-08, 'epoch': 2.93} +2025-05-11 14:57:21 - ERROR - stderr - 98%|█████████▊| 3658/3741 [21:31:27<27:10, 19.64s/it] +2025-05-11 14:57:40 - ERROR - stderr - 98%|█████████▊| 3659/3741 [21:31:47<26:46, 19.59s/it] +2025-05-11 14:57:40 - ERROR - stderr - +2025-05-11 14:57:40 - ERROR - stderr - +2025-05-11 14:57:40 - INFO - stdout - {'loss': 0.4779, 'grad_norm': 0.8999835252761841, 'learning_rate': 2.5198832409218944e-08, 'epoch': 2.93} +2025-05-11 14:57:40 - ERROR - stderr - 98%|█████████▊| 3659/3741 [21:31:47<26:46, 19.59s/it] +2025-05-11 14:58:00 - ERROR - stderr - 98%|█████████▊| 3660/3741 [21:32:06<26:23, 19.54s/it] +2025-05-11 14:58:00 - ERROR - stderr - +2025-05-11 14:58:00 - ERROR - stderr - +2025-05-11 14:58:00 - INFO - stdout - {'loss': 0.4586, 'grad_norm': 0.8274721503257751, 'learning_rate': 2.458822479288969e-08, 'epoch': 2.94} +2025-05-11 14:58:00 - ERROR - stderr - 98%|█████████▊| 3660/3741 [21:32:06<26:23, 19.54s/it] +2025-05-11 14:58:19 - ERROR - stderr - 98%|█████████▊| 3661/3741 [21:32:26<26:02, 19.53s/it] +2025-05-11 14:58:19 - ERROR - stderr - +2025-05-11 14:58:19 - ERROR - stderr - +2025-05-11 14:58:19 - INFO - stdout - {'loss': 0.4738, 'grad_norm': 0.8735244274139404, 'learning_rate': 2.3985097080033715e-08, 'epoch': 2.94} +2025-05-11 14:58:19 - ERROR - stderr - 98%|█████████▊| 3661/3741 [21:32:26<26:02, 19.53s/it] +2025-05-11 14:58:39 - ERROR - stderr - 98%|█████████▊| 3662/3741 [21:32:45<25:42, 19.52s/it] +2025-05-11 14:58:39 - ERROR - stderr - +2025-05-11 14:58:39 - ERROR - stderr - +2025-05-11 14:58:39 - INFO - stdout - {'loss': 0.4773, 'grad_norm': 0.8940444588661194, 'learning_rate': 2.3389449722898137e-08, 'epoch': 2.94} +2025-05-11 14:58:39 - ERROR - stderr - 98%|█████████▊| 3662/3741 [21:32:45<25:42, 19.52s/it] +2025-05-11 14:58:58 - ERROR - stderr - 98%|█████████▊| 3663/3741 [21:33:04<25:17, 19.45s/it] +2025-05-11 14:58:58 - ERROR - stderr - +2025-05-11 14:58:58 - ERROR - stderr - +2025-05-11 14:58:58 - INFO - stdout - {'loss': 0.4798, 'grad_norm': 0.9149758219718933, 'learning_rate': 2.2801283168119028e-08, 'epoch': 2.94} +2025-05-11 14:58:58 - ERROR - stderr - 98%|█████████▊| 3663/3741 [21:33:04<25:17, 19.45s/it] +2025-05-11 14:59:18 - ERROR - stderr - 98%|█████████▊| 3664/3741 [21:33:24<24:58, 19.46s/it] +2025-05-11 14:59:18 - ERROR - stderr - +2025-05-11 14:59:18 - ERROR - stderr - +2025-05-11 14:59:18 - INFO - stdout - {'loss': 0.4672, 'grad_norm': 0.8782206773757935, 'learning_rate': 2.222059785672359e-08, 'epoch': 2.94} +2025-05-11 14:59:18 - ERROR - stderr - 98%|█████████▊| 3664/3741 [21:33:24<24:58, 19.46s/it] +2025-05-11 14:59:37 - ERROR - stderr - 98%|█████████▊| 3665/3741 [21:33:44<24:45, 19.55s/it] +2025-05-11 14:59:37 - ERROR - stderr - +2025-05-11 14:59:37 - ERROR - stderr - +2025-05-11 14:59:37 - INFO - stdout - {'loss': 0.483, 'grad_norm': 0.8763145804405212, 'learning_rate': 2.1647394224129092e-08, 'epoch': 2.94} +2025-05-11 14:59:37 - ERROR - stderr - 98%|█████████▊| 3665/3741 [21:33:44<24:45, 19.55s/it] +2025-05-11 14:59:57 - ERROR - stderr - 98%|█████████▊| 3666/3741 [21:34:03<24:28, 19.58s/it] +2025-05-11 14:59:57 - ERROR - stderr - +2025-05-11 14:59:57 - ERROR - stderr - +2025-05-11 14:59:57 - INFO - stdout - {'loss': 0.4818, 'grad_norm': 0.8692938089370728, 'learning_rate': 2.108167270014394e-08, 'epoch': 2.94} +2025-05-11 14:59:57 - ERROR - stderr - 98%|█████████▊| 3666/3741 [21:34:03<24:28, 19.58s/it] +2025-05-11 15:00:17 - ERROR - stderr - 98%|█████████▊| 3667/3741 [21:34:24<24:23, 19.77s/it] +2025-05-11 15:00:17 - ERROR - stderr - +2025-05-11 15:00:17 - ERROR - stderr - +2025-05-11 15:00:17 - INFO - stdout - {'loss': 0.4554, 'grad_norm': 0.8743387460708618, 'learning_rate': 2.052343370896437e-08, 'epoch': 2.94} +2025-05-11 15:00:17 - ERROR - stderr - 98%|█████████▊| 3667/3741 [21:34:24<24:23, 19.77s/it] +2025-05-11 15:00:37 - ERROR - stderr - 98%|█████████▊| 3668/3741 [21:34:43<24:03, 19.77s/it] +2025-05-11 15:00:37 - ERROR - stderr - +2025-05-11 15:00:37 - ERROR - stderr - +2025-05-11 15:00:37 - INFO - stdout - {'loss': 0.4854, 'grad_norm': 0.8664737343788147, 'learning_rate': 1.9972677669177766e-08, 'epoch': 2.94} +2025-05-11 15:00:37 - ERROR - stderr - 98%|█████████▊| 3668/3741 [21:34:43<24:03, 19.77s/it] +2025-05-11 15:00:57 - ERROR - stderr - 98%|█████████▊| 3669/3741 [21:35:03<23:47, 19.83s/it] +2025-05-11 15:00:57 - ERROR - stderr - +2025-05-11 15:00:57 - ERROR - stderr - +2025-05-11 15:00:57 - INFO - stdout - {'loss': 0.4664, 'grad_norm': 0.8775107860565186, 'learning_rate': 1.942940499376045e-08, 'epoch': 2.94} +2025-05-11 15:00:57 - ERROR - stderr - 98%|█████████▊| 3669/3741 [21:35:03<23:47, 19.83s/it] +2025-05-11 15:00:57 - INFO - stdout - WARNING: tokenization mismatch: 1 vs. 3126. (ignored) +2025-05-11 15:01:16 - ERROR - stderr - 98%|█████████▊| 3670/3741 [21:35:23<23:16, 19.67s/it] +2025-05-11 15:01:16 - ERROR - stderr - +2025-05-11 15:01:16 - ERROR - stderr - +2025-05-11 15:01:16 - INFO - stdout - {'loss': 0.4388, 'grad_norm': 0.8614839315414429, 'learning_rate': 1.889361609007434e-08, 'epoch': 2.94} +2025-05-11 15:01:16 - ERROR - stderr - 98%|█████████▊| 3670/3741 [21:35:23<23:16, 19.67s/it] +2025-05-11 15:01:36 - ERROR - stderr - 98%|█████████▊| 3671/3741 [21:35:42<22:58, 19.69s/it] +2025-05-11 15:01:36 - ERROR - stderr - +2025-05-11 15:01:36 - ERROR - stderr - +2025-05-11 15:01:36 - INFO - stdout - {'loss': 0.4858, 'grad_norm': 0.9102433323860168, 'learning_rate': 1.836531135987474e-08, 'epoch': 2.94} +2025-05-11 15:01:36 - ERROR - stderr - 98%|█████████▊| 3671/3741 [21:35:42<22:58, 19.69s/it] +2025-05-11 15:01:56 - ERROR - stderr - 98%|█████████▊| 3672/3741 [21:36:02<22:47, 19.81s/it] +2025-05-11 15:01:56 - ERROR - stderr - +2025-05-11 15:01:56 - ERROR - stderr - +2025-05-11 15:01:56 - INFO - stdout - {'loss': 0.4674, 'grad_norm': 0.8594423532485962, 'learning_rate': 1.7844491199301428e-08, 'epoch': 2.94} +2025-05-11 15:01:56 - ERROR - stderr - 98%|█████████▊| 3672/3741 [21:36:02<22:47, 19.81s/it] +2025-05-11 15:02:16 - ERROR - stderr - 98%|█████████▊| 3673/3741 [21:36:22<22:22, 19.74s/it] +2025-05-11 15:02:16 - ERROR - stderr - +2025-05-11 15:02:16 - ERROR - stderr - +2025-05-11 15:02:16 - INFO - stdout - {'loss': 0.4674, 'grad_norm': 0.9068805575370789, 'learning_rate': 1.733115599888202e-08, 'epoch': 2.95} +2025-05-11 15:02:16 - ERROR - stderr - 98%|█████████▊| 3673/3741 [21:36:22<22:22, 19.74s/it] +2025-05-11 15:02:36 - ERROR - stderr - 98%|█████████▊| 3674/3741 [21:36:42<22:12, 19.88s/it] +2025-05-11 15:02:36 - ERROR - stderr - +2025-05-11 15:02:36 - ERROR - stderr - +2025-05-11 15:02:36 - INFO - stdout - {'loss': 0.5031, 'grad_norm': 0.9371825456619263, 'learning_rate': 1.682530614353528e-08, 'epoch': 2.95} +2025-05-11 15:02:36 - ERROR - stderr - 98%|█████████▊| 3674/3741 [21:36:42<22:12, 19.88s/it] +2025-05-11 15:02:56 - ERROR - stderr - 98%|█████████▊| 3675/3741 [21:37:02<21:56, 19.94s/it] +2025-05-11 15:02:56 - ERROR - stderr - +2025-05-11 15:02:56 - ERROR - stderr - +2025-05-11 15:02:56 - INFO - stdout - {'loss': 0.4942, 'grad_norm': 0.9196876287460327, 'learning_rate': 1.6326942012562242e-08, 'epoch': 2.95} +2025-05-11 15:02:56 - ERROR - stderr - 98%|█████████▊| 3675/3741 [21:37:02<21:56, 19.94s/it] +2025-05-11 15:03:16 - ERROR - stderr - 98%|█████████▊| 3676/3741 [21:37:22<21:39, 19.99s/it] +2025-05-11 15:03:16 - ERROR - stderr - +2025-05-11 15:03:16 - ERROR - stderr - +2025-05-11 15:03:16 - INFO - stdout - {'loss': 0.4588, 'grad_norm': 0.7983100414276123, 'learning_rate': 1.5836063979656202e-08, 'epoch': 2.95} +2025-05-11 15:03:16 - ERROR - stderr - 98%|█████████▊| 3676/3741 [21:37:22<21:39, 19.99s/it] +2025-05-11 15:03:36 - ERROR - stderr - 98%|█████████▊| 3677/3741 [21:37:42<21:19, 19.99s/it] +2025-05-11 15:03:36 - ERROR - stderr - +2025-05-11 15:03:36 - ERROR - stderr - +2025-05-11 15:03:36 - INFO - stdout - {'loss': 0.4615, 'grad_norm': 0.8527657389640808, 'learning_rate': 1.535267241289051e-08, 'epoch': 2.95} +2025-05-11 15:03:36 - ERROR - stderr - 98%|█████████▊| 3677/3741 [21:37:42<21:19, 19.99s/it] +2025-05-11 15:03:56 - ERROR - stderr - 98%|█████████▊| 3678/3741 [21:38:02<20:51, 19.87s/it] +2025-05-11 15:03:56 - ERROR - stderr - +2025-05-11 15:03:56 - ERROR - stderr - +2025-05-11 15:03:56 - INFO - stdout - {'loss': 0.4497, 'grad_norm': 0.8166870474815369, 'learning_rate': 1.4876767674730786e-08, 'epoch': 2.95} +2025-05-11 15:03:56 - ERROR - stderr - 98%|█████████▊| 3678/3741 [21:38:02<20:51, 19.87s/it] +2025-05-11 15:04:15 - ERROR - stderr - 98%|█████████▊| 3679/3741 [21:38:22<20:28, 19.81s/it] +2025-05-11 15:04:15 - ERROR - stderr - +2025-05-11 15:04:15 - ERROR - stderr - +2025-05-11 15:04:15 - INFO - stdout - {'loss': 0.4559, 'grad_norm': 0.9895662665367126, 'learning_rate': 1.4408350122027126e-08, 'epoch': 2.95} +2025-05-11 15:04:15 - ERROR - stderr - 98%|█████████▊| 3679/3741 [21:38:22<20:28, 19.81s/it] +2025-05-11 15:04:35 - ERROR - stderr - 98%|█████████▊| 3680/3741 [21:38:41<20:04, 19.75s/it] +2025-05-11 15:04:35 - ERROR - stderr - +2025-05-11 15:04:35 - ERROR - stderr - +2025-05-11 15:04:35 - INFO - stdout - {'loss': 0.4756, 'grad_norm': 0.8738000988960266, 'learning_rate': 1.3947420106013021e-08, 'epoch': 2.95} +2025-05-11 15:04:35 - ERROR - stderr - 98%|█████████▊| 3680/3741 [21:38:41<20:04, 19.75s/it] +2025-05-11 15:04:55 - ERROR - stderr - 98%|█████████▊| 3681/3741 [21:39:01<19:44, 19.74s/it] +2025-05-11 15:04:55 - ERROR - stderr - +2025-05-11 15:04:55 - ERROR - stderr - +2025-05-11 15:04:55 - INFO - stdout - {'loss': 0.4423, 'grad_norm': 0.8730235695838928, 'learning_rate': 1.3493977972312e-08, 'epoch': 2.95} +2025-05-11 15:04:55 - ERROR - stderr - 98%|█████████▊| 3681/3741 [21:39:01<19:44, 19.74s/it] +2025-05-11 15:05:14 - ERROR - stderr - 98%|█████████▊| 3682/3741 [21:39:20<19:19, 19.65s/it] +2025-05-11 15:05:14 - ERROR - stderr - +2025-05-11 15:05:14 - ERROR - stderr - +2025-05-11 15:05:14 - INFO - stdout - {'loss': 0.4899, 'grad_norm': 0.9113731980323792, 'learning_rate': 1.3048024060928754e-08, 'epoch': 2.95} +2025-05-11 15:05:14 - ERROR - stderr - 98%|█████████▊| 3682/3741 [21:39:20<19:19, 19.65s/it] +2025-05-11 15:05:34 - ERROR - stderr - 98%|█████████▊| 3683/3741 [21:39:40<19:07, 19.78s/it] +2025-05-11 15:05:34 - ERROR - stderr - +2025-05-11 15:05:34 - ERROR - stderr - +2025-05-11 15:05:34 - INFO - stdout - {'loss': 0.4661, 'grad_norm': 0.8570337891578674, 'learning_rate': 1.2609558706253578e-08, 'epoch': 2.95} +2025-05-11 15:05:34 - ERROR - stderr - 98%|█████████▊| 3683/3741 [21:39:40<19:07, 19.78s/it] +2025-05-11 15:05:54 - ERROR - stderr - 98%|█████████▊| 3684/3741 [21:40:00<18:40, 19.67s/it] +2025-05-11 15:05:54 - ERROR - stderr - +2025-05-11 15:05:54 - ERROR - stderr - +2025-05-11 15:05:54 - INFO - stdout - {'loss': 0.4826, 'grad_norm': 0.8690351843833923, 'learning_rate': 1.2178582237065695e-08, 'epoch': 2.95} +2025-05-11 15:05:54 - ERROR - stderr - 98%|█████████▊| 3684/3741 [21:40:00<18:40, 19.67s/it] +2025-05-11 15:06:14 - ERROR - stderr - 99%|█████████▊| 3685/3741 [21:40:20<18:33, 19.88s/it] +2025-05-11 15:06:14 - ERROR - stderr - +2025-05-11 15:06:14 - ERROR - stderr - +2025-05-11 15:06:14 - INFO - stdout - {'loss': 0.4757, 'grad_norm': 0.8450669050216675, 'learning_rate': 1.1755094976523273e-08, 'epoch': 2.96} +2025-05-11 15:06:14 - ERROR - stderr - 99%|█████████▊| 3685/3741 [21:40:20<18:33, 19.88s/it] +2025-05-11 15:06:34 - ERROR - stderr - 99%|█████████▊| 3686/3741 [21:40:40<18:11, 19.84s/it] +2025-05-11 15:06:34 - ERROR - stderr - +2025-05-11 15:06:34 - ERROR - stderr - +2025-05-11 15:06:34 - INFO - stdout - {'loss': 0.449, 'grad_norm': 0.8436766266822815, 'learning_rate': 1.1339097242173414e-08, 'epoch': 2.96} +2025-05-11 15:06:34 - ERROR - stderr - 99%|█████████▊| 3686/3741 [21:40:40<18:11, 19.84s/it] +2025-05-11 15:06:54 - ERROR - stderr - 99%|█████████▊| 3687/3741 [21:41:00<18:00, 20.00s/it] +2025-05-11 15:06:54 - ERROR - stderr - +2025-05-11 15:06:54 - ERROR - stderr - +2025-05-11 15:06:54 - INFO - stdout - {'loss': 0.4733, 'grad_norm': 0.9269471168518066, 'learning_rate': 1.0930589345944376e-08, 'epoch': 2.96} +2025-05-11 15:06:54 - ERROR - stderr - 99%|█████████▊| 3687/3741 [21:41:00<18:00, 20.00s/it] +2025-05-11 15:07:13 - ERROR - stderr - 99%|█████████▊| 3688/3741 [21:41:20<17:29, 19.81s/it] +2025-05-11 15:07:13 - ERROR - stderr - +2025-05-11 15:07:13 - ERROR - stderr - +2025-05-11 15:07:13 - INFO - stdout - {'loss': 0.4396, 'grad_norm': 0.8610197305679321, 'learning_rate': 1.0529571594150023e-08, 'epoch': 2.96} +2025-05-11 15:07:13 - ERROR - stderr - 99%|█████████▊| 3688/3741 [21:41:20<17:29, 19.81s/it] +2025-05-11 15:07:34 - ERROR - stderr - 99%|█████████▊| 3689/3741 [21:41:41<17:26, 20.13s/it] +2025-05-11 15:07:34 - ERROR - stderr - +2025-05-11 15:07:34 - ERROR - stderr - +2025-05-11 15:07:34 - INFO - stdout - {'loss': 0.4536, 'grad_norm': 0.879664957523346, 'learning_rate': 1.013604428748538e-08, 'epoch': 2.96} +2025-05-11 15:07:34 - ERROR - stderr - 99%|█████████▊| 3689/3741 [21:41:41<17:26, 20.13s/it] +2025-05-11 15:07:54 - ERROR - stderr - 99%|█████████▊| 3690/3741 [21:42:00<16:56, 19.92s/it] +2025-05-11 15:07:54 - ERROR - stderr - +2025-05-11 15:07:54 - ERROR - stderr - +2025-05-11 15:07:54 - INFO - stdout - {'loss': 0.489, 'grad_norm': 0.9061784148216248, 'learning_rate': 9.750007721032184e-09, 'epoch': 2.96} +2025-05-11 15:07:54 - ERROR - stderr - 99%|█████████▊| 3690/3741 [21:42:00<16:56, 19.92s/it] +2025-05-11 15:08:14 - ERROR - stderr - 99%|█████████▊| 3691/3741 [21:42:20<16:35, 19.90s/it] +2025-05-11 15:08:14 - ERROR - stderr - +2025-05-11 15:08:14 - ERROR - stderr - +2025-05-11 15:08:14 - INFO - stdout - {'loss': 0.4883, 'grad_norm': 0.8848870992660522, 'learning_rate': 9.371462184254443e-09, 'epoch': 2.96} +2025-05-11 15:08:14 - ERROR - stderr - 99%|█████████▊| 3691/3741 [21:42:20<16:35, 19.90s/it] +2025-05-11 15:08:14 - WARNING - transformers.tokenization_utils_base - Token indices sequence length is longer than the specified maximum sequence length for this model (5807 > 4096). Running this sequence through the model will result in indexing errors +2025-05-11 15:08:14 - WARNING - transformers.tokenization_utils_base - Token indices sequence length is longer than the specified maximum sequence length for this model (5807 > 4096). Running this sequence through the model will result in indexing errors +2025-05-11 15:08:33 - ERROR - stderr - 99%|█████████▊| 3692/3741 [21:42:40<16:13, 19.87s/it] +2025-05-11 15:08:33 - ERROR - stderr - +2025-05-11 15:08:33 - ERROR - stderr - +2025-05-11 15:08:33 - INFO - stdout - {'loss': 0.4915, 'grad_norm': 0.8711914420127869, 'learning_rate': 9.000407960996216e-09, 'epoch': 2.96} +2025-05-11 15:08:33 - ERROR - stderr - 99%|█████████▊| 3692/3741 [21:42:40<16:13, 19.87s/it] +2025-05-11 15:08:58 - ERROR - stderr - 99%|█████████▊| 3693/3741 [21:43:05<17:05, 21.36s/it] +2025-05-11 15:08:58 - ERROR - stderr - +2025-05-11 15:08:58 - ERROR - stderr - +2025-05-11 15:08:58 - INFO - stdout - {'loss': 0.4564, 'grad_norm': 0.8504834175109863, 'learning_rate': 8.636845329488274e-09, 'epoch': 2.96} +2025-05-11 15:08:58 - ERROR - stderr - 99%|█████████▊| 3693/3741 [21:43:05<17:05, 21.36s/it] +2025-05-11 15:09:18 - ERROR - stderr - 99%|█████████▊| 3694/3741 [21:43:24<16:21, 20.89s/it] +2025-05-11 15:09:18 - ERROR - stderr - +2025-05-11 15:09:18 - ERROR - stderr - +2025-05-11 15:09:18 - INFO - stdout - {'loss': 0.4749, 'grad_norm': 0.8460233807563782, 'learning_rate': 8.280774562342552e-09, 'epoch': 2.96} +2025-05-11 15:09:18 - ERROR - stderr - 99%|█████████▊| 3694/3741 [21:43:24<16:21, 20.89s/it] +2025-05-11 15:09:38 - ERROR - stderr - 99%|█████████▉| 3695/3741 [21:43:44<15:44, 20.53s/it] +2025-05-11 15:09:38 - ERROR - stderr - +2025-05-11 15:09:38 - ERROR - stderr - +2025-05-11 15:09:38 - INFO - stdout - {'loss': 0.4831, 'grad_norm': 0.9107170104980469, 'learning_rate': 7.932195926552144e-09, 'epoch': 2.96} +2025-05-11 15:09:38 - ERROR - stderr - 99%|█████████▉| 3695/3741 [21:43:44<15:44, 20.53s/it] +2025-05-11 15:09:57 - ERROR - stderr - 99%|█████████▉| 3696/3741 [21:44:04<15:12, 20.28s/it] +2025-05-11 15:09:57 - ERROR - stderr - +2025-05-11 15:09:57 - ERROR - stderr - +2025-05-11 15:09:57 - INFO - stdout - {'loss': 0.5077, 'grad_norm': 0.9115839600563049, 'learning_rate': 7.591109683492415e-09, 'epoch': 2.96} +2025-05-11 15:09:57 - ERROR - stderr - 99%|█████████▉| 3696/3741 [21:44:04<15:12, 20.28s/it] +2025-05-11 15:10:17 - ERROR - stderr - 99%|█████████▉| 3697/3741 [21:44:23<14:46, 20.14s/it] +2025-05-11 15:10:17 - ERROR - stderr - +2025-05-11 15:10:17 - ERROR - stderr - +2025-05-11 15:10:17 - INFO - stdout - {'loss': 0.4558, 'grad_norm': 0.849886953830719, 'learning_rate': 7.257516088923222e-09, 'epoch': 2.96} +2025-05-11 15:10:17 - ERROR - stderr - 99%|█████████▉| 3697/3741 [21:44:24<14:46, 20.14s/it] +2025-05-11 15:10:37 - ERROR - stderr - 99%|█████████▉| 3698/3741 [21:44:43<14:21, 20.04s/it] +2025-05-11 15:10:37 - ERROR - stderr - +2025-05-11 15:10:37 - ERROR - stderr - +2025-05-11 15:10:37 - INFO - stdout - {'loss': 0.4509, 'grad_norm': 0.8572622537612915, 'learning_rate': 6.9314153929833646e-09, 'epoch': 2.97} +2025-05-11 15:10:37 - ERROR - stderr - 99%|█████████▉| 3698/3741 [21:44:43<14:21, 20.04s/it] +2025-05-11 15:10:57 - ERROR - stderr - 99%|█████████▉| 3699/3741 [21:45:03<13:57, 19.94s/it] +2025-05-11 15:10:57 - ERROR - stderr - +2025-05-11 15:10:57 - ERROR - stderr - +2025-05-11 15:10:57 - INFO - stdout - {'loss': 0.4814, 'grad_norm': 0.8848903179168701, 'learning_rate': 6.612807840195024e-09, 'epoch': 2.97} +2025-05-11 15:10:57 - ERROR - stderr - 99%|█████████▉| 3699/3741 [21:45:03<13:57, 19.94s/it] +2025-05-11 15:11:17 - ERROR - stderr - 99%|█████████▉| 3700/3741 [21:45:23<13:37, 19.93s/it] +2025-05-11 15:11:17 - ERROR - stderr - +2025-05-11 15:11:17 - ERROR - stderr - +2025-05-11 15:11:17 - INFO - stdout - {'loss': 0.4582, 'grad_norm': 0.811222493648529, 'learning_rate': 6.301693669459319e-09, 'epoch': 2.97} +2025-05-11 15:11:17 - ERROR - stderr - 99%|█████████▉| 3700/3741 [21:45:23<13:37, 19.93s/it] +2025-05-11 15:11:36 - ERROR - stderr - 99%|█████████▉| 3701/3741 [21:45:42<13:11, 19.80s/it] +2025-05-11 15:11:36 - ERROR - stderr - +2025-05-11 15:11:36 - ERROR - stderr - +2025-05-11 15:11:36 - INFO - stdout - {'loss': 0.4534, 'grad_norm': 0.8343945741653442, 'learning_rate': 5.998073114062975e-09, 'epoch': 2.97} +2025-05-11 15:11:36 - ERROR - stderr - 99%|█████████▉| 3701/3741 [21:45:42<13:11, 19.80s/it] +2025-05-11 15:11:56 - ERROR - stderr - 99%|█████████▉| 3702/3741 [21:46:02<12:49, 19.72s/it] +2025-05-11 15:11:56 - ERROR - stderr - +2025-05-11 15:11:56 - ERROR - stderr - +2025-05-11 15:11:56 - INFO - stdout - {'loss': 0.4938, 'grad_norm': 0.8955221176147461, 'learning_rate': 5.701946401668324e-09, 'epoch': 2.97} +2025-05-11 15:11:56 - ERROR - stderr - 99%|█████████▉| 3702/3741 [21:46:02<12:49, 19.72s/it] +2025-05-11 15:12:16 - ERROR - stderr - 99%|█████████▉| 3703/3741 [21:46:22<12:34, 19.86s/it] +2025-05-11 15:12:16 - ERROR - stderr - +2025-05-11 15:12:16 - ERROR - stderr - +2025-05-11 15:12:16 - INFO - stdout - {'loss': 0.4815, 'grad_norm': 0.9045392274856567, 'learning_rate': 5.413313754322192e-09, 'epoch': 2.97} +2025-05-11 15:12:16 - ERROR - stderr - 99%|█████████▉| 3703/3741 [21:46:22<12:34, 19.86s/it] +2025-05-11 15:12:35 - ERROR - stderr - 99%|█████████▉| 3704/3741 [21:46:42<12:11, 19.76s/it] +2025-05-11 15:12:35 - ERROR - stderr - +2025-05-11 15:12:35 - ERROR - stderr - +2025-05-11 15:12:35 - INFO - stdout - {'loss': 0.4714, 'grad_norm': 0.8312162756919861, 'learning_rate': 5.132175388452565e-09, 'epoch': 2.97} +2025-05-11 15:12:35 - ERROR - stderr - 99%|█████████▉| 3704/3741 [21:46:42<12:11, 19.76s/it] +2025-05-11 15:12:56 - ERROR - stderr - 99%|█████████▉| 3705/3741 [21:47:02<11:59, 19.98s/it] +2025-05-11 15:12:56 - ERROR - stderr - +2025-05-11 15:12:56 - ERROR - stderr - +2025-05-11 15:12:56 - INFO - stdout - {'loss': 0.4735, 'grad_norm': 0.8867566585540771, 'learning_rate': 4.858531514864151e-09, 'epoch': 2.97} +2025-05-11 15:12:56 - ERROR - stderr - 99%|█████████▉| 3705/3741 [21:47:02<11:59, 19.98s/it] +2025-05-11 15:13:16 - ERROR - stderr - 99%|█████████▉| 3706/3741 [21:47:22<11:40, 20.02s/it] +2025-05-11 15:13:16 - ERROR - stderr - +2025-05-11 15:13:16 - ERROR - stderr - +2025-05-11 15:13:16 - INFO - stdout - {'loss': 0.4656, 'grad_norm': 0.9006369709968567, 'learning_rate': 4.592382338746148e-09, 'epoch': 2.97} +2025-05-11 15:13:16 - ERROR - stderr - 99%|█████████▉| 3706/3741 [21:47:22<11:40, 20.02s/it] +2025-05-11 15:13:37 - ERROR - stderr - 99%|█████████▉| 3707/3741 [21:47:43<11:32, 20.37s/it] +2025-05-11 15:13:37 - ERROR - stderr - +2025-05-11 15:13:37 - ERROR - stderr - +2025-05-11 15:13:37 - INFO - stdout - {'loss': 0.4582, 'grad_norm': 0.8595499396324158, 'learning_rate': 4.3337280596655876e-09, 'epoch': 2.97} +2025-05-11 15:13:37 - ERROR - stderr - 99%|█████████▉| 3707/3741 [21:47:43<11:32, 20.37s/it] +2025-05-11 15:13:57 - ERROR - stderr - 99%|█████████▉| 3708/3741 [21:48:03<11:05, 20.16s/it] +2025-05-11 15:13:57 - ERROR - stderr - +2025-05-11 15:13:57 - ERROR - stderr - +2025-05-11 15:13:57 - INFO - stdout - {'loss': 0.4734, 'grad_norm': 0.927030622959137, 'learning_rate': 4.082568871570658e-09, 'epoch': 2.97} +2025-05-11 15:13:57 - ERROR - stderr - 99%|█████████▉| 3708/3741 [21:48:03<11:05, 20.16s/it] +2025-05-11 15:14:18 - ERROR - stderr - 99%|█████████▉| 3709/3741 [21:48:25<10:57, 20.56s/it] +2025-05-11 15:14:18 - ERROR - stderr - +2025-05-11 15:14:18 - ERROR - stderr - +2025-05-11 15:14:18 - INFO - stdout - {'loss': 0.4759, 'grad_norm': 0.8537117838859558, 'learning_rate': 3.838904962788492e-09, 'epoch': 2.97} +2025-05-11 15:14:18 - ERROR - stderr - 99%|█████████▉| 3709/3741 [21:48:25<10:57, 20.56s/it] +2025-05-11 15:14:38 - ERROR - stderr - 99%|█████████▉| 3710/3741 [21:48:44<10:29, 20.29s/it] +2025-05-11 15:14:38 - ERROR - stderr - +2025-05-11 15:14:38 - ERROR - stderr - +2025-05-11 15:14:38 - INFO - stdout - {'loss': 0.4711, 'grad_norm': 0.8765459656715393, 'learning_rate': 3.602736516027383e-09, 'epoch': 2.98} +2025-05-11 15:14:38 - ERROR - stderr - 99%|█████████▉| 3710/3741 [21:48:44<10:29, 20.29s/it] +2025-05-11 15:15:00 - ERROR - stderr - 99%|█████████▉| 3711/3741 [21:49:06<10:21, 20.73s/it] +2025-05-11 15:15:00 - ERROR - stderr - +2025-05-11 15:15:00 - ERROR - stderr - +2025-05-11 15:15:00 - INFO - stdout - {'loss': 0.4654, 'grad_norm': 0.8582831025123596, 'learning_rate': 3.374063708373454e-09, 'epoch': 2.98} +2025-05-11 15:15:00 - ERROR - stderr - 99%|█████████▉| 3711/3741 [21:49:06<10:21, 20.73s/it] +2025-05-11 15:15:19 - ERROR - stderr - 99%|█████████▉| 3712/3741 [21:49:26<09:52, 20.44s/it] +2025-05-11 15:15:19 - ERROR - stderr - +2025-05-11 15:15:19 - ERROR - stderr - +2025-05-11 15:15:19 - INFO - stdout - {'loss': 0.4866, 'grad_norm': 0.9093928337097168, 'learning_rate': 3.15288671129399e-09, 'epoch': 2.98} +2025-05-11 15:15:19 - ERROR - stderr - 99%|█████████▉| 3712/3741 [21:49:26<09:52, 20.44s/it] +2025-05-11 15:15:42 - ERROR - stderr - 99%|█████████▉| 3713/3741 [21:49:48<09:50, 21.08s/it] +2025-05-11 15:15:42 - ERROR - stderr - +2025-05-11 15:15:42 - ERROR - stderr - +2025-05-11 15:15:42 - INFO - stdout - {'loss': 0.4787, 'grad_norm': 0.9072393178939819, 'learning_rate': 2.9392056906352162e-09, 'epoch': 2.98} +2025-05-11 15:15:42 - ERROR - stderr - 99%|█████████▉| 3713/3741 [21:49:48<09:50, 21.08s/it] +2025-05-11 15:16:01 - ERROR - stderr - 99%|█████████▉| 3714/3741 [21:50:08<09:15, 20.57s/it] +2025-05-11 15:16:01 - ERROR - stderr - +2025-05-11 15:16:01 - ERROR - stderr - +2025-05-11 15:16:01 - INFO - stdout - {'loss': 0.4809, 'grad_norm': 0.8780279159545898, 'learning_rate': 2.7330208066222996e-09, 'epoch': 2.98} +2025-05-11 15:16:01 - ERROR - stderr - 99%|█████████▉| 3714/3741 [21:50:08<09:15, 20.57s/it] +2025-05-11 15:16:23 - ERROR - stderr - 99%|█████████▉| 3715/3741 [21:50:29<09:03, 20.92s/it] +2025-05-11 15:16:23 - ERROR - stderr - +2025-05-11 15:16:23 - ERROR - stderr - +2025-05-11 15:16:23 - INFO - stdout - {'loss': 0.4603, 'grad_norm': 0.8489347100257874, 'learning_rate': 2.5343322138593472e-09, 'epoch': 2.98} +2025-05-11 15:16:23 - ERROR - stderr - 99%|█████████▉| 3715/3741 [21:50:29<09:03, 20.92s/it] +2025-05-11 15:16:43 - ERROR - stderr - 99%|█████████▉| 3716/3741 [21:50:49<08:33, 20.54s/it] +2025-05-11 15:16:43 - ERROR - stderr - +2025-05-11 15:16:43 - ERROR - stderr - +2025-05-11 15:16:43 - INFO - stdout - {'loss': 0.5009, 'grad_norm': 0.9047530889511108, 'learning_rate': 2.3431400613305176e-09, 'epoch': 2.98} +2025-05-11 15:16:43 - ERROR - stderr - 99%|█████████▉| 3716/3741 [21:50:49<08:33, 20.54s/it] +2025-05-11 15:17:03 - ERROR - stderr - 99%|█████████▉| 3717/3741 [21:51:09<08:10, 20.43s/it] +2025-05-11 15:17:03 - ERROR - stderr - +2025-05-11 15:17:03 - ERROR - stderr - +2025-05-11 15:17:03 - INFO - stdout - {'loss': 0.454, 'grad_norm': 0.8335652947425842, 'learning_rate': 2.1594444923978e-09, 'epoch': 2.98} +2025-05-11 15:17:03 - ERROR - stderr - 99%|█████████▉| 3717/3741 [21:51:09<08:10, 20.43s/it] +2025-05-11 15:17:23 - ERROR - stderr - 99%|█████████▉| 3718/3741 [21:51:29<07:44, 20.19s/it] +2025-05-11 15:17:23 - ERROR - stderr - +2025-05-11 15:17:23 - ERROR - stderr - +2025-05-11 15:17:23 - INFO - stdout - {'loss': 0.471, 'grad_norm': 0.8720241189002991, 'learning_rate': 1.983245644802123e-09, 'epoch': 2.98} +2025-05-11 15:17:23 - ERROR - stderr - 99%|█████████▉| 3718/3741 [21:51:29<07:44, 20.19s/it] +2025-05-11 15:17:42 - ERROR - stderr - 99%|█████████▉| 3719/3741 [21:51:48<07:19, 19.99s/it] +2025-05-11 15:17:42 - ERROR - stderr - +2025-05-11 15:17:42 - ERROR - stderr - +2025-05-11 15:17:42 - INFO - stdout - {'loss': 0.4726, 'grad_norm': 0.8841453790664673, 'learning_rate': 1.8145436506633585e-09, 'epoch': 2.98} +2025-05-11 15:17:42 - ERROR - stderr - 99%|█████████▉| 3719/3741 [21:51:48<07:19, 19.99s/it] +2025-05-11 15:18:02 - ERROR - stderr - 99%|█████████▉| 3720/3741 [21:52:08<06:58, 19.91s/it] +2025-05-11 15:18:02 - ERROR - stderr - +2025-05-11 15:18:02 - ERROR - stderr - +2025-05-11 15:18:02 - INFO - stdout - {'loss': 0.4732, 'grad_norm': 0.8820040822029114, 'learning_rate': 1.6533386364814274e-09, 'epoch': 2.98} +2025-05-11 15:18:02 - ERROR - stderr - 99%|█████████▉| 3720/3741 [21:52:08<06:58, 19.91s/it] +2025-05-11 15:18:22 - ERROR - stderr - 99%|█████████▉| 3721/3741 [21:52:28<06:36, 19.83s/it] +2025-05-11 15:18:22 - ERROR - stderr - +2025-05-11 15:18:22 - ERROR - stderr - +2025-05-11 15:18:22 - INFO - stdout - {'loss': 0.4638, 'grad_norm': 0.8319164514541626, 'learning_rate': 1.4996307231307517e-09, 'epoch': 2.98} +2025-05-11 15:18:22 - ERROR - stderr - 99%|█████████▉| 3721/3741 [21:52:28<06:36, 19.83s/it] +2025-05-11 15:18:41 - ERROR - stderr - 99%|█████████▉| 3722/3741 [21:52:47<06:14, 19.69s/it] +2025-05-11 15:18:41 - ERROR - stderr - +2025-05-11 15:18:41 - ERROR - stderr - +2025-05-11 15:18:41 - INFO - stdout - {'loss': 0.4903, 'grad_norm': 0.8456513285636902, 'learning_rate': 1.3534200258691343e-09, 'epoch': 2.98} +2025-05-11 15:18:41 - ERROR - stderr - 99%|█████████▉| 3722/3741 [21:52:47<06:14, 19.69s/it] +2025-05-11 15:19:00 - ERROR - stderr - 100%|█████████▉| 3723/3741 [21:53:07<05:53, 19.64s/it] +2025-05-11 15:19:00 - ERROR - stderr - +2025-05-11 15:19:00 - ERROR - stderr - +2025-05-11 15:19:00 - INFO - stdout - {'loss': 0.4643, 'grad_norm': 0.8635490536689758, 'learning_rate': 1.2147066543288787e-09, 'epoch': 2.99} +2025-05-11 15:19:00 - ERROR - stderr - 100%|█████████▉| 3723/3741 [21:53:07<05:53, 19.64s/it] +2025-05-11 15:19:20 - ERROR - stderr - 100%|█████████▉| 3724/3741 [21:53:26<05:33, 19.59s/it] +2025-05-11 15:19:20 - ERROR - stderr - +2025-05-11 15:19:20 - ERROR - stderr - +2025-05-11 15:19:20 - INFO - stdout - {'loss': 0.4919, 'grad_norm': 0.8873840570449829, 'learning_rate': 1.0834907125223392e-09, 'epoch': 2.99} +2025-05-11 15:19:20 - ERROR - stderr - 100%|█████████▉| 3724/3741 [21:53:26<05:33, 19.59s/it] +2025-05-11 15:19:40 - ERROR - stderr - 100%|█████████▉| 3725/3741 [21:53:46<05:13, 19.60s/it] +2025-05-11 15:19:40 - ERROR - stderr - +2025-05-11 15:19:40 - ERROR - stderr - +2025-05-11 15:19:40 - INFO - stdout - {'loss': 0.4576, 'grad_norm': 0.8440971970558167, 'learning_rate': 9.59772298840811e-10, 'epoch': 2.99} +2025-05-11 15:19:40 - ERROR - stderr - 100%|█████████▉| 3725/3741 [21:53:46<05:13, 19.60s/it] +2025-05-11 15:19:59 - ERROR - stderr - 100%|█████████▉| 3726/3741 [21:54:05<04:54, 19.61s/it] +2025-05-11 15:19:59 - ERROR - stderr - +2025-05-11 15:19:59 - ERROR - stderr - +2025-05-11 15:19:59 - INFO - stdout - {'loss': 0.4615, 'grad_norm': 0.8713566064834595, 'learning_rate': 8.435515060500888e-10, 'epoch': 2.99} +2025-05-11 15:19:59 - ERROR - stderr - 100%|█████████▉| 3726/3741 [21:54:05<04:54, 19.61s/it] +2025-05-11 15:20:19 - ERROR - stderr - 100%|█████████▉| 3727/3741 [21:54:25<04:36, 19.74s/it] +2025-05-11 15:20:19 - ERROR - stderr - +2025-05-11 15:20:19 - ERROR - stderr - +2025-05-11 15:20:19 - INFO - stdout - {'loss': 0.4763, 'grad_norm': 0.877994954586029, 'learning_rate': 7.348284212993495e-10, 'epoch': 2.99} +2025-05-11 15:20:19 - ERROR - stderr - 100%|█████████▉| 3727/3741 [21:54:25<04:36, 19.74s/it] +2025-05-11 15:20:39 - ERROR - stderr - 100%|█████████▉| 3728/3741 [21:54:45<04:15, 19.66s/it] +2025-05-11 15:20:39 - ERROR - stderr - +2025-05-11 15:20:39 - ERROR - stderr - +2025-05-11 15:20:39 - INFO - stdout - {'loss': 0.4877, 'grad_norm': 0.8891599178314209, 'learning_rate': 6.336031261111597e-10, 'epoch': 2.99} +2025-05-11 15:20:39 - ERROR - stderr - 100%|█████████▉| 3728/3741 [21:54:45<04:15, 19.66s/it] +2025-05-11 15:20:58 - ERROR - stderr - 100%|█████████▉| 3729/3741 [21:55:05<03:55, 19.65s/it] +2025-05-11 15:20:58 - ERROR - stderr - +2025-05-11 15:20:58 - ERROR - stderr - +2025-05-11 15:20:58 - INFO - stdout - {'loss': 0.4856, 'grad_norm': 0.8762668967247009, 'learning_rate': 5.398756963881368e-10, 'epoch': 2.99} +2025-05-11 15:20:58 - ERROR - stderr - 100%|█████████▉| 3729/3741 [21:55:05<03:55, 19.65s/it] +2025-05-11 15:21:18 - ERROR - stderr - 100%|█████████▉| 3730/3741 [21:55:24<03:36, 19.71s/it] +2025-05-11 15:21:18 - ERROR - stderr - +2025-05-11 15:21:18 - ERROR - stderr - +2025-05-11 15:21:18 - INFO - stdout - {'loss': 0.4847, 'grad_norm': 0.8617690205574036, 'learning_rate': 4.5364620240961885e-10, 'epoch': 2.99} +2025-05-11 15:21:18 - ERROR - stderr - 100%|█████████▉| 3730/3741 [21:55:24<03:36, 19.71s/it] +2025-05-11 15:21:38 - ERROR - stderr - 100%|█████████▉| 3731/3741 [21:55:44<03:17, 19.73s/it] +2025-05-11 15:21:38 - ERROR - stderr - +2025-05-11 15:21:38 - ERROR - stderr - +2025-05-11 15:21:38 - INFO - stdout - {'loss': 0.4549, 'grad_norm': 0.8446850180625916, 'learning_rate': 3.749147088349947e-10, 'epoch': 2.99} +2025-05-11 15:21:38 - ERROR - stderr - 100%|█████████▉| 3731/3741 [21:55:44<03:17, 19.73s/it] +2025-05-11 15:21:58 - ERROR - stderr - 100%|█████████▉| 3732/3741 [21:56:04<02:58, 19.81s/it] +2025-05-11 15:21:58 - ERROR - stderr - +2025-05-11 15:21:58 - ERROR - stderr - +2025-05-11 15:21:58 - INFO - stdout - {'loss': 0.4741, 'grad_norm': 0.9086197018623352, 'learning_rate': 3.0368127469815324e-10, 'epoch': 2.99} +2025-05-11 15:21:58 - ERROR - stderr - 100%|█████████▉| 3732/3741 [21:56:04<02:58, 19.81s/it] +2025-05-11 15:22:18 - ERROR - stderr - 100%|█████████▉| 3733/3741 [21:56:24<02:38, 19.86s/it] +2025-05-11 15:22:18 - ERROR - stderr - +2025-05-11 15:22:18 - ERROR - stderr - +2025-05-11 15:22:18 - INFO - stdout - {'loss': 0.4584, 'grad_norm': 0.9534429907798767, 'learning_rate': 2.399459534130344e-10, 'epoch': 2.99} +2025-05-11 15:22:18 - ERROR - stderr - 100%|█████████▉| 3733/3741 [21:56:24<02:38, 19.86s/it] +2025-05-11 15:22:39 - ERROR - stderr - 100%|█████████▉| 3734/3741 [21:56:46<02:22, 20.38s/it] +2025-05-11 15:22:39 - ERROR - stderr - +2025-05-11 15:22:39 - ERROR - stderr - +2025-05-11 15:22:39 - INFO - stdout - {'loss': 0.4728, 'grad_norm': 0.8938235640525818, 'learning_rate': 1.8370879277140874e-10, 'epoch': 2.99} +2025-05-11 15:22:39 - ERROR - stderr - 100%|█████████▉| 3734/3741 [21:56:46<02:22, 20.38s/it] +2025-05-11 15:22:59 - ERROR - stderr - 100%|█████████▉| 3735/3741 [21:57:05<02:00, 20.13s/it] +2025-05-11 15:22:59 - ERROR - stderr - +2025-05-11 15:22:59 - ERROR - stderr - +2025-05-11 15:22:59 - INFO - stdout - {'loss': 0.4659, 'grad_norm': 0.8239015936851501, 'learning_rate': 1.3496983493954673e-10, 'epoch': 3.0} +2025-05-11 15:22:59 - ERROR - stderr - 100%|█████████▉| 3735/3741 [21:57:05<02:00, 20.13s/it] +2025-05-11 15:23:19 - ERROR - stderr - 100%|█████████▉| 3736/3741 [21:57:25<01:40, 20.13s/it] +2025-05-11 15:23:19 - ERROR - stderr - +2025-05-11 15:23:19 - ERROR - stderr - +2025-05-11 15:23:19 - INFO - stdout - {'loss': 0.4637, 'grad_norm': 0.8396766781806946, 'learning_rate': 9.372911646599037e-11, 'epoch': 3.0} +2025-05-11 15:23:19 - ERROR - stderr - 100%|█████████▉| 3736/3741 [21:57:25<01:40, 20.13s/it] +2025-05-11 15:23:39 - ERROR - stderr - 100%|█████████▉| 3737/3741 [21:57:45<01:20, 20.04s/it] +2025-05-11 15:23:39 - ERROR - stderr - +2025-05-11 15:23:39 - ERROR - stderr - +2025-05-11 15:23:39 - INFO - stdout - {'loss': 0.4648, 'grad_norm': 0.8573855757713318, 'learning_rate': 5.998666827378153e-11, 'epoch': 3.0} +2025-05-11 15:23:39 - ERROR - stderr - 100%|█████████▉| 3737/3741 [21:57:45<01:20, 20.04s/it] +2025-05-11 15:23:59 - ERROR - stderr - 100%|█████████▉| 3738/3741 [21:58:05<00:59, 19.91s/it] +2025-05-11 15:23:59 - ERROR - stderr - +2025-05-11 15:23:59 - ERROR - stderr - +2025-05-11 15:23:59 - INFO - stdout - {'loss': 0.4768, 'grad_norm': 0.8772425055503845, 'learning_rate': 3.3742515662682496e-11, 'epoch': 3.0} +2025-05-11 15:23:59 - ERROR - stderr - 100%|█████████▉| 3738/3741 [21:58:05<00:59, 19.91s/it] +2025-05-11 15:24:18 - ERROR - stderr - 100%|█████████▉| 3739/3741 [21:58:25<00:39, 19.91s/it] +2025-05-11 15:24:19 - ERROR - stderr - +2025-05-11 15:24:19 - ERROR - stderr - +2025-05-11 15:24:19 - INFO - stdout - {'loss': 0.4722, 'grad_norm': 0.9464996457099915, 'learning_rate': 1.4996678313616842e-11, 'epoch': 3.0} +2025-05-11 15:24:19 - ERROR - stderr - 100%|█████████▉| 3739/3741 [21:58:25<00:39, 19.91s/it] +2025-05-11 15:24:38 - ERROR - stderr - 100%|█████████▉| 3740/3741 [21:58:45<00:19, 19.90s/it] +2025-05-11 15:24:38 - ERROR - stderr - +2025-05-11 15:24:38 - ERROR - stderr - +2025-05-11 15:24:38 - INFO - stdout - {'loss': 0.4938, 'grad_norm': 0.9083192944526672, 'learning_rate': 3.749170280897829e-12, 'epoch': 3.0} +2025-05-11 15:24:38 - ERROR - stderr - 100%|█████████▉| 3740/3741 [21:58:45<00:19, 19.90s/it] +2025-05-11 15:24:57 - ERROR - stderr - 100%|██████████| 3741/3741 [21:59:03<00:00, 19.48s/it] +2025-05-11 15:24:57 - ERROR - stderr - +2025-05-11 15:24:57 - ERROR - stderr - +2025-05-11 15:24:57 - INFO - stdout - {'loss': 0.3898, 'grad_norm': 0.7624624967575073, 'learning_rate': 0.0, 'epoch': 3.0} +2025-05-11 15:24:57 - ERROR - stderr - 100%|██████████| 3741/3741 [21:59:03<00:00, 19.48s/it] +2025-05-11 15:24:57 - INFO - transformers.trainer - + +Training completed. Do not forget to share your model on huggingface.co/models =) + + +2025-05-11 15:24:57 - INFO - transformers.trainer - + +Training completed. Do not forget to share your model on huggingface.co/models =) + + +2025-05-11 15:24:57 - INFO - transformers.trainer - + +Training completed. Do not forget to share your model on huggingface.co/models =) + + +2025-05-11 15:24:57 - INFO - transformers.trainer - + +Training completed. Do not forget to share your model on huggingface.co/models =) + + +2025-05-11 15:24:57 - INFO - transformers.trainer - + +Training completed. Do not forget to share your model on huggingface.co/models =) + + +2025-05-11 15:24:57 - INFO - transformers.trainer - + +Training completed. Do not forget to share your model on huggingface.co/models =) + + +2025-05-11 15:24:57 - INFO - transformers.trainer - + +Training completed. Do not forget to share your model on huggingface.co/models =) + + +2025-05-11 15:24:57 - INFO - transformers.trainer - + +Training completed. Do not forget to share your model on huggingface.co/models =) + + +2025-05-11 15:24:57 - INFO - transformers.trainer - + +Training completed. Do not forget to share your model on huggingface.co/models =) + + +2025-05-11 15:24:57 - INFO - transformers.trainer - + +Training completed. Do not forget to share your model on huggingface.co/models =) + + +2025-05-11 15:24:57 - INFO - transformers.trainer - + +Training completed. Do not forget to share your model on huggingface.co/models =) + + +2025-05-11 15:24:57 - INFO - transformers.trainer - + +Training completed. Do not forget to share your model on huggingface.co/models =) + + +2025-05-11 15:24:57 - INFO - transformers.trainer - + +Training completed. Do not forget to share your model on huggingface.co/models =) + + +2025-05-11 15:24:57 - INFO - transformers.trainer - + +Training completed. Do not forget to share your model on huggingface.co/models =) + + +2025-05-11 15:24:57 - INFO - transformers.trainer - + +Training completed. Do not forget to share your model on huggingface.co/models =) + + +2025-05-11 15:24:57 - INFO - transformers.trainer - + +Training completed. Do not forget to share your model on huggingface.co/models =) + + +2025-05-11 15:24:57 - INFO - transformers.trainer - + +Training completed. Do not forget to share your model on huggingface.co/models =) + + +2025-05-11 15:24:57 - INFO - transformers.trainer - + +Training completed. Do not forget to share your model on huggingface.co/models =) + + +2025-05-11 15:24:57 - INFO - transformers.trainer - + +Training completed. Do not forget to share your model on huggingface.co/models =) + + +2025-05-11 15:24:57 - INFO - transformers.trainer - + +Training completed. Do not forget to share your model on huggingface.co/models =) + + +2025-05-11 15:24:57 - INFO - transformers.trainer - + +Training completed. Do not forget to share your model on huggingface.co/models =) + + +2025-05-11 15:24:57 - INFO - transformers.trainer - + +Training completed. Do not forget to share your model on huggingface.co/models =) + + +2025-05-11 15:24:57 - INFO - transformers.trainer - + +Training completed. Do not forget to share your model on huggingface.co/models =) + + +2025-05-11 15:24:57 - INFO - transformers.trainer - + +Training completed. Do not forget to share your model on huggingface.co/models =) + + +2025-05-11 15:24:57 - ERROR - stderr - +2025-05-11 15:24:57 - ERROR - stderr - +2025-05-11 15:24:57 - INFO - stdout - {'train_runtime': 79144.443, 'train_samples_per_second': 18.139, 'train_steps_per_second': 0.047, 'train_loss': 0.7324594869090414, 'epoch': 3.0} +2025-05-11 15:24:57 - ERROR - stderr - 100%|██████████| 3741/3741 [21:59:03<00:00, 19.48s/it] +2025-05-11 15:24:57 - ERROR - stderr - 100%|██████████| 3741/3741 [21:59:03<00:00, 21.16s/it] +2025-05-11 15:24:57 - ERROR - stderr - +2025-05-11 15:25:22 - INFO - transformers.trainer - Saving model checkpoint to outputs/LLaNA_13B_train_stage2_recipe3_shapenerf_objanerf_AUGMENTED_nf2seq_spatial516/LLaNA_13B_train_stage2_recipe3_shapenerf_objanerf_AUGMENTED_nf2seq_spatial516 +2025-05-11 15:25:22 - INFO - transformers.trainer - Saving model checkpoint to outputs/LLaNA_13B_train_stage2_recipe3_shapenerf_objanerf_AUGMENTED_nf2seq_spatial516/LLaNA_13B_train_stage2_recipe3_shapenerf_objanerf_AUGMENTED_nf2seq_spatial516 +2025-05-11 15:25:22 - INFO - transformers.configuration_utils - Configuration saved in outputs/LLaNA_13B_train_stage2_recipe3_shapenerf_objanerf_AUGMENTED_nf2seq_spatial516/LLaNA_13B_train_stage2_recipe3_shapenerf_objanerf_AUGMENTED_nf2seq_spatial516/config.json +2025-05-11 15:25:22 - INFO - transformers.configuration_utils - Configuration saved in outputs/LLaNA_13B_train_stage2_recipe3_shapenerf_objanerf_AUGMENTED_nf2seq_spatial516/LLaNA_13B_train_stage2_recipe3_shapenerf_objanerf_AUGMENTED_nf2seq_spatial516/config.json +2025-05-11 15:25:22 - INFO - transformers.generation.configuration_utils - Configuration saved in outputs/LLaNA_13B_train_stage2_recipe3_shapenerf_objanerf_AUGMENTED_nf2seq_spatial516/LLaNA_13B_train_stage2_recipe3_shapenerf_objanerf_AUGMENTED_nf2seq_spatial516/generation_config.json +2025-05-11 15:25:22 - INFO - transformers.generation.configuration_utils - Configuration saved in outputs/LLaNA_13B_train_stage2_recipe3_shapenerf_objanerf_AUGMENTED_nf2seq_spatial516/LLaNA_13B_train_stage2_recipe3_shapenerf_objanerf_AUGMENTED_nf2seq_spatial516/generation_config.json +2025-05-11 15:26:20 - INFO - transformers.modeling_utils - The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 11 checkpoint shards. You can find where each parameters has been saved in the index located at outputs/LLaNA_13B_train_stage2_recipe3_shapenerf_objanerf_AUGMENTED_nf2seq_spatial516/LLaNA_13B_train_stage2_recipe3_shapenerf_objanerf_AUGMENTED_nf2seq_spatial516/model.safetensors.index.json. +2025-05-11 15:26:20 - INFO - transformers.modeling_utils - The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 11 checkpoint shards. You can find where each parameters has been saved in the index located at outputs/LLaNA_13B_train_stage2_recipe3_shapenerf_objanerf_AUGMENTED_nf2seq_spatial516/LLaNA_13B_train_stage2_recipe3_shapenerf_objanerf_AUGMENTED_nf2seq_spatial516/model.safetensors.index.json. +2025-05-11 15:26:20 - INFO - transformers.tokenization_utils_base - tokenizer config file saved in outputs/LLaNA_13B_train_stage2_recipe3_shapenerf_objanerf_AUGMENTED_nf2seq_spatial516/LLaNA_13B_train_stage2_recipe3_shapenerf_objanerf_AUGMENTED_nf2seq_spatial516/tokenizer_config.json +2025-05-11 15:26:20 - INFO - transformers.tokenization_utils_base - tokenizer config file saved in outputs/LLaNA_13B_train_stage2_recipe3_shapenerf_objanerf_AUGMENTED_nf2seq_spatial516/LLaNA_13B_train_stage2_recipe3_shapenerf_objanerf_AUGMENTED_nf2seq_spatial516/tokenizer_config.json +2025-05-11 15:26:20 - INFO - transformers.tokenization_utils_base - Special tokens file saved in outputs/LLaNA_13B_train_stage2_recipe3_shapenerf_objanerf_AUGMENTED_nf2seq_spatial516/LLaNA_13B_train_stage2_recipe3_shapenerf_objanerf_AUGMENTED_nf2seq_spatial516/special_tokens_map.json +2025-05-11 15:26:20 - INFO - transformers.tokenization_utils_base - Special tokens file saved in outputs/LLaNA_13B_train_stage2_recipe3_shapenerf_objanerf_AUGMENTED_nf2seq_spatial516/LLaNA_13B_train_stage2_recipe3_shapenerf_objanerf_AUGMENTED_nf2seq_spatial516/special_tokens_map.json +2025-05-11 15:26:20 - INFO - transformers.tokenization_utils_base - added tokens file saved in outputs/LLaNA_13B_train_stage2_recipe3_shapenerf_objanerf_AUGMENTED_nf2seq_spatial516/LLaNA_13B_train_stage2_recipe3_shapenerf_objanerf_AUGMENTED_nf2seq_spatial516/added_tokens.json +2025-05-11 15:26:20 - INFO - transformers.tokenization_utils_base - added tokens file saved in outputs/LLaNA_13B_train_stage2_recipe3_shapenerf_objanerf_AUGMENTED_nf2seq_spatial516/LLaNA_13B_train_stage2_recipe3_shapenerf_objanerf_AUGMENTED_nf2seq_spatial516/added_tokens.json +2025-05-11 15:26:22 - INFO - wandb.sdk.mailbox.mailbox - Closing mailbox, abandoning 0 handles. +2025-05-11 15:26:22 - INFO - wandb.sdk.mailbox.mailbox - Closing mailbox, abandoning 0 handles.